Questions

  1. Which state & which school produced most number of players ?
  2. How does age play a factor in making risky 3-Base runs ?
  3. Does height has any relation to Runs & Homeruns a player can score ?
  4. Does attendance of crowd affect the performance of team in any ways ?
In [1]:
%pylab inline
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
Populating the interactive namespace from numpy and matplotlib
In [2]:
batting_df = pd.read_csv('Batting.csv')
pitching_df = pd.read_csv('Pitching.csv')
fieldind_df = pd.read_csv('Fielding.csv')
awards_df = pd.read_csv('AwardsPlayers.csv')
parks_df = pd.read_csv('Parks.csv')
master_df = pd.read_csv('Master.csv')
teams_df = pd.read_csv('Teams.csv')
school_df = pd.read_csv('Schools.csv')
schoolplay_df = pd.read_csv('CollegePlaying.csv')
halloffame_df = pd.read_csv('HallOfFame.csv')
In [3]:
# teams_df.head()
# batting_df.head()
# master_df.head()
# parks_df.head()
# awards_df.head()
# pitching_df.head()
# school_df.head()
# schoolplay_df.head()
# teams_df.describe()
# schoolplay_df.describe()
In [4]:
# Function to combine to dfs on a field
def combine_dfs(df1, df2,field):
    return df1.merge(df2,on=[field], how='inner')

def plot_for_fields(df1,df2,join_field,field1,field2):
    a =combine_dfs(df1, df2,join_field)
    if field1 == 'age':
        a['age']= a['yearID'].subtract(a['birthYear'])
    a_data_grouped = a[[field1,field2]].groupby([field1], as_index=False).mean()
    # print(a_data_grouped)
    plt.xlabel(field1)
    plt.ylabel(field2)
    plt.scatter(a_data_grouped[field1],a_data_grouped[field2])

Answer (1)

In [5]:
schools_data_grouped = combine_dfs(schoolplay_df, school_df,'schoolID')
states_0f_schools = np.array(schools_data_grouped['state'].values)
student_and_states = np.unique(states_0f_schools,return_counts=True)
shape = ( 7, 7 )
student_counts = student_and_states[1].reshape(shape)
states = student_and_states[0].reshape(shape)
# print(sum(e[1]))
# print(h)
# print(len(e[0]))

hm = sns.heatmap(student_counts,annot= states,fmt='s',
                 cmap="YlGnBu",cbar_kws={'label': 'NUMBER  OF  PLAYERS  PER  STATE'},linewidths=1.0)

Answer (2)

In [6]:
# AGE vs 3-Bases Run
plot_for_fields(master_df, batting_df,'playerID','age','3B')

Answer (3)

In [7]:
# Height vs Home Runs Scored
plot_for_fields(master_df, batting_df,'playerID','height','HR')
In [8]:
# Height vs Runs Scored
plot_for_fields(master_df, batting_df,'playerID','height','R')
In [9]:
# Height vs Runs Conceded
plot_for_fields(master_df,pitching_df,'playerID','height','R')

Answer (4)

In [10]:
# print(teams_df['W'],teams_df['attendance'])
team_data_by_win_attd = teams_df[['W','attendance']].groupby(['W'], as_index=False).mean()
plt.xlabel('Wins')
plt.ylabel('Attendance')
plt.scatter(team_data_by_win_attd['W'],team_data_by_win_attd['attendance'])
Out[10]:
<matplotlib.collections.PathCollection at 0x1b242bb66a0>