- Which state & which school produced most number of players ?
- How does age play a factor in making risky 3-Base runs ?
- Does height has any relation to Runs & Homeruns a player can score ?
- Does attendance of crowd affect the performance of team in any ways ?
%pylab inline
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
batting_df = pd.read_csv('Batting.csv')
pitching_df = pd.read_csv('Pitching.csv')
fieldind_df = pd.read_csv('Fielding.csv')
awards_df = pd.read_csv('AwardsPlayers.csv')
parks_df = pd.read_csv('Parks.csv')
master_df = pd.read_csv('Master.csv')
teams_df = pd.read_csv('Teams.csv')
school_df = pd.read_csv('Schools.csv')
schoolplay_df = pd.read_csv('CollegePlaying.csv')
halloffame_df = pd.read_csv('HallOfFame.csv')
# teams_df.head()
# batting_df.head()
# master_df.head()
# parks_df.head()
# awards_df.head()
# pitching_df.head()
# school_df.head()
# schoolplay_df.head()
# teams_df.describe()
# schoolplay_df.describe()
# Function to combine to dfs on a field
def combine_dfs(df1, df2,field):
return df1.merge(df2,on=[field], how='inner')
def plot_for_fields(df1,df2,join_field,field1,field2):
a =combine_dfs(df1, df2,join_field)
if field1 == 'age':
a['age']= a['yearID'].subtract(a['birthYear'])
a_data_grouped = a[[field1,field2]].groupby([field1], as_index=False).mean()
# print(a_data_grouped)
plt.xlabel(field1)
plt.ylabel(field2)
plt.scatter(a_data_grouped[field1],a_data_grouped[field2])
schools_data_grouped = combine_dfs(schoolplay_df, school_df,'schoolID')
states_0f_schools = np.array(schools_data_grouped['state'].values)
student_and_states = np.unique(states_0f_schools,return_counts=True)
shape = ( 7, 7 )
student_counts = student_and_states[1].reshape(shape)
states = student_and_states[0].reshape(shape)
# print(sum(e[1]))
# print(h)
# print(len(e[0]))
hm = sns.heatmap(student_counts,annot= states,fmt='s',
cmap="YlGnBu",cbar_kws={'label': 'NUMBER OF PLAYERS PER STATE'},linewidths=1.0)
# AGE vs 3-Bases Run
plot_for_fields(master_df, batting_df,'playerID','age','3B')
# Height vs Home Runs Scored
plot_for_fields(master_df, batting_df,'playerID','height','HR')
# Height vs Runs Scored
plot_for_fields(master_df, batting_df,'playerID','height','R')
# Height vs Runs Conceded
plot_for_fields(master_df,pitching_df,'playerID','height','R')
# print(teams_df['W'],teams_df['attendance'])
team_data_by_win_attd = teams_df[['W','attendance']].groupby(['W'], as_index=False).mean()
plt.xlabel('Wins')
plt.ylabel('Attendance')
plt.scatter(team_data_by_win_attd['W'],team_data_by_win_attd['attendance'])