# from google.colab import drive
# drive.mount('/content/gdrive')
# %cd gdrive/MyDrive/DataMiningProjectCopy
# %ls

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
[Errno 2] No such file or directory: 'gdrive/MyDrive/DataMiningProjectCopy'
/content/gdrive/.shortcut-targets-by-id/1mp2LsCYxml8kmIBdoUz4nNQUfeHEXPdw/DataMiningProjectCopy
 classification-dataset.csv       Data_Mining_Project.ipynb
 classification-dataset.gsheet    DataMiningProject.ipynb
'Copy of Project Proposal.gdoc'   finalized_model.sav


# These import statements are needed for the various models we use.
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import time
# Imports for basic data splitting, cross validation, scoring, dimensionality
# reduction, pipelines, and standardization.
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# SMOTE Balancing Our Models
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from imblearn.under_sampling import EditedNearestNeighbours
# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
# Naive Bayes Classifier
from sklearn.naive_bayes import GaussianNB
# Neural Network Classifier
from sklearn.neural_network import MLPClassifier
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
# K-Nearest Neighbor Classifier
from sklearn.neighbors import KNeighborsClassifier
# Support Vector Machine Classifier
from sklearn.svm import SVC
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
# Ensemble Classifiers
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
# Upload Final Model
import pickle

data = pd.read_csv("classification-dataset.csv")
print(data.head())

   pitcher p_throws stand      description pitch_type pitch_name  \
0   453286        R     L  swinging_strike        NaN        NaN   
1   453286        R     R    hit_into_play        NaN        NaN   
2   453286        R     R             ball        NaN        NaN   
3   453286        R     R             ball        NaN        NaN   
4   453286        R     L    called_strike        NaN        NaN   

   release_speed  release_spin_rate  spin_axis  pfx_x  pfx_z  release_pos_x  \
0           94.9             2356.0      219.0  -0.78   1.30          -3.06   
1           94.5             2426.0      224.0  -0.84   1.17          -3.25   
2           96.6             2429.0      212.0  -0.35   1.34          -2.99   
3           95.1             2624.0      228.0  -0.81   1.23          -3.28   
4           95.3             2385.0      224.0  -0.78   1.44          -3.15   

   release_pos_z  release_extension  plate_x  plate_z  pitch_id  
0           5.75                6.0    -0.87     3.24         1  
1           5.49                6.3     0.53     2.86         2  
2           5.72                6.3     1.19     3.55         3  
3           5.11                6.7     1.00     1.15         4  
4           5.52                6.4    -0.71     2.48         5


del data['pitch_type']
print(data.head())

   pitcher p_throws stand      description pitch_name  release_speed  \
0   453286        R     L  swinging_strike        NaN           94.9   
1   453286        R     R    hit_into_play        NaN           94.5   
2   453286        R     R             ball        NaN           96.6   
3   453286        R     R             ball        NaN           95.1   
4   453286        R     L    called_strike        NaN           95.3   

   release_spin_rate  spin_axis  pfx_x  pfx_z  release_pos_x  release_pos_z  \
0             2356.0      219.0  -0.78   1.30          -3.06           5.75   
1             2426.0      224.0  -0.84   1.17          -3.25           5.49   
2             2429.0      212.0  -0.35   1.34          -2.99           5.72   
3             2624.0      228.0  -0.81   1.23          -3.28           5.11   
4             2385.0      224.0  -0.78   1.44          -3.15           5.52   

   release_extension  plate_x  plate_z  pitch_id  
0                6.0    -0.87     3.24         1  
1                6.3     0.53     2.86         2  
2                6.3     1.19     3.55         3  
3                6.7     1.00     1.15         4  
4                6.4    -0.71     2.48         5


print(data.shape)
print(data.isna().sum())
data = data.dropna()
print(data.isna().sum())
print(data.shape)

(22526, 16)
pitcher                 0
p_throws                0
stand                   0
description             0
pitch_name           4000
release_speed           0
release_spin_rate      42
spin_axis              42
pfx_x                   0
pfx_z                   0
release_pos_x           0
release_pos_z           0
release_extension      42
plate_x                 0
plate_z                 0
pitch_id                0
dtype: int64
pitcher              0
p_throws             0
stand                0
description          0
pitch_name           0
release_speed        0
release_spin_rate    0
spin_axis            0
pfx_x                0
pfx_z                0
release_pos_x        0
release_pos_z        0
release_extension    0
plate_x              0
plate_z              0
pitch_id             0
dtype: int64
(18493, 16)


# One Hot Encoding of Categorical Variables
data['p_throws'].replace(['L', 'R'], [0,1], inplace = True)
data['stand'].replace(['L', 'R'], [0,1], inplace = True)


data['pitch_name'].value_counts().plot(kind = 'bar', color = 'orange')
plt.title("Pitch Type Distribution")
plt.ylabel("Number of Pitches")
plt.xlabel("Pitch Type")

Text(0.5, 0, 'Pitch Type')


values = ['Split-Finger', 'Knuckle Curve', 'Curveball']
data = data[data.pitch_name.isin(values) == False]

data['pitch_name'].value_counts()

4-Seam Fastball    6502
Slider             3228
Cutter             2989
Changeup           2187
Sinker             1613
Name: pitch_name, dtype: int64


pthrows_orig = data['p_throws'].replace([0, 1], ['Left','Right'], inplace = False)
pthrows_orig.value_counts().plot(kind = 'bar', color = 'orange')
plt.title("Throwing Hand Distribution")
plt.xlabel("Throwing Hand")
plt.show()


stand_orig = data['stand'].replace([0, 1], ['Left','Right'], inplace = False)
stand_orig.value_counts().plot(kind = 'bar', color = 'orange')
plt.title("Batting Side Distribution")
plt.xlabel("Batting Side")
plt.show()


print("Release Speed Variability: " + str(data['release_speed'].var()))
print("Release Spin Rate: " + str(data['release_spin_rate'].var()))
print("Spin Axis: " + str(data['spin_axis'].var()))
print("Pfx_x: " + str(data['pfx_x'].var()))
print("Pfx_z: " + str(data['pfx_z'].var()))
print("Release Pos X: " + str(data['release_pos_x'].var()))
print("Release Pos Z: " + str(data['release_pos_z'].var()))
print("Release Extension: " + str(data['release_extension'].var()))
print("Plate X: " + str(data['plate_x'].var()))
print("Plate Z: " + str(data['plate_z'].var()))

Release Speed Variability: 22.941476948165803
Release Spin Rate: 124952.40838277544
Spin Axis: 2613.2326132601306
Pfx_x: 0.6307944099938104
Pfx_z: 0.3238680291098727
Release Pos X: 4.014829424431769
Release Pos Z: 0.23950163194248636
Release Extension: 0.06935857025611841
Plate X: 0.6673774465403014
Plate Z: 0.8960389026249447


plt.boxplot(data['pfx_x'])
plt.title("Pfx_x Distribution")
plt.ylabel("Pfx_x")
plt.show()

plt.boxplot(data['pfx_z'])
plt.title("Pfx_z Distribution")
plt.ylabel("Pfx_z")
plt.show()


data.corr()


# The boxplots for each numeric variable

plt.boxplot(data['pfx_x'])
plt.title("Pfx_x Distribution")
plt.ylabel("Pfx_x")
plt.show()

plt.boxplot(data['pfx_z'])
plt.title("Pfx_z Distribution")
plt.ylabel("Pfx_z")
plt.show()

plt.boxplot(data['release_speed'])
plt.title("Release Speed Distribution")
plt.ylabel("Release Speed (mph)")
plt.show()

plt.boxplot(data['release_spin_rate'])
plt.title("Release Spin Rate Distribution")
plt.ylabel("Release Spin Rate")
plt.show()


plt.boxplot(data['spin_axis'])
plt.title("Spin Axis Distribution")
plt.ylabel("Spin Axis (Degrees)")
plt.show()

plt.boxplot(data['plate_x'])
plt.title("Plate X Distribution")
plt.ylabel("Plate X")
plt.show()

plt.boxplot(data['plate_z'])
plt.title("Plate Z Distribution")
plt.ylabel("Plate Z")
plt.show()


import seaborn as sns

sns.scatterplot('pfx_z', 'release_speed',data=data)
plt.title("Pfx_z vs Release Speed")
plt.xlabel("Pfx_z")
plt.ylabel("Release Speed (mph)")
plt.show()

palette = ['blue', 'green', 'orange', 'red', 'black', 'yellow', 'pink', 'purple']
sns.scatterplot('pfx_z', 'release_speed',data=data,  hue="pitcher", palette=palette)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
plt.title("Pfx_z vs Release Speed by Pitcher")
plt.xlabel("Pfx_z")
plt.ylabel("Release Speed (mph)")
plt.show()

palette = ['blue', 'green', 'orange', 'red', 'black']
sns.scatterplot('pfx_z', 'release_speed',data=data,  hue="pitch_name", palette=palette)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
plt.title("Pfx_z vs Release Speed by Pitch Name")
plt.xlabel("Pfx_z")
plt.ylabel("Release Speed (mph)")
plt.show()

/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning

/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning

/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning


# Removed based on Domain knowledge
del data['pitch_id']
del data['description']
del data['release_extension']
del data['release_pos_x']
del data['release_pos_z']
del data['pitcher']

# Altered based on outlier detection
data = data[data['release_speed'] > 70]


# What data looks like now after changes made
print(data.head())

      p_throws  stand       pitch_name  release_speed  release_spin_rate  \
4000         1      1  4-Seam Fastball           94.3             2486.0   
4001         1      1           Slider           84.7             2365.0   
4003         1      1           Slider           85.0             2442.0   
4004         1      1           Slider           85.5             2331.0   
4005         1      1           Slider           84.2             2343.0   

      spin_axis  pfx_x  pfx_z  plate_x  plate_z  
4000      230.0  -1.22   1.44    -0.37     3.24  
4001      173.0  -0.16   0.39    -0.78     1.42  
4003      155.0   0.22   0.21     1.32     1.29  
4004      215.0   0.04   0.37     1.02     1.35  
4005      175.0   0.05   0.34    -0.02     2.13


data_labels = data['pitch_name'].to_frame()
data_features = data.drop(['pitch_name'], axis=1)
print(data_labels.head())
print(data_features.head())

           pitch_name
4000  4-Seam Fastball
4001           Slider
4003           Slider
4004           Slider
4005           Slider
      p_throws  stand  release_speed  release_spin_rate  spin_axis  pfx_x  \
4000         1      1           94.3             2486.0      230.0  -1.22   
4001         1      1           84.7             2365.0      173.0  -0.16   
4003         1      1           85.0             2442.0      155.0   0.22   
4004         1      1           85.5             2331.0      215.0   0.04   
4005         1      1           84.2             2343.0      175.0   0.05   

      pfx_z  plate_x  plate_z  
4000   1.44    -0.37     3.24  
4001   0.39    -0.78     1.42  
4003   0.21     1.32     1.29  
4004   0.37     1.02     1.35  
4005   0.34    -0.02     2.13


smote = SMOTE(random_state = 0, sampling_strategy = 'not majority')
print("Data Counts Before SMOTE:")
print(data_labels.value_counts())
print()
x_smote, y_smote = smote.fit_resample(data_features, data_labels)
print("Data Counts After SMOTE")
print(y_smote.value_counts())

Data Counts Before SMOTE:
pitch_name     
4-Seam Fastball    6502
Slider             3228
Cutter             2989
Changeup           2187
Sinker             1610
dtype: int64

Data Counts After SMOTE
pitch_name     
4-Seam Fastball    6502
Changeup           6502
Cutter             6502
Sinker             6502
Slider             6502
dtype: int64


clf = DecisionTreeClassifier()
param_grid = {"max_depth": [5,10,15,20, 25], "min_samples_leaf": [5,10,15,20], "max_features": [4,6,8], "criterion": ['entropy', 'gini']}
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
predict_y = cross_val_predict(grid_search, data_features, data_labels, cv = 10)
print("Accuracy:", accuracy_score(data_labels, predict_y) * 100)
print()
print(confusion_matrix(data_labels, predict_y))
print(classification_report(data_labels, predict_y))

Accuracy: 90.96633567449746

[[6294   53   33   98   24]
 [  40 2116    1   23    7]
 [  17    4 2570   33  365]
 [ 157   29   16 1397   11]
 [  11   51  510    9 2647]]
                 precision    recall  f1-score   support

4-Seam Fastball       0.97      0.97      0.97      6502
       Changeup       0.94      0.97      0.95      2187
         Cutter       0.82      0.86      0.84      2989
         Sinker       0.90      0.87      0.88      1610
         Slider       0.87      0.82      0.84      3228

       accuracy                           0.91     16516
      macro avg       0.90      0.90      0.90     16516
   weighted avg       0.91      0.91      0.91     16516


from imblearn.pipeline import Pipeline as imbpipeline
smote = SMOTE(random_state = 0, sampling_strategy = 'not majority')
clf = DecisionTreeClassifier()
pipe = imbpipeline(steps = [('smote', smote), ('clf', DecisionTreeClassifier())])
param_grid = {"clf__max_depth": [5,10,15,20, 25], "clf__min_samples_leaf": [5,10,15,20], "clf__max_features": [4,6,8], "clf__criterion": ['entropy', 'gini']}
grid_search = GridSearchCV(estimator = pipe, param_grid = param_grid, cv=5, scoring='accuracy')
predict_y = cross_val_predict(grid_search, data_features, data_labels, cv = 10)
print("Accuracy:", accuracy_score(data_labels, predict_y) * 100)
print()
print(confusion_matrix(data_labels, predict_y))
print(classification_report(data_labels, predict_y))

Accuracy: 89.083313150884

[[6045   45   94  300   18]
 [  37 2106    4   24   16]
 [  35    4 2559   40  351]
 [  96   33   18 1456    7]
 [   8   35  632    6 2547]]
                 precision    recall  f1-score   support

4-Seam Fastball       0.97      0.93      0.95      6502
       Changeup       0.95      0.96      0.96      2187
         Cutter       0.77      0.86      0.81      2989
         Sinker       0.80      0.90      0.85      1610
         Slider       0.87      0.79      0.83      3228

       accuracy                           0.89     16516
      macro avg       0.87      0.89      0.88     16516
   weighted avg       0.90      0.89      0.89     16516


clf = GaussianNB()
nested_score = cross_val_score(clf, data_features, data_labels.values.ravel(), cv = 10)
print("Accuracy:", nested_score.mean()*100)
predict_y = cross_val_predict(clf, data_features, data_labels.values.ravel(), cv = 10)
print()
print(classification_report(data_labels, predict_y))

Accuracy: 76.32498757081703

                 precision    recall  f1-score   support

4-Seam Fastball       0.95      0.98      0.96      6502
       Changeup       0.79      0.88      0.83      2187
         Cutter       0.53      0.57      0.55      2989
         Sinker       0.69      0.66      0.68      1610
         Slider       0.59      0.49      0.53      3228

       accuracy                           0.76     16516
      macro avg       0.71      0.71      0.71     16516
   weighted avg       0.76      0.76      0.76     16516


from imblearn.pipeline import Pipeline as imbpipeline
smote = SMOTE(random_state=0, sampling_strategy='not majority')
clf = GaussianNB()
pipe = imbpipeline(steps = [("smote", smote), ("clf", clf)])
nested_score = cross_val_score(pipe, data_features, data_labels.values.ravel(), cv = 10)
print("Accuracy:", nested_score.mean()*100)
predict_y = cross_val_predict(pipe, data_features, data_labels.values.ravel(), cv = 10)
print()
print(classification_report(data_labels, predict_y))

Accuracy: 74.91993992928198

                 precision    recall  f1-score   support

4-Seam Fastball       0.96      0.96      0.96      6502
       Changeup       0.78      0.87      0.83      2187
         Cutter       0.51      0.51      0.51      2989
         Sinker       0.57      0.69      0.63      1610
         Slider       0.60      0.50      0.55      3228

       accuracy                           0.75     16516
      macro avg       0.69      0.71      0.69     16516
   weighted avg       0.75      0.75      0.75     16516


scaler = StandardScaler()
scaler.mean_ = 0
scaler.var_ = 1
PCA = PCA()
knn = KNeighborsClassifier()
pipe = Pipeline(steps = [('scaler', scaler), ('PCA', PCA), ('knn', knn)])

param_grid = {
    'PCA__n_components': list(range(1, 10)),
    'knn__n_neighbors': list(range(1, 26))
}

grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
grid_search.fit(data_features, data_labels.values.ravel())
nested_score = cross_val_score(grid_search, data_features, data_labels.values.ravel(), cv=5)
print("Accuracy:", nested_score.mean()*100)

Accuracy: 87.34003096458648


from sklearn.decomposition import PCA
from imblearn.pipeline import Pipeline as imbpipeline
scaler = StandardScaler()
scaler.mean_ = 0
scaler.var_ = 1
smote = SMOTE(random_state=0, sampling_strategy='not majority')
knn = KNeighborsClassifier()
enn = EditedNearestNeighbours()
PCA = PCA()
pipe = imbpipeline(steps = [('scaler', scaler), ('PCA', PCA), ('enn', enn), ('smote', smote), ('knn', knn)])

param_grid = {
    'PCA__n_components': list(range(1, 10)),
    'knn__n_neighbors': list(range(1, 26))
}

grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
nested_score = cross_val_score(grid_search, data_features, data_labels.values.ravel(), cv=5)
print("Accuracy:", nested_score.mean()*100)

Accuracy: 87.32193530131461


from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA

sel = VarianceThreshold(threshold=(1))
transformed_features = sel.fit_transform(data_features)
number_of_features = len(transformed_features[0])
print("Number of Features: " + str(number_of_features))
scaler = StandardScaler()
scaler.mean_ = 0
scaler.var_ = 1
PCA = PCA()
knn = KNeighborsClassifier()
pipe = Pipeline(steps = [('sel', sel), ('scaler', scaler), ('PCA', PCA), ('knn', knn)])

param_grid = {
    'PCA__n_components': list(range(1, number_of_features + 1)),
    'knn__n_neighbors': list(range(1, 26)),
}

grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
grid_search.fit(data_features, data_labels.values.ravel())
nested_score = cross_val_score(grid_search, data_features, data_labels.values.ravel(), cv=5)
print("Accuracy:", nested_score.mean()*100)

Number of Features: 3
Accuracy: 71.11923711586576


from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA

sel = VarianceThreshold(threshold=(0.75))
transformed_features = sel.fit_transform(data_features)
number_of_features = len(transformed_features[0])
print("Number of Features: " + str(number_of_features))
scaler = StandardScaler()
scaler.mean_ = 0
scaler.var_ = 1
PCA = PCA()
knn = KNeighborsClassifier()
pipe = Pipeline(steps = [('sel', sel), ('scaler', scaler), ('PCA', PCA), ('knn', knn)])

param_grid = {
    'PCA__n_components': list(range(1, number_of_features + 1)),
    'knn__n_neighbors': list(range(1, 26))
}

grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
grid_search.fit(data_features, data_labels.values.ravel())
nested_score = cross_val_score(grid_search, data_features, data_labels.values.ravel(), cv=5)
print("Accuracy:", nested_score.mean()*100)

Number of Features: 4
Accuracy: 70.55001176566317


from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA

sel = VarianceThreshold(threshold=(0.5))
transformed_features = sel.fit_transform(data_features)
number_of_features = len(transformed_features[0])
print("Number of Features: " + str(number_of_features))
scaler = StandardScaler()
scaler.mean_ = 0
scaler.var_ = 1
PCA = PCA()
knn = KNeighborsClassifier()
pipe = Pipeline(steps = [('sel', sel), ('scaler', scaler), ('PCA', PCA), ('knn', knn)])

param_grid = {
    'PCA__n_components': list(range(1, number_of_features + 1)),
    'knn__n_neighbors': list(range(1, 26))
}

grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
grid_search.fit(data_features, data_labels.values.ravel())
nested_score = cross_val_score(grid_search, data_features, data_labels.values.ravel(), cv=5)
print("Accuracy:", nested_score.mean()*100)

Number of Features: 6
Accuracy: 77.65251561607724


from sklearn.decomposition import PCA
scaler = StandardScaler()
scaler.mean_ = 0
scaler.var_ = 1
PCA = PCA()
data_features2 = data_features.drop(['plate_x', 'plate_z'], axis=1)
print(data_features2.head())
knn = KNeighborsClassifier()
pipe = Pipeline(steps = [('scaler', scaler), ('PCA', PCA), ('knn', knn)])

param_grid = {
    'PCA__n_components': list(range(1, 8)),
    'knn__n_neighbors': list(range(1, 26))
}

grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
grid_search.fit(data_features2, data_labels.values.ravel())
nested_score = cross_val_score(grid_search, data_features2, data_labels.values.ravel(), cv=5)
print("Accuracy:", nested_score.mean()*100)

      p_throws  stand  release_speed  release_spin_rate  spin_axis  pfx_x  \
4000         1      1           94.3             2486.0      230.0  -1.22   
4001         1      1           84.7             2365.0      173.0  -0.16   
4003         1      1           85.0             2442.0      155.0   0.22   
4004         1      1           85.5             2331.0      215.0   0.04   
4005         1      1           84.2             2343.0      175.0   0.05   

      pfx_z  
4000   1.44  
4001   0.39  
4003   0.21  
4004   0.37  
4005   0.34  
Accuracy: 89.72573909257046


from sklearn.decomposition import PCA
scaler = StandardScaler()
scaler.mean_ = 0
scaler.var_ = 1
PCA = PCA()
data_features3 = data_features.drop(['plate_x', 'plate_z', 'spin_axis'], axis=1)
print(data_features3.head())
knn = KNeighborsClassifier()
pipe = Pipeline(steps = [('scaler', scaler), ('PCA', PCA), ('knn', knn)])

param_grid = {
    'PCA__n_components': list(range(1, 7)),
    'knn__n_neighbors': list(range(1, 26))
}

grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
grid_search.fit(data_features3, data_labels.values.ravel())
nested_score = cross_val_score(grid_search, data_features3, data_labels.values.ravel(), cv=5)
print("Accuracy:", nested_score.mean()*100)

      p_throws  stand  release_speed  release_spin_rate  pfx_x  pfx_z
4000         1      1           94.3             2486.0  -1.22   1.44
4001         1      1           84.7             2365.0  -0.16   0.39
4003         1      1           85.0             2442.0   0.22   0.21
4004         1      1           85.5             2331.0   0.04   0.37
4005         1      1           84.2             2343.0   0.05   0.34
Accuracy: 87.1463630172585


from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
svc = SVC()
pipe = Pipeline(steps = [('scaler', scaler), ('PCA', PCA), ('svc', svc)])
param_grid = {
    'PCA__n_components': list(range(1, 10)),
    'svc__kernel': ['linear', 'rbf', 'poly']
}

grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
predict_y = cross_val_predict(grid_search, data_features, data_labels.values.ravel(), cv = 10)
print(classification_report(data_labels, predict_y))
print("Accuracy:", accuracy_score(data_labels, predict_y) * 100)

                 precision    recall  f1-score   support

4-Seam Fastball       0.97      0.99      0.98      6502
       Changeup       0.98      0.98      0.98      2187
         Cutter       0.81      0.92      0.86      2989
         Sinker       0.94      0.89      0.91      1610
         Slider       0.92      0.80      0.86      3228

       accuracy                           0.93     16516
      macro avg       0.92      0.91      0.92     16516
   weighted avg       0.93      0.93      0.93     16516

Accuracy: 92.72220876725599


from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.decomposition import PCA
svc = SVC()
PCA = PCA()
scaler = StandardScaler()
scaler.mean_ = 0
scaler.var_ = 1
smote = SMOTE(random_state=0, sampling_strategy='not majority')
pipe = imbpipeline(steps = [('scaler', scaler), ('PCA', PCA), ('smote', smote), ('svc', svc)])
param_grid = {
    'PCA__n_components': list(range(1, 10)),
    'svc__kernel': ['linear', 'rbf', 'poly']
}

grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
predict_y = cross_val_predict(grid_search, data_features, data_labels.values.ravel(), cv = 10)
print(classification_report(data_labels, predict_y))
print("Accuracy:", accuracy_score(data_labels, predict_y) * 100)

                 precision    recall  f1-score   support

4-Seam Fastball       0.98      0.97      0.98      6502
       Changeup       0.98      0.98      0.98      2187
         Cutter       0.83      0.92      0.87      2989
         Sinker       0.89      0.94      0.91      1610
         Slider       0.93      0.82      0.87      3228

       accuracy                           0.93     16516
      macro avg       0.92      0.93      0.92     16516
   weighted avg       0.93      0.93      0.93     16516

Accuracy: 93.17631387745216


from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.decomposition import PCA
from sklearn.svm import SVC
svc = SVC()
PCA = PCA()
data_features4 = data_features.drop(['plate_x', 'plate_z'], axis=1)
scaler = StandardScaler()
scaler.mean_ = 0
scaler.var_ = 1
smote = SMOTE(random_state=0, sampling_strategy='not majority')
pipe = imbpipeline(steps = [('scaler', scaler), ('PCA', PCA), ('smote', smote), ('svc', svc)])
param_grid = {
    'PCA__n_components': list(range(1, 8)),
    'svc__kernel': ['linear', 'rbf', 'poly']
}

grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
predict_y = cross_val_predict(grid_search, data_features4, data_labels.values.ravel(), cv = 10)
print(classification_report(data_labels, predict_y))
print("Accuracy:", accuracy_score(data_labels, predict_y) * 100)

                 precision    recall  f1-score   support

4-Seam Fastball       0.98      0.97      0.98      6502
       Changeup       0.98      0.98      0.98      2187
         Cutter       0.83      0.93      0.88      2989
         Sinker       0.90      0.95      0.92      1610
         Slider       0.93      0.83      0.88      3228

       accuracy                           0.93     16516
      macro avg       0.93      0.93      0.93     16516
   weighted avg       0.94      0.93      0.94     16516

Accuracy: 93.4972148219908


from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.decomposition import PCA
svc = SVC()
PCA = PCA()
data_features5 = data_features.drop(['plate_x', 'plate_z', 'spin_axis'], axis=1)
scaler = StandardScaler()
scaler.mean_ = 0
scaler.var_ = 1
smote = SMOTE(random_state=0, sampling_strategy='not majority')
pipe = imbpipeline(steps = [('scaler', scaler), ('PCA', PCA), ('smote', smote), ('svc', svc)])
param_grid = {
    'PCA__n_components': list(range(1, 7)),
    'svc__kernel': ['linear', 'rbf', 'poly']
}

grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
predict_y = cross_val_predict(grid_search, data_features5, data_labels.values.ravel(), cv = 10)
print(classification_report(data_labels, predict_y))
print("Accuracy:", accuracy_score(data_labels, predict_y) * 100)

                 precision    recall  f1-score   support

4-Seam Fastball       0.98      0.96      0.97      6502
       Changeup       0.98      0.97      0.98      2187
         Cutter       0.80      0.90      0.84      2989
         Sinker       0.85      0.94      0.89      1610
         Slider       0.89      0.79      0.84      3228

       accuracy                           0.91     16516
      macro avg       0.90      0.91      0.90     16516
   weighted avg       0.92      0.91      0.91     16516

Accuracy: 91.4446597239041


from sklearn.neural_network import MLPClassifier
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

@ignore_warnings(category=ConvergenceWarning)
def run_nn():
  neural_network = MLPClassifier()
  pipe = Pipeline(steps = [('scaler', scaler), ('neural_network', neural_network)])
  param_grid = {
      'neural_network__hidden_layer_sizes': list(range(30, 61, 10)),
      'neural_network__activation': ['logistic', 'tanh', 'relu']
  }
  grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
  nested_score = cross_val_score(grid_search, data_features, data_labels.values.ravel(), cv=5)
  print("Accuracy:", nested_score.mean()*100)

run_nn()

Accuracy: 88.21821859795813


from sklearn.neural_network import MLPClassifier
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
from imblearn.pipeline import Pipeline as imbpipeline

@ignore_warnings(category=ConvergenceWarning)
def run_nn():
  scaler = StandardScaler()
  scaler.mean_ = 0
  scaler.var_ = 1
  smote = SMOTE(random_state=0, sampling_strategy='not majority')
  neural_network = MLPClassifier()
  pipe = imbpipeline(steps = [('scaler', scaler), ('smote', smote), ('neural_network', neural_network)])
  param_grid = {
      'neural_network__hidden_layer_sizes': list(range(30, 61, 10)),
      'neural_network__activation': ['logistic', 'tanh', 'relu']
  }
  grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
  nested_score = cross_val_score(grid_search, data_features, data_labels.values.ravel(), cv=5)
  print("Accuracy:", nested_score.mean()*100)

run_nn()

Accuracy: 88.69646531621778


from sklearn.ensemble import RandomForestClassifier
forest_classifier = RandomForestClassifier()
param_grid = {
    'max_depth': list(range(10, 25)),
    'min_samples_leaf': [8, 10, 12],
    'max_features': ['sqrt', 'log2']
}
grid_search = GridSearchCV(forest_classifier, param_grid, cv=5, scoring='accuracy')
nested_score = cross_val_score(grid_search, data_features, data_labels.values.ravel(), cv=5)
print("Accuracy:", nested_score.mean()*100)

Accuracy: 87.84885374584263


from sklearn.ensemble import AdaBoostClassifier
adaboost_classifier = AdaBoostClassifier()
param_grid = {'n_estimators': list(range(100, 170, 10))}
grid_search = GridSearchCV(adaboost_classifier, param_grid, cv=5, scoring='accuracy')
nested_score = cross_val_score(grid_search, data_features, data_labels.values.ravel(), cv=5)
print("Accuracy:", nested_score.mean()*100)
predict_y = cross_val_predict(grid_search, data_features, data_labels.values.ravel(), cv = 5)
print(classification_report(data_labels, predict_y))

Accuracy: 67.31701644773736
                 precision    recall  f1-score   support

4-Seam Fastball       0.92      0.79      0.85      6502
       Changeup       0.48      0.79      0.60      2187
         Cutter       0.60      0.54      0.57      2989
         Sinker       0.61      0.62      0.62      1610
         Slider       0.55      0.50      0.53      3228

       accuracy                           0.67     16516
      macro avg       0.63      0.65      0.63     16516
   weighted avg       0.70      0.67      0.68     16516


gradient_classifier = GradientBoostingClassifier()
param_grid = {"max_depth": [5,10,15], "min_samples_leaf": [5,10,15], "max_features": [4,6,8]}
grid_search = GridSearchCV(gradient_classifier, param_grid, cv=5, scoring='accuracy')
predict_y = cross_val_predict(grid_search, data_features, data_labels.values.ravel(), cv=5)
print("Accuracy:", accuracy_score(data_labels, predict_y) * 100)
print()
print(classification_report(data_labels, predict_y))

Accuracy: 88.3567449745701

                 precision    recall  f1-score   support

4-Seam Fastball       0.96      0.98      0.97      6502
       Changeup       0.97      0.88      0.93      2187
         Cutter       0.74      0.86      0.80      2989
         Sinker       0.82      0.86      0.84      1610
         Slider       0.86      0.73      0.79      3228

       accuracy                           0.88     16516
      macro avg       0.87      0.86      0.86     16516
   weighted avg       0.89      0.88      0.88     16516


from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.decomposition import PCA
from sklearn.svm import SVC
svc = SVC()
PCA = PCA()
data_features4 = data_features.drop(['plate_x', 'plate_z'], axis=1)
scaler = StandardScaler()
scaler.mean_ = 0
scaler.var_ = 1
smote = SMOTE(random_state=0, sampling_strategy='not majority')
pipe = imbpipeline(steps = [('scaler', scaler), ('PCA', PCA), ('smote', smote), ('svc', svc)])
param_grid = {
    'PCA__n_components': list(range(1, 8)),
    'svc__kernel': ['linear', 'rbf', 'poly']
}


grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
grid_search.fit(data_features4, data_labels.values.ravel())
print(grid_search.best_params_)
final_model = grid_search.best_estimator_

filename = 'finalized_model.sav'
pickle.dump(final_model, open(filename, 'wb'))

{'PCA__n_components': 7, 'svc__kernel': 'rbf'}

	pitcher	p_throws	stand	release_speed	release_spin_rate	spin_axis	pfx_x	pfx_z	release_pos_x	release_pos_z	release_extension	plate_x	plate_z	pitch_id
pitcher	1.000000	-0.238308	0.066568	0.133730	-0.283217	-0.175634	0.268299	0.168734	0.488674	0.431571	-0.092489	0.001360	0.069290	0.025653
p_throws	-0.238308	1.000000	-0.266280	0.172848	0.370252	0.245319	-0.448812	-0.150044	-0.926289	-0.377512	-0.134075	0.125281	0.038587	0.012723
stand	0.066568	-0.266280	1.000000	-0.042989	-0.049127	-0.207924	0.262913	0.026605	0.264665	0.120067	0.063111	0.147649	-0.070705	-0.009784
release_speed	0.133730	0.172848	-0.042989	1.000000	0.237348	0.379401	-0.360720	0.717333	-0.150657	-0.141455	0.348640	-0.033246	0.323141	0.048404
release_spin_rate	-0.283217	0.370252	-0.049127	0.237348	1.000000	-0.259015	0.044923	-0.141406	-0.329294	-0.333564	-0.020571	0.122162	0.065943	0.047460
spin_axis	-0.175634	0.245319	-0.207924	0.379401	-0.259015	1.000000	-0.802807	0.370402	-0.313282	-0.082769	0.086186	-0.209574	0.114017	-0.031260
pfx_x	0.268299	-0.448812	0.262913	-0.360720	0.044923	-0.802807	1.000000	-0.203264	0.481452	0.255377	-0.104648	0.275152	-0.076269	0.000455
pfx_z	0.168734	-0.150044	0.026605	0.717333	-0.141406	0.370402	-0.203264	1.000000	0.122595	0.262786	0.316208	-0.103621	0.363891	0.034885
release_pos_x	0.488674	-0.926289	0.264665	-0.150657	-0.329294	-0.313282	0.481452	0.122595	1.000000	0.472029	0.034220	-0.104701	-0.037110	-0.005530
release_pos_z	0.431571	-0.377512	0.120067	-0.141455	-0.333564	-0.082769	0.255377	0.262786	0.472029	1.000000	-0.187604	-0.116615	0.096348	-0.005682
release_extension	-0.092489	-0.134075	0.063111	0.348640	-0.020571	0.086186	-0.104648	0.316208	0.034220	-0.187604	1.000000	-0.008828	-0.035151	0.113164
plate_x	0.001360	0.125281	0.147649	-0.033246	0.122162	-0.209574	0.275152	-0.103621	-0.104701	-0.116615	-0.008828	1.000000	-0.139354	0.015110
plate_z	0.069290	0.038587	-0.070705	0.323141	0.065943	0.114017	-0.076269	0.363891	-0.037110	0.096348	-0.035151	-0.139354	1.000000	-0.002896
pitch_id	0.025653	0.012723	-0.009784	0.048404	0.047460	-0.031260	0.000455	0.034885	-0.005530	-0.005682	0.113164	0.015110	-0.002896	1.000000

Pitch Prediction Data Science Project¶

Authors: Anshul Moondra, Cameron Courtney, Viren Velacheri¶

Introduction¶

Google Colab Setup¶

Data Cleaning, Data Exploration, and Feature Engineering¶

Reading and Quick Look at Dataset¶

Feature Filtering¶

Domain Knowledge¶

Variability¶

Correlation¶

Outlier Detection¶

Modeling¶

Class Imbalance Problem¶

Decision Tree Model¶

Naive Bayes Classifier¶

K-Nearest Neighbor Model¶

Support Vector Machine¶

Neural Network¶

Ensemble Methods¶

Random Forests¶

AdaBoost (with Decision Trees)¶

Gradient Boosting¶

Final Model¶

Conclusion/Takeaways¶