Grid Search Diabetes Example
Import Libraries
import pandas as pd
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="ticks")
1. Read Data
d = pd.read_csv('grid-search-diabetes-example/diabetes.csv')
print(str(d.shape[0]) + ' rows, ' + str(d.shape[1]) + ' columns\n')
print(d.info())
d.head()
768 rows, 9 columns
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Pregnancies 768 non-null int64
1 Glucose 768 non-null int64
2 BloodPressure 768 non-null int64
3 SkinThickness 768 non-null int64
4 Insulin 768 non-null int64
5 BMI 768 non-null float64
6 DiabetesPedigreeFunction 768 non-null float64
7 Age 768 non-null int64
8 Outcome 768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
2. Perform Basic Exploratory Data Analysis
Define a function to plot histograms, boxplots, and basic summary stats.
def plot_stats(field, bins=None, xlim_list=None, xtick_list=None):
fig, (h, b) = plt.subplots(1, 2, sharex=True, figsize=(12,3))
h.hist(d[field],
bins=bins,
align='left')
h.set_xlim(xlim_list)
h.set_xticks(xtick_list)
h.grid(True)
b.boxplot(d[field], vert=False)
b.set_yticks([])
b.set_xticks(xtick_list)
plt.suptitle(field)
for ax in [h, b]:
for spine in ax.spines.values():
spine.set_visible(False)
ax.tick_params(
axis='x',
bottom=False)
ax.tick_params(
axis='y',
left=False,
right=False)
def plot_stats(field, bins=None, xlim_list=None, xtick_list=None):
fig = plt.figure(constrained_layout=True)
gs = fig.add_gridspec(ncols=1, nrows=2,
height_ratios = [1,4])
b = fig.add_subplot(gs[0,:])
h = fig.add_subplot(gs[1:,:])
h.hist(d[field],
bins=bins,
align='left')
h.set_xlim(xlim_list)
h.set_xticks(xtick_list)
h.grid(True)
b.boxplot(d[field], vert=False)
b.set_yticks([])
b.set_xticks(xtick_list)
b.set_title(field)
for ax in [b, h]:
for spine in ax.spines.values():
spine.set_visible(False)
ax.tick_params(
axis='x',
bottom=False)
ax.tick_params(
axis='y',
left=False,
right=False)
def plot_stats(field, bins=None, xlim_list=None, xtick_list=None):
fig, (b, h) = plt.subplots(ncols=1, nrows=2,
sharex=True,
gridspec_kw={'height_ratios':[1,3]},
figsize=(12,3))
b.boxplot(d[field],
widths=0.6,
vert=False)
b.set_title(field,
fontsize=14)
b.set_yticks([])
b.set_xticks(xtick_list)
h.hist(d[field],
bins=bins,
align='left')
h.set_xlim(xlim_list)
h.set_xticks(xtick_list)
h.grid(True)
for ax in [b, h]:
for spine in ax.spines.values():
spine.set_visible(False)
ax.tick_params(
axis='x',
bottom=False)
ax.tick_params(
axis='y',
left=False,
right=False)
params = [('Pregnancies',np.arange(0,19,1),[-1,19],np.arange(0,21,3)),
('Glucose',np.arange(0,220,10),[-10,210],np.arange(0,220,20)),
('BloodPressure',np.arange(0,150,7.5),[-10,160],np.arange(0,165,15)),
('SkinThickness',np.arange(0,100,5),[-5,105],np.arange(0,105,5)),
('Insulin',np.arange(0,1000,50),[-50,1050],np.arange(0,1000,100)),
('BMI',np.arange(0,70,5),[-5,75],np.arange(0,80,10)),
('DiabetesPedigreeFunction',np.arange(0,2.5,0.125),
[-0.125,2.625],np.arange(0,3,0.25)),
('Age',np.arange(20,90,2.5),[15,85],np.arange(20,100,10)),
('Outcome',[0,1,2],[-1,2],[0,1])]
for field, bins, xlim_list, x_tick_list in params:
plot_stats(field, bins, xlim_list, x_tick_list)
correlations = d.corr()
plt.figure(figsize=(12,6))
sns.heatmap(correlations, annot=True, cmap='YlGnBu');
diabetes_proportion = d[d['Outcome']==1].shape[0] / d.shape[0]
print('Proportion of diabetes outcomes:\t{:1.2%}'.format(diabetes_proportion))
missing_data_points = d.isna().sum().sum()
print('Number of missing data points:\t\t{}'.format(missing_data_points))
highest_corr = correlations['Outcome']['Glucose']
print('Highest correlation with Outcomes:\tGlucose, {:3.3f}'.format(highest_corr))
Proportion of diabetes outcomes: 34.90%
Number of missing data points: 0
Highest correlation with Outcomes: Glucose, 0.467
3. Split into features, X, and labels, y. Then, split into Training and Testing Datasets.
y = d['Outcome']
X = d[d.columns[:-1]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('''X_train, y_train: {}, {}
X_test, y_test: {}, {}'''.format(X_train.shape, y_train.shape,
X_test.shape, y_test.shape))
X_train, y_train: (614, 8), (614,)
X_test, y_test: (154, 8), (154,)
4. Perform randomized search
Random Forest
clf_rf = RandomForestClassifier()
param_dist = {"max_depth": [3, None],
"n_estimators": list(range(10, 200)),
"max_features": list(range(1, X_test.shape[1]+1)),
"min_samples_split": list(range(2, 11)),
"min_samples_leaf": list(range(1, 11)),
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]}
random_search = RandomizedSearchCV(clf_rf, param_distributions=param_dist)
random_search.fit(X_train, y_train)
rf_preds = random_search.best_estimator_.predict(X_test)
This content is taken from notes I took while pursuing the Intro to Machine Learning with Pytorch nanodegree certification.