Grid Search Diabetes Example

Import Libraries

import pandas as pd
import numpy as np

from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="ticks")

1. Read Data

d = pd.read_csv('grid-search-diabetes-example/diabetes.csv')
print(str(d.shape[0]) + ' rows, ' + str(d.shape[1]) + ' columns\n')
print(d.info())
d.head()
768 rows, 9 columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1

2. Perform Basic Exploratory Data Analysis

Define a function to plot histograms, boxplots, and basic summary stats.

def plot_stats(field, bins=None, xlim_list=None, xtick_list=None):
    fig, (h, b) = plt.subplots(1, 2, sharex=True, figsize=(12,3))
    
    h.hist(d[field],
           bins=bins,
           align='left')
    h.set_xlim(xlim_list)
    h.set_xticks(xtick_list)
    h.grid(True)
    
    b.boxplot(d[field], vert=False)
    b.set_yticks([])
    b.set_xticks(xtick_list)
    
    plt.suptitle(field)
        
    for ax in [h, b]:
        for spine in ax.spines.values():
            spine.set_visible(False)
        ax.tick_params(
            axis='x',
            bottom=False)
        ax.tick_params(
            axis='y',
            left=False,
            right=False)
def plot_stats(field, bins=None, xlim_list=None, xtick_list=None):
    fig = plt.figure(constrained_layout=True)
    gs = fig.add_gridspec(ncols=1, nrows=2, 
                          height_ratios = [1,4])
    b = fig.add_subplot(gs[0,:])
    h = fig.add_subplot(gs[1:,:])
    
    h.hist(d[field],
           bins=bins,
           align='left')
    h.set_xlim(xlim_list)
    h.set_xticks(xtick_list)
    h.grid(True)
    
    b.boxplot(d[field], vert=False)
    b.set_yticks([])
    b.set_xticks(xtick_list)
    
    b.set_title(field)
        
    for ax in [b, h]:
        for spine in ax.spines.values():
            spine.set_visible(False)
        ax.tick_params(
            axis='x',
            bottom=False)
        ax.tick_params(
            axis='y',
            left=False,
            right=False)
def plot_stats(field, bins=None, xlim_list=None, xtick_list=None):
    fig, (b, h) = plt.subplots(ncols=1, nrows=2, 
                               sharex=True, 
                               gridspec_kw={'height_ratios':[1,3]},
                               figsize=(12,3))
    b.boxplot(d[field],
              widths=0.6,
              vert=False)
    b.set_title(field,
                fontsize=14)
    b.set_yticks([])
    b.set_xticks(xtick_list)
    
    h.hist(d[field],
           bins=bins,
           align='left')
    h.set_xlim(xlim_list)
    h.set_xticks(xtick_list)
    h.grid(True)
    
    for ax in [b, h]:
        for spine in ax.spines.values():
            spine.set_visible(False)
        ax.tick_params(
            axis='x',
            bottom=False)
        ax.tick_params(
            axis='y',
            left=False,
            right=False)
params = [('Pregnancies',np.arange(0,19,1),[-1,19],np.arange(0,21,3)),
          ('Glucose',np.arange(0,220,10),[-10,210],np.arange(0,220,20)),
          ('BloodPressure',np.arange(0,150,7.5),[-10,160],np.arange(0,165,15)),
          ('SkinThickness',np.arange(0,100,5),[-5,105],np.arange(0,105,5)),
          ('Insulin',np.arange(0,1000,50),[-50,1050],np.arange(0,1000,100)),
          ('BMI',np.arange(0,70,5),[-5,75],np.arange(0,80,10)),
          ('DiabetesPedigreeFunction',np.arange(0,2.5,0.125),
           [-0.125,2.625],np.arange(0,3,0.25)),
          ('Age',np.arange(20,90,2.5),[15,85],np.arange(20,100,10)),
          ('Outcome',[0,1,2],[-1,2],[0,1])]
for field, bins, xlim_list, x_tick_list in params:
    plot_stats(field, bins, xlim_list, x_tick_list)

png

png

png

png

png

png

png

png

png

correlations = d.corr()
plt.figure(figsize=(12,6))
sns.heatmap(correlations, annot=True, cmap='YlGnBu');

png

diabetes_proportion = d[d['Outcome']==1].shape[0] / d.shape[0]
print('Proportion of diabetes outcomes:\t{:1.2%}'.format(diabetes_proportion))
missing_data_points = d.isna().sum().sum()
print('Number of missing data points:\t\t{}'.format(missing_data_points))
highest_corr = correlations['Outcome']['Glucose']
print('Highest correlation with Outcomes:\tGlucose, {:3.3f}'.format(highest_corr))
Proportion of diabetes outcomes:    34.90%
Number of missing data points:      0
Highest correlation with Outcomes:  Glucose, 0.467

3. Split into features, X, and labels, y. Then, split into Training and Testing Datasets.

y = d['Outcome']
X = d[d.columns[:-1]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('''X_train, y_train: {}, {}
  X_test, y_test: {}, {}'''.format(X_train.shape, y_train.shape,
                                   X_test.shape, y_test.shape))
X_train, y_train: (614, 8), (614,)
  X_test, y_test: (154, 8), (154,)

Random Forest

clf_rf = RandomForestClassifier()

param_dist = {"max_depth": [3, None],
              "n_estimators": list(range(10, 200)),
              "max_features": list(range(1, X_test.shape[1]+1)),
              "min_samples_split": list(range(2, 11)),
              "min_samples_leaf": list(range(1, 11)),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

random_search = RandomizedSearchCV(clf_rf, param_distributions=param_dist)

random_search.fit(X_train, y_train)

rf_preds = random_search.best_estimator_.predict(X_test)