Random Search Diabetes Example

26 Mar 2020

Import Libraries

import pandas as pd
import numpy as np

from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="ticks")

1. Read Data

d = pd.read_csv('random-search-diabetes-example/diabetes.csv')
print(str(d.shape[0]) + ' rows, ' + str(d.shape[1]) + ' columns\n')
print(d.info())
d.head()

768 rows, 9 columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1

2. Perform Basic Exploratory Data Analysis

Define a function to plot histograms, boxplots, and basic summary stats.

def plot_stats(field, bins=None, xlim_list=None, xtick_list=None):
    fig, (b, h) = plt.subplots(ncols=1, nrows=2, 
                               sharex=True, 
                               gridspec_kw={'height_ratios':[1,3]},
                               figsize=(12,3))
    b.boxplot(d[field],
              widths=0.6,
              vert=False)
    b.set_title(field,
                fontsize=14)
    b.set_yticks([])
    b.set_xticks(xtick_list)
    
    h.hist(d[field],
           bins=bins,
           align='left')
    h.set_xlim(xlim_list)
    h.set_xticks(xtick_list)
    h.grid(True)
    
    for ax in [b, h]:
        for spine in ax.spines.values():
            spine.set_visible(False)
        ax.tick_params(
            axis='x',
            bottom=False)
        ax.tick_params(
            axis='y',
            left=False,
            right=False)

params = [('Pregnancies',np.arange(0,19,1),[-1,19],np.arange(0,21,3)),
          ('Glucose',np.arange(0,220,10),[-10,210],np.arange(0,220,20)),
          ('BloodPressure',np.arange(0,150,7.5),[-10,160],np.arange(0,165,15)),
          ('SkinThickness',np.arange(0,100,5),[-5,105],np.arange(0,105,5)),
          ('Insulin',np.arange(0,1000,50),[-50,1050],np.arange(0,1000,100)),
          ('BMI',np.arange(0,70,5),[-5,75],np.arange(0,80,10)),
          ('DiabetesPedigreeFunction',np.arange(0,2.5,0.125),
           [-0.125,2.625],np.arange(0,3,0.25)),
          ('Age',np.arange(20,90,2.5),[15,85],np.arange(20,100,10)),
          ('Outcome',[0,1,2],[-1,2],[0,1])]
for field, bins, xlim_list, x_tick_list in params:
    plot_stats(field, bins, xlim_list, x_tick_list)

correlations = d.corr()
plt.figure(figsize=(12,6))
sns.heatmap(correlations, annot=True, cmap='YlGnBu');

diabetes_proportion = d[d['Outcome']==1].shape[0] / d.shape[0]
print('Proportion of diabetes outcomes:\t{:1.2%}'.format(diabetes_proportion))
missing_data_points = d.isna().sum().sum()
print('Number of missing data points:\t\t{}'.format(missing_data_points))
highest_corr = correlations['Outcome']['Glucose']
print('Highest correlation with Outcomes:\tGlucose, {:3.3f}'.format(highest_corr))

Proportion of diabetes outcomes:	34.90%
Number of missing data points:		0
Highest correlation with Outcomes:	Glucose, 0.467

Some takeaways:

Proportion of diabetes outcomes is roughly one third.
There are no missing data points in the dataset, though there are many zeros where that value does not make sense (ex: Blood Pressure, BMI) implying that missing values may have been filled with zeros during some preprocessing somewhere upstream.
The Age feature is right-skewed.
The Glucose feature distribution is roughly symmetric. It is also the variable with the strongest correlation with the outcome.

3. Split into features, X, and labels, y. Then, split into Training and Testing Datasets.

y = d['Outcome']
X = d[d.columns[:-1]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('''X_train, y_train: {}, {}
  X_test, y_test: {}, {}'''.format(X_train.shape, y_train.shape,
                                   X_test.shape, y_test.shape))

X_train, y_train: (614, 8), (614,)
  X_test, y_test: (154, 8), (154,)

4. Perform randomized search and Make Predictions

Random Forest

random_forest = RandomForestClassifier()

param_dict = {"max_depth": [3, None],
              "n_estimators": list(range(10, 200)),
              "max_features": list(range(1, X_test.shape[1]+1)),
              "min_samples_split": list(range(2, 11)),
              "min_samples_leaf": list(range(1, 11)),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

random_search = RandomizedSearchCV(random_forest, 
                                   param_distributions=param_dict)
random_search.fit(X_train, y_train)
best_random_forest = random_search.best_estimator_
rf_preds = best_random_forest.predict(X_test)
best_random_forest

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=7,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=9, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=60,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

AdaBoost

ada_boost = AdaBoostClassifier()

param_dict = {'n_estimators'  : [10, 50, 100, 200, 400],
              'learning_rate' : [0.001, 0.005, .01, 0.05, 0.1, 0.2, 
                                 0.3, 0.4, 0.5, 1, 2, 10, 20]}

random_search = RandomizedSearchCV(ada_boost, 
                                   param_distributions=param_dict)
random_search.fit(X_train, y_train)
best_adaboost = random_search.best_estimator_
ab_preds = best_adaboost.predict(X_test)
best_adaboost

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.1,
                   n_estimators=50, random_state=None)

SVM

svm = SVC()

param_dict = {'C'      : [0.1, 0.5, 1, 3, 5, 10],
              'kernel' : ['linear','poly','rbf','sigmoid'],
              'degree' : [2, 3, 4, 5, 6]}

random_search = RandomizedSearchCV(svm, 
                                   param_distributions=param_dict)
random_search.fit(X_train, y_train)
best_svm = random_search.best_estimator_
sv_preds = best_svm.predict(X_test)
best_svm

SVC(C=0.5, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=6, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

5. Print results

def print_accuracy_stats(model, preds):
    accuracy = accuracy_score(y_test, preds)
    precision = precision_score(y_test, preds)
    recall = recall_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    print('\n--- {} ---'.format(model))
    print(' Accuracy score: {:2.1f}%'.format(accuracy*100.))
    print('Precision score: {:2.1f}%'.format(precision*100.))
    print('   Recall score: {:2.1f}%'.format(recall*100.))
    print('       F1 score: {:2.1f}%'.format(f1*100.))

print_accuracy_stats('Random Forest', rf_preds)
print_accuracy_stats('AdaBoost', ab_preds)
print_accuracy_stats('SVM', sv_preds)

--- Random Forest ---
 Accuracy score: 76.0%
Precision score: 66.1%
   Recall score: 67.3%
       F1 score: 66.7%

--- AdaBoost ---
 Accuracy score: 76.6%
Precision score: 71.1%
   Recall score: 58.2%
       F1 score: 64.0%

--- SVM ---
 Accuracy score: 75.3%
Precision score: 65.5%
   Recall score: 65.5%
       F1 score: 65.5%

Takeaways:

The Random Forest classifier outperformed both the AdaBoost and SVM classifiers on the basis of F1 score.

6. Determine the Most Important Features and State Conclusions

def plot_feature_importances():
    features = d.columns[:d.shape[1]]
    importances = best_random_forest.feature_importances_
    indices = np.argsort(importances)

    plt.title('Random Forest - Feature Importances')
    plt.barh(range(len(indices)),
             importances[indices],
             color='b',
             align='center')
    plt.yticks(range(len(indices)),
               features[indices])
    plt.xlabel('Relative Importance')

    for spine in plt.gca().spines.values():
        spine.set_visible(False)
    plt.gca().tick_params(
        axis='x',
        bottom=False)
    plt.gca().tick_params(
        axis='y',
        left=False)
    plt.grid(True,
             axis='x')

def plot_correlations():
    correlations = (d.corr()[['Outcome']]
                    .sort_values(['Outcome'])[1:])

    plt.title('Correlations with Outcome')
    plt.barh(range(len(correlations[:-1])),
             correlations['Outcome'][:-1])
    plt.yticks(range(len(correlations[:-1])),
               correlations.index[:-1])
    plt.xlabel('Correlations')

    for spine in plt.gca().spines.values():
        spine.set_visible(False)
    plt.gca().tick_params(
        axis='x',
        bottom=False)
    plt.gca().tick_params(
        axis='y',
        left=False)
    plt.grid(True,
             axis='x')

plot_correlations()

plot_feature_importances()

Conclusions:

Glucose is decidedly the most important indicator for determining whether a person has diabetes, followed by BMI and Age.
The relative importance of the features in the Random Forest classifier were roughly similar to the rank order of the features in the initial correlation analysis. This gives confidence to the overall conclusions.
One notable difference is that pregnancies appear to be less important to the outcome than the correlations suggest. This is possibly due to a strong correlation between age and pregnancies (0.54), and age was ultimately the better indicator.

This content is taken from notes I took while pursuing the Intro to Machine Learning with Pytorch nanodegree certification.