Random Search Diabetes Example
Import Libraries
import pandas as pd
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="ticks")
1. Read Data
d = pd.read_csv('random-search-diabetes-example/diabetes.csv')
print(str(d.shape[0]) + ' rows, ' + str(d.shape[1]) + ' columns\n')
print(d.info())
d.head()
768 rows, 9 columns
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Pregnancies 768 non-null int64
1 Glucose 768 non-null int64
2 BloodPressure 768 non-null int64
3 SkinThickness 768 non-null int64
4 Insulin 768 non-null int64
5 BMI 768 non-null float64
6 DiabetesPedigreeFunction 768 non-null float64
7 Age 768 non-null int64
8 Outcome 768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
2. Perform Basic Exploratory Data Analysis
Define a function to plot histograms, boxplots, and basic summary stats.
def plot_stats(field, bins=None, xlim_list=None, xtick_list=None):
fig, (b, h) = plt.subplots(ncols=1, nrows=2,
sharex=True,
gridspec_kw={'height_ratios':[1,3]},
figsize=(12,3))
b.boxplot(d[field],
widths=0.6,
vert=False)
b.set_title(field,
fontsize=14)
b.set_yticks([])
b.set_xticks(xtick_list)
h.hist(d[field],
bins=bins,
align='left')
h.set_xlim(xlim_list)
h.set_xticks(xtick_list)
h.grid(True)
for ax in [b, h]:
for spine in ax.spines.values():
spine.set_visible(False)
ax.tick_params(
axis='x',
bottom=False)
ax.tick_params(
axis='y',
left=False,
right=False)
params = [('Pregnancies',np.arange(0,19,1),[-1,19],np.arange(0,21,3)),
('Glucose',np.arange(0,220,10),[-10,210],np.arange(0,220,20)),
('BloodPressure',np.arange(0,150,7.5),[-10,160],np.arange(0,165,15)),
('SkinThickness',np.arange(0,100,5),[-5,105],np.arange(0,105,5)),
('Insulin',np.arange(0,1000,50),[-50,1050],np.arange(0,1000,100)),
('BMI',np.arange(0,70,5),[-5,75],np.arange(0,80,10)),
('DiabetesPedigreeFunction',np.arange(0,2.5,0.125),
[-0.125,2.625],np.arange(0,3,0.25)),
('Age',np.arange(20,90,2.5),[15,85],np.arange(20,100,10)),
('Outcome',[0,1,2],[-1,2],[0,1])]
for field, bins, xlim_list, x_tick_list in params:
plot_stats(field, bins, xlim_list, x_tick_list)
correlations = d.corr()
plt.figure(figsize=(12,6))
sns.heatmap(correlations, annot=True, cmap='YlGnBu');
diabetes_proportion = d[d['Outcome']==1].shape[0] / d.shape[0]
print('Proportion of diabetes outcomes:\t{:1.2%}'.format(diabetes_proportion))
missing_data_points = d.isna().sum().sum()
print('Number of missing data points:\t\t{}'.format(missing_data_points))
highest_corr = correlations['Outcome']['Glucose']
print('Highest correlation with Outcomes:\tGlucose, {:3.3f}'.format(highest_corr))
Proportion of diabetes outcomes: 34.90%
Number of missing data points: 0
Highest correlation with Outcomes: Glucose, 0.467
Some takeaways:
- Proportion of diabetes outcomes is roughly one third.
- There are no missing data points in the dataset, though there are many zeros where that value does not make sense (ex: Blood Pressure, BMI) implying that missing values may have been filled with zeros during some preprocessing somewhere upstream.
- The Age feature is right-skewed.
- The Glucose feature distribution is roughly symmetric. It is also the variable with the strongest correlation with the outcome.
3. Split into features, X, and labels, y. Then, split into Training and Testing Datasets.
y = d['Outcome']
X = d[d.columns[:-1]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('''X_train, y_train: {}, {}
X_test, y_test: {}, {}'''.format(X_train.shape, y_train.shape,
X_test.shape, y_test.shape))
X_train, y_train: (614, 8), (614,)
X_test, y_test: (154, 8), (154,)
4. Perform randomized search and Make Predictions
Random Forest
random_forest = RandomForestClassifier()
param_dict = {"max_depth": [3, None],
"n_estimators": list(range(10, 200)),
"max_features": list(range(1, X_test.shape[1]+1)),
"min_samples_split": list(range(2, 11)),
"min_samples_leaf": list(range(1, 11)),
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]}
random_search = RandomizedSearchCV(random_forest,
param_distributions=param_dict)
random_search.fit(X_train, y_train)
best_random_forest = random_search.best_estimator_
rf_preds = best_random_forest.predict(X_test)
best_random_forest
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features=7,
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=9, min_samples_split=10,
min_weight_fraction_leaf=0.0, n_estimators=60,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False)
AdaBoost
ada_boost = AdaBoostClassifier()
param_dict = {'n_estimators' : [10, 50, 100, 200, 400],
'learning_rate' : [0.001, 0.005, .01, 0.05, 0.1, 0.2,
0.3, 0.4, 0.5, 1, 2, 10, 20]}
random_search = RandomizedSearchCV(ada_boost,
param_distributions=param_dict)
random_search.fit(X_train, y_train)
best_adaboost = random_search.best_estimator_
ab_preds = best_adaboost.predict(X_test)
best_adaboost
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.1,
n_estimators=50, random_state=None)
SVM
svm = SVC()
param_dict = {'C' : [0.1, 0.5, 1, 3, 5, 10],
'kernel' : ['linear','poly','rbf','sigmoid'],
'degree' : [2, 3, 4, 5, 6]}
random_search = RandomizedSearchCV(svm,
param_distributions=param_dict)
random_search.fit(X_train, y_train)
best_svm = random_search.best_estimator_
sv_preds = best_svm.predict(X_test)
best_svm
SVC(C=0.5, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=6, gamma='scale', kernel='linear',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
5. Print results
def print_accuracy_stats(model, preds):
accuracy = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds)
recall = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)
print('\n--- {} ---'.format(model))
print(' Accuracy score: {:2.1f}%'.format(accuracy*100.))
print('Precision score: {:2.1f}%'.format(precision*100.))
print(' Recall score: {:2.1f}%'.format(recall*100.))
print(' F1 score: {:2.1f}%'.format(f1*100.))
print_accuracy_stats('Random Forest', rf_preds)
print_accuracy_stats('AdaBoost', ab_preds)
print_accuracy_stats('SVM', sv_preds)
--- Random Forest ---
Accuracy score: 76.0%
Precision score: 66.1%
Recall score: 67.3%
F1 score: 66.7%
--- AdaBoost ---
Accuracy score: 76.6%
Precision score: 71.1%
Recall score: 58.2%
F1 score: 64.0%
--- SVM ---
Accuracy score: 75.3%
Precision score: 65.5%
Recall score: 65.5%
F1 score: 65.5%
Takeaways:
- The Random Forest classifier outperformed both the AdaBoost and SVM classifiers on the basis of F1 score.
6. Determine the Most Important Features and State Conclusions
def plot_feature_importances():
features = d.columns[:d.shape[1]]
importances = best_random_forest.feature_importances_
indices = np.argsort(importances)
plt.title('Random Forest - Feature Importances')
plt.barh(range(len(indices)),
importances[indices],
color='b',
align='center')
plt.yticks(range(len(indices)),
features[indices])
plt.xlabel('Relative Importance')
for spine in plt.gca().spines.values():
spine.set_visible(False)
plt.gca().tick_params(
axis='x',
bottom=False)
plt.gca().tick_params(
axis='y',
left=False)
plt.grid(True,
axis='x')
def plot_correlations():
correlations = (d.corr()[['Outcome']]
.sort_values(['Outcome'])[1:])
plt.title('Correlations with Outcome')
plt.barh(range(len(correlations[:-1])),
correlations['Outcome'][:-1])
plt.yticks(range(len(correlations[:-1])),
correlations.index[:-1])
plt.xlabel('Correlations')
for spine in plt.gca().spines.values():
spine.set_visible(False)
plt.gca().tick_params(
axis='x',
bottom=False)
plt.gca().tick_params(
axis='y',
left=False)
plt.grid(True,
axis='x')
plot_correlations()
plot_feature_importances()
Conclusions:
- Glucose is decidedly the most important indicator for determining whether a person has diabetes, followed by BMI and Age.
- The relative importance of the features in the Random Forest classifier were roughly similar to the rank order of the features in the initial correlation analysis. This gives confidence to the overall conclusions.
- One notable difference is that pregnancies appear to be less important to the outcome than the correlations suggest. This is possibly due to a strong correlation between age and pregnancies (0.54), and age was ultimately the better indicator.
This content is taken from notes I took while pursuing the Intro to Machine Learning with Pytorch nanodegree certification.