Grid Search Examples

These notes demonstrate using Grid Search to tune the hyper-parameters of a model so that it does not overfit.

Read and plot the data.

%matplotlib notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
def load_pts(dataframe):
    data = np.asarray(dataframe)
    X = data[:,0:2]
    y = data[:,2]

    plt.figure()
    
    plt.xlim(-2.05,2.05)
    plt.ylim(-2.05,2.05)
    plt.grid(True, zorder=0)
    
    plt.scatter(X[np.argwhere(y==0).flatten(),0], 
                X[np.argwhere(y==0).flatten(),1],
                s = 20, 
                color = 'blue',
                zorder=3)
    plt.scatter(X[np.argwhere(y==1).flatten(),0], 
                X[np.argwhere(y==1).flatten(),1],
                s = 20, 
                color = 'red',
                zorder=3)
    
    for spine in plt.gca().spines.values():
        spine.set_visible(False)
    plt.tick_params(
        axis='x',
        bottom=False)
    plt.tick_params(
        axis='y',
        left=False)
    
    plt.title('The Data')    

    return X, y
df = pd.read_csv('grid-search-examples/data.csv', header=None)
df.head()

0 1 2
0 0.336494 -0.985951 0.0
1 -0.011043 -0.105529 0.0
2 0.238160 -0.617417 1.0
3 -0.366783 -0.713819 1.0
4 1.221923 -1.039399 0.0
X, y = load_pts(df)
<IPython.core.display.Javascript object>

Split the data into training and testing sets

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, make_scorer

import random
random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Fit a Decision Tree model

from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=42)

clf.fit(X_train, y_train)

original_train_predictions = clf.predict(X_train)
original_test_predictions = clf.predict(X_test)

Plot the Decision Boundaries

from sklearn.metrics import f1_score
def plot_model(clf, train_predictions, test_predictions):
    plt.figure()
    plt.scatter(X[np.argwhere(y==0).flatten(),0], 
                X[np.argwhere(y==0).flatten(),1],
                s = 20, 
                color = 'blue',
                zorder=3)
    plt.scatter(X[np.argwhere(y==1).flatten(),0], 
                X[np.argwhere(y==1).flatten(),1],
                s = 20, 
                color = 'red',
                zorder=3)
    
    for spine in plt.gca().spines.values():
        spine.set_visible(False)
    plt.tick_params(
        axis='x',
        bottom=False)
    plt.tick_params(
        axis='y',
        left=False)

    plt.xlim(-2.05,2.05)
    plt.ylim(-2.05,2.05)
    plt.grid(True, zorder=0)

    r = np.linspace(-2.1,2.1,300)
    s,t = np.meshgrid(r,r)
    s = np.reshape(s,(np.size(s),1))
    t = np.reshape(t,(np.size(t),1))
    h = np.concatenate((s,t),1)

    z = clf.predict(h)

    s = s.reshape((np.size(r),np.size(r)))
    t = t.reshape((np.size(r),np.size(r)))
    z = z.reshape((np.size(r),np.size(r)))

    plt.contourf(s,
                 t,
                 z,
                 colors = ['blue','red'],
                 alpha = 0.2,
                 levels = range(-1,2))
    if len(np.unique(z)) > 1:
        plt.contour(s,
                    t,
                    z,
                    colors = 'k',
                    linewidths = 0.125)
    
    title_str = ('Training / Test F1 Scores:   {:1.2} / {:1.2}'
                 .format(f1_score(train_predictions, y_train),
                         f1_score(test_predictions, y_test)))
    plt.title(title_str)
plot_model(clf, original_train_predictions, original_test_predictions)
<IPython.core.display.Javascript object>

Both the shape of the decision boundaries and the ratio of the training to test scores indicate the model has overfit the data.

Use Grid Search to improve the model

1. Import libraries

from sklearn.metrics import make_scorer, f1_score
from sklearn.model_selection import GridSearchCV

2. Define the model.

clf = DecisionTreeClassifier(random_state=42)

3. Define the hyper-parameter search space.

parameters = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
              'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
              'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10]}

4. Define the scorer.

scorer = make_scorer(f1_score)

5. Create a GridSearch object with the parameters and the scorer. Then, fit it to the data.

grid_obj = GridSearchCV(clf, parameters, scoring=scorer)
grid_fit = grid_obj.fit(X_train, y_train)

6. Get the best fit estimator.

best_clf = grid_fit.best_estimator_
best_clf
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=4, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

7. Fit the best fit estimator.

best_clf.fit(X_train, y_train);

8. Make predictions using the new model.

best_train_predictions = best_clf.predict(X_train)
best_test_predictions = best_clf.predict(X_test)

9. Plot the new Model.

plot_model(best_clf, best_train_predictions, best_test_predictions)
<IPython.core.display.Javascript object>