Learning Curve Examples

The difference between underfit (high bias), overfit (high variance), and appropriately fit models is shown below.

Read Data

import pandas as pd
import numpy as np
data = pd.read_csv('learning-curve-examples/data.csv')
print(data.shape)
X = np.array(data[['x1', 'x2']])
y = np.array(data['y'])
(100, 3)
np.random.seed(42)

Visualize Data

%matplotlib notebook

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
plt.figure()

cmap_bold  = ListedColormap(['#FF1111', '#1111FF'])
plt.scatter(X[:, 0],
            X[:, 1],
            s=20,
            c=y,
            cmap=cmap_bold,
            zorder=3)

plt.grid(True, zorder=0)
for spine in plt.gca().spines.values():
    spine.set_visible(False)
plt.tick_params(
    axis='x',
    bottom=False)
plt.tick_params(
    axis='y',
    left=False)
    
plt.title('The Data');
<IPython.core.display.Javascript object>

Initialize Four Machine Learning Models

from sklearn.model_selection import learning_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
### Logistic Regression
logistic_regression = LogisticRegression()

### Decision Tree
decision_tree = GradientBoostingClassifier()

### Support Vector Machine
support_vector_machine = SVC(kernel='rbf', gamma=1000)

Define Utility Functions

Randomize Data

def randomize(X, Y):
    permutation = np.random.permutation(Y.shape[0])
    X2 = X[permutation,:]
    Y2 = Y[permutation]
    
    return X2, Y2
X_rand, y_rand = randomize(X, y)

Draw Learning Curves

def draw_learning_curves(estimator, title):
    plt.figure()
    
    train_sizes, train_scores, test_scores = (learning_curve(
        estimator, X_rand, y_rand, cv=None, n_jobs=1, 
        train_sizes=range(5,85,5)))

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.grid()

    plt.title(title + " - Learning Curves")
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    
    plt.gca().set_ylim([0.35, 1.05])
    for spine in plt.gca().spines.values():
        spine.set_visible(False)
    plt.tick_params(
        axis='x',
        bottom=False)
    plt.tick_params(
        axis='y',
        left=False)

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color='g')
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color='y')
    plt.plot(train_sizes, train_scores_mean, 'o-', color="g",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="y",
             label="Cross-validation score")

    plt.legend(loc="best")

    plt.show()

Create Plots

for model, title in [(logistic_regression,    'Logistic Regression'),
                     (decision_tree,          'Decision Tree'),
                     (support_vector_machine, 'Support Vector Machine')]:
    draw_learning_curves(model, title)
<IPython.core.display.Javascript object>
<IPython.core.display.Javascript object>
<IPython.core.display.Javascript object>

The shapes shown above are very characteristic shapes. In particular:

  • Logistic Regression: Underfits, both the training and cross-validation scores are low at less than 70%.

  • Decision Tree: Appropriately Fit, the cross-validation score increases with more training examples, approaching the training score.

  • Support Vector Machine: Overfits, the training score is high, but the cross-validation score never increases to approach it.

Visualize the Decision Boundaries

The decision boundaries shown below demonstrate that these fit classifications are accurate.

from sklearn.model_selection import train_test_split
cmap_light = ListedColormap(['#FFCCCC', '#CCCCFF'])
cmap_bold = ListedColormap(['#FF1111', '#1111FF'])

mesh_step_size = .01
mark_symbol_size = 5
def plot_decision_boundaries(model, title):
    plt.figure()
    
    X_train, X_test, y_train, y_test = train_test_split(X_rand, y_rand, random_state=42)
    
    model.fit(X_train, y_train)

    x_min = X_rand[:,0].min()
    x_max = X_rand[:,0].max()
    y_min = X_rand[:,1].min()
    y_max = X_rand[:,1].max()
    
    # Plot Decision Boundaries
    xx, yy = np.meshgrid(np.arange(x_min, x_max, mesh_step_size),
                         np.arange(y_min, y_max, mesh_step_size))
    Z = model.predict(np.c_[xx.ravel(), 
                            yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

    # Plot Points
    plt.scatter(X_rand[:, 0],
                X_rand[:, 1],
                s=mark_symbol_size,
                c=y_rand,
                cmap=cmap_bold)
    plt.title(title)
    
    for spine in plt.gca().spines.values():
        spine.set_visible(False)
    plt.xticks([x_min,x_max])
    plt.yticks([y_min,y_max]);
for model, title in [(logistic_regression,    'Logistic Regression'),
                     (decision_tree,          'Decision Tree'),
                     (support_vector_machine, 'Support Vector Machine')]:
    plot_decision_boundaries(model, title)
<IPython.core.display.Javascript object>
<IPython.core.display.Javascript object>
<IPython.core.display.Javascript object>