Learning Curve Examples
The difference between underfit (high bias), overfit (high variance), and appropriately fit models is shown below.
Read Data
import pandas as pd
import numpy as np
data = pd.read_csv('learning-curve-examples/data.csv')
print(data.shape)
X = np.array(data[['x1', 'x2']])
y = np.array(data['y'])
(100, 3)
np.random.seed(42)
Visualize Data
%matplotlib notebook
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
plt.figure()
cmap_bold = ListedColormap(['#FF1111', '#1111FF'])
plt.scatter(X[:, 0],
X[:, 1],
s=20,
c=y,
cmap=cmap_bold,
zorder=3)
plt.grid(True, zorder=0)
for spine in plt.gca().spines.values():
spine.set_visible(False)
plt.tick_params(
axis='x',
bottom=False)
plt.tick_params(
axis='y',
left=False)
plt.title('The Data');
<IPython.core.display.Javascript object>
Initialize Four Machine Learning Models
from sklearn.model_selection import learning_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
### Logistic Regression
logistic_regression = LogisticRegression()
### Decision Tree
decision_tree = GradientBoostingClassifier()
### Support Vector Machine
support_vector_machine = SVC(kernel='rbf', gamma=1000)
Define Utility Functions
Randomize Data
def randomize(X, Y):
permutation = np.random.permutation(Y.shape[0])
X2 = X[permutation,:]
Y2 = Y[permutation]
return X2, Y2
X_rand, y_rand = randomize(X, y)
Draw Learning Curves
def draw_learning_curves(estimator, title):
plt.figure()
train_sizes, train_scores, test_scores = (learning_curve(
estimator, X_rand, y_rand, cv=None, n_jobs=1,
train_sizes=range(5,85,5)))
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.title(title + " - Learning Curves")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.gca().set_ylim([0.35, 1.05])
for spine in plt.gca().spines.values():
spine.set_visible(False)
plt.tick_params(
axis='x',
bottom=False)
plt.tick_params(
axis='y',
left=False)
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1, color='g')
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color='y')
plt.plot(train_sizes, train_scores_mean, 'o-', color="g",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="y",
label="Cross-validation score")
plt.legend(loc="best")
plt.show()
Create Plots
for model, title in [(logistic_regression, 'Logistic Regression'),
(decision_tree, 'Decision Tree'),
(support_vector_machine, 'Support Vector Machine')]:
draw_learning_curves(model, title)
<IPython.core.display.Javascript object>
<IPython.core.display.Javascript object>
<IPython.core.display.Javascript object>
The shapes shown above are very characteristic shapes. In particular:
-
Logistic Regression: Underfits, both the training and cross-validation scores are low at less than 70%.
-
Decision Tree: Appropriately Fit, the cross-validation score increases with more training examples, approaching the training score.
-
Support Vector Machine: Overfits, the training score is high, but the cross-validation score never increases to approach it.
Visualize the Decision Boundaries
The decision boundaries shown below demonstrate that these fit classifications are accurate.
from sklearn.model_selection import train_test_split
cmap_light = ListedColormap(['#FFCCCC', '#CCCCFF'])
cmap_bold = ListedColormap(['#FF1111', '#1111FF'])
mesh_step_size = .01
mark_symbol_size = 5
def plot_decision_boundaries(model, title):
plt.figure()
X_train, X_test, y_train, y_test = train_test_split(X_rand, y_rand, random_state=42)
model.fit(X_train, y_train)
x_min = X_rand[:,0].min()
x_max = X_rand[:,0].max()
y_min = X_rand[:,1].min()
y_max = X_rand[:,1].max()
# Plot Decision Boundaries
xx, yy = np.meshgrid(np.arange(x_min, x_max, mesh_step_size),
np.arange(y_min, y_max, mesh_step_size))
Z = model.predict(np.c_[xx.ravel(),
yy.ravel()])
Z = Z.reshape(xx.shape)
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot Points
plt.scatter(X_rand[:, 0],
X_rand[:, 1],
s=mark_symbol_size,
c=y_rand,
cmap=cmap_bold)
plt.title(title)
for spine in plt.gca().spines.values():
spine.set_visible(False)
plt.xticks([x_min,x_max])
plt.yticks([y_min,y_max]);
for model, title in [(logistic_regression, 'Logistic Regression'),
(decision_tree, 'Decision Tree'),
(support_vector_machine, 'Support Vector Machine')]:
plot_decision_boundaries(model, title)
<IPython.core.display.Javascript object>
<IPython.core.display.Javascript object>
<IPython.core.display.Javascript object>
This content is taken from notes I took while pursuing the Intro to Machine Learning with Pytorch nanodegree certification.