Classification Model Evaluation

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (accuracy_score, 
                             precision_score, 
                             recall_score, 
                             f1_score)
from sklearn.ensemble import (BaggingClassifier, 
                              RandomForestClassifier, 
                              AdaBoostClassifier)
from sklearn.svm import SVC
import tests as t

Import and Prepare Data

This example uses the SMS Text Spam classification dataset explored in the Naive Bayes notes. The following imports the data, changes the label to a binary integer, splits in to training and testing data, and transforms the data into sparse matrics of ones and zeroes.

df = pd.read_table('classification-model-evaluation/SMSSpamCollection.txt',
                   sep='\t', 
                   header=None, 
                   names=['label', 'sms_message'])
df['label'] = df.label.map({'ham':0, 'spam':1})

X_train, X_test, y_train, y_test = train_test_split(df['sms_message'], 
                                                    df['label'], 
                                                    random_state=1)

count_vector = CountVectorizer()
training_data = count_vector.fit_transform(X_train)
testing_data = count_vector.transform(X_test)

print("Training, Testing Data shape = " + \
      str(training_data.shape) + ', ' + \
      str(testing_data.shape))
Training, Testing Data shape = (4179, 7456), (1393, 7456)

Instantiate Models

naive_bayes = MultinomialNB()
bag_mod = BaggingClassifier(n_estimators=200)
rf_mod = RandomForestClassifier(n_estimators=200)
ada_mod = AdaBoostClassifier(n_estimators=300, learning_rate=0.2)
svm_mod = SVC()

Train Models

naive_bayes.fit(training_data, y_train)
bag_mod.fit(training_data, y_train)
rf_mod.fit(training_data, y_train)
ada_mod.fit(training_data, y_train)
svm_mod.fit(training_data, y_train);

Make Predictions

naive_bayes_preds = naive_bayes.predict(testing_data)
bag_mod_preds = bag_mod.predict(testing_data)
rf_mod_preds = rf_mod.predict(testing_data)
ada_mod_preds = ada_mod.predict(testing_data)
svm_mod_preds = svm_mod.predict(testing_data);

Calculate Metrics

for label, preds in [('Naive Bayes',   naive_bayes_preds),
                     ('Bagging',       bag_mod_preds),
                     ('Random Forest', rf_mod_preds),
                     ('AdaBoost',      ada_mod_preds),
                     ('SVM',           svm_mod_preds)]:
    
    accuracy = accuracy_score(y_test, preds)
    precision = precision_score(y_test, preds)
    recall = recall_score(y_test, preds)
    f1 = f1_score(y_test, preds)

    print('\n--- {} ---'.format(label))
    print(' Accuracy score: {:2.1f}%'.format(accuracy*100.))
    print('Precision score: {:2.1f}%'.format(precision*100.))
    print('   Recall score: {:2.1f}%'.format(recall*100.))
    print('       F1 score: {:2.1f}%'.format(f1*100.))
--- Naive Bayes ---
 Accuracy score: 98.9%
Precision score: 97.2%
   Recall score: 94.1%
       F1 score: 95.6%

--- Bagging ---
 Accuracy score: 97.5%
Precision score: 91.7%
   Recall score: 89.2%
       F1 score: 90.4%

--- Random Forest ---
 Accuracy score: 98.1%
Precision score: 100.0%
   Recall score: 85.4%
       F1 score: 92.1%

--- AdaBoost ---
 Accuracy score: 97.7%
Precision score: 96.9%
   Recall score: 85.4%
       F1 score: 90.8%

--- SVM ---
 Accuracy score: 98.5%
Precision score: 99.4%
   Recall score: 89.2%
       F1 score: 94.0%

The conclusion of the metrics above is that Naive Bayes is the best model for all metrics except for precision, where the Random Forest outperformed. Note that despite this outperformance, it has the worst overall recall score.

Build the ROC Curve and Calculate the AUC

%matplotlib notebook

import matplotlib.pyplot as plt
from itertools import cycle
from sklearn.metrics import roc_curve, auc, roc_auc_score
from scipy import interp
def build_roc_auc(label, model, X_train, X_test, y_train, y_test):
    plt.figure()
    
    y_preds = model.fit(X_train, y_train).predict_proba(X_test)
    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(len(y_test)):
        fpr[i], tpr[i], _ = roc_curve(y_test, y_preds[:, 1])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_preds[:, 1].ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    
    plt.plot(fpr[2], tpr[2], color='darkorange',
             lw=2, label='ROC curve (area = %0.2f)' % roc_auc[2])
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    auc_score = roc_auc_score(y_test, np.round(y_preds[:, 1]))*100.
        
    plt.title('{}\nReceiver Operating Charactersitic, AUC = {:2.1f}%'.format(label,auc_score))
for label, model in [('Naive Bayes',   naive_bayes),
                     ('Bagging',       bag_mod),
                     ('Random Forest', rf_mod),
                     ('AdaBoost',      ada_mod)]:
    build_roc_auc(label,
                  model, 
                  training_data, 
                  testing_data, 
                  y_train, 
                  y_test)
<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>