In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

K-Folds cross-validator
===

Provides train/test indices to split data in train/test sets. Split dataset into k consecutive folds (without shuffling by default).

In [None]:
# fill in the folder where you saved the data
my_folder = 'my/data/folder'

X = pd.read_csv(my_folder+'/X_train.csv')
y = pd.read_csv(my_folder+'/y_train.csv')

# drop ID column for training
X = X.drop(['ID'], axis=1)
y = y.drop(['ID'], axis=1)

How much can we trust the random forest classifier with standard parameters?
---

In [None]:
from sklearn.model_selection import KFold

n_splits = 5

# initialize classifier
clf = RandomForestClassifier(random_state=0)

k_fold = KFold(n_splits=n_splits)

accuracy_scores = np.zeros(n_splits)
for k, (train_index, test_index) in enumerate(k_fold.split(X, y)):
    #for train_index, test_index in k_fold.split(X):
    print("Fold:", k, " TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index,0], y.iloc[test_index,0]
    
    # train classifier
    clf.fit(X_train, y_train)
    
    # test classifier
    y_pred = clf.predict(X_test)
    
    # calculate accuracy score
    accuracy_scores[k] = accuracy_score(y_test, y_pred)
    print("Accuracy: %.3f\n" % accuracy_scores[k])

print("Average accuracy score: %.2f"%np.mean(accuracy_scores))
print("Accuracy score standard deviance: %.2f"%np.std(accuracy_scores))

Try a different number of trees (default=10)
---

In [None]:
from sklearn.model_selection import KFold

n_splits = 5
n_estimators = 5
#n_estimators = 20

# initialize classifier
clf = RandomForestClassifier(n_estimators=n_estimators,random_state=0)

k_fold = KFold(n_splits=n_splits)

accuracy_scores = np.zeros(n_splits)
for k, (train_index, test_index) in enumerate(k_fold.split(X, y)):
    #for train_index, test_index in k_fold.split(X):
    print("Fold:", k, " TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index,0], y.iloc[test_index,0]
    
    # train classifier
    clf.fit(X_train, y_train)
    
    # test classifier
    y_pred = clf.predict(X_test)
    
    # calculate accuracy score
    accuracy_scores[k] = accuracy_score(y_test, y_pred)
    print("Accuracy: %.3f\n" % accuracy_scores[k])

print("Average accuracy score: %.2f"%np.mean(accuracy_scores))
print("Accuracy score standard deviance: %.2f"%np.std(accuracy_scores))

Automatically find the best parameters with GridSearchCV
---

In [None]:
from sklearn.model_selection import GridSearchCV

# The number of trees in the forest: n_estimators
# The number of features to consider when looking for the best split: max_features

parameters = {'n_estimators':[5, 10, 15, 20]}
rf = RandomForestClassifier(random_state=0)

clf = GridSearchCV(rf, parameters)

clf.fit(X, y.iloc[:,0])

In [None]:
print("Best parameters:", clf.best_params_)

print("Best score:", clf.best_score_)

Now try finding the optimal parameters for another classifier.