In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

plt.style.use('bmh')

Exploratory data analysis
===

In [None]:
# read data

# fill in the folder where you saved the data
my_folder = 'my/data/folder'

X = pd.read_csv(my_folder+'/X_train.csv')
y = pd.read_csv(my_folder+'/y_train.csv')

# drop ID column for training
X = X.drop(['ID'], axis=1)
y = y.drop(['ID'], axis=1)

In [None]:
activity_labels = pd.read_csv(my_folder+'/activity_labels.txt',
        delim_whitespace=True, # use blank spaces as separator
        header=None,
        names=('ID','activity')) # use instead these names as header

Look at the names of the activities.

In [None]:
activity_labels

Look at the names in the variables in this dataset

In [None]:
feature_names = list(X.columns.values)
for feature in feature_names:
    print(feature)

Basic stats
---

In [None]:
X.info()

In [None]:
X.head()

In [None]:
X.describe()

Histograms
---

In [None]:
plt.figure(figsize=(15,6))
sns.distplot(X['tBodyAcc-Mean-1'], color='g', bins=200, hist_kws={'alpha': 0.4}, kde=False);
sns.distplot(X['tBodyAcc-Mean-2'], color='r', bins=200, hist_kws={'alpha': 0.4}, kde=False);
sns.distplot(X['tBodyAcc-Mean-3'], color='b', bins=200, hist_kws={'alpha': 0.4}, kde=False);

In [None]:
plt.figure(figsize=(15,6))
sns.distplot(X['tBodyAcc-STD-1'], color='g', bins=200, hist_kws={'alpha': 0.4}, kde=False);
sns.distplot(X['tBodyAcc-STD-2'], color='r', bins=200, hist_kws={'alpha': 0.4}, kde=False);
sns.distplot(X['tBodyAcc-STD-3'], color='b', bins=200, hist_kws={'alpha': 0.4}, kde=False);

Scatter plots
---

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(X['tBodyAcc-Mean-1'],X['tBodyAcc-Mean-2'],c=y.iloc[:,0],cmap=plt.cm.get_cmap('nipy_spectral', 12))
plt.show()

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(X['tBodyAcc-Mean-1'],X['tBodyAcc-Mean-3'],c=y.iloc[:,0],cmap=plt.cm.get_cmap('nipy_spectral', 12))
plt.show()

Correlation matrices
---

In [None]:
corr = X.iloc[:, 0:30].corr()
plt.figure(figsize=(15, 14))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='viridis', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);

In [None]:
corr = X.iloc[:, 300:330].corr()
plt.figure(figsize=(15, 14))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='viridis', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);

Plotting average acceleration for first subject
---

In [None]:
subject_id_train = pd.read_csv(my_folder+'/subject_id_train.txt',
        header=None,
        names=(['subject_id'])) # use instead these names as header
subject_id_train.head()

In [None]:
idx = subject_id_train == 1
row_idx = idx.iloc[:,0]
indices = idx.index.values[row_idx]

f, (ax1, ax2, ax3) = plt.subplots(1, 3,figsize=(15,8))
ax1.scatter(indices, X.loc[row_idx,'tBodyAcc-Mean-1'],c=y[row_idx]['TARGET'].values,
           cmap=plt.cm.get_cmap('nipy_spectral', 12))
ax2.scatter(indices, X.loc[row_idx,'tBodyAcc-Mean-2'],c=y[row_idx]['TARGET'].values,
           cmap=plt.cm.get_cmap('nipy_spectral', 12))
ax3.scatter(indices, X.loc[row_idx,'tBodyAcc-Mean-3'],c=y[row_idx]['TARGET'].values,
           cmap=plt.cm.get_cmap('nipy_spectral', 12))
plt.show()

PCA
---

In [None]:
X.info()

In [None]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(X)

In [None]:
plt.figure(figsize=(15,10))
plt.plot(np.cumsum(pca.explained_variance_)/np.sum(pca.explained_variance_))

In [None]:
pc0 = pca.components_[:,0]
plt.figure(figsize=(15,4))
plt.plot(pc0)

In [None]:
ind0 = np.argmax(np.abs(pc0), axis=0)
print(ind0)
headers = list(X.columns.values)
headers[ind0]

In [None]:
pc1 = pca.components_[:,1]
ind1 = np.argmax(np.abs(pc1), axis=0)
print(ind1)
headers = list(X.columns.values)
headers[ind1]

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(X[headers[ind0]],X[headers[ind1]],c=y.iloc[:,0],cmap=plt.cm.get_cmap('nipy_spectral', 12))

Logistic Regression
---

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model

X_train, X_test, y_train, y_test = train_test_split(X, y.iloc[:,0], test_size=0.4, random_state=0)

# initialize classifier
clf = linear_model.LogisticRegression()

# train classifier
clf.fit(X_train,y_train)

# test classifier
y_pred = clf.predict(X_test)

# print results    
t = pd.crosstab(y_test, y_pred, rownames=['actual'], colnames=['predictions'])
print("\nConfusion matrix:")
print(t)
print("\nAccuracy of Logistic Regression: %.3f\n" % np.mean(y_test == y_pred))

Perform PCA
---

In [None]:
pca = PCA(n_components=200)
X_pca = pca.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_pca, y.iloc[:,0], test_size=0.4, random_state=0)

# initialize classifier
clf = linear_model.LogisticRegression()

# train classifier
clf.fit(X_train,y_train)

# test classifier
y_pred = clf.predict(X_test)

# print results    
t = pd.crosstab(y_test, y_pred, rownames=['actual'], colnames=['predictions'])
print("\nConfusion matrix:")
print(t)
print("\nAccuracy of Logistic Regression: %.3f\n" % np.mean(y_test == y_pred))

Chaining a PCA and a logistic regression
---

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

logistic = linear_model.LogisticRegression()

pca = PCA()
pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])

# Prediction
n_components = [100, 200, 300]
Cs = np.logspace(-2, 2, 3)

# Parameters of pipelines can be set using ‘__’ separated parameter names:
estimator = GridSearchCV(pipe,
                         dict(pca__n_components=n_components,
                              logistic__C=Cs))
estimator.fit(X, y.iloc[:,0])

In [None]:
# Plot the PCA spectrum
pca.fit(X)

plt.figure(1, figsize=(15, 10))
plt.clf()
plt.axes([.2, .2, .7, .7])
plt.plot(np.cumsum(pca.explained_variance_)/np.sum(pca.explained_variance_), linewidth=2)
plt.axis('tight')
plt.xlabel('n_components')
plt.ylabel('explained_variance_')

plt.axvline(estimator.best_estimator_.named_steps['pca'].n_components,
            linestyle=':', label='n_components chosen')
plt.legend(prop=dict(size=12))
plt.show()

In [None]:
estimator.best_params_