In [0]:
%matplotlib inline
from pylab import *



---
# Get the data

* Load the Olivetti Face dataset
* Import the smile/no smile reference data

In [0]:
from sklearn import datasets
faces = datasets.fetch_olivetti_faces()
faces.keys()

In [0]:
# Display some images
for i in range(10):
    face = faces.images[i]
    subplot(1, 10, i + 1)
    imshow(face.reshape((64, 64)), cmap='gray')
    axis('off')

In [0]:
# Download results-smile-GT-BLS.xml from https://blesaux.github.io/teaching/IOGS-machine-learning
# Then load results-smile-GT-BLS.xml from computer in the "/content/" directory
import json
from google.colab import files
uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(name=fn, length=len(uploaded[fn])))


In [0]:
# Open reference data
results = json.load(open('/content/results-smile-GT-BLS.xml'))

y = list(results.values())
print(y)
yy = [int(yy==True) for yy in y]
print(yy)

In [0]:
# Analyze reference data
yes, no = (sum([results[x] == True for x in results]), 
            sum([results[x] == False for x in results]))
bar([0, 1], [no, yes])
ylim(0, max(yes, no))
xticks([0.4, 1.4], ['no smile', 'smile']);

In [0]:
smiling_indices = [int(i) for i in results if results[i] == True]

In [0]:
fig = plt.figure(figsize=(12, 12))
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)
for i in range(len(smiling_indices)):
    # plot the images in a matrix of 20x20
    p = fig.add_subplot(20, 20, i + 1)
    p.imshow(faces.images[smiling_indices[i]], cmap=plt.cm.bone)
    
    # label the image with the target value
    p.text(0, 14, "smiling")
    p.text(0, 60, str(i))
    p.axis('off')



---

# Feature extraction

* Compute Histogram of Gradients (HoGs) features on **all images**
* Understand what HoGs are

In [0]:
from __future__ import division, print_function
from time import time

import numpy as np
import matplotlib.pyplot as plt

from skimage import feature

# Compute HoG features
hog_vec = []
hog_vis = []
for i in range(len(faces.images)):
  image = faces.images[i]
  hvec, hvis = feature.hog(image, visualise=True)
  hog_vec.append(hvec)
  hog_vis.append(hvis)

print('Number of features of size... ',np.array(hog_vec).shape)

In [0]:
# Understand HOG features
from random import randint
ii = randint(0, len(faces.images))
print(len(faces.images), ii)

fig, ax = plt.subplots(1, 2, figsize=(12, 6),
                       subplot_kw=dict(xticks=[], yticks=[]))
ax[0].imshow( faces.images[ii], cmap='gray')
ax[0].set_title('input image')

ax[1].imshow(hog_vis[ii])
ax[1].set_title('visualization of HOG features');

In [0]:
print(hog_vec[ii])
print(hog_vec[ii].shape)
print( np.max(hog_vec) )
print( np.min(hog_vec) )



---

# Prepare train and test data

* Split data in training / test set

Info: https://scikit-learn.org/stable/model_selection.html

In [0]:
from sklearn.model_selection import train_test_split


hog_train, hog_test, yy_train, yy_test = ...




---
# Trees


* Built a tree classifier on HOG Features.
* Visualize the decision tree
* Info: https://scikit-learn.org/stable/modules/tree.html
* Evaluate the trained model
* Info: https://scikit-learn.org/stable/modules/model_evaluation.html

In [0]:
# Train decision tree
from sklearn import tree

clf = ...

In [0]:
# Visualize the decision tree

In [0]:
# Test the model on a single image
from random import randrange
rnd_test = random_index = randrange(len(yy_test))




In [0]:
# Compute predictions for all test, and estimate performance statistics




---

# Random Forest: a bagging example

In this part, the objective it to build a **custom random forest**.

**After the course**, check built-in scikit-learn random forests: https://scikit-learn.org/stable/modules/ensemble.html

* Train various trees on subsets of the training set
* Use a voting procedure to aggregate the individual predictions on the 
* Evaluate:
  * Evaluate the custom forest;
  * Compare with previous decision tree;
  * Compare with individual trees of the forest.

Numpy tips: [np.arange](https://docs.scipy.org/doc/numpy-1.15.0/reference/generated/numpy.arange.html), [numpy.sum](https://docs.scipy.org/doc/numpy-1.15.0/reference/generated/numpy.sum.html), [numpy.mean](https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/numpy.mean.html), [numpy.where](https://docs.scipy.org/doc/numpy-1.15.0/reference/generated/numpy.where.html)

In [0]:
# The training data are: hog_train, yy_train
# The test data are: hog_test, yy_test
# First prepare several subsets
# Second train decision trees on each subset

nb_trees = ...
forest = []

In [0]:
# Get predictions on the test dataset

print(len(forest))



In [0]:
# Vote
