Ce notebook constitue la base des programmes et du compte-rendu. Y rajouter directement:
Envoyer le fichier ipynb avant le 20 février à : bertrand.le_saux@onera.fr
Mentionner dans le sujet du mail et le nom de fichier les noms des étudiants.
Ce TP porte sur l'analyse de statistiques extraites de la série de livres Game of Thrones. Plus précisément, l'objectif est de prédire la survie ou la disparition des personnages en se basant sur les données mises à disposition.
Les données proviennent du fan site: https://got.show/machine-learning-algorithm-predicts-death-game-of-thrones
Ici vous pouvez accéder au fichier de données GoT légèrement modifié pour ce TD : character-predictions-new.csv
Les différentes étapes consistent à:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
character_predictions = pd.read_csv('character-predictions-new.csv')
df = character_predictions
# Exemple de fonction affichant un histogramme de survie
def plot(cat):
df.groupby(cat).isAlive.mean().plot(kind='bar')
plt.ylabel('Percent Alive')
plt.ylim([0.0, 1.0])
plt.show()
df.keys()
df
plot('isPopular')
df['is_over'] = df['age'].apply(lambda x: 1 if x>30 else 0)
plot('is_over')
df.drop('is_over',axis=1,inplace=True)
plot("culture")
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
## 'age' column has NaN values which are not ok for sklearn
from sklearn.preprocessing import Imputer
## replace NaN by average age
imp=Imputer(missing_values="NaN", strategy="mean" )
df["age"]=imp.fit_transform(df[["age"]]).ravel()
print(df['age'].mean() )
#df
# prepare training set and corresponding labels
feature_cols = ['male','book1','book2','book3','book4','book5','isMarried','isNoble','popularity','name_in_house',\
'boolDeadRelations','age','numDeadRelations']
X = df[feature_cols]
y = df.isAlive
indices = np.arange( len(y) )
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(X, y, indices, random_state=0)
from sklearn.tree import DecisionTreeClassifier
# train a decision tree
# default parameters (Gini coefficient)
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train,y_train)
# test the classifier
y_pred=clf.predict(X_test)
# compute classification accuracy
print (clf.score(X_test, y_test) )
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
print(clf.classes_)
print(cnf_matrix)
print(classification_report(y_test, y_pred))
def next_popular_dead(df, y_pred, y_test):
# print names of the predicted dead
# find how to get indices and then the names
badguys=df['name'].values
pop = df['isPopular'].values
true_dead = np.where( ((y_pred==0) & (y_pred==y_test)) )[0]
false_dead = np.where( ((y_pred==0) & (y_pred!=y_test)) )[0]
#print(badguys[idx_test[true_dead]])
#print(badguys[idx_test[false_dead]])
# potential popular dead in season 6 and 7... SPOILER !
false_dead_badguys = badguys[idx_test[false_dead]]
return(false_dead_badguys[pop[idx_test[false_dead]]==1])
print('next popular dead: ')
print(next_popular_dead(df, y_pred, y_test))
# train a decision tree
# fine-tuned coefficients: entropy as loss, deep tree, well-balanced split
clf = DecisionTreeClassifier(random_state=0, criterion='entropy', max_depth=12, min_samples_split=0.35)
clf.fit(X_train,y_train)
# test the classifier
y_pred=clf.predict(X_test)
# compute classification accuracy
print (clf.score(X_test, y_test) )
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
print(clf.classes_)
print(cnf_matrix)
print(classification_report(y_test, y_pred))
print('next popular dead: ')
print(next_popular_dead(df, y_pred, y_test))
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestClassifier(max_depth=3, random_state=0)
rf.fit(X_train, y_train)
# All possible parameters:
#RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
# max_depth=2, max_features='auto', max_leaf_nodes=None,
# min_impurity_decrease=0.0, min_impurity_split=None,
# min_samples_leaf=1, min_samples_split=2,
# min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
# oob_score=False, random_state=0, verbose=0, warm_start=False)
# sort features by importance and print them
idx_imp = np.argsort(rf.feature_importances_, axis=None)
#print( rf.feature_importances_[idx_imp[::-1]] )
#print(idx_imp[::-1])
for i in idx_imp[::-1]: print( feature_cols[ i ] )
# test the classifier
y_pred=rf.predict(X_test)
# compute classification accuracy
print (rf.score(X_test, y_test) )
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
print(rf.classes_)
print(cnf_matrix)
print(classification_report(y_test, y_pred))
print('next popular dead: ')
print(next_popular_dead(df, y_pred, y_test))
rf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
n_estimators=130, oob_score=True, max_depth=12, random_state=0)
rf.fit(X_train, y_train)
# test the classifier
y_pred=rf.predict(X_test)
# compute classification accuracy
print (rf.score(X_test, y_test) )
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
print(rf.classes_)
print(cnf_matrix)
print(classification_report(y_test, y_pred))
print('next popular dead: ')
print(next_popular_dead(df, y_pred, y_test))
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
adaboo = AdaBoostClassifier(random_state=0, algorithm='SAMME')
# all parameters (not so much!)
#AdaBoostClassifier(base_estimator=None, n_estimators=50,
#learning_rate=1.0, algorithm=’SAMME.R’,random_state=None)
adaboo.fit(X_train, y_train)
# test the classifier
y_pred=adaboo.predict(X_test)
# compute classification accuracy
print (adaboo.score(X_test, y_test) )
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
print(adaboo.classes_)
print(cnf_matrix)
print(classification_report(y_test, y_pred))
print('next popular dead: ')
print(next_popular_dead(df, y_pred, y_test))
graboo = GradientBoostingClassifier(random_state=0, max_depth=5, subsample=0.5, )
# all parameters
#GradientBoostingClassifier(loss=’deviance’, learning_rate=0.1, n_estimators=100,
#subsample=1.0, criterion=’friedman_mse’, min_samples_split=2, min_samples_leaf=1,
#min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0,
#min_impurity_split=None, init=None, random_state=None, max_features=None,
#verbose=0, max_leaf_nodes=None, warm_start=False, presort=’auto’)
graboo.fit(X_train, y_train)
# test the classifier
y_pred=graboo.predict(X_test)
# compute classification accuracy
print (graboo.score(X_test, y_test) )
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
print(graboo.classes_)
print(cnf_matrix)
print(classification_report(y_test, y_pred))
print('next popular dead: ')
print(next_popular_dead(df, y_pred, y_test))