Analyse R.O.C (receiver operating characteristic) pour tester la performance d'une classification discrète en utilisant le python.
Introduction
Question de départ: pour un x donné, est-ce qu'il appartient à la population A ou non ? Soit une simple classification définie par un seuil (par exemple $x_s = 10$), si $x >= x_s$ $x \in A$ si $x < x_s$ alors $x \notin A$
import matplotlib.pyplot as plt
import scipy.stats
import numpy as np
x_min = 0.0
x_max = 30.0
#----------------------------------------------------------------------------------------#
# Population B
mean = 9.0
std = 2.0
x = np.linspace(x_min, x_max, 100)
y = scipy.stats.norm.pdf(x,mean,std)
plt.plot(x,y, color='black')
plt.fill_between(x, y, color='#89bedc', alpha='1.0')
#----------------------------------------------------------------------------------------#
# Population A
mean = 15.0
std = 4.0
x = np.linspace(x_min, x_max, 100)
y = scipy.stats.norm.pdf(x,mean,std)
plt.plot(x,y, color='black')
plt.fill_between(x, y, color='#0b559f', alpha='1.0')
#----------------------------------------------------------------------------------------#
import matplotlib.patches as mpatches
pop_a = mpatches.Patch(color='#0b559f', label='Population A')
pop_b = mpatches.Patch(color='#89bedc', label='Population B')
plt.legend(handles=[pop_a,pop_b])
plt.axvline(x=10,color='red')
plt.grid()
plt.xlim(x_min,x_max)
plt.ylim(0,0.25)
plt.title('How to use ROC curve to test a dicrete classifier ?',fontsize=10)
plt.xlabel('x')
plt.ylabel('Probability Density Function')
plt.savefig("roc_curve_discrete_classifier_02.png")
plt.show()
Calculer TP, TN, FP, FN
On peut ensuite calculer les pourcentages suivants:
- $x \in A$ et $x > x_s$ (TP true positive)
- $x \in A$ et $x < x_s$ (FN false negative)
- $x \notin A$ et $x > x_s$ (FP false positive)
- $x \notin A$ et $x < x_s$ (TN true negative)
from scipy.integrate import quad
import matplotlib.pyplot as plt
import scipy.stats
import numpy as np
x_min = 0.0
x_max = 30.0
x_threshold = 10.0
def normal_distribution_function(x,mean,std):
value = scipy.stats.norm.pdf(x,mean,std)
return value
#----------------------------------------------------------------------------------------#
# Population B
mean = 9.0
std = 2.0
x = np.linspace(x_min, x_max, 100)
y = scipy.stats.norm.pdf(x,mean,std)
plt.plot(x,y, color='gray')
#----------------------------------------------------------------------------------------#
# Population A
mean = 15.0
std = 4.0
x = np.linspace(x_min, x_max, 100)
y = scipy.stats.norm.pdf(x,mean,std)
plt.plot(x,y, color='gray')
ptx = np.linspace(x_min, x_threshold, 100)
pty = scipy.stats.norm.pdf(ptx,mean,std)
plt.fill_between(ptx, pty, color='#e1b1b4', alpha='1.0')
fn_res, err = quad(normal_distribution_function, x_min, x_threshold, args=(mean,std,))
print('False Negative (FN)',fn_res)
ptx = np.linspace(x_threshold, x_max, 100)
pty = scipy.stats.norm.pdf(ptx,mean,std)
plt.fill_between(ptx, pty, color='#b77495', alpha='1.0')
tp_res, err = quad(normal_distribution_function, x_threshold, x_max, args=(mean,std,))
print('True Positive (TP)',tp_res)
#----------------------------------------------------------------------------------------#
import matplotlib.patches as mpatches
pop_a = mpatches.Patch(color='#e1b1b4', label='False Negative (FN): ' + str(round(fn_res,2)))
pop_b = mpatches.Patch(color='#b77495', label='True Positive (TP): ' + str(round(tp_res,2)))
plt.legend(handles=[pop_a,pop_b])
plt.axvline(x=x_threshold,color='red')
plt.grid()
plt.xlim(x_min,x_max)
plt.ylim(0,0.25)
plt.title('How to use ROC curve to test a dicrete classifier ?',fontsize=10)
plt.xlabel('x')
plt.ylabel('Probability Density Function')
plt.savefig("roc_curve_discrete_classifier_03.png")
plt.show()
Tracer la matrice de Confusion
Tracer la matrice de confusion
#!/usr/bin/env python
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd
import seaborn as sns
import math
from mpl_toolkits.axes_grid1 import make_axes_locatable
import matplotlib as mpl
mpl.style.use('seaborn')
conf_arr = np.array([[0.89,0.31],[0.11,0.69]])
df_cm = pd.DataFrame(conf_arr,
index = [ 'A', 'B'],
columns = ['A', 'B'])
fig = plt.figure()
plt.clf()
ax = fig.add_subplot(111)
ax.set_aspect(1)
cmap = sns.cubehelix_palette(light=1, as_cmap=True)
res = sn.heatmap(df_cm, annot=True, vmin=0.0, vmax=1.0, fmt='.2f', cmap=cmap)
plt.yticks([0.5,1.5], [ 'Classifier B', 'Classifier A'], va='center')
ax.xaxis.tick_top()
ax.xaxis.set_label_position('top')
plt.savefig('roc_curve_discrete_classifier_05.png', dpi=100, bbox_inches='tight' )
plt.close()
Tracer la courbe R.O.C
Tracer la courbe ROC et en déduire AUC (Area Under the Curve)
from scipy.integrate import quad
from scipy.integrate import simps
import matplotlib.pyplot as plt
import numpy as np
def slope(x1, y1, x2, y2):
return (y2-y1)/(x2-x1)
fp = 0.31
tp = 0.89
a = slope(0.0, 0.0, fp, tp)
b = 0.0
ptx = np.linspace(0, fp, 100)
pty = a * ptx + b
plt.fill_between(ptx, pty, color='#89bedc', alpha='1.0')
area_1 = simps(pty,ptx)
a = slope(fp, tp, 1.0, 1.0)
b = tp - a * fp
ptx = np.linspace(fp, 1.0, 100)
pty = a * ptx + b
plt.fill_between(ptx, pty, color='#89bedc', alpha='1.0')
area_2 = simps(pty,ptx)
auc_area = area_1 + area_2
print(auc_area)
auc_area = (1.0 - fp) * tp + 0.5 * fp * tp + 0.5 * (1.0 - fp) * (1.0 -tp)
print(auc_area)
plt.text(0.6, 0.25, 'AUC: '+str(round(auc_area,2)),color='white',fontsize=14)
plt.scatter(fp,tp)
plt.plot([0,fp,1],[0,tp,1])
plt.plot([0.0,1.0],[0.0,1.0],'k--')
plt.xlim(0,1)
plt.ylim(0,1)
plt.xlabel('False Positive (FP)',fontsize=8)
plt.ylabel('True Positive (TP)',fontsize=8)
plt.title('Receiver operating characteristics (R.O.C) Curve',fontsize=10)
plt.savefig("roc_curve_discrete_classifier_07.png")
Trouver la classification maximisant "AUC"
from scipy.integrate import quad
import matplotlib.pyplot as plt
import scipy.stats
import numpy as np
x_min = 0.0
x_max = 30.0
def normal_distribution_function(x,mean,std):
value = scipy.stats.norm.pdf(x,mean,std)
return value
#----------------------------------------------------------------------------------------#
# Population A
mean_a = 15.0
std_a = 4.0
x_a = np.linspace(x_min, x_max, 100)
y_a = scipy.stats.norm.pdf(x_a,mean_a,std_a)
#----------------------------------------------------------------------------------------#
# Population B
mean_b = 9.0
std_b = 2.0
x_b = np.linspace(x_min, x_max, 100)
y_b = scipy.stats.norm.pdf(x_b,mean_b,std_b)
#----------------------------------------------------------------------------------------#
auc_max = 0.0
x_s_opt = 0.0
for x_s in [i for i in np.arange(x_min,x_max,0.1)]:
ptx = np.linspace(x_s, x_max, 100)
pty = scipy.stats.norm.pdf(ptx,mean_a,std_a)
tp, err = quad(normal_distribution_function, x_s, x_max, args=(mean_a,std_a,))
ptx = np.linspace(x_s, x_max, 100)
pty = scipy.stats.norm.pdf(ptx,mean_b,std_b)
fp, err = quad(normal_distribution_function, x_s, x_max, args=(mean_b,std_b,))
auc_area = (1.0 - fp) * tp + 0.5 * fp * tp + 0.5 * (1.0 - fp) * (1.0 -tp)
if auc_area > auc_max:
x_s_opt = x_s
auc_max = auc_area
print('Best xs found: ', x_s_opt)
print('Best AUC found: ', auc_max)
donne ici:
Best xs found: 11.8
Best AUC found: 0.853649762448816
Références
Liens | Site |
---|---|
Courbe ROC | wikipedia |
Classement automatique | wikipedia |
Bayesian Decision Theory | gatech.edu |
An introduction to ROC analysis | people.inf.elte.hu |
ROC curves and Area Under the Curve explained | dataschool.io |
Understanding ROC curves | dataschool.io |
Receiver Operating Characteristic (ROC) | scikit-learn.org |
Machine Learning FAQ | sebastianraschka.com |
Micro Average vs Macro average Performance in a Multiclass classification setting | stackexchange |