Analyse R.O.C (receiver operating characteristic) pour tester la performance d'une classification discrète en utilisant le python.
Introduction
Question de départ: pour un x donné, est-ce qu'il appartient à la population A ou non ? Soit une simple classification définie par un seuil (par exemple $x_s = 10$), si $x >= x_s$ $x \in A$ si $x < x_s$ alors $x \notin A$
import matplotlib.pyplot as pltimport scipy.statsimport numpy as npx_min = 0.0x_max = 30.0#----------------------------------------------------------------------------------------## Population Bmean = 9.0std = 2.0x = np.linspace(x_min, x_max, 100)y = scipy.stats.norm.pdf(x,mean,std)plt.plot(x,y, color='black')plt.fill_between(x, y, color='#89bedc', alpha='1.0')#----------------------------------------------------------------------------------------## Population Amean = 15.0std = 4.0x = np.linspace(x_min, x_max, 100)y = scipy.stats.norm.pdf(x,mean,std)plt.plot(x,y, color='black')plt.fill_between(x, y, color='#0b559f', alpha='1.0')#----------------------------------------------------------------------------------------#import matplotlib.patches as mpatchespop_a = mpatches.Patch(color='#0b559f', label='Population A')pop_b = mpatches.Patch(color='#89bedc', label='Population B')plt.legend(handles=[pop_a,pop_b])plt.axvline(x=10,color='red')plt.grid()plt.xlim(x_min,x_max)plt.ylim(0,0.25)plt.title('How to use ROC curve to test a dicrete classifier ?',fontsize=10)plt.xlabel('x')plt.ylabel('Probability Density Function')plt.savefig("roc_curve_discrete_classifier_02.png")plt.show()
Calculer TP, TN, FP, FN
On peut ensuite calculer les pourcentages suivants:
- $x \in A$ et $x > x_s$ (TP true positive)
- $x \in A$ et $x < x_s$ (FN false negative)
- $x \notin A$ et $x > x_s$ (FP false positive)
- $x \notin A$ et $x < x_s$ (TN true negative)
from scipy.integrate import quadimport matplotlib.pyplot as pltimport scipy.statsimport numpy as npx_min = 0.0x_max = 30.0x_threshold = 10.0def normal_distribution_function(x,mean,std):value = scipy.stats.norm.pdf(x,mean,std)return value#----------------------------------------------------------------------------------------## Population Bmean = 9.0std = 2.0x = np.linspace(x_min, x_max, 100)y = scipy.stats.norm.pdf(x,mean,std)plt.plot(x,y, color='gray')#----------------------------------------------------------------------------------------## Population Amean = 15.0std = 4.0x = np.linspace(x_min, x_max, 100)y = scipy.stats.norm.pdf(x,mean,std)plt.plot(x,y, color='gray')ptx = np.linspace(x_min, x_threshold, 100)pty = scipy.stats.norm.pdf(ptx,mean,std)plt.fill_between(ptx, pty, color='#e1b1b4', alpha='1.0')fn_res, err = quad(normal_distribution_function, x_min, x_threshold, args=(mean,std,))print('False Negative (FN)',fn_res)ptx = np.linspace(x_threshold, x_max, 100)pty = scipy.stats.norm.pdf(ptx,mean,std)plt.fill_between(ptx, pty, color='#b77495', alpha='1.0')tp_res, err = quad(normal_distribution_function, x_threshold, x_max, args=(mean,std,))print('True Positive (TP)',tp_res)#----------------------------------------------------------------------------------------#import matplotlib.patches as mpatchespop_a = mpatches.Patch(color='#e1b1b4', label='False Negative (FN): ' + str(round(fn_res,2)))pop_b = mpatches.Patch(color='#b77495', label='True Positive (TP): ' + str(round(tp_res,2)))plt.legend(handles=[pop_a,pop_b])plt.axvline(x=x_threshold,color='red')plt.grid()plt.xlim(x_min,x_max)plt.ylim(0,0.25)plt.title('How to use ROC curve to test a dicrete classifier ?',fontsize=10)plt.xlabel('x')plt.ylabel('Probability Density Function')plt.savefig("roc_curve_discrete_classifier_03.png")plt.show()
Tracer la matrice de Confusion
Tracer la matrice de confusion

#!/usr/bin/env pythonimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snimport pandas as pdimport seaborn as snsimport mathfrom mpl_toolkits.axes_grid1 import make_axes_locatableimport matplotlib as mplmpl.style.use('seaborn')conf_arr = np.array([[0.89,0.31],[0.11,0.69]])df_cm = pd.DataFrame(conf_arr,index = [ 'A', 'B'],columns = ['A', 'B'])fig = plt.figure()plt.clf()ax = fig.add_subplot(111)ax.set_aspect(1)cmap = sns.cubehelix_palette(light=1, as_cmap=True)res = sn.heatmap(df_cm, annot=True, vmin=0.0, vmax=1.0, fmt='.2f', cmap=cmap)plt.yticks([0.5,1.5], [ 'Classifier B', 'Classifier A'], va='center')ax.xaxis.tick_top()ax.xaxis.set_label_position('top')plt.savefig('roc_curve_discrete_classifier_05.png', dpi=100, bbox_inches='tight' )plt.close()
Tracer la courbe R.O.C
Tracer la courbe ROC et en déduire AUC (Area Under the Curve)
from scipy.integrate import quadfrom scipy.integrate import simpsimport matplotlib.pyplot as pltimport numpy as npdef slope(x1, y1, x2, y2):return (y2-y1)/(x2-x1)fp = 0.31tp = 0.89a = slope(0.0, 0.0, fp, tp)b = 0.0ptx = np.linspace(0, fp, 100)pty = a * ptx + bplt.fill_between(ptx, pty, color='#89bedc', alpha='1.0')area_1 = simps(pty,ptx)a = slope(fp, tp, 1.0, 1.0)b = tp - a * fpptx = np.linspace(fp, 1.0, 100)pty = a * ptx + bplt.fill_between(ptx, pty, color='#89bedc', alpha='1.0')area_2 = simps(pty,ptx)auc_area = area_1 + area_2print(auc_area)auc_area = (1.0 - fp) * tp + 0.5 * fp * tp + 0.5 * (1.0 - fp) * (1.0 -tp)print(auc_area)plt.text(0.6, 0.25, 'AUC: '+str(round(auc_area,2)),color='white',fontsize=14)plt.scatter(fp,tp)plt.plot([0,fp,1],[0,tp,1])plt.plot([0.0,1.0],[0.0,1.0],'k--')plt.xlim(0,1)plt.ylim(0,1)plt.xlabel('False Positive (FP)',fontsize=8)plt.ylabel('True Positive (TP)',fontsize=8)plt.title('Receiver operating characteristics (R.O.C) Curve',fontsize=10)plt.savefig("roc_curve_discrete_classifier_07.png")
Trouver la classification maximisant "AUC"
from scipy.integrate import quadimport matplotlib.pyplot as pltimport scipy.statsimport numpy as npx_min = 0.0x_max = 30.0def normal_distribution_function(x,mean,std):value = scipy.stats.norm.pdf(x,mean,std)return value#----------------------------------------------------------------------------------------## Population Amean_a = 15.0std_a = 4.0x_a = np.linspace(x_min, x_max, 100)y_a = scipy.stats.norm.pdf(x_a,mean_a,std_a)#----------------------------------------------------------------------------------------## Population Bmean_b = 9.0std_b = 2.0x_b = np.linspace(x_min, x_max, 100)y_b = scipy.stats.norm.pdf(x_b,mean_b,std_b)#----------------------------------------------------------------------------------------#auc_max = 0.0x_s_opt = 0.0for x_s in [i for i in np.arange(x_min,x_max,0.1)]:ptx = np.linspace(x_s, x_max, 100)pty = scipy.stats.norm.pdf(ptx,mean_a,std_a)tp, err = quad(normal_distribution_function, x_s, x_max, args=(mean_a,std_a,))ptx = np.linspace(x_s, x_max, 100)pty = scipy.stats.norm.pdf(ptx,mean_b,std_b)fp, err = quad(normal_distribution_function, x_s, x_max, args=(mean_b,std_b,))auc_area = (1.0 - fp) * tp + 0.5 * fp * tp + 0.5 * (1.0 - fp) * (1.0 -tp)if auc_area > auc_max:x_s_opt = x_sauc_max = auc_areaprint('Best xs found: ', x_s_opt)print('Best AUC found: ', auc_max)
donne ici:
Best xs found: 11.8Best AUC found: 0.853649762448816
Références
| Liens | Site |
|---|---|
| Courbe ROC | wikipedia |
| Classement automatique | wikipedia |
| Bayesian Decision Theory | gatech.edu |
| An introduction to ROC analysis | people.inf.elte.hu |
| ROC curves and Area Under the Curve explained | dataschool.io |
| Understanding ROC curves | dataschool.io |
| Receiver Operating Characteristic (ROC) | scikit-learn.org |
| Machine Learning FAQ | sebastianraschka.com |
| Micro Average vs Macro average Performance in a Multiclass classification setting | stackexchange |
