Use the ROC curve to test the performance of a discrete classifier in python ?


R.O.C analysis (receiver operating characteristic) to test the performance of a discrete classifier in python

Introduction

Question: for a given x, does it belong to A population or not ? Let's consider a simple classifier define by a threshold (for example $x_s = 10$), if $x >= x_s$ then $x \in A$ if $x < x_s$ then $x \notin A$

Use the ROC curve to test the performance of a discrete classifier in python ?

import matplotlib.pyplot as plt
import scipy.stats
import numpy as np

x_min = 0.0
x_max = 30.0

#----------------------------------------------------------------------------------------#
# Population B

mean = 9.0 
std = 2.0

x = np.linspace(x_min, x_max, 100)

y = scipy.stats.norm.pdf(x,mean,std)

plt.plot(x,y, color='black')

plt.fill_between(x, y, color='#89bedc', alpha='1.0')

#----------------------------------------------------------------------------------------#
# Population A

mean = 15.0 
std = 4.0

x = np.linspace(x_min, x_max, 100)

y = scipy.stats.norm.pdf(x,mean,std)

plt.plot(x,y, color='black')

plt.fill_between(x, y, color='#0b559f', alpha='1.0')

#----------------------------------------------------------------------------------------#

import matplotlib.patches as mpatches

pop_a = mpatches.Patch(color='#0b559f', label='Population A')
pop_b = mpatches.Patch(color='#89bedc', label='Population B')

plt.legend(handles=[pop_a,pop_b])

plt.axvline(x=10,color='red')

plt.grid()

plt.xlim(x_min,x_max)
plt.ylim(0,0.25)

plt.title('How to use ROC curve to test a dicrete classifier ?',fontsize=10)

plt.xlabel('x')
plt.ylabel('Probability Density Function')

plt.savefig("roc_curve_discrete_classifier_02.png")
plt.show()

Calculate TP, TN, FP, FN

  • $x \in A$ and $x > x_s$ (TP true positive)
  • $x \in A$ and $x < x_s$ (FN false negative)
  • $x \notin A$ and $x > x_s$ (FP false positive)
  • $x \notin A$ and $x < x_s$ (TN true negative)

Use the ROC curve to test the performance of a discrete classifier in python ?

from scipy.integrate import quad

import matplotlib.pyplot as plt
import scipy.stats
import numpy as np

x_min = 0.0
x_max = 30.0

x_threshold = 10.0

def normal_distribution_function(x,mean,std):
    value = scipy.stats.norm.pdf(x,mean,std)
    return value

#----------------------------------------------------------------------------------------#
# Population B

mean = 9.0 
std = 2.0

x = np.linspace(x_min, x_max, 100)

y = scipy.stats.norm.pdf(x,mean,std)

plt.plot(x,y, color='gray')

#----------------------------------------------------------------------------------------#
# Population A

mean = 15.0 
std = 4.0

x = np.linspace(x_min, x_max, 100)

y = scipy.stats.norm.pdf(x,mean,std)

plt.plot(x,y, color='gray')


ptx = np.linspace(x_min, x_threshold, 100)
pty = scipy.stats.norm.pdf(ptx,mean,std)

plt.fill_between(ptx, pty, color='#e1b1b4', alpha='1.0')

fn_res, err = quad(normal_distribution_function, x_min, x_threshold, args=(mean,std,))

print('False Negative (FN)',fn_res)

ptx = np.linspace(x_threshold, x_max, 100)
pty = scipy.stats.norm.pdf(ptx,mean,std)
plt.fill_between(ptx, pty, color='#b77495', alpha='1.0')

tp_res, err = quad(normal_distribution_function, x_threshold, x_max, args=(mean,std,))

print('True Positive (TP)',tp_res)

#----------------------------------------------------------------------------------------#

import matplotlib.patches as mpatches

pop_a = mpatches.Patch(color='#e1b1b4', label='False Negative (FN): ' + str(round(fn_res,2)))
pop_b = mpatches.Patch(color='#b77495', label='True Positive (TP): ' + str(round(tp_res,2)))

plt.legend(handles=[pop_a,pop_b])

plt.axvline(x=x_threshold,color='red')

plt.grid()

plt.xlim(x_min,x_max)
plt.ylim(0,0.25)

plt.title('How to use ROC curve to test a dicrete classifier ?',fontsize=10)

plt.xlabel('x')
plt.ylabel('Probability Density Function')

plt.savefig("roc_curve_discrete_classifier_03.png")
plt.show()

Plot the confusion matrix

Use the ROC curve to test the performance of a discrete classifier in python ?

#!/usr/bin/env python

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd

import seaborn as sns
import math

from mpl_toolkits.axes_grid1 import make_axes_locatable

import matplotlib as mpl

mpl.style.use('seaborn')

conf_arr = np.array([[0.89,0.31],[0.11,0.69]])

df_cm = pd.DataFrame(conf_arr, 
  index = [ 'A', 'B'],
  columns = ['A', 'B'])

fig = plt.figure()

plt.clf()

ax = fig.add_subplot(111)
ax.set_aspect(1)

cmap = sns.cubehelix_palette(light=1, as_cmap=True)

res = sn.heatmap(df_cm, annot=True, vmin=0.0, vmax=1.0, fmt='.2f', cmap=cmap)

plt.yticks([0.5,1.5], [ 'Classifier B', 'Classifier A'], va='center')

ax.xaxis.tick_top()
ax.xaxis.set_label_position('top')

plt.savefig('roc_curve_discrete_classifier_05.png', dpi=100, bbox_inches='tight' )

plt.close()

Plot the R.O.C curve and calculate the AUC (Area Under the Curve)

Use the ROC curve to test the performance of a discrete classifier in python ?

from scipy.integrate import quad
from scipy.integrate import simps

import matplotlib.pyplot as plt
import numpy as np

def slope(x1, y1, x2, y2):
    return (y2-y1)/(x2-x1)

fp = 0.31
tp = 0.89

a = slope(0.0, 0.0, fp, tp)
b = 0.0

ptx = np.linspace(0, fp, 100)
pty = a * ptx + b

plt.fill_between(ptx, pty, color='#89bedc', alpha='1.0')

area_1 = simps(pty,ptx)

a = slope(fp, tp, 1.0, 1.0)
b = tp - a * fp

ptx = np.linspace(fp, 1.0, 100)
pty = a * ptx + b

plt.fill_between(ptx, pty, color='#89bedc', alpha='1.0')

area_2 = simps(pty,ptx)

auc_area = area_1 + area_2
print(auc_area)

auc_area = (1.0 - fp) * tp + 0.5 * fp * tp + 0.5 * (1.0 - fp) * (1.0 -tp) 
print(auc_area)

plt.text(0.6, 0.25, 'AUC: '+str(round(auc_area,2)),color='white',fontsize=14)

plt.scatter(fp,tp)

plt.plot([0,fp,1],[0,tp,1])
plt.plot([0.0,1.0],[0.0,1.0],'k--')
plt.xlim(0,1)
plt.ylim(0,1)

plt.xlabel('False Positive (FP)',fontsize=8)
plt.ylabel('True Positive (TP)',fontsize=8)

plt.title('Receiver operating characteristics (R.O.C) Curve',fontsize=10)

plt.savefig("roc_curve_discrete_classifier_07.png")

Find the threshold that maximize the AUC

Use the ROC curve to test the performance of a discrete classifier in python ?

from scipy.integrate import quad

import matplotlib.pyplot as plt
import scipy.stats
import numpy as np

x_min = 0.0
x_max = 30.0

def normal_distribution_function(x,mean,std):
    value = scipy.stats.norm.pdf(x,mean,std)
    return value

#----------------------------------------------------------------------------------------#
# Population A

mean_a = 15.0 
std_a = 4.0

x_a = np.linspace(x_min, x_max, 100)

y_a = scipy.stats.norm.pdf(x_a,mean_a,std_a)

#----------------------------------------------------------------------------------------#
# Population B

mean_b = 9.0 
std_b = 2.0

x_b = np.linspace(x_min, x_max, 100)

y_b = scipy.stats.norm.pdf(x_b,mean_b,std_b)

#----------------------------------------------------------------------------------------#

auc_max = 0.0
x_s_opt = 0.0

for x_s in [i for i in np.arange(x_min,x_max,0.1)]:

    ptx = np.linspace(x_s, x_max, 100)
    pty = scipy.stats.norm.pdf(ptx,mean_a,std_a)

    tp, err = quad(normal_distribution_function, x_s, x_max, args=(mean_a,std_a,))

    ptx = np.linspace(x_s, x_max, 100)
    pty = scipy.stats.norm.pdf(ptx,mean_b,std_b)

    fp, err = quad(normal_distribution_function, x_s, x_max, args=(mean_b,std_b,))

    auc_area = (1.0 - fp) * tp + 0.5 * fp * tp + 0.5 * (1.0 - fp) * (1.0 -tp)

    if auc_area > auc_max:
        x_s_opt = x_s
        auc_max = auc_area

print('Best xs found: ', x_s_opt)
print('Best AUC found: ', auc_max)

returns here:

Best xs found:  11.8
Best AUC found:  0.853649762448816

References

Image

of