Exemples de comment estimer une moyenne à partir d'un ensemble de données distribuées selon une distribution Gaussienne en python ?
1 -- Créer un ensemble de nombres aléatoires distribués suivant une Gaussienne
import scipy.statsimport numpy as npimport matplotlib.pyplot as pltmu_0 = 2.0srd_0 = 4.0data = np.random.randn(100000)data = data * srd_0 + mu_0data = data.reshape(-1, 1)
2 -- Données complètes
hx, hy, _ = plt.hist(data, bins=50, density=1,color="lightblue")plt.ylim(0.0,max(hx)+0.05)plt.title('Mean estimation from a censored dataset')plt.grid()plt.xlim(-4*srd_0,4*srd_0)plt.savefig("censored_data_01.png", bbox_inches='tight')plt.show()

print('Mean (Complete Data): ', np.mean(data))print('Std (Complete Data): ',np.std(data))
donne
Mean (Complete Data): 2.0104953814107076Std (Complete Data): 3.9865860565580946
3 -- Données incomplètes
max_x = 5data_trunc_2 = np.copy(data)data_trunc_2[data_trunc_2 > max_x] = max_xdata_trunc_2 = data_trunc_2.reshape(-1, 1)hx, hy, _ = plt.hist(data_trunc_2, density=1, bins=50,color="lightblue")plt.ylim(0.0,max(hx)+0.05)plt.title('Mean estimation from a censored dataset')plt.grid()plt.xlim(-4*srd_0,4*srd_0)plt.savefig("censored_data_02.png", bbox_inches='tight')plt.show()

data_trunc_2_not_cens = data_trunc_2[data_trunc_2<max_x]data_trunc_2_cens = data_trunc_2[data_trunc_2==max_x]x = np.linspace(-10, 10, 1000, endpoint=True)#print(data_trunc_2.shape)#print(data_trunc_2_cens.shape)#print(data_trunc_2_not_cens.shape)y = []for i in x:p1 = np.log(scipy.stats.norm.pdf(data_trunc_2_not_cens,i,4)).sum()p2 = np.log(1.0 - scipy.stats.norm.cdf(data_trunc_2_cens,i,4)).sum()y.append(p1+p2)plt.plot(x,y)plt.title('Mean estimation from a censored dataset')plt.savefig("censored_data_03.png", bbox_inches='tight')plt.show()print('Mean (Censored Data): ', np.mean(data_trunc_2))print('Std (Censored Data): ',np.std(data_trunc_2))
donne
Mean (Censored Data): 1.4878815869535522Std (Censored Data): 3.2348435672740012

y_min = y.index(max(y))print('Mean (max log likelihood): ', x[y_min])
donne
Mean (max log likelihood): 2.0120120120120113
