"Parmi les thèmes à propos desquels les statisticiens ne sont pas d'accord, se trouve la définition de leur science."
― Maurice Kendall, 1935
Retenons à ce stade :
"It's tough to make predictions, especially about the future." ― Yogi Berra
(dans les entreprises)
(en ce qui vous concerne)
# Taille d'une population en cm
x = np.random.normal(size=int(10e6),loc=179,scale=8)
plt.hist(x,bins=50);
plt.grid()
from matplotlib.ticker import PercentFormatter
plt.hist(x,bins=50, weights=np.ones(len(x)) / len(x))
plt.grid()
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
from matplotlib.ticker import PercentFormatter
plt.hist(x,bins=100, weights=np.ones(len(x)) / len(x),cumulative=1)
plt.grid()
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
from matplotlib.ticker import PercentFormatter
plt.hist(x,bins=100, weights=np.ones(len(x)) / len(x),cumulative=1,alpha=0.7)
plt.grid()
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.plot([120,np.median(x)],[50e-2,50e-2],'k');plt.plot([np.median(x),np.median(x)],[50e-2,0],'k')
print(np.median(x))
179.00757425195704
# loi normal
from scipy import stats
x = np.linspace(120,220,num=100)
y = stats.norm.pdf(x,179,8)
plt.plot(x,y);plt.grid()
y = stats.norm.cdf(x,179,8);
plt.plot(x,y);plt.grid()
X = np.random.multivariate_normal(mean=[179,75],cov = [[8,4],[4,8]],size=1000)
plt.scatter(X[:,0],X[:,1],alpha=1,s=2)
<matplotlib.collections.PathCollection at 0x1b08443e4a0>
plt.hist2d(X[:,0],X[:,1],bins=50);
# Create grid and multivariate normal
x = np.linspace(160, 190, 200) # Create a mesh of 200 x-points
y = np.linspace(65, 85, 200) # Create a mesh of 200 y-points
X, Y = np.meshgrid(x,y)
pos = np.dstack((X, Y))
Z = stats.multivariate_normal.pdf(pos,[179,75],[[8,4],[4,8]])
ax = plt.figure(figsize=[5,10]).add_subplot(projection='3d')
# Plot the 3D surface
ax.plot_surface(X, Y, Z, edgecolor='royalblue', lw=0.5, rstride=8, cstride=8,
alpha=0.1)
ax.set(ylim=(65, 85), xlim=(160, 190), zlim=(0, 0.03),
xlabel='X', ylabel='Y', zlabel='Z');
Évènements incompatibles :
Évènements compatibles :
Évènements Indépendants :
~ Les propriétés d'un échantillon tendent vers les propriétés de la distribution de probabilité qui la gouverne.
$\rightarrow$ Fait le pont entre les statistiques et les probabilités
import math
math.comb(5,2)
10
Quelle est la probabilité d'obtenir deux 6 sur 5 lancés de dé?
N = 10
x = np.random.randint(1,7,[N,5])
print(x)
print(sum(np.sum(x == 6,axis=1)==2)/N)
[[1 6 1 4 3] [4 1 3 2 4] [6 3 5 5 5] [6 4 5 3 4] [4 3 2 3 5] [4 6 1 5 4] [2 3 1 5 5] [3 5 3 2 1] [3 6 3 2 6] [1 2 1 3 6]] 0.1
N = 1000000
x = np.random.randint(1,7,[N,5])
print(x)
print(sum(np.sum(x == 6,axis=1)==2)/N)
[[2 2 2 5 3] [4 1 4 1 3] [3 1 5 3 1] ... [6 4 5 5 4] [5 4 1 5 6] [5 5 4 2 1]] 0.160558
N = 100
x = np.random.randint(1,7,[N,20])
plt.figure(figsize=[15,5])
plt.hist(np.sum(x == 6,axis=1),weights=np.ones(N)/N,bins=np.linspace(0,12,13),label='expriences');
n = np.array(range(13))
plt.bar(n,[math.comb(20,n_i)*(1/6)**n_i*(5/6)**(20-n_i) for n_i in n],color='orange',label='loi binomiale')
plt.grid()
plt.title('Nombre de 6 sur 20 lancés ({} exp.)'.format(N))
plt.xlabel("nombre de 6")
plt.ylabel("Fréquence d'occurance")
plt.legend();
N = 1000
x = np.random.randint(1,7,[N,20])
plt.figure(figsize=[15,5])
plt.hist(np.sum(x == 6,axis=1),weights=np.ones(N)/N,bins=np.linspace(0,12,13),label='expriences');
n = np.array(range(13))
plt.bar(n,[math.comb(20,n_i)*(1/6)**n_i*(5/6)**(20-n_i) for n_i in n],color='orange',label='loi binomiale')
plt.grid()
plt.title('Nombre de 6 sur 20 lancés ({} exp.)'.format(N))
plt.xlabel("nombre de 6")
plt.ylabel("Fréquence d'occurrence")
plt.legend();
N = 100000
x = np.random.randint(1,3,[N,20])
plt.figure(figsize=[15,5])
plt.hist(np.sum(x == 1,axis=1),weights=np.ones(N)/N,bins=np.linspace(0,20,21),label='expriences');
n = np.array(range(20))
plt.bar(n,[math.comb(20,n_i)*(0.5)**n_i*(0.5)**(20-n_i) for n_i in n],color='orange',label='loi binomiale')
plt.grid()
plt.title("Nombre de 'faces' sur 100 lancés ({} exp.)".format(N))
plt.xlabel("nombre de 'faces'")
plt.ylabel("Fréquence d'occurrence")
plt.legend();
np.random.seed(12)
b1 = 120
b2 = 240
def f(x,b1,b2):
if (type(x)==int) or (type(x)==float) :
if ((x>b1) and (x<b2)):
return 1/(b2-b1)
else:
return 0
else:
p = np.zeros(len(x))
p[(x>b1) & (x<b2)]=1/(b2-b1)
return p
def weight_n_potatoes(N,b1,b2):
return b1+np.random.rand(N)*(b2-b1)
plt.figure(figsize=[15,5])
x1 = np.linspace(100,250,500)
plt.plot(x1,f(x1,b1,b2),c='red')
plt.xlabel("x = masse [g]")
plt.ylabel("pdf(x)")
plt.title(r'$f(x) = {:0.5f}$ '.format(1/(b2-b1)));
plt.figure(figsize=[15,5])
N = 6
x = weight_n_potatoes(N,b1,b2)
bins=np.linspace(120,240,25)
plt.xlabel("x = masse [g]")
plt.ylabel("Nombre de pdt")
plt.hist(x,bins);
plt.figure(figsize=[15,5])
N = 100
x = weight_n_potatoes(N,b1,b2)
bins=np.linspace(120,240,25)
plt.xlabel("x = masse [g]")
plt.ylabel("Nombre de pdt")
plt.hist(x,bins);
plt.figure(figsize=[15,5])
N = 100000
x = weight_n_potatoes(N,b1,b2)
bins=np.linspace(120,240,25)
plt.xlabel("x = masse [g]")
plt.ylabel("Nombre de pdt")
plt.hist(x,bins);
plt.figure(figsize=[15,5])
N = 1000000
x = weight_n_potatoes(N,b1,b2)
bins=np.linspace(120,240,25)
plt.xlabel("x = masse [g]")
plt.ylabel("Proportion de pdt")
plt.hist(x,bins,weights=np.ones(N)/N);
plt.figure(figsize=[15,5])
N = 100000
x = weight_n_potatoes(N,b1,b2)
bins=np.linspace(120,240,25)
plt.xlabel("x = masse [g]")
plt.ylabel("Densité de pdt")
plt.hist(x,bins,weights=np.ones(N)/N/120*24,label=r'expérience'); # 120 grammes - 24 bins
x1 = np.linspace(100,250,500)
plt.plot(x1,f(x1,b1,b2),c='red',label=r'$f(x)$')
plt.xlabel("x = masse [g]")
plt.ylabel("pdf(x)");
plt.legend()
plt.grid()
plt.figure(figsize=[15,5])
x1 = np.linspace(100,250,500)
plt.plot(x1,f(x1,b1,b2),c='red')
plt.xlabel("x = masse [g]")
plt.ylabel("pdf(x)");
print(f(182.3,b1,b2))
Il s'agit d'une densité de probabilité. Elle est exprimée "par unité".
$ f(x) = 0.008333 \neq \mathbb {P}(X=x) $
plt.figure(figsize=[15,5])
x1 = np.linspace(100,250,500)
plt.plot(x1,f(x1,b1,b2),c='red')
plt.xlabel("x = masse [g]")
plt.ylabel("pdf(x)")
plt.bar(np.array(range(175,200)),1/(b2-b1))
plt.title(r'$f(x) = {:0.5f}$ '.format(1/(b2-b1)));
Somme des surfaces de chaque bar : hauteur x largeur $(200-175) \times 0.0083333 = 0.208$
def cdf(x,b1,b2):
if (type(x)==int) or (type(x)==np.int64) or (type(x)==np.int32):
if ((x>b1) and (x<b2)):
return 1/(b2-b1)*(x-b1)
else:
return 0
else:
x = np.array(x)
p = np.zeros(len(x))
zone_prop = (x>b1) & (x<=b2)
zone_1 = x>b2
p[zone_prop] = 1/(b2-b1)*(x[zone_prop]-b1)
p[zone_1] = np.ones(sum(zone_1))
return p
plt.figure(figsize=[15,5])
x1 = np.linspace(100,250,500)
plt.plot(x1,cdf(x1,b1,b2),c='red')
plt.xlabel("x = masse [g]")
plt.ylabel("pdf(x)")
plt.title(r'$F(x) = min(1,{:0.5f}\times (x-{}) )$ '.format(1/(b2-b1),b1));
plt.figure(figsize=[15,5])
x1 = np.linspace(100,250,500)
plt.plot(x1,cdf(x1,b1,b2),c='red')
plt.xlabel("x = masse [g]")
plt.ylabel("pdf(x)")
plt.title(r'$F(x) = {:0.5f}\times (x-{}) $ '.format(1/(b2-b1),b1));
bounds = np.array([175,200])
cdfs = cdf(bounds,b1,b2)
plt.scatter(bounds,cdfs);
plt.plot([100,bounds[0]],[cdfs[0],cdfs[0]],'k')
plt.plot([100,bounds[1]],[cdfs[1],cdfs[1]],'k')
plt.plot([bounds[0],bounds[0]],[0,cdfs[0]],'k')
plt.plot([bounds[1],bounds[1]],[0,cdfs[1]],'k')
plt.text(95,cdfs[0]+0.03,'{:.4f}'.format(cdfs[0]));
plt.text(95,cdfs[1]+0.03,'{:.4f}'.format(cdfs[1]));
print("cdf(200) - cdf(175) = ")
print("{:.5f} - {:.5f} = {:.5f}".format(cdf(bounds[1],b1,b2), cdf(bounds[0],b1,b2),cdf(bounds[1],b1,b2)-cdf(bounds[0],b1,b2)))
np.random.seed(1)
m = 180
s = 20
def f(x,m=0,s=1):
t = (x-m)/s
return 1/(s*np.sqrt(2*np.pi))*np.exp(-0.5*(t**2))
plt.figure(figsize=[15,5])
x1 = np.linspace(m-4*s,m+4*s,100)
plt.plot(x1,f(x1,m,s),c='red')
plt.xlabel("x = masse [g]")
plt.ylabel("pdf(x)")
plt.plot([m,m],[0,0.025],'k')
plt.arrow(m,0.023,s,0,color='k',length_includes_head=True,shape='full',width=0.0005);
plt.title(r'$X\sim {\mathcal {N}}(\mu ,\sigma ^{2}) = {\mathcal {N}}(180 ,20 ^{2})$ ');
plt.figure(figsize=[15,5])
x1 = np.linspace(-4,4,100)
plt.plot(x1,f(x1),c='red')
plt.xlabel("t")
plt.ylabel("pdf(x)")
plt.title(r'$X\sim {\mathcal {N}}(\mu ,\sigma ^{2}) = {\mathcal {N}}(0 ,1)$ ');
def f(x,m=0,s=1):
t = (x-m)/s
return 1/(s*np.sqrt(2*np.pi))*np.exp(-0.5*(t**2))
def weight_n_potatoes(N,m,s):
return np.random.randn(N)*s+m
plt.figure(figsize=[15,5])
N = 6
x = weight_n_potatoes(N,m,s)
bins=np.linspace(120,240,25)
plt.xlabel("x = masse [g]")
plt.ylabel("Nombre de pdt")
plt.hist(x,bins);
plt.figure(figsize=[15,5])
N = 100
x = weight_n_potatoes(N,m,s)
bins=np.linspace(120,240,25)
plt.xlabel("x = masse [g]")
plt.ylabel("Nombre de pdt")
plt.hist(x,bins);
plt.figure(figsize=[15,5])
N = 50000
x = weight_n_potatoes(N,m,s)
bins=np.linspace(120,240,25)
plt.xlabel("x = masse [g]")
plt.ylabel("Nombre de pdt")
plt.hist(x,bins);
plt.figure(figsize=[15,5])
bins=np.linspace(120,240,25)
plt.ylabel("Proportion de pdt")
plt.hist(x,bins,weights=np.ones(N)/N)
plt.xlabel("x = masse [g]");
plt.figure(figsize=[15,5])
bins=np.linspace(120,240,25)
plt.ylabel("Proportion de pdt")
plt.hist(x,bins,weights=np.ones(N)/N/(bins[1]-bins[0]));
x1 = np.linspace(m-4*s,m+4*s,100)
plt.plot(x1,f(x1,m,s));
plt.xlabel("x = masse [g]");
plt.figure(figsize=[15,5])
bins=np.linspace(120,240,25)
plt.ylabel("Proportion de pdt")
plt.hist(x,bins,weights=np.ones(N)/N/(bins[1]-bins[0]));
x1 = np.linspace(m-4*s,m+4*s,100)
plt.plot(x1,f(x1,m,s)); plt.xlabel("x = masse [g]");
print('f(182) = {}'.format(f(182,m,s),180,20))
plt.figure(figsize=[15,5])
bins=np.linspace(120,240,25)
plt.ylabel("Proportion de pdt")
plt.hist(x,bins,weights=np.ones(N)/N/(bins[1]-bins[0]));
x1 = np.linspace(m-4*s,m+4*s,100)
plt.plot(x1,f(x1));
plt.xlabel("x = masse [g]");
bins=np.linspace(175,200,6)
a = plt.hist(x,bins,weights=np.ones(N)/N/(bins[1]-bins[0]))
plt.show()
print('Aire verte = {}'.format(sum(a[0]*5)))
plt.figure(figsize=[15,5])
x1 = np.linspace(m-4*s,m+4*s,100)
plt.plot(x1,f(x1,180,20));
plt.xlabel("x = masse [g]");
plt.ylabel('pdf(x)')
bins=np.linspace(175,200,6)
x2 = np.array(range(175,200))
plt.bar(x2,f(x2,m,s))
plt.show()
print('Aire = {}'.format(sum(f(x2,m,s))))
plt.figure(figsize=[15,5])
x1 = np.linspace(m-4*s,m+4*s,100)
plt.plot(x1,f(x1,180,20));
plt.xlabel("x = masse [g]");
bins=np.linspace(175,200,6)
x2 = np.linspace(175,200,25*100)
plt.bar(x2,f(x2,m,s))
plt.show()
print('Aire = {}'.format(sum(f(x2,m,s))/100))
def figure1():
plt.figure(figsize=[15,5])
x1 = np.linspace(m-4*s,m+4*s,100)
plt.plot(x1,f(x1,180,20));
plt.xlabel("x = masse [g]");
bins=np.linspace(175,200,6)
x2 = np.linspace(175,200,50)
plt.bar(x2,f(x2,m,s))
plt.ylabel('pdf(x)')
import scipy.stats as st
plt.figure(figsize=[15,5])
x1 = np.linspace(m-4*s,m+4*s,200)
plt.plot(x1,st.norm.cdf((x1-m)/s));
plt.xlabel('Weight [g]')
plt.ylabel('cdf(x)\n ')
bounds = np.array([175,200])
cdf = st.norm.cdf((bounds-m)/s)
figure1()
def figure1():
import scipy.stats as st
plt.figure(figsize=[15,5])
x1 = np.linspace(m-4*s,m+4*s,100)
plt.plot(x1,f(x1,180,20.));
plt.xlabel("x = masse [g]");
bins=np.linspace(175,200,6)
x2 = np.linspace(175,200,50)
plt.bar(x2,f(x2,m,s))
plt.ylabel('pdf(x)')
plt.figure(figsize=[15,5])
x1 = np.linspace(m-4*s,m+4*s,200)
plt.plot(x1,st.norm.cdf((x1-m)/s));
plt.xlabel('Weight [g]')
plt.ylabel('cdf(x)\n ')
bounds = np.array([175,200])
cdf = st.norm.cdf((bounds-m)/s)
plt.scatter(bounds,cdf);
plt.plot([100,bounds[0]],[cdf[0],cdf[0]],'k')
plt.plot([100,bounds[1]],[cdf[1],cdf[1]],'k')
plt.plot([bounds[0],bounds[0]],[0,cdf[0]],'k')
plt.plot([bounds[1],bounds[1]],[0,cdf[1]],'k')
plt.text(95,cdf[0]+0.03,'{:.4f}'.format(cdf[0]));
plt.text(95,cdf[1]+0.03,'{:.4f}'.format(cdf[1]));
figure1()
print('cdf[1]-cdf[0] = {:.6f}'.format(cdf[1]-cdf[0]))
$p(175<x<200) = p(\frac{175-180}{20}<t<\frac{200-180}{20}) = p(-0.25<t<1)$
0.0987 + 0.3413 = 0.44