利用Python做假设检验、参数估计、方差分析、线性回归
生活随笔
收集整理的這篇文章主要介紹了
利用Python做假设检验、参数估计、方差分析、线性回归
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
目錄
參數估計
方差比的置信區間
均值差的置信區間
一個正態總體方差的點估計和置信區間
一個正態總體均值的點估計和置信區間
單樣本t檢驗的SciPy實現方式
單樣本t檢驗的statsmodels實現方式
兩樣本t檢驗SciPy的實現方式
兩樣本t檢驗statsmodels的實現方式
配對t檢驗的SciPy實現
方差分析
單因素方差分析的SciPy實現
事后檢驗
非參數方法
SciPy實現有符號秩和檢驗
SciPy實現秩和檢驗
一元線性回歸
參數估計
方差比的置信區間
import numpy as np from scipy.stats import f # 定義一個實現方差比置信區間的函數 def var_ratio_ci_est(data1,data2,alpha):n1 = len(data1) # 樣本1的樣本容量n2 = len(data2) # 樣本2的樣本容量f_lower_value = f.ppf(alpha/2,n1-1,n2-1) # 左側臨界值f_upper_value = f.ppf(1-alpha/2,n1-1,n2-1) # 右側臨界值var_ratio = np.var(data1) / np.var(data2)return var_ratio / f_upper_value,var_ratio / f_lower_value salary_18 = [1484, 785, 1598, 1366, 1716, 1020, 1716, 785, 3113, 1601] salary_35 = [902, 4508, 3809, 3923, 4276, 2065, 1601, 553, 3345, 2182]print(var_ratio_ci_est(salary_18,salary_35,0.05)) (0.05314530971751432, 0.8614126063098585)均值差的置信區間
import numpy as np from scipy.stats import t # 兩個正態總體均值差的置信區間 def mean_diff_ci_t_est(data1,data2,alpha,equal=True):n1 = len(data1) # 樣本1的樣本容量n2 = len(data2) # 樣本2的樣本容量mean_diff = np.mean(data1) - np.mean(data2) # 兩個樣本的均值差(兩個總體均值差的無偏點估計)sample1_var = np.var(data1) # 樣本1的方差sample2_var = np.var(data2) # 樣本2的方差if equal: # 方差未知且相等sw = np.sqrt(((n1-1)*sample1_var + (n2-1)*sample2_var) / (n1+n2-2))t_value = np.abs(t.ppf(alpha/2,n1+n2-2)) # t值return mean_diff - sw*np.sqrt(1/n1+1/n2)*t_value, \mean_diff + sw*np.sqrt(1/n1+1/n2)*t_valueelse: # 方差未知且不等df_numerator = (sample1_var/n1+sample2_var/n2)**2 # 自由度分子# 自由度分母df_denominator = (sample1_var/n1)**2/(n1-1) + (sample2_var/n2)**2/(n2-1)df = df_numerator / df_denominator # 自由度t_value = np.abs(t.ppf(alpha/2,df))return mean_diff - np.sqrt(sample1_var/n1 + sample2_var/n2)*t_value, \mean_diff + np.sqrt(sample1_var/n1 + sample2_var/n2)*t_value salary_18 = [1484, 785, 1598, 1366, 1716, 1020, 1716, 785, 3113, 1601] salary_35 = [902, 4508, 3809, 3923, 4276, 2065, 1601, 553, 3345, 2182]print(mean_diff_ci_t_est(salary_18,salary_35,0.05,equal=True)) # 方差相等 print(mean_diff_ci_t_est(salary_18,salary_35,0.05,equal=False)) # 方差不等 (-2196.676574582891, -199.32342541710875) (-2227.5521493017823, -168.4478506982175)一個正態總體方差的點估計和置信區間
import numpy as np from scipy.stats import chi2 # 定義計算方差的置信區間的函數 def var_ci_est(data,alpha):n = len(data) # 樣本容量s2 = np.var(data) # 樣本方差# chi2.ppf(alpha/2,n-1)的意思是返回左側面積為alpha/2的卡方值chi2_lower_value = chi2.ppf(alpha/2,n-1)# # chi2.ppf(1-alpha/2,n-1)的意思是返回左側面積為1-alpha/2的卡方值chi2_upper_value = chi2.ppf(1-alpha/2,n-1)return (n-1) * s2 / chi2_upper_value,(n-1) * s2 / chi2_lower_value salary_18 = [1484, 785, 1598, 1366, 1716, 1020, 1716, 785, 3113, 1601] salary_35 = [902, 4508, 3809, 3923, 4276, 2065, 1601, 553, 3345, 2182]# 樣本方差是總體方差的無偏點估計,再調用函數計算方差的置信區間 print(np.std(salary_18),np.var(salary_18),var_ci_est(salary_18,0.05)) print(np.std(salary_35),np.var(salary_35),var_ci_est(salary_35,0.05)) 631.0754629994736 398256.24 (188421.9056837747, 1327329.3204650925) 1364.3074580167038 1861334.8399999999 (880629.6611156773, 6203554.546528138)一個正態總體均值的點估計和置信區間
import numpy as np from scipy.stats import norm, t # 計算一個正態總體均值的置信區間的函數 def mean_ci_est(data,alpha,sigma=None):n = len(data) # 樣本容量sample_mean = np.mean(data) # 樣本均值if sigma is None: # 方差未知的情況s = np.std(data)se = s / np.sqrt(n)# t.ppf(alpha / 2,n-1)返回的是左側面積為alpha / 2對應的t值t_value = np.abs(t.ppf(alpha / 2,n-1))# 根據公式返回置信區間return sample_mean - se * t_value,sample_mean + se * t_valueelse: # 方差已知的情況se = sigma / np.sqrt(n) # 標準誤# norm.ppf(alpha / 2)返回的是左側面積為alpha / 2對應的z值z_value = np.abs(norm.ppf(alpha / 2))# 根據公式返回置信區間return sample_mean - se * z_value,sample_mean + se * z_value salary_18 = [1484, 785, 1598, 1366, 1716, 1020, 1716, 785, 3113, 1601] salary_35 = [902, 4508, 3809, 3923, 4276, 2065, 1601, 553, 3345, 2182]# 樣本均值是總體均值的無偏點估計,并調用函數計算均值的置信區間 print(np.mean(salary_18),mean_ci_est(salary_18,0.05)) print(np.mean(salary_35),mean_ci_est(salary_35,0.05)) 1518.4 (1066.9558093661096, 1969.8441906338905) 2716.4 (1740.433238065152, 3692.366761934848)假設檢驗
初始化設置
import pandas as pd import scipy.stats as ss import matplotlib # 解決繪圖的兼容問題 %matplotlib inline matplotlib.rcParams['font.sans-serif'] = ['SimHei']單樣本t檢驗的SciPy實現方式
ccss = pd.read_excel("CCSS_sample.xlsx",sheet_name="CCSS") # 讀取Excel文件中名稱為CCSS的表單 ccss.head() # 廣州基期的消費信心指數的描述統計 ccss.query("s0 == '廣州' & time == 203004" ).index1.describe()from scipy import stats as ss # 單樣本t檢驗 ss.ttest_1samp(ccss.query("s0 == '廣州' & time == 203004" ).index1,100) Ttest_1sampResult(statistic=-1.3625667518512996, pvalue=0.17611075148299993)我們可以看出P值大于0.05,接受H0,沒有顯著性差異
單樣本t檢驗的statsmodels實現方式
from statsmodels.stats import weightstats as wsdes = ws.DescrStatsW(ccss.query("s0 == '廣州' & time == 203004").index1) des.mean 97.16472701710536 # 置信水平為95%的置信區間(可信區間) des.tconfint_mean() (93.03590418536487, 101.29354984884586) # 單樣本t檢驗 des.ttest_mean(100) (-1.3625667518512996, 0.17611075148299993, 99.0)我們可以看出P值大于0.05,接受H0,沒有顯著性差異
兩樣本t檢驗SciPy的實現方式
from scipy import stats as ss # 消費的信心指數分布的對稱性考察 ccss.index1.plot.hist() # 不同婚姻狀況的消費信心指數分組描述 ccss.groupby('s7').index1.describe() # 方差齊性檢驗 ss.levene(ccss.index1[ccss.s7 == '未婚'],ccss.index1[ccss.s7 == '已婚']) LeveneResult(statistic=0.6178738290825966, pvalue=0.4320031337363959) # 兩樣本t檢驗(方差齊性) ss.ttest_ind(ccss.index1[ccss.s7 == '未婚'],ccss.index1[ccss.s7 == '已婚']) Ttest_indResult(statistic=2.4052614244262576, pvalue=0.01632071963816213) # 兩樣本t檢驗(方差不齊性) ss.ttest_ind(ccss.index1[ccss.s7 == '未婚'],ccss.index1[ccss.s7 == '已婚'],equal_var=False) Ttest_indResult(statistic=2.466907208318663, pvalue=0.013870358702526698)兩樣本t檢驗statsmodels的實現方式
from statsmodels.stats import weightstats as ws d1 = ws.DescrStatsW(ccss.index1[ccss.s7 == '未婚']) # 未婚的消費信心指數數據 d2 = ws.DescrStatsW(ccss.index1[ccss.s7 == '已婚']) # 已婚的消費信心指數數據 comp = ws.CompareMeans(d1,d2) # 創建CompareMeans對象 comp.ttest_ind() # 兩組獨立樣本t檢驗(方差齊性) (2.4052614244262576, 0.01632071963816213, 1131.0) comp.ttest_ind(usevar='unequal') # 兩組獨立樣本t檢驗(方差不齊性) (2.4669072083186636, 0.013870358702526675, 690.0875773844764)配對t檢驗的SciPy實現
import scipy.stats as ss import pandas as pd ccss_p.loc[:,['index1','index1n']].describe() # 取出4月和12月信心指數,并描述 # 用相關分析確認配對信息是否的確存在 ss.pearsonr(ccss_p.index1,ccss_p.index1n) (0.2638011798615908, 0.01301162367951006) # 配對t檢驗 ss.ttest_rel(ccss_p.index1,ccss_p.index1n) Ttest_relResult(statistic=1.1616334792419984, pvalue=0.24856144386191056)方差分析
單因素方差分析的SciPy實現
導包
import pandas as pd import scipy.stats as ss import matplotlib # 解決繪圖的兼容問題 %matplotlib inline matplotlib.rcParams['font.sans-serif'] = ['SimHei'] ccss = pd.read_excel("CCSS_sample.xlsx",sheet_name="CCSS") # 讀取Excel文件中名稱為CCSS的表單 # 分別提取出北京四個時間段的消費信心數據 a = ccss.query("s0 == '北京' & time == 203004 ").index1 b = ccss.query("s0 == '北京' & time == 203012 ").index1 c = ccss.query("s0 == '北京' & time == 203112 ").index1 d = ccss.query("s0 == '北京' & time == 203212 ").index1 ss.levene(a,b,c,d) # 方差齊性檢驗 LeveneResult(statistic=0.44332330387152036, pvalue=0.7221678627997157) ss.f_oneway(a,b,c,d) # 單因素方差分析 F_onewayResult(statistic=5.630155391280303, pvalue=0.0008777240313291846)事后檢驗
# 需要先在環境中安裝scikit_posthocs包:pip install scikit_posthocs import scikit_posthocs as sp# 創建對象,該對象接收事后檢驗的數據,并且設置p值校正的方法(控制兩兩比較的α值)為'bonferroni' pc = sp.posthoc_conover(ccss,val_col='index1',group_col='time',p_adjust='bonferroni') # 使用熱力圖顯示比較結果 heatmap_args = {'linewidths': 0.25, 'linecolor': '0.5', 'clip_on': False, 'square': True, 'cbar_ax_bbox': [0.80, 0.35, 0.04, 0.3]} sp.sign_plot(pc,**heatmap_args) # 繪制熱力圖非參數方法
SciPy實現有符號秩和檢驗
import numpy as np from scipy import stats data = [36, 32, 31, 25, 28, 36, 40, 32, 41, 26, 35, 35, 32, 87, 33, 35] data = np.array(data) # 轉換為numpy數組,方便運算# 檢驗總體均值是否為37 k = min(len(data[data>37]),len(data[data<37])) pvalue = 2*stats.binom.cdf(k,len(data),0.5) print(pvalue) 0.021270751953125SciPy實現秩和檢驗
import scipy.stats as stats weight_high=[134,146,104,119,124,161,107,83,113,129] # 高蛋白人群的體重的樣本 weight_low=[70,118,101,85,112,132,94,100,68,86] # 低蛋白人群的體重的樣本# mannwhitneyu方法針對樣本容量相等或不等都是適用的 stats.mannwhitneyu(weight_high,weight_low,alternative='two-sided') MannwhitneyuResult(statistic=81.0, pvalue=0.0211339281291611)一元線性回歸
import pandas as pd import scipy.stats as ss ccss = pd.read_excel("CCSS_sample.xlsx",sheet_name="CCSS") # 讀取Excel文件中名稱為CCSS的表單 # 建立年齡和總消費信心指數的回歸方程 ss.linregress(ccss.s3,ccss.index1) LinregressResult(slope=-0.3576848241677336, intercept=108.89832302796889, rvalue=-0.2190793138600294, pvalue=6.243013355018415e-14, stderr=0.047077765734542185, intercept_stderr=1.815504369310354) # 以k*2形式的二維數組提供數據 ss.linregress(ccss.loc[:,['s3','index1']]) LinregressResult(slope=-0.3576848241677336, intercept=108.89832302796889, rvalue=-0.2190793138600294, pvalue=6.243013355018415e-14, stderr=0.047077765734542185, intercept_stderr=1.815504369310354)總結
以上是生活随笔為你收集整理的利用Python做假设检验、参数估计、方差分析、线性回归的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: spring开发_Annotation_
- 下一篇: 丢了的文件怎么找回来-如何恢复文件