【Python学习系列十七】基于scikit-learn库逻辑回归训练模型(delta比赛代码2)
生活随笔
收集整理的這篇文章主要介紹了
【Python学习系列十七】基于scikit-learn库逻辑回归训练模型(delta比赛代码2)
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
機器學習任務流程:學習任務定義->數學建模->訓練樣本采樣->特征分析和抽取->算法設計和代碼->模型訓練和優化(性能評估和度量)->泛化能力評估(重采樣和重建模);
算法思路:應用半監督學習思路,先用訓練集訓練出一個模型,然后用模型給預測集打標簽,之后將打上標簽的預測集也加入到訓練集中用模型再訓練,用f1-scror作為性能評估的依據。這個代碼和之前比,主要是增加model.predict_proba()函數返回正例概率,自己設置閾值來選擇正例樣本。代碼如下:
# -*- coding: utf-8 -*-import pandas as pd import time from sklearn import metrics from sklearn.linear_model import LogisticRegression from sklearn import preprocessing #from sklearn.tree import DecisionTreeClassifier def main():#省份和地市映射data = {"province":['河北省', '山西省', '內蒙古自治區', '遼寧省', '吉林省', '黑龍江省', '江蘇省', '浙江省', '安徽省', '福建省', '江西省', '山東省', '河南省', '湖北省', '湖南省', '廣東省', '廣西壯族自治區', '海南省', '四川省', '貴州省', '云南省', '西藏自治區', '陜西省', '甘肅省', '青海省', '寧夏回族自治區', '新疆維吾爾自治區', '北京市', '天津市', '上海市', '重慶市'],"pro_code":[13,14,15,21,22,23,32,33,34,35,36,37,41,42,43,44,45,46,51,52,53,54,61,62,63,64,65,11,12,31,50]}province = pd.DataFrame(data, columns = ["province", "pro_code"])citydata=pd.read_csv(r"D:\city.csv")#加載地市映射表#加載帶標記數據label_ds=pd.read_csv(r"D:\label.csv")label_ds = pd.merge(label_ds, province, how = "left", on = "province")label_ds = pd.merge(label_ds, citydata, how = "left", on = "city")label_df = pd.DataFrame(label_ds[['denomination','min_amount','pro_code','age','sex','account_age','txn_count','use_nums',\'txn_min_amount','txn_amount_mean','avg_discount','voucher_num','avg_txn_amt',\'use_ratio','voucher_ratio','batch_no','voucher_no','city_id','label']])label_df["denomination"] = label_df["denomination"].astype("int")label_df["min_amount"] = label_df["min_amount"].astype("int")label_df["pro_code"] = label_df["pro_code"].astype("int") label_df["age"] = label_df["age"].astype("int")label_df["sex"] = label_df["sex"].astype("int")label_df["account_age"] = label_df["account_age"].astype("int")label_df["txn_count"] = label_df["txn_count"].astype("int")label_df["use_nums"] = label_df["use_nums"].astype("int")label_df["txn_min_amount"] = label_df["txn_min_amount"].astype("int")label_df["txn_amount_mean"] = label_df["txn_amount_mean"].astype("int")label_df["avg_discount"] = label_df["avg_discount"].astype("int")label_df["voucher_num"] = label_df["voucher_num"].astype("int")label_df["avg_txn_amt"] = label_df["avg_txn_amt"].astype("int")label_df["use_ratio"] = label_df["use_ratio"].astype("float")label_df["voucher_ratio"] = label_df["voucher_ratio"].astype("float")label_df["batch_no"] = label_df["batch_no"].astype("int")label_df["voucher_no"] = label_df["voucher_no"].astype("str")label_df["city_id"] = label_df["city_id"].astype("int")label_df["label"] = label_df["label"].astype("int")#加載未標記數據unlabel_ds=pd.read_csv(r"D:\unlabel.csv")unlabel_ds = pd.merge(unlabel_ds, province, how = "left", on = "province")unlabel_ds = pd.merge(unlabel_ds, citydata, how = "left", on = "city")unlabel_df = pd.DataFrame(unlabel_ds[['denomination','min_amount','pro_code','age','sex','account_age','txn_count','use_nums',\'txn_min_amount','txn_amount_mean','avg_discount','voucher_num','avg_txn_amt',\'use_ratio','voucher_ratio','batch_no','city_id','phone','voucher_no']]) unlabel_df["denomination"] = unlabel_df["denomination"].astype("int")unlabel_df["min_amount"] = unlabel_df["min_amount"].astype("int") unlabel_df["pro_code"] = unlabel_df["pro_code"].astype("int") unlabel_df["age"] = unlabel_df["age"].astype("int")unlabel_df["sex"] = unlabel_df["sex"].astype("int")unlabel_df["account_age"] = unlabel_df["account_age"].astype("int")unlabel_df["txn_count"] = unlabel_df["txn_count"].astype("int")unlabel_df["use_nums"] = unlabel_df["use_nums"].astype("int")unlabel_df["txn_min_amount"] = unlabel_df["txn_min_amount"].astype("int")unlabel_df["txn_amount_mean"] = unlabel_df["txn_amount_mean"].astype("int")unlabel_df["avg_discount"] = unlabel_df["avg_discount"].astype("int")unlabel_df["voucher_num"] = unlabel_df["voucher_num"].astype("int")unlabel_df["avg_txn_amt"] = unlabel_df["avg_txn_amt"].astype("int")unlabel_df["use_ratio"] = unlabel_df["use_ratio"].astype("float")unlabel_df["voucher_ratio"] = unlabel_df["voucher_ratio"].astype("float")unlabel_df["batch_no"] = unlabel_df["batch_no"].astype("int")unlabel_df["city_id"] = unlabel_df["city_id"].astype("int")unlabel_df["phone"] = unlabel_df["phone"].astype("str")unlabel_df["voucher_no"] = unlabel_df["voucher_no"].astype("str") #模型訓練和預測f1_score_old=float(0)#f1-scoref1_score=float(0.3)#高于全部設置1的分數outset=[]flag=int(1) label_df_cons=label_df#訓練樣本數不變while (f1_score-f1_score_old)>0.0001 :#迭代收斂到f1-score不再提升if flag==0 :#第一次訓練排除樣本數量帶來的問題f1_score_old=f1_score#訓練數據采樣,80%訓練,20%驗證 print "總樣本,有", label_df.shape[0], "行", label_df.shape[1], "列"train_label_df=label_df#全量訓練,ample(frac=0.8) print "訓練集,有", train_label_df.shape[0], "行", train_label_df.shape[1], "列"test_label_df=label_df_cons.sample(frac=0.3) #用訓練集來測試f1-scoreprint "驗證集,有", test_label_df.shape[0], "行", test_label_df.shape[1], "列"#模型訓練label_X = train_label_df[['pro_code','city_id','age','sex','account_age',\'txn_count','txn_amount_mean','txn_min_amount']]label_X = preprocessing.scale(label_X)#歸一化label_y = train_label_df['label']model = LogisticRegression()#if flag==0 :# model = LogisticRegression()#邏輯回歸,第一次預訓練#else :# model = DecisionTreeClassifier()#決策樹model.fit(label_X, label_y)if flag==0 :#模型驗證,第一次訓練不評分expected = test_label_df['label']predicted_X=test_label_df[['pro_code','city_id','age','sex','account_age',\'txn_count','txn_amount_mean','txn_min_amount']]predicted_X=preprocessing.scale(predicted_X)#歸一化predicted = model.predict(predicted_X)f1_score = metrics.f1_score(expected, predicted) #模型評估print f1_scoreflag=int(0)if f1_score_old<f1_score :#為未標記樣本打上標記,然后加入訓練集unlabel_X=unlabel_df[['pro_code','city_id','age','sex','account_age',\'txn_count','txn_amount_mean','txn_min_amount']]unlabel_X_noScale=unlabel_Xunlabel_X=preprocessing.scale(unlabel_X)#歸一化unlabel_y=model.predict(unlabel_X)out_y=pd.DataFrame(unlabel_y.reshape(-1,1),columns=['label'])unlabel_X_new=unlabel_X_noScale.join(out_y,how='left')label_df=pd.DataFrame()#原樣本清空label_df=label_df_cons.append(unlabel_X_new)#構成新的訓練集else : #迭代訓練結束,輸出結果unlabel_X=unlabel_df[['pro_code','city_id','age','sex','account_age',\'txn_count','txn_amount_mean','txn_min_amount']]unlabel_info = unlabel_df[['phone','voucher_no']]unlabel_X=preprocessing.scale(unlabel_X)#歸一化unlabel_y=model.predict_proba(unlabel_X)[:,1]#預測返回概率值,通過概率值閾值選擇正例樣本out_y=pd.DataFrame(unlabel_y,columns=['prob']) #返回判定正例的比例outset=unlabel_info.join(out_y,how='left')#輸出結果outset["label"] = outset.apply(lambda x: 0 if x["prob"] <0.57 else 1, axis = 1)outset= outset[outset['label']==1] outset=outset[['phone','voucher_no','label']]outsetds=pd.DataFrame(outset)outsetds.to_csv('D:\gd_delta.csv',index=False,header=None)#輸出預測數據#評價f1#unlabel_X=pd.DataFrame(unlabel_X,columns=['pro_code','city_id','age','sex','account_age',\# 'txn_count','txn_amount_mean','txn_min_amount'])#print unlabel_X.head(5)#outset=unlabel_X.join(out_y,how='left')#輸出結果#outset["label"] = outset.apply(lambda x: 0 if x["prob"] <0.57 else 1, axis = 1)#expected = outset['label']#predicted_X=outset[['pro_code','city_id','age','sex','account_age',\# 'txn_count','txn_amount_mean','txn_min_amount']]#predicted_X=preprocessing.scale(predicted_X)#歸一化#predicted = model.predict(predicted_X)#f1_score = metrics.f1_score(expected, predicted) #模型評估#print f1_score#0.855946148093#退出循環break#執行 if __name__ == '__main__': start = time.clock() main()end = time.clock() print('finish all in %s' % str(end - start))繼續提升有三點:
1)可以嘗試給預測集打標簽用一個模型,迭代訓練用另一個模型;
2)可以嘗試抽取不同的特征來建模,其次對特征值做離散化處理;
3)可以嘗試用部分特征來預訓練,另一部分特征來做訓練模型,可以降低過擬合問題;
總結
以上是生活随笔為你收集整理的【Python学习系列十七】基于scikit-learn库逻辑回归训练模型(delta比赛代码2)的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 【Python学习系列十六】基于scik
- 下一篇: 【Python学习系列十八】基于scik