【Python学习系列十六】基于scikit-learn库逻辑回归训练模型(delta比赛代码)
生活随笔
收集整理的這篇文章主要介紹了
【Python学习系列十六】基于scikit-learn库逻辑回归训练模型(delta比赛代码)
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
delta比賽的場景:給定數據樣本,設計模型訓練預測二分類結果,并通過f1-score評估結果。比賽中對特征抽取、樣本擾動、過擬合、強相關特征、歸一化等概念有實際的理解和應用。
這里給出的代碼是基于邏輯回歸訓練的模型,代碼貼出來主要是用于后續做類似程序的參考,主要是dataframe操作、模型訓練、半監督學習思路應用等,如下:
# -*- coding: utf-8 -*-import pandas as pd import time from sklearn import metrics from sklearn.linear_model import LogisticRegression from sklearn import preprocessingdef main():#省份和地市映射data = {"province":['河北省', '山西省', '內蒙古自治區', '遼寧省', '吉林省', '黑龍江省', '江蘇省', '浙江省', '安徽省', '福建省', '江西省', '山東省', '河南省', '湖北省', '湖南省', '廣東省', '廣西壯族自治區', '海南省', '四川省', '貴州省', '云南省', '西藏自治區', '陜西省', '甘肅省', '青海省', '寧夏回族自治區', '新疆維吾爾自治區', '北京市', '天津市', '上海市', '重慶市'],"pro_code":[13,14,15,21,22,23,32,33,34,35,36,37,41,42,43,44,45,46,51,52,53,54,61,62,63,64,65,11,12,31,50]}province = pd.DataFrame(data, columns = ["province", "pro_code"])citydata=pd.read_csv(r"D:\city.csv")#加載地市映射表#加載帶標記數據label_ds=pd.read_csv(r"D:\label.csv")label_ds = pd.merge(label_ds, province, how = "left", on = "province")label_ds = pd.merge(label_ds, citydata, how = "left", on = "city")label_df = pd.DataFrame(label_ds[['denomination','min_amount','pro_code','age','sex','account_age','txn_count','use_nums',\'txn_min_amount','txn_amount_mean','avg_discount','voucher_num','avg_txn_amt',\'use_ratio','voucher_ratio','batch_no','voucher_no','city_id','label']])label_df["denomination"] = label_df["denomination"].astype("int")label_df["min_amount"] = label_df["min_amount"].astype("int")label_df["pro_code"] = label_df["pro_code"].astype("int") label_df["age"] = label_df["age"].astype("int")label_df["sex"] = label_df["sex"].astype("int")label_df["account_age"] = label_df["account_age"].astype("int")label_df["txn_count"] = label_df["txn_count"].astype("int")label_df["use_nums"] = label_df["use_nums"].astype("int")label_df["txn_min_amount"] = label_df["txn_min_amount"].astype("int")label_df["txn_amount_mean"] = label_df["txn_amount_mean"].astype("int")label_df["avg_discount"] = label_df["avg_discount"].astype("int")label_df["voucher_num"] = label_df["voucher_num"].astype("int")label_df["avg_txn_amt"] = label_df["avg_txn_amt"].astype("int")label_df["use_ratio"] = label_df["use_ratio"].astype("float")label_df["voucher_ratio"] = label_df["voucher_ratio"].astype("float")label_df["batch_no"] = label_df["batch_no"].astype("int")label_df["voucher_no"] = label_df["voucher_no"].astype("str")label_df["city_id"] = label_df["city_id"].astype("int")label_df["label"] = label_df["label"].astype("int")#加載未標記數據unlabel_ds=pd.read_csv(r"D:\unlabel.csv")unlabel_ds = pd.merge(unlabel_ds, province, how = "left", on = "province")unlabel_ds = pd.merge(unlabel_ds, citydata, how = "left", on = "city")unlabel_df = pd.DataFrame(unlabel_ds[['denomination','min_amount','pro_code','age','sex','account_age','txn_count','use_nums',\'txn_min_amount','txn_amount_mean','avg_discount','voucher_num','avg_txn_amt',\'use_ratio','voucher_ratio','batch_no','city_id','phone','voucher_no']]) unlabel_df["denomination"] = unlabel_df["denomination"].astype("int")unlabel_df["min_amount"] = unlabel_df["min_amount"].astype("int") unlabel_df["pro_code"] = unlabel_df["pro_code"].astype("int") unlabel_df["age"] = unlabel_df["age"].astype("int")unlabel_df["sex"] = unlabel_df["sex"].astype("int")unlabel_df["account_age"] = unlabel_df["account_age"].astype("int")unlabel_df["txn_count"] = unlabel_df["txn_count"].astype("int")unlabel_df["use_nums"] = unlabel_df["use_nums"].astype("int")unlabel_df["txn_min_amount"] = unlabel_df["txn_min_amount"].astype("int")unlabel_df["txn_amount_mean"] = unlabel_df["txn_amount_mean"].astype("int")unlabel_df["avg_discount"] = unlabel_df["avg_discount"].astype("int")unlabel_df["voucher_num"] = unlabel_df["voucher_num"].astype("int")unlabel_df["avg_txn_amt"] = unlabel_df["avg_txn_amt"].astype("int")unlabel_df["use_ratio"] = unlabel_df["use_ratio"].astype("float")unlabel_df["voucher_ratio"] = unlabel_df["voucher_ratio"].astype("float")unlabel_df["batch_no"] = unlabel_df["batch_no"].astype("int")unlabel_df["city_id"] = unlabel_df["city_id"].astype("int")unlabel_df["phone"] = unlabel_df["phone"].astype("str")unlabel_df["voucher_no"] = unlabel_df["voucher_no"].astype("str") #模型訓練和預測f1_score_old=float(0)#f1-scoref1_score=float(0.3)#高于全部設置1的分數outset=[]flag=int(1) label_df_cons=label_df#訓練樣本數不變while (f1_score-f1_score_old)>0.0001 :#迭代收斂到f1-score不再提升if flag==0 :#第一次訓練排除樣本數量帶來的問題f1_score_old=f1_score#訓練數據采樣,80%訓練,20%驗證 print "總樣本,有", label_df.shape[0], "行", label_df.shape[1], "列"train_label_df=label_df.sample(frac=0.8) print "訓練集,有", train_label_df.shape[0], "行", train_label_df.shape[1], "列"test_label_df=label_df.sample(frac=0.2) print "驗證集,有", test_label_df.shape[0], "行", test_label_df.shape[1], "列"#模型訓練label_X = train_label_df[['pro_code','city_id','age','sex','account_age',\'txn_count','txn_amount_mean','txn_min_amount']]label_X = preprocessing.scale(label_X)#歸一化label_y = train_label_df['label']model = LogisticRegression()#參數要設置model.fit(label_X, label_y)if flag==0 :#模型驗證,第一次訓練不評分expected = test_label_df['label']predicted_X=test_label_df[['pro_code','city_id','age','sex','account_age',\'txn_count','txn_amount_mean','txn_min_amount']]predicted_X=preprocessing.scale(predicted_X)#歸一化predicted = model.predict(predicted_X)f1_score = metrics.f1_score(expected, predicted) #模型評估print f1_scoreprint(metrics.classification_report(expected, predicted))flag=int(0)if f1_score_old>f1_score :break#利用模型為未標記樣本打上標簽unlabel_X=unlabel_df[['pro_code','city_id','age','sex','account_age',\'txn_count','txn_amount_mean','txn_min_amount']]unlabel_X_noScale=unlabel_Xunlabel_info = unlabel_df[['phone','voucher_no']]unlabel_X=preprocessing.scale(unlabel_X)#歸一化unlabel_y=model.predict(unlabel_X)out_y=pd.DataFrame(unlabel_y.reshape(-1,1),columns=['label'])unlabel_X_new=unlabel_X_noScale.join(out_y,how='left')label_df=pd.DataFrame()#原樣本清空label_df=label_df_cons.append(unlabel_X_new)#構成新的訓練集outset=unlabel_info.join(out_y,how='left')#輸出結果#迭代優化結束,輸出結果集 outset= outset[outset['label']==1] outset.to_csv('D:\gd_delta.csv',index=False,header=None)#輸出預測數據#執行 if __name__ == '__main__': start = time.clock() main()end = time.clock() print('finish all in %s' % str(end - start))比賽最后發現不是比的算法和建模,而是比sql數據處理,坑爹無比。比賽中的樣本也不均勻,個別特征非常集中,模型如何訓練都無法提升f1-score分數。
總結
以上是生活随笔為你收集整理的【Python学习系列十六】基于scikit-learn库逻辑回归训练模型(delta比赛代码)的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 【正一专栏】希望才是深深让人绝望的东西-
- 下一篇: 【Python学习系列十七】基于scik