金融风控实战——集成学习
生活随笔
收集整理的這篇文章主要介紹了
金融风控实战——集成学习
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
xgb依然要去除共線性、變量選擇
lr bivar要嚴格單調,xgb、lightGBM不需要
LightGBM評分卡
import pandas as pd from sklearn.metrics import roc_auc_score,roc_curve,auc from sklearn.model_selection import train_test_split from sklearn import metrics from sklearn.linear_model import LogisticRegression import numpy as np import random import math import time import lightgbm as lgbdata = pd.read_csv('Bcard.txt') data.head() data.shape #(95806, 13) #看一下月份分布,我們用最后一個月做為跨時間驗證集合 data.obs_mth.unique() #array(['2018-10-31', '2018-07-31', '2018-09-30', '2018-06-30', # '2018-11-30'], dtype=object) df_train = data[data.obs_mth != '2018-11-30'].reset_index().copy() val = data[data.obs_mth == '2018-11-30'].reset_index().copy() #這是我們全部的變量,info結尾的是自己做的無監督系統輸出的個人表現,score結尾的是收費的外部征信數據 lst = ['person_info','finance_info','credit_info','act_info','td_score','jxl_score','mj_score','rh_score']df_train = df_train.sort_values(by = 'obs_mth',ascending = False) df_train.head() df_train = df_train.sort_values(by = 'obs_mth',ascending = False)rank_lst = [] for i in range(1,len(df_train)+1):rank_lst.append(i)df_train['rank'] = rank_lstdf_train['rank'] = df_train['rank']/len(df_train)pct_lst = [] for x in df_train['rank']:if x <= 0.2:x = 1elif x <= 0.4:x = 2elif x <= 0.6:x = 3elif x <= 0.8:x = 4else:x = 5pct_lst.append(x) df_train['rank'] = pct_lst #train = train.drop('obs_mth',axis = 1) df_train.head() df_train['rank'].groupby(df_train['rank']).count() #rank #1 15966 #2 15966 #3 15966 #4 15966 #5 15967 #Name: rank, dtype: int64 #定義lgb函數 def LGB_test(train_x,train_y,test_x,test_y):from multiprocessing import cpu_countclf = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,max_depth=2, n_estimators=800, objective='binary',subsample=0.7, colsample_bytree=0.7, subsample_freq=1,learning_rate=0.05, min_child_weight=50,random_state=None,n_jobs=cpu_count()-1,num_iterations = 800 #迭代次數)clf.fit(train_x, train_y,eval_set=[(train_x, train_y),(test_x,test_y)],eval_metric='auc',early_stopping_rounds=100)print(clf.n_features_)return clf,clf.best_score_[ 'valid_1']['auc']feature_lst = {} ks_train_lst = [] ks_test_lst = [] for rk in set(df_train['rank']): # 測試集8.18以后作為跨時間驗證集#定義模型訓練集與測試集ttest = df_train[df_train['rank'] == rk]ttrain = df_train[df_train['rank'] != rk]train = ttrain[lst]train_y = ttrain.bad_indtest = ttest[lst]test_y = ttest.bad_ind start = time.time()model,auc = LGB_test(train,train_y,test,test_y) end = time.time()#模型貢獻度放在feture中feature = pd.DataFrame({'name' : model.booster_.feature_name(),'importance' : model.feature_importances_}).sort_values(by = ['importance'],ascending = False)#計算訓練集、測試集、驗證集上的KS和AUCy_pred_train_lgb = model.predict_proba(train)[:, 1]y_pred_test_lgb = model.predict_proba(test)[:, 1]train_fpr_lgb, train_tpr_lgb, _ = roc_curve(train_y, y_pred_train_lgb)test_fpr_lgb, test_tpr_lgb, _ = roc_curve(test_y, y_pred_test_lgb)train_ks = abs(train_fpr_lgb - train_tpr_lgb).max()test_ks = abs(test_fpr_lgb - test_tpr_lgb).max()train_auc = metrics.auc(train_fpr_lgb, train_tpr_lgb)test_auc = metrics.auc(test_fpr_lgb, test_tpr_lgb)ks_train_lst.append(train_ks)ks_test_lst.append(test_ks) feature_lst[str(rk)] = feature[feature.importance>=20].nametrain_ks = np.mean(ks_train_lst) test_ks = np.mean(ks_test_lst)ft_lst = {} for i in range(1,6):ft_lst[str(i)] = feature_lst[str(i)]fn_lst=list(set(ft_lst['1']) & set(ft_lst['2']) & set(ft_lst['3']) & set(ft_lst['4']) &set(ft_lst['5']))print('train_ks: ',train_ks) print('test_ks: ',test_ks) print('ft_lst: ',fn_lst ) #[LightGBM] [Warning] Unknown parameter: max_features #[1] training's auc: 0.726731 training's binary_logloss: 0.0827979 valid_1's auc: 0.742666 valid_1's binary_logloss: 0.12066 #[2] training's auc: 0.769499 training's binary_logloss: 0.0822062 valid_1's auc: 0.753919 valid_1's binary_logloss: 0.119728 #[3] training's auc: 0.788952 training's binary_logloss: 0.0816227 valid_1's auc: 0.762911 valid_1's binary_logloss: 0.118777 #. . . #[188] training's auc: 0.827082 training's binary_logloss: 0.0777181 valid_1's auc: 0.786679 valid_1's binary_logloss: 0.078782 #[189] training's auc: 0.827128 training's binary_logloss: 0.0777136 valid_1's auc: 0.786756 valid_1's binary_logloss: 0.0787781 #[190] training's auc: 0.827162 training's binary_logloss: 0.0777108 valid_1's auc: 0.786696 valid_1's binary_logloss: 0.0787811#train_ks: 0.4907124806547195 #test_ks: 0.47382530047645305 #ft_lst: ['credit_info', 'person_info', 'finance_info'] lst = ['person_info','finance_info','credit_info','act_info']train = data[data.obs_mth != '2018-11-30'].reset_index().copy() evl = data[data.obs_mth == '2018-11-30'].reset_index().copy()x = train[lst] y = train['bad_ind']evl_x = evl[lst] evl_y = evl['bad_ind']model,auc = LGB_test(x,y,evl_x,evl_y)y_pred = model.predict_proba(x)[:,1] fpr_lgb_train,tpr_lgb_train,_ = roc_curve(y,y_pred) train_ks = abs(fpr_lgb_train - tpr_lgb_train).max() print('train_ks : ',train_ks)y_pred = model.predict_proba(evl_x)[:,1] fpr_lgb,tpr_lgb,_ = roc_curve(evl_y,y_pred) evl_ks = abs(fpr_lgb - tpr_lgb).max() print('evl_ks : ',evl_ks)from matplotlib import pyplot as plt plt.plot(fpr_lgb_train,tpr_lgb_train,label = 'train LR') plt.plot(fpr_lgb,tpr_lgb,label = 'evl LR') plt.plot([0,1],[0,1],'k--') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC Curve') plt.legend(loc = 'best') plt.show() #[1] training's binary_logloss: 0.090317 training's auc: 0.712883 valid_1's binary_logloss: 0.0986629 valid_1's auc: 0.678619 #Training until validation scores don't improve for 100 rounds. #[2] training's binary_logloss: 0.0896369 training's auc: 0.779216 valid_1's binary_logloss: 0.0978883 valid_1's auc: 0.755811 #[3] training's binary_logloss: 0.0885026 training's auc: 0.779149 valid_1's binary_logloss: 0.0966811 valid_1's auc: 0.749375 #[4] training's binary_logloss: 0.087998 training's auc: 0.780539 valid_1's binary_logloss: 0.0961527 valid_1's auc: 0.759009 #... #[179] training's binary_logloss: 0.0784288 training's auc: 0.812571 valid_1's binary_logloss: 0.0900886 valid_1's auc: 0.779962 #[180] training's binary_logloss: 0.0784267 training's auc: 0.812602 valid_1's binary_logloss: 0.0900914 valid_1's auc: 0.779887 #[181] training's binary_logloss: 0.078425 training's auc: 0.812601 valid_1's binary_logloss: 0.0900941 valid_1's auc: 0.779927 #[182] training's binary_logloss: 0.0784229 training's auc: 0.8126 valid_1's binary_logloss: 0.0900964 valid_1's auc: 0.779932 #Early stopping, best iteration is: #[82] training's binary_logloss: 0.0788374 training's auc: 0.811646 valid_1's binary_logloss: 0.089958 valid_1's auc: 0.779946 #4 #train_ks : 0.4801091876625077 #evl_ks : 0.4416674980164514
LightGBM其實效果確實是比較LR要好的,但是我們LR也可以逼近這個效果,下節課我們會具體來做。
評分卡公式變形
600+50×ln?P0P1ln?2,P0為好人,P1為壞人600+50 \times \frac{\ln \frac{P_{0}}{P_{1}}}{\ln 2},P_{0}為好人,P_{1}為壞人 600+50×ln2lnP1?P0???,P0?為好人,P1?為壞人600+50×ln?1?xbetaxbetaln?2600+50 \times \frac{\ln \frac{1-xbeta}{xbeta}}{\ln 2} 600+50×ln2lnxbeta1?xbeta??600+50×log?21?xbetaxbeta600+50 \times \log _{2} \frac{1-{ xbeta }}{{ xbeta }} 600+50×log2?xbeta1?xbeta?
總結
以上是生活随笔為你收集整理的金融风控实战——集成学习的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: LightGBM用法速查表
- 下一篇: 金融风控实战——不均衡学习