利用xgb筛选模型变量
生活随笔
收集整理的這篇文章主要介紹了
利用xgb筛选模型变量
小編覺(jué)得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.
做評(píng)分卡模型時(shí)要做特征分析,小編一般都是等頻分箱。但是做一些策略時(shí),小編經(jīng)常要找出極端好和極端壞的客戶所在的特征分組,于是經(jīng)常要用到最優(yōu)分箱來(lái)進(jìn)行特征分析,以下是代碼:
?
%macro num_iv(data=,dvar=,splitsize=,maxbranch=,nsurrs=,method=,maxdepth=,dir=); * 1.?splitsize : 指定一個(gè)節(jié)點(diǎn)分割的最小觀測(cè)數(shù) 2.?maxbranch:指定一個(gè)節(jié)點(diǎn)的最大分枝數(shù)? 3.?nsurrs :指定替代規(guī)則數(shù) 4.?maxdepth:指定最大的數(shù)深度 5.?criteion:指定決策樹(shù)分割標(biāo)準(zhǔn) 6.?assess:指定模型評(píng)估方法 ;proc datasets lib=work nodetails;delete ?varname_total; run; /*建立數(shù)值型建模變量數(shù)據(jù)集*/ proc sql noprint ;select distinct name into : var_analy separated by?' '?from?woe_score;select count(*) into : var_num ?from?woe_score; quit; %put &var_analy.; %let var_list = &var_analy.; %let var_num = &var_num.; ? %put &var_list.; /*把數(shù)值型變量定義為宏變量*/ %do i=1?%to &var_num.; ?? %let numvar_name_&i.= %scan(&var_list.,&i.); %put &numvar_name_1.; * 1.?splitsize : 指定一個(gè)節(jié)點(diǎn)分割的最小觀測(cè)數(shù) 2.?maxbranch:指定一個(gè)節(jié)點(diǎn)的最大分枝數(shù)? 3.?nsurrs :指定替代規(guī)則數(shù) 4.?maxdepth:指定最大的數(shù)深度 5.?criteion:指定決策樹(shù)分割標(biāo)準(zhǔn) 6.?assess:指定模型評(píng)估方法 ; proc split data=&data. ?splitsize=&splitsize. maxbranch=&maxbranch. MAXDEPTH=&maxdepth. ? nsurrs=&nsurrs. ?assess=lift ?criterion=&method.;input &&numvar_name_&i./level=interval;target &DVAR./level=binary;Score data=&data. out=d_&i.;?code file="&dir.\treecode_tic_&&numvar_name_&i..sas";?describe file="&dir.\treerule_tic_&&numvar_name_&i..txt"; run;?data n_d_&i.;?set d_&i.;?%include?"&dir.\treecode_tic_&&numvar_name_&i..sas";?rename p_&DVAR.1=p_&&numvar_name_&i.;? run;proc sql noprint;select count(*),max(&&numvar_name_&i.),min(&&numvar_name_&i.)into:total, :max ,:min?from?n_D_&i.; quit;data n_D_&i.;set n_D_&i.;if?&min.<=&&numvar_name_&i.<=&max.then flag="no_null";else?flag="null"; run;proc sql noprint;select count(*) into:is_null?from?n_D_&i.; quit; %put &is_null.; %if?&is_null.>0??%then ?%do; /****************************************************************************************/ proc sql noprint;select count(*),max(&&numvar_name_&i.),min(&&numvar_name_&i.) into:total,:max ,:min?from?n_D_&i.;select sum(case when default=1?then?1?else?0?end), sum(case when default=0?then?1?else0?end) into :tot_bad, :tot_good?from??n_d_&i.;create table total?as??select?"&&numvar_name_&i."?as?varname, ? ? ? ?min(&&numvar_name_&i.)?as?interval_1,max(&&numvar_name_&i.)?as?interval_2, ? ? ? ? ? ??compress(put(min(round(&&numvar_name_&i.,0.0001)),best32.))||'-'||compress(put(max(round(&&numvar_name_&i.,0.0001)),best32.))?as?interval,count(*)?as?total_num label =?"賬戶數(shù)",sum(&DVAR.)?as?bad_num label ="壞客戶數(shù)",input(compress(put(count(*)/ &total.,percent10.2),'%'),best32.)?as?cnt_pct label = %nrstr("%賬戶數(shù)占比") , ? ? ? ??input(compress(put(sum(&DVAR.)/count(*),percent10.2),'%'),best32.)?as?bad_rate label = %nrstr("%壞賬率"),((sum(case when &DVAR.=1?then?1?else?0?end)/&tot_bad)-(sum(case when &DVAR.=0?then?1?else?0?end)/&tot_good))*log((sum(case when &DVAR.=1?then?1?else?0?end)/&tot_bad)/(sum(case when &DVAR.=0?then?1?else?0?end)/&tot_good))?as?pre_ivfrom?n_D_&i.(where =(&&numvar_name_&i ^= .))group by p_&&numvar_name_&i.union /*得到兩個(gè)數(shù)據(jù)集所有的數(shù)據(jù),這里如果兩個(gè)數(shù)據(jù)集有相同的數(shù)據(jù),重復(fù)數(shù)據(jù)只出現(xiàn)一次 */select?"&&numvar_name_&i."?as?varname,-9999?as?interval_1,-9999?as?interval_2,'null'?as?interval,count(*)?as?total_num label =?"賬戶數(shù)",sum(&DVAR.)?as?bad_num label ="壞客戶數(shù)",input(compress(put(count(*)/ &total.,percent10.2),'%'),best32.)?as?cnt_pct label = %nrstr("%賬戶數(shù)占比") , ? ? ? ??input(compress(put(sum(&DVAR.)/count(*),percent10.2),'%'),best32.)?as?bad_rate label = %nrstr("%壞賬率"),((sum(case when &DVAR.=1?then?1?else?0?end)/&tot_bad)-(sum(case when &DVAR.=0?then?1?else?0?end)/&tot_good))*log((sum(case when &DVAR.=1?then?1?else?0?end)/&tot_bad)/(sum(case when &DVAR.=0?then?1?else?0?end)/&tot_good))?as?pre_ivfrom?n_D_&i.(where=(&&numvar_name_&i.=.))group by p_&&numvar_name_&i.order by interval_1; quit; /*********************************************************************/ %end; %else??%do; proc sql noprint;select count(*),max(&&numvar_name_&i.),min(&&numvar_name_&i.)into:total,:max ,:min?from?n_D_&i.;select sum(case when default=1?then?1?else?0?end), sum(case when default=0?then?1?else0?end) into :tot_bad, :tot_good?from??n_d_&i.;create table total?asselect?"&&numvar_name_&i."?as?varname,min(&&numvar_name_&i.)?as?interval_1,max(&&numvar_name_&i.)?as?interval_2,compress(put(min(round(&&numvar_name_&i.,0.0001)),best32.))||'-'||compress(put(max(round(&&numvar_name_&i.,0.0001)),best32.))?as?interval,count(*)?as?total_num label =?"賬戶數(shù)",sum(&DVAR.)?as?bad_num label ="壞客戶數(shù)",input(compress(put(count(*)/ &total.,percent10.2),'%'),best32.)?as?cnt_pct label = %nrstr("%賬戶數(shù)占比") , ? ? ? ??input(compress(put(sum(&DVAR.)/count(*),percent10.2),'%'),best32.)?as?bad_rate label = %nrstr("%壞賬率"),((sum(case when &DVAR.=1?then?1?else?0?end)/&tot_bad)-(sum(case when &DVAR.=0?then?1?else?0?end)/&tot_good))*log((sum(case when &DVAR.=1?then?1?else?0?end)/&tot_bad)/(sum(case when &DVAR.=0?then?1?else?0?end)/&tot_good))?as?pre_ivfrom?n_D_&i.group by p_&&numvar_name_&i.order by interval_1;quit; %end; data t_&i.;length varname $100.;set total;group=_n_; run;? proc append base=varname_total data=t_&i. force;run; proc datasets lib=work nodetails noprint;delete total n_: d_: _namedat; quit; %end; %mend; /*創(chuàng)建新文件夾*/ data _null_;new=dcreate("特征分析2","E:\AnalystPersonal\yuqing\決策樹(shù)");new1 = dcreate ("9wgini特征分析2","E:\AnalystPersonal\yuqing");call symputx ("dir1",new1);call symputx("dir",new); run; %put &dir.; %num_iv(data = work.all9w,dvar = default,splitsize =?1000,maxbranch=?2,nsurrs=?5,method =chisq ,maxdepth=5,dir = &dir.); dm?"odsresults"?clear;總結(jié)
以上是生活随笔為你收集整理的利用xgb筛选模型变量的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
- 上一篇: 3个最常用的分类模型评估指标!
- 下一篇: 基于正则化的特征选择