Advertisement

金融风控项目

阅读量:

通过一个用户的核心数据及其行为轨迹来判断是否会违约。采用拍拍贷平台提供的详细资料作为样本。https://www.kesci.com/home/competition/56cd5f02b89b5bd026cb39c9/content/1

在此数据中提供了三种不同类型的数据:

  1. Master: 用户的主要信息
  2. Loginfo: 登录信息
  3. Userupdateinfo: 修改信息

本次项目中,在分析过程中我们仅利用\texttt{Master}这一信息来推断或估计一个用户是否会逾期。
数据里有一个字段名为\texttt{Target}为样本标签(label)。

复制代码
 import numpy as np

    
 import math 
    
 import pandas as pd 
    
 pd.set_option('display.float_format',lambda x:'%.3f' % x)
    
 import matplotlib.pyplot as plt 
    
 plt.style.use('ggplot')
    
 %matplotlib inline
    
 import seaborn as sns 
    
 sns.set_palette('muted')
    
 sns.set_style('darkgrid')
    
 import warnings
    
 warnings.filterwarnings('ignore')
    
 import os
复制代码
 # 读取Master数据

    
 data = pd.read_csv('data/Training/PPD_Training_Master_GBK_3_1_Training_Set.csv',encoding='gb18030')
    
 print (data.shape)
    
  
    
 # 展示记录
    
 print(data.head())
    
  
    
 # 正负样本的比例, 可以看出样本比例不平衡的
    
 data.target.value_counts()

1. 数据的预处

缺失值。数据里有大量的缺失值,需要做一些处理。

字符串的清洗。比如“北京市”和“北京”合并成“北京”, 统一转换成小写等

二值化。具体方法请参考课程里的介绍

衍生特征:比如户籍地和当前城市是否是同一个?

特征的独热编码:对于类别型特征使用独热编码形式

连续性特征的处理:根据情况来处理

复制代码
 from pandas.api.types import is_numeric_dtype

    
 from pandas.api.types import is_string_dtype
    
 from sklearn.preprocessing import OneHotEncoder
    
  
    
 def process_missing(df,kind):
    
     
    
     """
    
     1) Delete sparse columns/rows
    
     2) fill NA
    
     """
    
     
    
     for index in range(df.shape[0]):
    
     
    
     if df.iloc[index].isnull().sum()/df.shape[1] > 0.5: # delete row if more than half of the columns in row Index are missing values
    
         
    
         df.drop(index,axis=0,inplace=True)
    
     
    
     for column in df.columns:
    
     
    
     if df[column].isnull().sum(axis=0)/df.shape[0] > 0.5: # delete column if more than half of the rows in column Column are missing values
    
         df.drop(labels=column,axis=1,inplace=True)
    
     else:
    
         continue
    
     for column in df.columns: # fill in empty elements
    
     
    
     if df[column].isnull().any(axis=0):
    
         
    
         if is_numeric_dtype(df[column]):# if the dtype of this column is numeric, use the mean number of the column
    
             
    
             df.fillna(value={column:df[column].mean()},inplace=True)
    
             
    
         else: #otherwise, use the mode of the column
    
             
    
             df.fillna(value={column:df[column].mode()[0]},inplace=True)
    
  
    
     else:
    
         continue
    
  
    
     # deal with date column
    
     if kind == 'test':
    
     
    
     df['Month'] = df['ListingInfo'].astype(str).apply(lambda x:x.split('/')[1] )
    
     df['Year'] = df['ListingInfo'].astype(str).apply(lambda x:x.split('/')[2] )
    
  
    
 #         def func(x):
    
 #             return (x.split('/')[1] + '/' + x.split('/')[0] + '/' + x.split('/')[2])
    
 #         df['ListingInfo'] = df['ListingInfo'].astype(str).apply(func)
    
 #         df['ListingInfo'] = pd.to_datetime(df['ListingInfo'],format = '%Y-%m-%d')
    
     
    
     else:
    
     
    
     df['Month'] = df['ListingInfo'].astype(str).apply(lambda x:x.split('/')[1] )
    
     df['Year'] = df['ListingInfo'].astype(str).apply(lambda x:x.split('/')[0] )
    
 #         df['ListingInfo'] = pd.to_datetime(df['ListingInfo'],format = '%Y-%m-%d')
    
     
    
 #     df['month'] = df['ListingInfo'].dt.month.astype(int)
    
 #     df['year'] = df['ListingInfo'].dt.year.astype(int)
    
 #     df['is_month_start'] = (df['ListingInfo'].dt.is_month_start).astype(int)
    
 #     df['is_month_end'] = (df['ListingInfo'].dt.is_month_end).astype(int)
    
 #     df['is_year_start'] = (df['ListingInfo'].dt.is_year_start).astype(int)
    
 #     df['is_year_end'] = (df['ListingInfo'].dt.is_year_end).astype(int)
    
     df.drop(['ListingInfo'],axis=1,inplace=True)
    
  
    
         
    
 def string_cleanup(df):
    
     """
    
     1) change string into lower case;
    
     2) unify the city name
    
     """
    
     for column in df.columns:# lower string
    
     if is_string_dtype(df[column]):
    
         df[column].str.lower()
    
         
    
     for row in range(df.shape[0]): #unify the city name
    
     
    
     df.loc[row,'UserInfo_8'] = df.loc[row,'UserInfo_8'].replace('市','')
    
     df.loc[row,'UserInfo_20'] = df.loc[row,'UserInfo_20'].replace('市','')
    
     
    
 def city_coding(df):
    
     """
    
     divide city into two groups: city_list and others: UserInfo_2, UserInfo_4, UserInfo_8, UserInfo_20
    
     """
    
     city_list = ['北京','上海','广州','深圳','成都','青岛','苏州'] # cities that are more likely to be overdue
    
     
    
     df.loc[~df['UserInfo_2'].isin(city_list),'UserInfo_2'] = '其他'
    
     
    
     df.loc[~df['UserInfo_4'].isin(city_list),'UserInfo_4'] = '其他'
    
     
    
     df.loc[~df['UserInfo_8'].isin(city_list),'UserInfo_8'] = '其他'
    
     
    
     df.loc[~df['UserInfo_20'].isin(city_list),'UserInfo_20'] = '其他'
    
     
    
 # def one_hot(df):
    
     
    
 #     """
    
 #     one-hot encoding for categorical columns
    
 #     """
    
 #     df = pd.get_dummies(df,columns=['UserInfo_2','UserInfo_4','UserInfo_7','UserInfo_8','UserInfo_9','UserInfo_19','UserInfo_20','UserInfo_22','UserInfo_23','UserInfo_24','Education_Info2','Education_Info3','Education_Info4','Education_Info6','Education_Info7','Education_Info8','WeblogInfo_19','WeblogInfo_20','WeblogInfo_21'
    
 #     ])
复制代码
    process_missing(data,kind='train') # 1) Delete sparse columns/rows 2) fill NA
复制代码
    string_cleanup(data) # 1) change string into lower case; 2) unify the city name
复制代码
    city_coding(data) # divide city into two groups: city_list and others: UserInfo_2, UserInfo_4, UserInfo_8, UserInfo_20
复制代码
 #one-hot encoding

    
 data = pd.get_dummies(data,columns=['UserInfo_2','UserInfo_4','UserInfo_7','UserInfo_8','UserInfo_9','UserInfo_19','UserInfo_20','UserInfo_22','UserInfo_23','UserInfo_24','Education_Info2','Education_Info3','Education_Info4','Education_Info6','Education_Info7','Education_Info8','WeblogInfo_19','WeblogInfo_20','WeblogInfo_21'])

2. 特征选择

在200多个特征中,能够显著提升性能的特征数量有限。 在这一环节中完成特征选择任务,在这一阶段建议采用基于结构的模型进行分析。 例如可以参考sklearn官方提供的此处,或者采用XGBoost等方法直接进行筛选。 经过训练后,在模型结果中可通过属性feature_importance_values获取权重信息。

复制代码
 # As the 'target' column is missing in test data, here I split the training data into X_train and X_test

    
 from sklearn.model_selection import train_test_split
    
  
    
 X_train,X_test,y_train,y_test = train_test_split(data.drop(['target'],axis=1),data['target'],test_size=0.25,random_state=100)
复制代码
 from sklearn.ensemble import ExtraTreesClassifier

    
 from sklearn.feature_selection import SelectFromModel
    
 from sklearn.metrics import accuracy_score
    
  
    
 #Feature selection
    
 clf = ExtraTreesClassifier(n_estimators=50)
    
 clf = clf.fit(X_train,y_train)
    
 model = SelectFromModel(clf,threshold=0.007,prefit=True)
    
 X_new_train = model.transform(X_train)
    
 X_new_train.shape

3. XGBoost来训练风控模型,结果以AUC为准

https://github.com/dmlc/xgboost 是XGBoost官方开发者的GitHub存储位置, 提供了完整的开发文档和技术细节。https://pypi.org/project/xgboost/ 上提供了完整的Python包安装指南和依赖信息。试着去尝试优化它的超参数设置, 通过微调参数实现最佳模型性能。

复制代码
 #utilize selected features to train XGB model again

    
 from xgboost import XGBClassifier
    
 selected_model = XGBClassifier()
    
 selected_model.fit(X_new_train,y_train)
复制代码
 #transform test dataset to include only the selected features

    
  
    
 X_new_test = model.transform(X_test)
    
  
    
 y_pred = selected_model.predict(X_new_test)
    
  
    
 #calculate AUC
    
  
    
 from sklearn import metrics
    
  
    
 auc_roc=metrics.roc_auc_score(y_test,y_pred)
    
  
    
 print(auc_roc)

全部评论 (0)

还没有任何评论哟~