金融风控项目
发布时间
阅读量:
阅读量
通过一个用户的核心数据及其行为轨迹来判断是否会违约。采用拍拍贷平台提供的详细资料作为样本。https://www.kesci.com/home/competition/56cd5f02b89b5bd026cb39c9/content/1。
在此数据中提供了三种不同类型的数据:
- Master: 用户的主要信息
- Loginfo: 登录信息
- Userupdateinfo: 修改信息
本次项目中,在分析过程中我们仅利用\texttt{Master}这一信息来推断或估计一个用户是否会逾期。
数据里有一个字段名为\texttt{Target}为样本标签(label)。
import numpy as np
import math
import pandas as pd
pd.set_option('display.float_format',lambda x:'%.3f' % x)
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
import seaborn as sns
sns.set_palette('muted')
sns.set_style('darkgrid')
import warnings
warnings.filterwarnings('ignore')
import os
# 读取Master数据
data = pd.read_csv('data/Training/PPD_Training_Master_GBK_3_1_Training_Set.csv',encoding='gb18030')
print (data.shape)
# 展示记录
print(data.head())
# 正负样本的比例, 可以看出样本比例不平衡的
data.target.value_counts()
1. 数据的预处
缺失值。数据里有大量的缺失值,需要做一些处理。
字符串的清洗。比如“北京市”和“北京”合并成“北京”, 统一转换成小写等
二值化。具体方法请参考课程里的介绍
衍生特征:比如户籍地和当前城市是否是同一个?
特征的独热编码:对于类别型特征使用独热编码形式
连续性特征的处理:根据情况来处理
from pandas.api.types import is_numeric_dtype
from pandas.api.types import is_string_dtype
from sklearn.preprocessing import OneHotEncoder
def process_missing(df,kind):
"""
1) Delete sparse columns/rows
2) fill NA
"""
for index in range(df.shape[0]):
if df.iloc[index].isnull().sum()/df.shape[1] > 0.5: # delete row if more than half of the columns in row Index are missing values
df.drop(index,axis=0,inplace=True)
for column in df.columns:
if df[column].isnull().sum(axis=0)/df.shape[0] > 0.5: # delete column if more than half of the rows in column Column are missing values
df.drop(labels=column,axis=1,inplace=True)
else:
continue
for column in df.columns: # fill in empty elements
if df[column].isnull().any(axis=0):
if is_numeric_dtype(df[column]):# if the dtype of this column is numeric, use the mean number of the column
df.fillna(value={column:df[column].mean()},inplace=True)
else: #otherwise, use the mode of the column
df.fillna(value={column:df[column].mode()[0]},inplace=True)
else:
continue
# deal with date column
if kind == 'test':
df['Month'] = df['ListingInfo'].astype(str).apply(lambda x:x.split('/')[1] )
df['Year'] = df['ListingInfo'].astype(str).apply(lambda x:x.split('/')[2] )
# def func(x):
# return (x.split('/')[1] + '/' + x.split('/')[0] + '/' + x.split('/')[2])
# df['ListingInfo'] = df['ListingInfo'].astype(str).apply(func)
# df['ListingInfo'] = pd.to_datetime(df['ListingInfo'],format = '%Y-%m-%d')
else:
df['Month'] = df['ListingInfo'].astype(str).apply(lambda x:x.split('/')[1] )
df['Year'] = df['ListingInfo'].astype(str).apply(lambda x:x.split('/')[0] )
# df['ListingInfo'] = pd.to_datetime(df['ListingInfo'],format = '%Y-%m-%d')
# df['month'] = df['ListingInfo'].dt.month.astype(int)
# df['year'] = df['ListingInfo'].dt.year.astype(int)
# df['is_month_start'] = (df['ListingInfo'].dt.is_month_start).astype(int)
# df['is_month_end'] = (df['ListingInfo'].dt.is_month_end).astype(int)
# df['is_year_start'] = (df['ListingInfo'].dt.is_year_start).astype(int)
# df['is_year_end'] = (df['ListingInfo'].dt.is_year_end).astype(int)
df.drop(['ListingInfo'],axis=1,inplace=True)
def string_cleanup(df):
"""
1) change string into lower case;
2) unify the city name
"""
for column in df.columns:# lower string
if is_string_dtype(df[column]):
df[column].str.lower()
for row in range(df.shape[0]): #unify the city name
df.loc[row,'UserInfo_8'] = df.loc[row,'UserInfo_8'].replace('市','')
df.loc[row,'UserInfo_20'] = df.loc[row,'UserInfo_20'].replace('市','')
def city_coding(df):
"""
divide city into two groups: city_list and others: UserInfo_2, UserInfo_4, UserInfo_8, UserInfo_20
"""
city_list = ['北京','上海','广州','深圳','成都','青岛','苏州'] # cities that are more likely to be overdue
df.loc[~df['UserInfo_2'].isin(city_list),'UserInfo_2'] = '其他'
df.loc[~df['UserInfo_4'].isin(city_list),'UserInfo_4'] = '其他'
df.loc[~df['UserInfo_8'].isin(city_list),'UserInfo_8'] = '其他'
df.loc[~df['UserInfo_20'].isin(city_list),'UserInfo_20'] = '其他'
# def one_hot(df):
# """
# one-hot encoding for categorical columns
# """
# df = pd.get_dummies(df,columns=['UserInfo_2','UserInfo_4','UserInfo_7','UserInfo_8','UserInfo_9','UserInfo_19','UserInfo_20','UserInfo_22','UserInfo_23','UserInfo_24','Education_Info2','Education_Info3','Education_Info4','Education_Info6','Education_Info7','Education_Info8','WeblogInfo_19','WeblogInfo_20','WeblogInfo_21'
# ])
process_missing(data,kind='train') # 1) Delete sparse columns/rows 2) fill NA
string_cleanup(data) # 1) change string into lower case; 2) unify the city name
city_coding(data) # divide city into two groups: city_list and others: UserInfo_2, UserInfo_4, UserInfo_8, UserInfo_20
#one-hot encoding
data = pd.get_dummies(data,columns=['UserInfo_2','UserInfo_4','UserInfo_7','UserInfo_8','UserInfo_9','UserInfo_19','UserInfo_20','UserInfo_22','UserInfo_23','UserInfo_24','Education_Info2','Education_Info3','Education_Info4','Education_Info6','Education_Info7','Education_Info8','WeblogInfo_19','WeblogInfo_20','WeblogInfo_21'])
2. 特征选择
在200多个特征中,能够显著提升性能的特征数量有限。 在这一环节中完成特征选择任务,在这一阶段建议采用基于树结构的模型进行分析。 例如可以参考sklearn官方提供的此处,或者采用XGBoost等方法直接进行筛选。 经过训练后,在模型结果中可通过属性feature_importance_values获取权重信息。
# As the 'target' column is missing in test data, here I split the training data into X_train and X_test
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(data.drop(['target'],axis=1),data['target'],test_size=0.25,random_state=100)
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
#Feature selection
clf = ExtraTreesClassifier(n_estimators=50)
clf = clf.fit(X_train,y_train)
model = SelectFromModel(clf,threshold=0.007,prefit=True)
X_new_train = model.transform(X_train)
X_new_train.shape
3. XGBoost来训练风控模型,结果以AUC为准
https://github.com/dmlc/xgboost 是XGBoost官方开发者的GitHub存储位置, 提供了完整的开发文档和技术细节。https://pypi.org/project/xgboost/ 上提供了完整的Python包安装指南和依赖信息。试着去尝试优化它的超参数设置, 通过微调参数实现最佳模型性能。
#utilize selected features to train XGB model again
from xgboost import XGBClassifier
selected_model = XGBClassifier()
selected_model.fit(X_new_train,y_train)
#transform test dataset to include only the selected features
X_new_test = model.transform(X_test)
y_pred = selected_model.predict(X_new_test)
#calculate AUC
from sklearn import metrics
auc_roc=metrics.roc_auc_score(y_test,y_pred)
print(auc_roc)
全部评论 (0)
还没有任何评论哟~
