Advertisement

金融风控-贷款违约预测 Task2 数据分析

阅读量:

GitHub链接:[FinancialRiskControl/Task2's Data Analysis.md](https://github.com/datawhalechina/team-learning-data-mining/blob/master/FinancialRiskControl/Task2's Data Analysis.md)

复制代码
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    import datetime
    import warnings
    warnings.filterwarnings('ignore')
    data_train = pd.read_csv("train.csv")
    data_test_a = pd.read_csv("testA.csv")

总体了解

复制代码
    data_train.info()

<class ‘pandas.core.frame.DataFrame’>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 47 columns):
id 800000 non-null int64
loanAmnt 800000 non-null float64
term 800000 non-null int64
interestRate 800000 non-null float64
installment 800000 non-null float64
grade 800000 non-null object
subGrade 800000 non-null object
employmentTitle 799999 non-null float64
employmentLength 753201 non-null object
homeOwnership 800000 non-null int64
annualIncome 800000 non-null float64
verificationStatus 800000 non-null int64
issueDate 800000 non-null object
isDefault 800000 non-null int64
purpose 800000 non-null int64
postCode 799999 non-null float64
regionCode 800000 non-null int64
dti 799761 non-null float64
delinquency_2years 800000 non-null float64
ficoRangeLow 800000 non-null float64
ficoRangeHigh 800000 non-null float64
openAcc 800000 non-null float64
pubRec 800000 non-null float64
pubRecBankruptcies 799595 non-null float64
revolBal 800000 non-null float64
revolUtil 799469 non-null float64
totalAcc 800000 non-null float64
initialListStatus 800000 non-null int64
applicationType 800000 non-null int64
earliesCreditLine 800000 non-null object
title 799999 non-null float64
policyCode 800000 non-null float64
n0 759730 non-null float64
n1 759730 non-null float64
n2 759730 non-null float64
n2.1 759730 non-null float64
n4 766761 non-null float64
n5 759730 non-null float64
n6 759730 non-null float64
n7 759730 non-null float64
n8 759729 non-null float64
n9 759730 non-null float64
n10 766761 non-null float64
n11 730248 non-null float64
n12 759730 non-null float64
n13 759730 non-null float64
n14 759730 non-null float64
dtypes: float64(33), int64(9), object(5)
memory usage: 286.9+ MB

查看数据集中特征缺失值,唯一值等

查看缺失值

复制代码
    data_train.isnull().any() # 默认axis=0,按行计算,即判断每一列是否有True;若axis=1,则判断每一行是否有True

id False
loanAmnt False
term False
interestRate False
installment False
grade False
subGrade False
employmentTitle True
employmentLength True
homeOwnership False
annualIncome False
verificationStatus False
issueDate False
isDefault False
purpose False
postCode True
regionCode False
dti True
delinquency_2years False
ficoRangeLow False
ficoRangeHigh False
openAcc False
pubRec False
pubRecBankruptcies True
revolBal False
revolUtil True
totalAcc False
initialListStatus False
applicationType False
earliesCreditLine False
title True
policyCode False
n0 True
n1 True
n2 True
n2.1 True
n4 True
n5 True
n6 True
n7 True
n8 True
n9 True
n10 True
n11 True
n12 True
n13 True
n14 True
dtype: bool

复制代码
    print(f"There are {data_train.isnull().any().sum()} columns in train dataset with missing values.") # 加上sum,计算True的数量,即训练集有22列缺失值
    # There are 22 columns in train dataset with missing values.
    
    # 上面得到训练集有22列特征有缺失值,进一步查看缺失特征中缺失率大于50%的特征
    # data_train.isnull().sum()计算每一列特征有多少缺失值
    have_null_fea_dict = (data_train.isnull().sum()/len(data_train)).to_dict()  # 将结果转化成字典形式
    print(have_null_fea_dict)
    
    fea_null_moreThanHalf = {}
    for key, value in have_null_fea_dict.items():
    if value>0.5:
        fea_null_moreThanHalf[key] = value
    print(fea_null_moreThanHalf)  # 没有缺失率大于50%的特征  # {}
    
    # 具体的查看缺失特征及缺失率
    
    # nan可视化
    missing = data_train.isnull().sum()/len(data_train)
    missing = missing[missing>0]
    missing.sort_values(inplace=True)
    missing.plot.bar()  # 柱状图
在这里插入图片描述

查看唯一值

复制代码
    # 查看训练集测试集中特征属性只有一值的特征
    one_value_fea = [col for col in data_train.columns if data_train[col].nunique() <= 1]
    print(one_value_fea)
    one_value_fea_test = [col for col in data_test_a.columns if data_test_a[col].nunique() <= 1]
    print(one_value_fea_test)
    # ['policyCode']
    # ['policyCode']
    print(data_train["policyCode"])
    print(data_test_a["policyCode"])

0 1.0
1 1.0
2 1.0
3 1.0
4 1.0

799995 1.0
799996 1.0
799997 1.0
799998 1.0
799999 1.0
Name: policyCode, Length: 800000, dtype: float64
0 1.0
1 1.0
2 1.0
3 1.0
4 1.0

199995 1.0
199996 1.0
199997 1.0
199998 1.0
199999 1.0
Name: policyCode, Length: 200000, dtype: float64

查看特征的数值类型有哪些,对象类型有哪些

复制代码
    numerical_fea = list(data_train.select_dtypes(exclude=["object"]).columns) # 获得数值型特征
    print("numerical_fea:\n", numerical_fea)
    category_fea = list(filter(lambda x: x not in numerical_fea, list(data_train.columns)))  # 获得类别型特征
    # filter过滤不符合条件的元素,返回符合条件的元素组成的列表,上例即过滤掉数值型特征
    print("\ncategory_fea:\n",category_fea)
    print(data_train.grade)

numerical_fea:
[‘id’, ‘贷款金额’, ‘期限’, ‘利率’, ‘还款额’, ‘职位’, ‘房产类型’, ‘年收入’, ‘验证状态’, ‘是否违约’, ‘用途’, ‘邮政编码’, ‘地区编码’, ‘Debt to Income Ratio (DTI)’, ‘两年内违约记录’, ‘FICO分数范围(最低值)’, ‘FICO分数范围(最高值)’, ‘开放账户数量’, ‘公共记录数量’, ‘公共记录中的破产次数', ' revolving balance', ' revolving utilization', '总账户数量', '初始借款状况', '申请类型', '标题', '政策编号', 'n0', 'n1', 'n2', 'n2.1', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9',' n10',' n11',' n12',' n13',' n14']
category_fea:
[‘评分等级’ ,‘子评分等级’ ,‘工作时长’ ,‘放款日期’ ,‘最早信用限额’]

0 E
1 D
2 D
3 A
4 C

799995 C
799996 A
799997 C
799998 A
799999 B
Name: grade, Length: 800000, dtype: object

数值型变量分析

划分数值型变量中的连续变量和离散型变量

复制代码
    # 过滤数值型类别特征
    def get_numerical_serial_fea(data, feas):
    numerical_serial_fea = []
    numerical_noserial_fea = []
    for fea in feas:
        temp = data[fea].nunique()
        if temp <= 10:  # 判断是否为离散型变量
            numerical_noserial_fea.append(fea) # 划分为离散型变量
            continue
        numerical_serial_fea.append(fea)
    return numerical_serial_fea, numerical_noserial_fea
    numerical_serial_fea, numerical_noserial_fea = get_numerical_serial_fea(data_train,numerical_fea)
    print("numerical_serial_fea:\n", numerical_serial_fea)
    print("\nnumerical_noserical_fea:\n", numerical_noserial_fea)

numerical_serial_fea:
[‘id’, ‘loanAmnt’, ‘interestRate’, ‘installment’, ‘employmentTitle’, ‘annualIncome’, ‘purpose’, ‘postCode’, ‘regionCode’, ‘dti’, ‘delinquency_2years’, ‘ficoRangeLow’, ‘ficoRangeHigh’, ‘openAcc’, ‘pubRec’, ‘pubRecBankruptcies’, ‘revolBal’, ‘revolUtil’, ‘totalAcc’, ‘title’, ‘n0’, ‘n1’, ‘n2’, ‘n2.1’, ‘n4’, ‘n5’, ‘n6’, ‘n7’, ‘n8’, ‘n9’, ‘n10’, ‘n13’, ‘n14’]

numerical_noserical_fea:
[‘term’, ‘homeOwnership’, ‘verificationStatus’, ‘isDefault’, ‘initialListStatus’, ‘applicationType’, ‘policyCode’, ‘n11’, ‘n12’]

数值类别型变量分析

复制代码
    data_train['verificationStatus'].value_counts()#离散型变量
    data_train['initialListStatus'].value_counts()#离散型变量
    data_train['applicationType'].value_counts()#离散型变量
    data_train['policyCode'].value_counts()#离散型变量,无用,全部一个值
    data_train['n11'].value_counts()#离散型变量,相差悬殊,用不用再分析
    data_train['n12'].value_counts()#离散型变量,相差悬殊,用不用再分析
    # 剔除无用数据,并保存
    new_data_train = data_train.drop(['policyCode','n11','n12'],axis=1)
    print(new_data_train.columns)
    new_data_train.to_csv("new_train.csv", index=False, sep=',') # 不保存行名

全部评论 (0)

还没有任何评论哟~