Advertisement

Python金融数据挖掘 第6章 复习思考题3

阅读量:

3、对于泰坦尼克号的数据集,试分析幸存与否与独立登船是否相关(alone数据列),进一步地,可以分析与年龄段(age数据列)是否相关。

复制代码
 # 引入库

    
 import matplotlib.pyplot as plt
    
 import numpy as np
    
 import seaborn as sns
    
 import pandas as pd
    
  
    
 # 图表内嵌中文字体问题
    
 plt.rcParams['font.sans-serif']=['SimHei']
    
 plt.rcParams['axes.unicode_minus'] = False
    
 sns.set_style('whitegrid',{'font.sans-serif':['simhei', 'Arial']})
    
  
    
 # 导入数据
    
 titanic = pd.read_csv('titanic.csv')
    
  
    
 # 统计幸存者的是否独立登船比例
    
 survived = titanic.groupby(['alone','survived']).size().unstack()
    
 survived['sum'] = survived[0]+survived[1]
    
 survived['生还率'] = survived[1]/survived['sum']
    
 print('幸存者按照是否独立登船统计的生还者、遇难者:')
    
 print('0:遇难,1:生还')
    
 print(survived)
    
 print('幸存与否与独立登船的相关系数:')
    
 print(titanic['survived'].corr(titanic['alone']))
    
 mm = titanic['survived'].corr(titanic['alone'])
    
 if mm > 0:
    
     print('幸存与否与独立登船呈正相关')
    
 else:
    
     print('幸存与否与独立登船呈负相关')
    
 print()
    
  
    
 # 画相关系数矩阵
    
 plt.figure(figsize=(10,8))
    
  
    
 # sns.heatmap(survived,corr(),linewidths = 0.1,vmax=1.0,square=True,linecolor='white', annot=True,annot_kws={'size':20,'weight':'bold','color':'white'})
    
 sns.heatmap(survived,linewidths = 0.1,vmax=1.0,square=True,linecolor='white', annot=True,annot_kws={'size':20,'weight':'bold','color':'green'})
    
 plt.savefig('相关矩阵.png',dpi=300,bbox_inches='tight')
    
 plt.show()
    
  
    
 # 年龄段与幸存者关系
    
 age_surv = titanic.loc[titanic['survived'] == 1,'age']
    
 age_unsurv = titanic.loc[titanic['survived'] == 0,'age']
    
 print('生还者年龄统计描述:')
    
 print(age_surv.describe())
    
 print('遇难者年龄统计描述:')
    
 print(age_unsurv.describe())
    
 print('幸存与否与年龄的相关系数:')
    
 print(titanic['survived'].corr(titanic['age']))
    
 nn = titanic['survived'].corr(titanic['age'])
    
 if nn > 0:
    
     print('幸存与否与年龄呈正相关')
    
 else:
    
     print('幸存与否与年龄呈负相关')
    
  
    
 # 画图
    
 f,ax = plt.subplots(figsize=(7,7))
    
 sns.set()
    
 sns.distplot(np.array(age_surv),hist = False,label = 'survived',kde_kws = {'color':'r','lw':1,'marker':'o'})
    
 sns.distplot(np.array(age_unsurv),hist = False,label = 'unsurvived',kde_kws = {'color':'k','lw':3})
    
 ax.set(title = '幸存/遇难者年龄分布图:')
    
 plt.savefig('幸存/遇难者年龄分布图.png',dpi=300, bbox_inches=" tight")
    
 plt.show()
    
  
    
    
    
    

1)

幸存者按照是否独立登船统计的生还者、遇难者:
0:遇难,1:生还
survived 0 1 sum 生还率
alone
False 175 179 354 0.505650
True 374 163 537 0.303538
幸存与否与独立登船的相关系数:
负相关
-0.2033670856998918

2)

生还者年龄统计描述:
count 290.000000
mean 28.343690
std 14.950952
min 0.420000
25% 19.000000
50% 28.000000
75% 36.000000
max 80.000000
Name: age, dtype: float64
遇难者年龄统计描述:
count 424.000000
mean 30.626179
std 14.172110
min 1.000000
25% 21.000000
50% 28.000000
75% 39.000000
max 74.000000
Name: age, dtype: float64
幸存与否与年龄的相关系数:
-0.07722109457217761
幸存与否与年龄呈负相关


全部评论 (0)

还没有任何评论哟~