Advertisement

4、pandas数据清洗与预处理

阅读量:

1、数据清洗

在这里插入图片描述

2、数据预处理

在这里插入图片描述
实例:排序、排名
复制代码
    import pandas as pd
    st=pd.DataFrame({'id':[0,1,3,2],'name':['alice','cindy','bob','bruce']},index=[0,1,2,3])
    gr=pd.DataFrame({'id_no':[0,3,2],'name':['alice','cindy','bob'],'age':[18,18,20],'grade':['A-90','B-85','C-80']},index=[1,3,2])
    #列索引升序
    print(gr.sort_index(axis=1))
    print('\n')
    #按age降序
    print(gr.sort_values(['age'],ascending=False))
    print('\n')
    #排名
    gr['rank']=gr['age'].rank()
    gr['rank_1']=gr['age'].rank(method='first',ascending=False)
    gr['rank_max']=gr['age'].rank(method='max')
    gr['rank_min']=gr['age'].rank(method='min')
    gr['rank_2']=gr['age'].rank(method='dense')
    print(gr.rank(method='dense'))
    print('\n')
    print(gr.sort_values(['age']))
    
    runfile('C:/Users/xyy/.spyder-py3/temp.py', wdir='C:/Users/xyy/.spyder-py3')
       age grade  id_no   name
    1   18  A-90      0  alice
    3   18  B-85      3  cindy
    2   20  C-80      2    bob
    
    
       id_no   name  age grade
    2      2    bob   20  C-80
    1      0  alice   18  A-90
    3      3  cindy   18  B-85
    
    
       id_no  name  age  grade  rank  rank_1  rank_max  rank_min  rank_2
    1    1.0   1.0  1.0    1.0   1.0     2.0       1.0       1.0     1.0
    3    3.0   3.0  1.0    2.0   1.0     3.0       1.0       1.0     1.0
    2    2.0   2.0  2.0    3.0   2.0     1.0       2.0       2.0     2.0
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
    AI写代码
实例:分组、筛选、透视
复制代码
    import numpy as np
    import pandas as pd
    st=pd.DataFrame({'id':[0,1,3,2],'name':['alice','cindy','bob','bruce']},index=[0,1,2,3])
    gr=pd.DataFrame({'id_no':[0,3,2],'name':['alice','cindy','bob'],'age':[18,18,20],'grade':['A-90','B-85','C-80']},index=[1,3,2])
    #分组,并将分组结果重置为新字段的值
    gr['age_group']=np.where(gr['name']=='alice','alice','others') #单条件
    gr.loc[(gr['age']==18)|(gr['name'].str[:1]=='a'),'group']=gr['name'] #多条件
    print(gr)
    print('\n')
    #按条件返回指定字段,并透视
    print('18岁或除了姓名首字母为a的所有人:\n',gr.loc[(gr['age']==18)|(gr['name'].str[:1]!='a'),['name','age']].sort_values(['age']))
    print('18岁或除了姓名首字母为a的所有人的年龄和:',gr.loc[(gr['age']==18)|(gr['name'].str[:1]!='a'),['name','age']].sort_values(['age']).age.sum())
    
    runfile('C:/Users/xyy/.spyder-py3/temp.py', wdir='C:/Users/xyy/.spyder-py3')
       id_no   name  age grade age_group  group
    1      0  alice   18  A-90     alice  alice
    3      3  cindy   18  B-85    others  cindy
    2      2    bob   20  C-80    others    NaN
    
    
    18岁或除了姓名首字母为a的所有人:
     name  age
    1  alice   18
    3  cindy   18
    2    bob   20
    18岁或除了姓名首字母为a的所有人的年龄和: 56
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
    AI写代码
实例:表合并与字段分列
复制代码
    import pandas as pd
    st=pd.DataFrame({'id':[0,1,3,2],'name':['alice','cindy','bob','bruce']},index=[0,1,2,3])
    gr=pd.DataFrame({'id_no':[0,3,2],'name':['alice','cindy','bob'],'age':[18,18,20],'grade':['A-90','B-85','C-80']},index=[1,3,2])
    #表合并
    rs=pd.merge(st,gr,how='left',left_on='id',right_on='id_no',sort=True)
    print(rs)
    print('\n')
    #单字段分列并建立分列表,索引为原表索引,将分列数据命名为新的两列
    gr_split=pd.DataFrame((x.split('-') for x in gr['grade']),index=gr.index,columns=['grade_x','grade_y'])
    print(gr_split)
    print('\n')
    #将原表与分列表合并
    gr1=pd.merge(gr,pd.DataFrame((x.split('-') for x in gr['grade']),columns=['grade_x','grade_y']),left_index=True,right_index=True) #left/right_index=True表示保留原表索引并按照索引关联
    print(gr1)
    print('\n')
    #单字段分列,将分列结果作为原字段值
    gr['grade_y']=gr['grade'].apply(lambda x:x.split('-')[1])
    gr['grade']=gr['grade'].apply(lambda x:x.split('-')[0])
    print(gr)
    print('\n')
    #截取字符进行分列
    gr['name_first']=gr['name'].str[:1]
    print(gr)
    
    runfile('C:/Users/xyy/.spyder-py3/temp.py', wdir='C:/Users/xyy/.spyder-py3')
    id name_x  id_no name_y   age grade
    0   0  alice    0.0  alice  18.0  A-90
    1   1  cindy    NaN    NaN   NaN   NaN
    2   2  bruce    2.0    bob  20.0  C-80
    3   3    bob    3.0  cindy  18.0  B-85
    
    
      grade_x grade_y
    1       A      90
    3       B      85
    2       C      80
    
    
       id_no   name  age grade grade_x grade_y
    1      0  alice   18  A-90       B      85
    2      2    bob   20  C-80       C      80
    
    
       id_no   name  age grade grade_y
    1      0  alice   18     A      90
    3      3  cindy   18     B      85
    2      2    bob   20     C      80
    
    
       id_no   name  age grade grade_y name_first
    1      0  alice   18     A      90          a
    3      3  cindy   18     B      85          c
    2      2    bob   20     C      80          b
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
    AI写代码

全部评论 (0)

还没有任何评论哟~