4、pandas数据清洗与预处理
发布时间
阅读量:
阅读量
1、数据清洗

2、数据预处理

实例:排序、排名
import pandas as pd
st=pd.DataFrame({'id':[0,1,3,2],'name':['alice','cindy','bob','bruce']},index=[0,1,2,3])
gr=pd.DataFrame({'id_no':[0,3,2],'name':['alice','cindy','bob'],'age':[18,18,20],'grade':['A-90','B-85','C-80']},index=[1,3,2])
#列索引升序
print(gr.sort_index(axis=1))
print('\n')
#按age降序
print(gr.sort_values(['age'],ascending=False))
print('\n')
#排名
gr['rank']=gr['age'].rank()
gr['rank_1']=gr['age'].rank(method='first',ascending=False)
gr['rank_max']=gr['age'].rank(method='max')
gr['rank_min']=gr['age'].rank(method='min')
gr['rank_2']=gr['age'].rank(method='dense')
print(gr.rank(method='dense'))
print('\n')
print(gr.sort_values(['age']))
runfile('C:/Users/xyy/.spyder-py3/temp.py', wdir='C:/Users/xyy/.spyder-py3')
age grade id_no name
1 18 A-90 0 alice
3 18 B-85 3 cindy
2 20 C-80 2 bob
id_no name age grade
2 2 bob 20 C-80
1 0 alice 18 A-90
3 3 cindy 18 B-85
id_no name age grade rank rank_1 rank_max rank_min rank_2
1 1.0 1.0 1.0 1.0 1.0 2.0 1.0 1.0 1.0
3 3.0 3.0 1.0 2.0 1.0 3.0 1.0 1.0 1.0
2 2.0 2.0 2.0 3.0 2.0 1.0 2.0 2.0 2.0
AI写代码
实例:分组、筛选、透视
import numpy as np
import pandas as pd
st=pd.DataFrame({'id':[0,1,3,2],'name':['alice','cindy','bob','bruce']},index=[0,1,2,3])
gr=pd.DataFrame({'id_no':[0,3,2],'name':['alice','cindy','bob'],'age':[18,18,20],'grade':['A-90','B-85','C-80']},index=[1,3,2])
#分组,并将分组结果重置为新字段的值
gr['age_group']=np.where(gr['name']=='alice','alice','others') #单条件
gr.loc[(gr['age']==18)|(gr['name'].str[:1]=='a'),'group']=gr['name'] #多条件
print(gr)
print('\n')
#按条件返回指定字段,并透视
print('18岁或除了姓名首字母为a的所有人:\n',gr.loc[(gr['age']==18)|(gr['name'].str[:1]!='a'),['name','age']].sort_values(['age']))
print('18岁或除了姓名首字母为a的所有人的年龄和:',gr.loc[(gr['age']==18)|(gr['name'].str[:1]!='a'),['name','age']].sort_values(['age']).age.sum())
runfile('C:/Users/xyy/.spyder-py3/temp.py', wdir='C:/Users/xyy/.spyder-py3')
id_no name age grade age_group group
1 0 alice 18 A-90 alice alice
3 3 cindy 18 B-85 others cindy
2 2 bob 20 C-80 others NaN
18岁或除了姓名首字母为a的所有人:
name age
1 alice 18
3 cindy 18
2 bob 20
18岁或除了姓名首字母为a的所有人的年龄和: 56
AI写代码
实例:表合并与字段分列
import pandas as pd
st=pd.DataFrame({'id':[0,1,3,2],'name':['alice','cindy','bob','bruce']},index=[0,1,2,3])
gr=pd.DataFrame({'id_no':[0,3,2],'name':['alice','cindy','bob'],'age':[18,18,20],'grade':['A-90','B-85','C-80']},index=[1,3,2])
#表合并
rs=pd.merge(st,gr,how='left',left_on='id',right_on='id_no',sort=True)
print(rs)
print('\n')
#单字段分列并建立分列表,索引为原表索引,将分列数据命名为新的两列
gr_split=pd.DataFrame((x.split('-') for x in gr['grade']),index=gr.index,columns=['grade_x','grade_y'])
print(gr_split)
print('\n')
#将原表与分列表合并
gr1=pd.merge(gr,pd.DataFrame((x.split('-') for x in gr['grade']),columns=['grade_x','grade_y']),left_index=True,right_index=True) #left/right_index=True表示保留原表索引并按照索引关联
print(gr1)
print('\n')
#单字段分列,将分列结果作为原字段值
gr['grade_y']=gr['grade'].apply(lambda x:x.split('-')[1])
gr['grade']=gr['grade'].apply(lambda x:x.split('-')[0])
print(gr)
print('\n')
#截取字符进行分列
gr['name_first']=gr['name'].str[:1]
print(gr)
runfile('C:/Users/xyy/.spyder-py3/temp.py', wdir='C:/Users/xyy/.spyder-py3')
id name_x id_no name_y age grade
0 0 alice 0.0 alice 18.0 A-90
1 1 cindy NaN NaN NaN NaN
2 2 bruce 2.0 bob 20.0 C-80
3 3 bob 3.0 cindy 18.0 B-85
grade_x grade_y
1 A 90
3 B 85
2 C 80
id_no name age grade grade_x grade_y
1 0 alice 18 A-90 B 85
2 2 bob 20 C-80 C 80
id_no name age grade grade_y
1 0 alice 18 A 90
3 3 cindy 18 B 85
2 2 bob 20 C 80
id_no name age grade grade_y name_first
1 0 alice 18 A 90 a
3 3 cindy 18 B 85 c
2 2 bob 20 C 80 b
AI写代码
全部评论 (0)
还没有任何评论哟~
