Advertisement

Python 用pandas 进行数据清洗处理

阅读量:

1.数据读取

复制代码
 import pandas as pd

    
 import numpy as np
    
 import pymongo
    
  
    
 data = pd.DataFrame(pd.read_excel('000.xlsx', index=False))
    
  
    
 client = pymongo.MongoClient("mongodb://XX:XXXXX@192.168.3.7:2018",connect=False)
    
 db = client["test"]
    
 table = db["python"]
    
 df = pd.DataFrame(list(table.find()))

可以从excel,csv,mongo数据之类的读取数据

2.遍历

复制代码
 for i in range(data.index.max()):  
    
     if any([  
    
     'missing' in data.loc[i,:].values,  
    
     data.loc[i,'hour'] not in range(25),  
    
     ]):  
    
   
    
     print('已删除存在异常值 %s 行数据'%i)  
    
     data.drop([i],inplace=True)
    
 for i in range(0,len(df)):
    
     info = df.loc[i].to_dict()

3. 去空(NA)

3.1直接去除

复制代码
 from numpy import nan as NA

    
 data=Series([1,NA,3.5,NA,7])
    
 print(data.dropna())
    
 #至少2个NA才删除
    
 print(data.dropna(thresh=2))

3.2 用中位数或者平均数进行填充

复制代码
 df = df.fillna(df.median())

    
 print(df.fillna(df.mean()))

4.对字段进行处理

复制代码
 def get_salary(salary):

    
     s = 0
    
     if "-" in salary:
    
     for part in salary.split("-"):
    
         if "万" in part:
    
             q = float(part[:-1]) 
    
         else:
    
             q = float(part[:-1]) 
    
         s += q
    
     return int(s/2.0)
    
     else:
    
     return np.nan    
    
 df["salary"] = df["salary"].apply(get_salary)
    
 df.head()
    
 df["company"]=df["company"].apply(lambda x :x.split("/")[0].strip('"'))

5.删除重复

复制代码
    df["company"].drop_duplicates()

6.只留部分

复制代码
 df.loc[:,["address","company"]]

    
 df_c = df_c.iloc[:,[4,5]]
    
 del data["name_grade"]
    
 del data["info_grade"]

7. 排序

复制代码
    df.sort_values(by='col1', ascending=False)

8. isin

复制代码
    mask = df['A'].isin([1]) #括号中必须为list

9. merge

复制代码
 df1 = pd.DataFrame({'name':['kate', 'herz', 'catherine', 'sally'], 'age':[25, 28, 39, 35]})

    
 df2 = pd.DataFrame({'name_t':['kate', 'herz', 'catherine', 'sally'], 'score':[70, 60, 90, 100]})
    
 print(pd.merge(df1, df2, left_on="name", right_on="name_t").drop('name_t', axis=1))

10.保存为csv,或者到mongo

复制代码
 df["company"].drop_duplicates().to_csv("company.csv",encoding="utf-8")

    
 db[MONGO_TABLE].insert(row.to_dict())

http://www.codeblogbt.com/archives/102061

全部评论 (0)

还没有任何评论哟~