Advertisement

利用R语言的dplyr包进行数据转换

阅读量:
复制代码
 library(tidyverse)

    
 library(nycflights13)    #利用该包中的flights数据
    
  
    
 flights
    
  
    
 #### R语言中的变量类型
    
 # int——整数型变量
    
 # dbl——双精度浮点数型变量,或称实数
    
 # chr——字符串
    
 # dttm——日期时间型变量
    
 # lgl——逻辑型变量
    
 # fctr——因子,即具有固定数目的值的分类变量
    
 # date——日期型变量
    
  
    
  
    
  
    
 ####使用filter()筛选行
    
 (jan_1 <- filter(flights,month == 1,day == 1))
    
 #比较浮点数是否相等时,不能使用==,而应该使用near()
    
 near(sqrt(2) ^ 2, 2)
    
 #逻辑运算符或
    
 filter(flights,month == 11 | month == 12)
    
 #等价于
    
 (nov_dec <- filter(flights,month %in% c(11,12)))
    
 #判定一个值是否为缺失值
    
 is.na(3)
    
 #filter()默认排除条件为FALSE和NA的行。如果想保留缺失值,可以明确指出
    
 df <- tibble(x = c(1, NA, 3))
    
 filter(df, is.na(x) | x > 1)
    
  
    
  
    
  
    
 ####使用arrange()排列行
    
 #按默认升序
    
 arrange(flights,year,month,day)
    
 #按降序
    
 arrange(flights,desc(arr_delay))
    
 #缺失值总是排在最后
    
 df <- tibble(x = c(5,2,NA))
    
 arrange(df,x)
    
  
    
  
    
  
    
 ####使用select()选择列
    
 #按名称选择列
    
 select(flights,year,month,day)
    
 #选择year和day之间所有的列(包括year和day)
    
 select(flights,year:day)
    
 #选择不在year和day之间所有的列(不包括year和day)
    
 select(flights,-(year:day))
    
 #重命名变量
    
 rename(flights,tail_num = tailnum)
    
 #将几个变量移动到数据框的开头
    
 select(flights,time_hour,air_time,everything())
    
  
    
  
    
  
    
 ####使用mutate()添加变量
    
 #新增变量会放在最后
    
 #创建一个新的更狭窄的数据集,以便能够看到新变量
    
 flights_sml <- select(flights,
    
   year:day,
    
   ends_with("delay"),
    
   distance,
    
   air_time
    
 )
    
 flights_sml
    
 mutate(flights_sml,
    
    gain = arr_delay - dep_delay,
    
    speed = distance / air_time * 60)
    
 #创建的新列可以立即使用
    
 mutate(flights_sml,
    
    gain = arr_delay - dep_delay,
    
    hours = air_time / 60,
    
    gain_per_hour = gain / hours)
    
 #若只想保留新变量,使用transmute()函数
    
 transmute(flights_sml,
    
       gain = arr_delay - dep_delay,
    
       hours = air_time / 60,
    
       gain_per_hour = gain / hours)
    
 #可以与mutate()一起使用的向量化函数
    
 # 算数运算符:+、-、*、/、^
    
 # 模运算符:%/%(整除)和%%(求余)
    
 transmute(flights,
    
       dep_time,
    
       hour = dep_time %/% 100,
    
       minute = dep_time %% 100
    
 )
    
 # 对数运算符:log()、log2()和log10()
    
 # 偏移函数:lead()和lag()
    
 (x <- 1:10)
    
 lead(x)
    
 lag(x)
    
 # 累加和滚动聚合
    
 # 累加和cumsum()
    
 cumsum(x)
    
 # 累加积cumprod()
    
 cumprod(x)
    
 # 累加最小值cummin()
    
 cummin(x)
    
 # 累加最大值cummax()
    
 cummax(x)
    
 # 逻辑比较:<、<=、>、>=、!=
    
 # 排秩
    
 y <- c(1,2,2,NA,3,4)
    
 min_rank(y)
    
 row_number(y)
    
  
    
  
    
  
    
 ####使用summarize()进行分组摘要
    
 by_day <- group_by(flights, year, month, day)
    
 summarize(by_day,delay = mean(dep_delay,na.rm = TRUE))
    
  
    
  
    
 ###使用管道组合多种操作
    
 #研究每个目的地的距离和平均延误时间之间的关系
    
 #普通写法
    
 by_dest <- group_by(flights,dest)
    
 delay <- summarize(by_dest,
    
                count = n(),
    
                dist = mean(distance,na.rm = TRUE),
    
                delay = mean(arr_delay,na.rm = TRUE))
    
 delay <- filter(delay, count > 20, dest != "HNL")
    
 ggplot(data = delay, mapping = aes(x = dist, y = delay)) + 
    
   geom_point(aes(size = count), alpha = 1/3) + 
    
   geom_smooth(se = FALSE)
    
  
    
  
    
 ###管道写法
    
 #管道的重点在于转换的过程而不是对象
    
 delays <- flights %>%
    
   group_by(dest) %>%
    
   summarize(
    
     count = n(),
    
     dist = mean(distance, na.rm = TRUE), delay = mean(arr_delay, na.rm = TRUE)
    
   ) %>%
    
   filter(count > 20, dest != "HNL")
    
  
    
  
    
 ###缺失值
    
 flights %>%
    
   group_by(year, month, day) %>%
    
   summarize(mean = mean(dep_delay, na.rm = TRUE))
    
 #也可以先去除缺失值再进行分组摘要
    
 not_cancelled <- flights %>%
    
   filter(!is.na(dep_delay), !is.na(arr_delay))
    
 not_cancelled %>%
    
   group_by(year, month, day) %>%
    
   summarize(mean = mean(dep_delay))
    
  
    
  
    
 ###计数
    
 #可以确定结论不是基于非常少的数据得出来的
    
 delays <- not_cancelled %>%
    
   group_by(tailnum) %>%
    
   summarize(
    
     delay = mean(arr_delay)
    
   )
    
 ggplot(delays, aes(x = delay)) + 
    
   geom_freqpoly(binwidth = 10)
    
 #绘制航班数量和平均延误时间的散点图
    
 delays <- not_cancelled %>%
    
   group_by(tailnum) %>%
    
   summarize(
    
     delay = mean(arr_delay,na.rm = TRUE),
    
     n = n()
    
   )
    
 ggplot(data = delays, mapping = aes(x = n, y = delay)) +
    
   geom_point(alpha = 1/10)
    
 #剔除观测值较少的飞机
    
 delays %>%
    
   filter(n > 25) %>%
    
   ggplot(mapping = aes(x = n, y = delay)) +
    
   geom_point(alpha = 1/10)
    
  
    
 ##利用Lahman包中的数据计算大联盟中每个棒球队员的打击率(安打数/打数)
    
 (batting <- as_tibble(Lahman::Batting))
    
 batters <- batting %>%
    
   group_by(playerID) %>%
    
   summarize(
    
     ba = sum(H, na.rm = TRUE) / sum(AB, na.rm = TRUE),    #能力——ba
    
     ab = sum(AB, na.rm = TRUE)                            #击球数量——ab
    
   )
    
 batters %>%
    
   filter(ab > 100) %>%
    
   ggplot(mapping = aes(x = ab, y = ba)) +              #在管道中不必再声明data
    
   geom_point() +
    
   geom_smooth(se = FALSE)
    
  
    
  
    
 ### 常用的摘要函数
    
 ##位置度量:mean()、median()
    
 ##分散程度度量:sd()、IQR(四分位距)、mad(绝对中位差)
    
 ##秩的度量:min()、quantile(x,0.25)、max()
    
 ##定位度量:first()、nth(x,2)、last()
    
 ##计数:n()、n_distinct(计算唯一值的数量)、count(仅用于计数)
    
 ##逻辑值的计数和比例:sum(x > 10)、mean(y ==10)
    
  
    
  
    
 ###按多个变量分组
    
 #每次会用掉一个分组变量
    
 daily <- group_by(flights, year, month, day)
    
 (per_day <- summarize(daily, flights = n()))
    
 (per_month <- summarize(per_day, flights = sum(flights)))
    
 (per_year <- summarize(per_month, flights = sum(flights)))
    
 ##取消分组
    
 daily %>%
    
   ungroup() %>%
    
   summarize(flights = n())
    
  
    
  
    
 ### 分组新变量和筛选器
    
 #找出每个分组中最差的成员
    
 flights_sml %>%
    
   group_by(year, month, day) %>%
    
   filter(rank(desc(arr_delay)) < 10)
    
 #找出大于某个阈值的所有分组
    
 (popular_dests <- flights %>%
    
   group_by(dest) %>%
    
   filter(n() > 365))

全部评论 (0)

还没有任何评论哟~