R语言的dplyr应用
发布时间
阅读量:
阅读量
| ####### Some dplyr | |
|---|---|
| library(dplyr) | |
| install.packages("hflights") | |
| library(hflights) | |
| flights<- as.data.frame(hflights) | |
| ### Filter function#### | |
| filter(flights, UniqueCarrier %in% c("AA", "UA")) | |
| ### Select##### | |
| select(flights, DepTime, ArrTime, FlightNum) | |
| select(flights, Year:DayofMonth, contains("Taxi")) | |
| ### Select and Filter##### | |
| flights %>% | |
| select(UniqueCarrier, DepDelay) %>% | |
| filter(DepDelay >60) | |
| ### arrange##### | |
| flights %>% | |
| select(UniqueCarrier, DepDelay) %>% | |
| arrange(desc(DepDelay)) | |
| ### Mutate##### | |
| flights<- flights %>% | |
| mutate(speed = Distance/AirTime*60) | |
| ### Summarise##### | |
| n() | |
| n_distinct() | |
| flights %>% | |
| group_by(Dest) %>% | |
| summarize(ave_delay = mean(ArrDelay, na.rm= TRUE)) | |
| ## Summarize mean to multiple columns | |
| flights %>% | |
| group_by(UniqueCarrier) %>% | |
| summarise_each(funs(mean), Cancelled, Diverted) | |
| ## Summarize min and max to multiple columns by match function to select columns | |
| flights %>% | |
| group_by(UniqueCarrier) %>% | |
| summarise_each(funs(min(.,na.rm = TRUE), max(.,na.rm=TRUE)),matches("Delay")) | |
| ## Count n() | |
| flights %>% | |
| group_by(Month, DayofMonth)%>% | |
| summarise(flight_count = n())%>% | |
| arrange(desc(flight_count)) | |
| ## Count n_distict() | |
| flights %>% | |
| group_by(Dest) %>% | |
| summarise(flight_count = n(), plane_count= n_distinct(TailNum)) | |
| ## Group without summarising | |
| flights %>% | |
| group_by(Dest) %>% | |
| select(Cancelled) %>% | |
| table() | |
| #### Window function ####### | |
| ## for each carrier, which 2 days of theyear had longest departure | |
| flights %>% | |
| group_by(UniqueCarrier)%>% | |
| select(Month, DayofMonth, DepDelay)%>% | |
| filter(min_rank(desc(DepDelay)) <=2 )%>% | |
| arrange(UniqueCarrier, desc(DepDelay)) | |
| ## using top_n function | |
| flights %>% | |
| group_by(UniqueCarrier)%>% | |
| select(Month, DayofMonth, DepDelay)%>% | |
| top_n(2)%>% | |
| arrange(UniqueCarrier, desc(DepDelay)) | |
| ## Calculate the month diff from last month | |
| flights %>% | |
| group_by(Month)%>% | |
| summarise(flight_count = n())%>% | |
| mutate(change = flight_count - lag(flight_count)) | |
| #### Other function############ | |
| ## Sample data | |
| flights%>% sample_n(5) | |
| flights %>% sample_frac(0.25, replace = FALSE) | |
| ## rename | |
| flights%>% rename(tail= TailNum) | |
| ## Sort dep_delay in each group | |
| flights%>% | |
| group_by(Month, DayofMonth) %>% | |
| top_n(3, DepDelay)%>% | |
| arrange(desc(DepDelay)) | |
| #### Reshape2 | |
| library(Reshape2) | |
| melt() | |
| dcast() | |
| #### ggplot2############################### | |
| ### Average age for each occupation | |
| ggplot(marketing, aes(job, age)) + | |
| geom_bar(stat = "summary", fun.y = "mean", color = "black",fill= "grey", width = 0.5) + | |
| theme_bw() + | |
| labs( y = "Age", | |
| title = "Age Distribution")+ | |
| theme(plot.title = element_text(hjust = 0.5), | |
| plot.subtitle = element_text(hjust = 0.5)) | |
| ##Average age for each occupation with y fill | |
| ggplot(marketing, aes(job, age, fill = y)) + | |
| geom_bar(stat = "summary", fun.y = "mean", width = 0.5) + | |
| theme_bw() + | |
| labs( y = "Age", | |
| title = "Age Distribution") | |
| ### density plot | |
| p <- ggplot(df, aes(x=weight)) + | |
| geom_density() + | |
| geom_vline(aes(xintercept=mean(weight)), # Mean line | |
| color="blue", linetype="dashed", size=1) | |
| # add density line by groups by color | |
| p<-ggplot(df, aes(x=weight, color=sex)) + | |
| geom_density()+ | |
| geom_vline(data=mu, aes(xintercept=mean, color=sex), # mu is mean for each group | |
| linetype="dashed") | |
| ## scatter plot | |
| # color with scatter plot | |
| ggplot(data, aes(a,b, color = a) | |
| geom_point(shape = 16, size = 5, show.legend = FALSE) + | |
| theme_minimal() | |
| # color and transparent alpha | |
| ggplot(d, aes(a, b, color = pc)) + | |
| geom_point(shape = 16, size = 5, show.legend = FALSE, alpha = .4) + | |
| theme_minimal() + | |
| scale_color_gradient(low = "#0091ff", high = "#f0650e") | |
| ## bar chart | |
| ggplot(marketing, aes(job, age, fill = y)) + | |
| geom_bar(stat = "summary", fun.y = "mean", width = 0.5) + | |
| facet_wrap( ~ marital) | |
| theme_bw() + | |
| labs( y = "Age", | |
| title = "Age Distribution") | |
| # color with RcolorBrewer for stacked percent bar | |
| ggplot(data, aex(fill = condition, y = value, x = specie))+ | |
| geom_bar(stat = "identity", position = "fill")+ | |
| scale_fill_brewer(palette = "Set1) | |
| # Grouped chart | |
| ggplot(data, aes(fill = condition, y=value, x=specie))+ | |
| geom_bar(postion = "dodge", stat = "identity") | |
| # Faceting# | |
| ggplot(data, aes(y= value, x= specie, color = specie, fill = specie))+ | |
| geom_bar(stat = "identity") + | |
| facet_wrap(~condition) | |
全部评论 (0)
还没有任何评论哟~
