Advertisement

Datacamp: Introduction to Importing Data in R

阅读量:

Chapter 1

1. Introduction & read.csv

Defaults: (1) header = TRUE

(2) sep = ","

eg. (1) Use read.csv() to import "swimming_pools.csv" as a data frame with the name pools.(2) Print the structure of pools using str().

复制代码
 # Import swimming_pools.csv: pools

    
 pools <- read.csv("swimming_pools.csv")
    
  
    
 # Print the structure of pools
    
 str(pools)
    
    
    
    

stringsAsFactors: If seting stringsAsFactors to FALSE, the data frame columns corresponding to strings in your text file will be character. If seting stringsAsFactors to TRUE, you import strings as factors.

复制代码
 # Import swimming_pools.csv correctly: pools

    
 pools <- read.csv("swimming_pools.csv", stringsAsFactors = FALSE)
    
  
    
 # Check the structure of pools
    
 str(pools)
    
    
    
    

2. read.delim & read.table

(1) read.delim (读取.txt文件)

Default: (1) header = TRUE (the first row contains the field names).

(2) sep = "\t" (fields in a record are delimited by tabs)

eg1. (1) Import the data in "hotdogs.txt" with read.delim(). Call the resulting data frame hotdogs and the variable names are not on the first line. (2) Call summary() on hotdogs. This will print out some summary statistics about all variables in the data frame.

复制代码
 # Import hotdogs.txt: hotdogs

    
 hotdogs <- read.delim("hotdogs.txt", header = FALSE)
    
  
    
 # Summarize hotdogs
    
 summary(hotdogs)
    
    
    
    

(2) read.table (deal with more exotic flat file formats)

By default, the header argument defaults to FALSE and the sep argument is "" by default.

eg2. (1) The data is still hotdogs.txt. It has no column names in the first row, and the field separators are tabs. This time, though, the file is in the data folder inside your current working directory. A variable path with the location of this file is already coded for you. (2) Call head() on hotdogs; this will print the first 6 observations in the data frame.

复制代码
 # Path to the hotdogs.txt file: path

    
 path <- file.path("data", "hotdogs.txt")
    
  
    
 # Import the hotdogs.txt file: hotdogs
    
 hotdogs <- read.table(path, 
    
                   sep = "\t", 
    
                   col.names = c("type", "calories", "sodium"))
    
  
    
 # Call head() on hotdogs
    
 head(hotdogs)
    
    
    
    

eg3. Lily and Tom decide that they will have one each hot dog. Lily wants to have the one with the fewest calories while Tom wants to have the one with the most sodium.(Next to calories and sodium, the hotdogs have one more variable: type. This can be one of three things: Beef, Meat or Poultry, so a categorical variable: a factor is fine.)

复制代码
 # Finish the read.delim() call

    
 hotdogs <- read.delim("hotdogs.txt", header = FALSE, col.names = c("type", "calories", "sodium"))
    
  
    
 # hotdogs(结果输出)
    
 hotdogs
    
       type calories sodium
    
 1     Beef      186    495
    
 2     Beef      181    477
    
 3     Beef      176    425
    
 4     Beef      149    322
    
 5     Beef      184    482
    
 6     Beef      190    587
    
 7     Beef      158    370
    
 8     Beef      139    322
    
 9     Beef      175    479
    
 10    Beef      148    375
    
 11    Beef      152    330
    
 12    Beef      111    300
    
 13    Beef      141    386
    
 14    Beef      153    401
    
 15    Beef      190    645
    
 16    Beef      157    440
    
 17    Beef      131    317
    
 18    Beef      149    319
    
 19    Beef      135    298
    
 20    Beef      132    253
    
 21    Meat      173    458
    
 22    Meat      191    506
    
 23    Meat      182    473
    
 24    Meat      190    545
    
 25    Meat      172    496
    
 26    Meat      147    360
    
 27    Meat      146    387
    
 28    Meat      139    386
    
 29    Meat      175    507
    
 30    Meat      136    393
    
 31    Meat      179    405
    
 32    Meat      153    372
    
 33    Meat      107    144
    
 34    Meat      195    511
    
 35    Meat      135    405
    
 36    Meat      140    428
    
 37    Meat      138    339
    
 38 Poultry      129    430
    
 39 Poultry      132    375
    
 40 Poultry      102    396
    
 41 Poultry      106    383
    
 42 Poultry       94    387
    
 43 Poultry      102    542
    
 44 Poultry       87    359
    
 45 Poultry       99    357
    
 46 Poultry      107    528
    
 47 Poultry      113    513
    
 48 Poultry      135    426
    
 49 Poultry      142    513
    
 50 Poultry       86    358
    
 51 Poultry      143    581
    
 52 Poultry      152    588
    
 53 Poultry      146    522
    
 54 Poultry      144    545
    
  
    
 # Select the hot dog with the least calories: lily
    
 lily <- hotdogs[which.min(hotdogs$calories), ]
    
  
    
 # Select the observation with the most sodium: tom
    
 tom <- hotdogs[which.max(hotdogs$sodium), ]
    
    
    
    

3. Column classes

colClasses: specify the column types or column classes of the resulting data frame.

(1) useful if have some columns that should be factors and others that should be characters and do not have to use stringsAsFactors

(2) if a column is set to "NULL" in the colClasses vector, this column will be skipped and will not be loaded into the data frame

复制代码
 read.delim("my_file.txt",

    
        colClasses = c("character",
    
                       "numeric",
    
                       "logical"))
    
    
    
    

Chapter 2

1. readr: read_csv & read_ts

复制代码
 install.packages("readr")

    
 library(readr)
    
    
    
    

(1) read_csv

复制代码
 # Load the readr package

    
 library(readr)
    
  
    
 # Import potatoes.csv with read_csv(): potatoes
    
 potatoes <- read_csv("potatoes.csv")
    
    
    
    

(2) read_tsv

复制代码
 # readr is already loaded

    
  
    
 # Column names
    
 properties <- c("area", "temp", "size", "storage", "method",
    
             "texture", "flavor", "moistness")
    
  
    
 # Import potatoes.txt: potatoes
    
 potatoes <- read_tsv("potatoes.txt", col_names = properties)
    
    
    
    

2. readr: read_delim

read_delim() takes two mandatory arguments:

(1) file: the file that contains the data

(2) delim: the character that separates the values in the data file

复制代码
 # readr is already loaded

    
  
    
 # Column names
    
 properties <- c("area", "temp", "size", "storage", "method",
    
             "texture", "flavor", "moistness")
    
  
    
 # Import potatoes.txt using read_delim(): potatoes
    
 potatoes <- read_delim("potatoes.txt", delim = "\t", col_names = properties)
    
    
    
    

3. skip and n_max

例. You have a CSV file with 20 lines, and set skip = 2 and n_max = 3, you're only reading in lines 3, 4 and 5 of the file.

(PS: Once you skip some lines, you also skip the first line that can contain column names.)

4. col_types

col_types: 指定导入的数据框架中的列应具有哪些类型( _ skips the column as a whole)

(1) c: character

(2) d: double

(3) i: integer

(4) l: logical

5. col_types with collectors

(1) col_integer(): the column should be interpreted as an integer

(2) col_factor(levels, ordered = FALSE): the column should be interpreted as a factor with levels

复制代码
 # Edit the col_types argument to import the data correctly: hotdogs_factor

    
 hotdogs_factor <- read_tsv("hotdogs.txt",
    
                        col_names = c("type", "calories", "sodium"),
    
                        col_types = list(fac, int, int))
    
    
    
    

6. data.table: fread

复制代码
 install.packages("data.table")

    
 library(data.table)
    
  
    
 # Import potatoes.csv with fread(): potatoes
    
 potatoes <- fread("potatoes.csv")
    
    
    
    

Two arguments of the function: drop and select

例. You have a dataset that contains 5 variables and you want to keep the first and fifth variable, named "a" and "e". The following options will all do the trick:

复制代码
 fread("path/to/file.txt", drop = 2:4)

    
 fread("path/to/file.txt", select = c(1, 5))
    
 fread("path/to/file.txt", drop = c("b", "c", "d"))
    
 fread("path/to/file.txt", select = c("a", "e"))
    
    
    
    

eg1.

复制代码
 # fread is already loaded

    
  
    
 # Import columns 6 and 8 of potatoes.csv: potatoes
    
 potatoes <- fread("potatoes.csv", select = c(6,8))
    
  
    
 # Plot texture (x) and moistness (y) of potatoes
    
 plot(potatoes$texture, potatoes$moistness)
    
    
    
    

eg2. The class of the result of fread() is both data.table and data.frame. read_csv() creates an object with three classes: tbl_df, tb and data.frame.

Chapter 3

1. readxl(1)

excel_sheets(): list different sheets

read_excel(): actually import data into R

复制代码
 # Load the readxl package

    
 install.packages("readxl")
    
 library(readxl)
    
  
    
 # Print the names of all worksheets
    
 excel_sheets("urbanpop.xlsx")
    
  
    
 # 输出结果
    
 [1] "1960-1966" "1967-1974" "1975-2011"
    
  
    
 # Read the sheets, one by one
    
 pop_1 <- read_excel("urbanpop.xlsx", sheet = 1)
    
 pop_2 <- read_excel("urbanpop.xlsx", sheet = 2)
    
 pop_3 <- read_excel("urbanpop.xlsx", sheet = 3)
    
  
    
 # 方法一
    
 # Put pop_1, pop_2 and pop_3 in a list: pop_list
    
 pop_list <- list(pop_1, pop_2, pop_3)
    
 # 方法二
    
 # Read all Excel sheets with lapply(): pop_list
    
 pop_list <- lapply(excel_sheets("urbanpop.xlsx"), read_excel, path = "urbanpop.xlsx")
    
    
    
    

2. Reading a workbook

复制代码
 my_workbook <- lapply(excel_sheets("data.xlsx"),

    
                   read_excel,
    
                   path = "data.xlsx")
    
    
    
    

3. readxl(2)

复制代码
 read_excel(path, sheet = 1,

    
        col_names = TRUE,
    
        col_types = NULL,
    
        skip = 0)
    
    
    
    

col_names:

(1) col_names = FALSE: R assigns names itself

(2) col_names = character vector: manually specify

(3) 默认为TRUE,表示 Excel 工作表中的第一行是否包含列名

skip:

复制代码
 # the first 15 rows in the first sheet of "data.xlsx" are ignored

    
 read_excel("data.xlsx", skip = 15)    
    
    
    
    

4. gdata

(1) Entire suite of tools for data manipulation

(2) Supercharges basic R

(3) read.xls()

(4) Support for XLS

(5) Support for XLSX with additional driver

(6) No readxl::excel_sheets() equivalent

复制代码
 # Load the gdata package

    
 library(gdata)
    
  
    
 # Import the second sheet of urbanpop.xls: urban_pop
    
 urban_pop <- read.xls("urbanpop.xls", sheet = "1967-1974")
    
  
    
 # Print the first 11 observations using head()
    
 head(urban_pop, n = 11)
    
  
    
 # Add code to import data from all three sheets in urbanpop.xls
    
 path <- "urbanpop.xls"
    
 urban_sheet1 <- read.xls(path, sheet = 1, stringsAsFactors = FALSE)
    
 urban_sheet2 <- read.xls(path, sheet = 2, stringsAsFactors = FALSE)
    
 urban_sheet3 <- read.xls(path, sheet = 3, stringsAsFactors = FALSE)
    
  
    
 # Extend the cbind() call to include urban_sheet3: urban
    
 urban <- cbind(urban_sheet1, urban_sheet2[-1], urban_sheet3[-1])
    
  
    
 # Remove all rows with NAs from urban: urban_clean
    
 urban_clean <- na.omit(urban)
    
    
    
    

Chapter 4

1. Reading sheets

复制代码
    install.packages("XLConnect")
    
    

(1) loadWorkbook()

复制代码
 library("XLConnect")

    
 book <- loadWorkbook("cities.xlsx")
    
    
    
    

(2) getSheets(): list the sheets in an Excel file

复制代码
 getSheets(book)

    
  
    
 library(readxl)
    
 excel_sheets("cities.xlsx")
    
    
    
    

(3) readWorksheet(): import data from a sheet

Customize readWorksheet:

复制代码
 my_book <- loadWorkbook("urbanpop.xlsx")

    
 sheets <- getSheets(my_book)
    
 all <- lapply(sheets, readWorksheet, object = my_book)
    
 str(all)
    
    
    
    

2. Adapting sheets

复制代码
 pop_2010 <- ...    # truncated

    
 library(XLConnect)
    
 book <- LoadWorkbook("cities.xlsx")
    
 createSheet(book, name = "year_2010")
    
 writeWorksheet(book, pop_2010, sheet = "year_2010")
    
 saveWorkbook(book, file = "cities2.xlsx")
    
    
    
    

复制代码
 removeSheet(book, sheet = "Y2010")

    
 saveWorkbook(book, file = "cities4.xlsx")
    
    
    
    

3. Renaming sheets

eg. Use renameSheet() to rename the fourth sheet to "summary".

复制代码
 my_book

    
 # 输出结果
    
 [1] "1960-1966"    "1967-1974"    "1975-2011"    "data_summary"
    
  
    
 # Rename "data_summary" sheet to "summary"
    
 renameSheet(my_book, "data_summary", "summary")
    
    
    
    

全部评论 (0)

还没有任何评论哟~