Data Programming

11.4 R

  • R

    • To download R, choose a CRAN mirror closest to your geographic location.
    • In order to build R packages, you should also download the latest recommended version of Rtools. Currently, the latest recommended version is Rtoools35.exe.
    • During installation, there should be an option to add Rtools to system path. You should check this option.
    • Afterwards, input the following code into R. If the output is TRUE, then Rtools was installed properly.
install.packages("devtools")
library(devtools)
find_rtools()
  • R Studio

    • R Studio is an integrated development environment (IDE) for R. After downloading R Studio, you should be able to type the following command at the console to download some common R packages for data analysis and visualization.
install.packages(c("dplyr", "tidyr", "ggplot2", "esquisse", "stats", "xtable"))

11.5 Cleaning and Reshaping Data

library(reshape2)
library(tidyr)
library(xtable)
library(stringr)
library(knitr)
options(kableExtra.latex.load_packages = FALSE)
library(kableExtra)
library(pander)


#original data is organized by id/trial (two locations per entry)
game <- data.frame(id = c(rep("X",3), rep("Y",3), rep("Z",3)),
           trial = rep(c(1,2,3), 3),
           location_A = round(rnorm(9, mean = 0, sd = 1), 1),
           location_B = round(rnorm(9, mean = 0, sd = 1), 1))

# reshape data from wide to long (each entry is unique by id/trial/location)
game_long <- melt(game, id = c("id","trial"), value.name = "score")
game_long$variable <- str_sub(game_long$variable,-1,-1)
colnames(game_long)[3] <- "location"

# reshape data back to wide (same as original data)
game_wide <- dcast(game_long, id + trial ~ location, value.var = "score")
# reshape data into even wider form (one entry per id with 6 value columns: 2 locations X 3 trials)
game_wider <- dcast(game_long, id ~ location + trial, value.var = "score")

# using tidyr and dplyr to reshape data
game_long2 <- game %>% gather(label, score, location_A, location_B) %>%
    separate(label, c("label_p1","location"), sep = "_") %>%
    dplyr::select(-label_p1)

game_wide2 <- game_long2 %>% spread(location, value = score)

#unite() function creates the location X trial combinations first in long format # then apply the spread() function to reshape into wide format
#just like in game_wide, each entry in game_wide2 is unique by id
game_wider2 <- game_long2 %>% unite(location_trial, location, trial) %>%
    spread(location_trial, value = score)

#xtable method
#print(xtable(game, caption = "Wide Data Listed by Person/Trial (Scores by Location)"), type="html")

#kable method
#kable(game, caption = "Wide Data Listed by Person/Trial (Scores by Location)", booktabs = TRUE) %>%
#    kable_styling(latex_options = c("hold_position"))

#pander method (most flexible)
pandoc.table(game, caption = "(\\#tab:wide) Wide Data Listed by Person/Trial (Scores by Location)")
Table 11.1: Wide Data Listed by Person/Trial (Scores by Location)
id trial location_A location_B
X 1 1 0.1
X 2 -0.8 -0.4
X 3 0.6 0
Y 1 -0.4 -2.3
Y 2 -0.2 -0.4
Y 3 -0.8 -0.5
Z 1 0.6 -1.2
Z 2 -0.4 2.4
Z 3 -0.4 -1.5
pandoc.table(game_wider, caption = "(\\#tab:wider) Wider Data Listed by ID (Scores by Location/Trial)")
Table 11.2: Wider Data Listed by ID (Scores by Location/Trial)
id A_1 A_2 A_3 B_1 B_2 B_3
X 1 -0.8 0.6 0.1 -0.4 0
Y -0.4 -0.2 -0.8 -2.3 -0.4 -0.5
Z 0.6 -0.4 -0.4 -1.2 2.4 -1.5
pandoc.table(game_long, caption = "(\\#tab:long) Long Data")
Table 11.3: Long Data
id trial location score
X 1 A 1
X 2 A -0.8
X 3 A 0.6
Y 1 A -0.4
Y 2 A -0.2
Y 3 A -0.8
Z 1 A 0.6
Z 2 A -0.4
Z 3 A -0.4
X 1 B 0.1
X 2 B -0.4
X 3 B 0
Y 1 B -2.3
Y 2 B -0.4
Y 3 B -0.5
Z 1 B -1.2
Z 2 B 2.4
Z 3 B -1.5