Dataframes

Start from scratch

Random number vectors (Gaussian & Uniform Dists); N=1000 items sampled from a random normal distribution (rnorm) and another 1000 items sampled from a uniform distribution.

set.seed(123)
dat <- data.frame(cbind(gaussian = rnorm(10, 50, 5), uniform = runif(10)))  #using cbind
print(dat)

##    gaussian   uniform
## 1  47.19762 0.8895393
## 2  48.84911 0.6928034
## 3  57.79354 0.6405068
## 4  50.35254 0.9942698
## 5  50.64644 0.6557058
## 6  58.57532 0.7085305
## 7  52.30458 0.5440660
## 8  43.67469 0.5941420
## 9  46.56574 0.2891597
## 10 47.77169 0.1471136

dat1 <- data.frame(School = c("temple", "penn"), SAT = c("normal", "crazy"))
print(dat1)

##   School    SAT
## 1 temple normal
## 2   penn  crazy

Append data

Here’s another appended variable created with the sample function.
sample(x, size, replace = FALSE) where x is a range

set.seed(190)
dat$sampled <- sample(40:60, 10, replace = TRUE)  #randomly samples integers from 40 to 60 with replacement
print(dat)

##    gaussian   uniform sampled
## 1  47.19762 0.8895393      52
## 2  48.84911 0.6928034      41
## 3  57.79354 0.6405068      48
## 4  50.35254 0.9942698      50
## 5  50.64644 0.6557058      44
## 6  58.57532 0.7085305      53
## 7  52.30458 0.5440660      57
## 8  43.67469 0.5941420      48
## 9  46.56574 0.2891597      52
## 10 47.77169 0.1471136      40

Append grouping variable

cats vs. dogs

letvec <- data.frame(letvec = rep(c("dogs", "cats"), 5))
print(letvec)

##    letvec
## 1    dogs
## 2    cats
## 3    dogs
## 4    cats
## 5    dogs
## 6    cats
## 7    dogs
## 8    cats
## 9    dogs
## 10   cats

both <- data.frame(dat, letvec)
print(both)

##    gaussian   uniform sampled letvec
## 1  47.19762 0.8895393      52   dogs
## 2  48.84911 0.6928034      41   cats
## 3  57.79354 0.6405068      48   dogs
## 4  50.35254 0.9942698      50   cats
## 5  50.64644 0.6557058      44   dogs
## 6  58.57532 0.7085305      53   cats
## 7  52.30458 0.5440660      57   dogs
## 8  43.67469 0.5941420      48   cats
## 9  46.56574 0.2891597      52   dogs
## 10 47.77169 0.1471136      40   cats

Naming column variables

single column name from letvec to something else
Rename letvec in the dat dataframe to “SomethingElse”. Here are two ways

new <- both %>% rename(SomethingElse = letvec) %>% print()  #the new name comes first

##    gaussian   uniform sampled SomethingElse
## 1  47.19762 0.8895393      52          dogs
## 2  48.84911 0.6928034      41          cats
## 3  57.79354 0.6405068      48          dogs
## 4  50.35254 0.9942698      50          cats
## 5  50.64644 0.6557058      44          dogs
## 6  58.57532 0.7085305      53          cats
## 7  52.30458 0.5440660      57          dogs
## 8  43.67469 0.5941420      48          cats
## 9  46.56574 0.2891597      52          dogs
## 10 47.77169 0.1471136      40          cats

names(both)[1] <- "animals"  #specify column index to change name to
both

animals	uniform	sampled	letvec
47.20	0.89	52	dogs
48.85	0.69	41	cats
57.79	0.64	48	dogs
50.35	0.99	50	cats
50.65	0.66	44	dogs
58.58	0.71	53	cats
52.30	0.54	57	dogs
43.67	0.59	48	cats
46.57	0.29	52	dogs
47.77	0.15	40	cats

Character strings

Here’s a thorny challenge – unlist a string that appears within other variable types in a dataframe

dat2 <- data.frame(a = as.factor(c("Philadelphia", "Pittsburgh")), b = c("Always Sunny In", 
    "Often Cloudy In"))
print(dat2)

##              a               b
## 1 Philadelphia Always Sunny In
## 2   Pittsburgh Often Cloudy In

# reshape retaining variable name for split strings
CarveString <- strsplit(dat2$b, split = " ")
GetLengthString <- lengths(CarveString)  ## expansion size
FlattenIt <- unlist(CarveString, use.names = FALSE)
NewDat2 <- data.frame(document = rep.int(dat2$a, GetLengthString), token = FlattenIt)
print(NewDat2)

##       document  token
## 1 Philadelphia Always
## 2 Philadelphia  Sunny
## 3 Philadelphia     In
## 4   Pittsburgh  Often
## 5   Pittsburgh Cloudy
## 6   Pittsburgh     In

GetanID

new1 <- getanID(new, id.vars = NULL)
print(new1)

##     gaussian   uniform sampled SomethingElse
##  1: 47.19762 0.8895393      52          dogs
##  2: 48.84911 0.6928034      41          cats
##  3: 57.79354 0.6405068      48          dogs
##  4: 50.35254 0.9942698      50          cats
##  5: 50.64644 0.6557058      44          dogs
##  6: 58.57532 0.7085305      53          cats
##  7: 52.30458 0.5440660      57          dogs
##  8: 43.67469 0.5941420      48          cats
##  9: 46.56574 0.2891597      52          dogs
## 10: 47.77169 0.1471136      40          cats

SeqAlong

new2 <- new %>% mutate(id = seq_along(gaussian))
print(new2)

##    gaussian   uniform sampled SomethingElse id
## 1  47.19762 0.8895393      52          dogs  1
## 2  48.84911 0.6928034      41          cats  2
## 3  57.79354 0.6405068      48          dogs  3
## 4  50.35254 0.9942698      50          cats  4
## 5  50.64644 0.6557058      44          dogs  5
## 6  58.57532 0.7085305      53          cats  6
## 7  52.30458 0.5440660      57          dogs  7
## 8  43.67469 0.5941420      48          cats  8
## 9  46.56574 0.2891597      52          dogs  9
## 10 47.77169 0.1471136      40          cats 10

Wrangling wide-to-long

Pivot_longer

Replaces melt the dataframe new from wide to long form

new3 <- new2 %>% select(1:3, 5)
new3$id <- as.factor(new3$id)
p <- new3 %>% pivot_longer(1:3, names_to = "my-var", values_to = "my-val")
print(p)

## # A tibble: 30 x 3
##    id    `my-var` `my-val`
##    <fct> <chr>       <dbl>
##  1 1     gaussian   47.2  
##  2 1     uniform     0.890
##  3 1     sampled    52    
##  4 2     gaussian   48.8  
##  5 2     uniform     0.693
##  6 2     sampled    41    
##  7 3     gaussian   57.8  
##  8 3     uniform     0.641
##  9 3     sampled    48    
## 10 4     gaussian   50.4  
## # … with 20 more rows

Subsetting

Filter (rows)

(rows) by a single level of a grouping variable. Generate a new dataframe isolating only rows corresponding to dogs (ie eliminate cats) using the filter command in dplyr.

short <- new %>% filter(SomethingElse != "cats")
short

Select (columns)

generate a new dataframe from new that only retains the first two columns of ‘new’.

smaller <- new %>% select(1:2)
smaller

Merging dataframes

Base R

d1 <- data.frame(Xvar = c(1, 2, 3))
d2 <- data.frame(Yvar = c(7, 8, 9))
d3 <- merge(d1, d2)  #this produces different output than cbind
d4 <- as.data.frame(cbind(d1, d2))
print(d3)

##   Xvar Yvar
## 1    1    7
## 2    2    7
## 3    3    7
## 4    1    8
## 5    2    8
## 6    3    8
## 7    1    9
## 8    2    9
## 9    3    9

print(d4)

##   Xvar Yvar
## 1    1    7
## 2    2    8
## 3    3    9

Matrices

Generate and populate a 3x3 matrix. Set seed for reproducibility). Name the rows and columns

set.seed(1234)
mat <- matrix(rnorm(9), nrow = 3)
rownames(mat) <- c("a", "b", "c")
colnames(mat) <- c("d", "e", "f")
print(mat)

##            d          e          f
## a -1.2070657 -2.3456977 -0.5747400
## b  0.2774292  0.4291247 -0.5466319
## c  1.0844412  0.5060559 -0.5644520

dataframes, vectors, matrices, lists

Jamie Reilly, Ph.D.

October 29, 2020