Random number vectors (Gaussian & Uniform Dists); N=1000 items sampled from a random normal distribution (rnorm) and another 1000 items sampled from a uniform distribution.
set.seed(123)
dat <- data.frame(cbind(gaussian = rnorm(10, 50, 5), uniform = runif(10))) #using cbind
print(dat)
## gaussian uniform
## 1 47.19762 0.8895393
## 2 48.84911 0.6928034
## 3 57.79354 0.6405068
## 4 50.35254 0.9942698
## 5 50.64644 0.6557058
## 6 58.57532 0.7085305
## 7 52.30458 0.5440660
## 8 43.67469 0.5941420
## 9 46.56574 0.2891597
## 10 47.77169 0.1471136
dat1 <- data.frame(School = c("temple", "penn"), SAT = c("normal", "crazy"))
print(dat1)
## School SAT
## 1 temple normal
## 2 penn crazy
Here’s another appended variable created with the sample function.
sample(x, size, replace = FALSE) where x is a range
set.seed(190)
dat$sampled <- sample(40:60, 10, replace = TRUE) #randomly samples integers from 40 to 60 with replacement
print(dat)
## gaussian uniform sampled
## 1 47.19762 0.8895393 52
## 2 48.84911 0.6928034 41
## 3 57.79354 0.6405068 48
## 4 50.35254 0.9942698 50
## 5 50.64644 0.6557058 44
## 6 58.57532 0.7085305 53
## 7 52.30458 0.5440660 57
## 8 43.67469 0.5941420 48
## 9 46.56574 0.2891597 52
## 10 47.77169 0.1471136 40
cats vs. dogs
letvec <- data.frame(letvec = rep(c("dogs", "cats"), 5))
print(letvec)
## letvec
## 1 dogs
## 2 cats
## 3 dogs
## 4 cats
## 5 dogs
## 6 cats
## 7 dogs
## 8 cats
## 9 dogs
## 10 cats
both <- data.frame(dat, letvec)
print(both)
## gaussian uniform sampled letvec
## 1 47.19762 0.8895393 52 dogs
## 2 48.84911 0.6928034 41 cats
## 3 57.79354 0.6405068 48 dogs
## 4 50.35254 0.9942698 50 cats
## 5 50.64644 0.6557058 44 dogs
## 6 58.57532 0.7085305 53 cats
## 7 52.30458 0.5440660 57 dogs
## 8 43.67469 0.5941420 48 cats
## 9 46.56574 0.2891597 52 dogs
## 10 47.77169 0.1471136 40 cats
single column name from letvec to something else
Rename letvec in the dat dataframe to “SomethingElse”. Here are two ways
new <- both %>% rename(SomethingElse = letvec) %>% print() #the new name comes first
## gaussian uniform sampled SomethingElse
## 1 47.19762 0.8895393 52 dogs
## 2 48.84911 0.6928034 41 cats
## 3 57.79354 0.6405068 48 dogs
## 4 50.35254 0.9942698 50 cats
## 5 50.64644 0.6557058 44 dogs
## 6 58.57532 0.7085305 53 cats
## 7 52.30458 0.5440660 57 dogs
## 8 43.67469 0.5941420 48 cats
## 9 46.56574 0.2891597 52 dogs
## 10 47.77169 0.1471136 40 cats
names(both)[1] <- "animals" #specify column index to change name to
both
animals | uniform | sampled | letvec |
---|---|---|---|
47.20 | 0.89 | 52 | dogs |
48.85 | 0.69 | 41 | cats |
57.79 | 0.64 | 48 | dogs |
50.35 | 0.99 | 50 | cats |
50.65 | 0.66 | 44 | dogs |
58.58 | 0.71 | 53 | cats |
52.30 | 0.54 | 57 | dogs |
43.67 | 0.59 | 48 | cats |
46.57 | 0.29 | 52 | dogs |
47.77 | 0.15 | 40 | cats |
Here’s a thorny challenge – unlist a string that appears within other variable types in a dataframe
dat2 <- data.frame(a = as.factor(c("Philadelphia", "Pittsburgh")), b = c("Always Sunny In",
"Often Cloudy In"))
print(dat2)
## a b
## 1 Philadelphia Always Sunny In
## 2 Pittsburgh Often Cloudy In
# reshape retaining variable name for split strings
CarveString <- strsplit(dat2$b, split = " ")
GetLengthString <- lengths(CarveString) ## expansion size
FlattenIt <- unlist(CarveString, use.names = FALSE)
NewDat2 <- data.frame(document = rep.int(dat2$a, GetLengthString), token = FlattenIt)
print(NewDat2)
## document token
## 1 Philadelphia Always
## 2 Philadelphia Sunny
## 3 Philadelphia In
## 4 Pittsburgh Often
## 5 Pittsburgh Cloudy
## 6 Pittsburgh In
new1 <- getanID(new, id.vars = NULL)
print(new1)
## gaussian uniform sampled SomethingElse
## 1: 47.19762 0.8895393 52 dogs
## 2: 48.84911 0.6928034 41 cats
## 3: 57.79354 0.6405068 48 dogs
## 4: 50.35254 0.9942698 50 cats
## 5: 50.64644 0.6557058 44 dogs
## 6: 58.57532 0.7085305 53 cats
## 7: 52.30458 0.5440660 57 dogs
## 8: 43.67469 0.5941420 48 cats
## 9: 46.56574 0.2891597 52 dogs
## 10: 47.77169 0.1471136 40 cats
new2 <- new %>% mutate(id = seq_along(gaussian))
print(new2)
## gaussian uniform sampled SomethingElse id
## 1 47.19762 0.8895393 52 dogs 1
## 2 48.84911 0.6928034 41 cats 2
## 3 57.79354 0.6405068 48 dogs 3
## 4 50.35254 0.9942698 50 cats 4
## 5 50.64644 0.6557058 44 dogs 5
## 6 58.57532 0.7085305 53 cats 6
## 7 52.30458 0.5440660 57 dogs 7
## 8 43.67469 0.5941420 48 cats 8
## 9 46.56574 0.2891597 52 dogs 9
## 10 47.77169 0.1471136 40 cats 10
Replaces melt the dataframe new from wide to long form
new3 <- new2 %>% select(1:3, 5)
new3$id <- as.factor(new3$id)
p <- new3 %>% pivot_longer(1:3, names_to = "my-var", values_to = "my-val")
print(p)
## # A tibble: 30 x 3
## id `my-var` `my-val`
## <fct> <chr> <dbl>
## 1 1 gaussian 47.2
## 2 1 uniform 0.890
## 3 1 sampled 52
## 4 2 gaussian 48.8
## 5 2 uniform 0.693
## 6 2 sampled 41
## 7 3 gaussian 57.8
## 8 3 uniform 0.641
## 9 3 sampled 48
## 10 4 gaussian 50.4
## # … with 20 more rows
(rows) by a single level of a grouping variable. Generate a new dataframe isolating only rows corresponding to dogs (ie eliminate cats) using the filter command in dplyr.
short <- new %>% filter(SomethingElse != "cats")
short
generate a new dataframe from new that only retains the first two columns of ‘new’.
smaller <- new %>% select(1:2)
smaller
Base R
d1 <- data.frame(Xvar = c(1, 2, 3))
d2 <- data.frame(Yvar = c(7, 8, 9))
d3 <- merge(d1, d2) #this produces different output than cbind
d4 <- as.data.frame(cbind(d1, d2))
print(d3)
## Xvar Yvar
## 1 1 7
## 2 2 7
## 3 3 7
## 4 1 8
## 5 2 8
## 6 3 8
## 7 1 9
## 8 2 9
## 9 3 9
print(d4)
## Xvar Yvar
## 1 1 7
## 2 2 8
## 3 3 9
Generate and populate a 3x3 matrix. Set seed for reproducibility). Name the rows and columns
set.seed(1234)
mat <- matrix(rnorm(9), nrow = 3)
rownames(mat) <- c("a", "b", "c")
colnames(mat) <- c("d", "e", "f")
print(mat)
## d e f
## a -1.2070657 -2.3456977 -0.5747400
## b 0.2774292 0.4291247 -0.5466319
## c 1.0844412 0.5060559 -0.5644520