a <- "hello world!"
a #enclose characters with quotes or else they get executed as code
## [1] "hello world!"
Generate a count of the characters in a string or vector
nchar(a) #counts the number of characters including spaces
## [1] 12
Strings are sequences of more than one characters
planets <- c("m", "v", "e", "m", "j", "s", "u", "n", "p")
planets #separate elements
## [1] "m" "v" "e" "m" "j" "s" "u" "n" "p"
is.character(planets) #boolean evaluates to True (1) or False (0) -
## [1] TRUE
Converts data to character and concatenates into vectors separating by the sep argument.
It takes this form paste(…, sep = " ", collapse = NULL) – where the default separator is a space.
Here’s an example combining a statement with printing the system date. If you specify collapse=T, you will remove all delimiters (spaces here)
paste("Hi! today is", (Sys.Date())) #collapse=NULL
## [1] "Hi! today is 2020-09-17"
‘cat’ concatenates and prints a character vector
cat(2:12, sep = "+") #the '+' is a delimiter here
## 2+3+4+5+6+7+8+9+10+11+12
cat(2:12, sep = "/")
## 2/3/4/5/6/7/8/9/10/11/12
cat(2:12, sep = "a")
## 2a3a4a5a6a7a8a9a10a11a12
dat <- c("ThE Day wAS LONG and TeRriBle.")
dat
## [1] "ThE Day wAS LONG and TeRriBle."
dat.low <- dat %>% tolower() #convert to lowercase
dat.up <- dat %>% toupper() #convert to uppercase
print(dat.low)
## [1] "the day was long and terrible."
print(dat.up)
## [1] "THE DAY WAS LONG AND TERRIBLE."
Sometimes you need to split the elements of a string (e.g., tokens) into separate units.
The strsplit(string, delimiter) command is useful for this. This takes the form:
newdat <- strsplit(olddat, “sep”)
length(dat.low) #print
## [1] 1
splitup <- strsplit(dat.low, " ") %>% print()
## [[1]]
## [1] "the" "day" "was" "long" "and" "terrible."
length(splitup)
## [1] 1
new <- c("hi I'm not feeling well")
new
## [1] "hi I'm not feeling well"
snew <- strsplit(new, " ")
snew
## [[1]]
## [1] "hi" "I'm" "not" "feeling" "well"
Grep
regexpr(pattern, string) - see other markdown find starting position and length of first match
regmatches(string, regexpr(pattern, string)) extract first match [1] “tam” “tim”
sub(pattern, replacement, string) sub replaces first match, gsub does a global replace
We are going to work a bit with a built in character vector called colors().
head(colors(), n = 25)
## [1] "white" "aliceblue" "antiquewhite" "antiquewhite1"
## [5] "antiquewhite2" "antiquewhite3" "antiquewhite4" "aquamarine"
## [9] "aquamarine1" "aquamarine2" "aquamarine3" "aquamarine4"
## [13] "azure" "azure1" "azure2" "azure3"
## [17] "azure4" "beige" "bisque" "bisque1"
## [21] "bisque2" "bisque3" "bisque4" "black"
## [25] "blanchedalmond"
length(colors())
## [1] 657
typeof(colors()) #colors is a built in character vector of color palettes in R
## [1] "character"
Base R default returns indices of matching arguments – [1] 1st observation column
The grep function format involves pattern match where pattern is a character string
grep(pattern, x, ignore.case = FALSE, perl = FALSE, value = FALSE, fixed = FALSE, useBytes = FALSE, invert = FALSE)
try <- c("a", "b", "a", "c", "z", "b", "a")
grep("a", try, value = F) #will give indices
## [1] 1 3 7
grep("a", try, value = T) #will give values for words with an 'a' in them
## [1] "a" "a" "a"
grep("a", try, value = T, invert = T) #inverse -- returns words WITHOUT an 'a' in them
## [1] "b" "c" "z" "b"
Let’s try this with a much longer character vector – colors()
grep("green", colors(), value = F) #will give indices with the string green in them
## [1] 81 85 86 87 88 89 102 103 104 105 106 139 254 255 256 257 258 259 393
## [20] 417 429 448 472 474 514 515 516 517 518 574 575 576 577 578 610 611 612 613
## [39] 614 657
Let’s inspect a few indices – maybe 85 to 88
colors()[85:88]
## [1] "darkolivegreen" "darkolivegreen1" "darkolivegreen2" "darkolivegreen3"
How many shades of green are in the R color palette?
length(grep("green", colors(), value = F))
## [1] 40
Now let’s look for indices that have the words “green” or “white” in them
grep("green|white", colors(), value = T)
## [1] "white" "antiquewhite" "antiquewhite1"
## [4] "antiquewhite2" "antiquewhite3" "antiquewhite4"
## [7] "darkgreen" "darkolivegreen" "darkolivegreen1"
## [10] "darkolivegreen2" "darkolivegreen3" "darkolivegreen4"
## [13] "darkseagreen" "darkseagreen1" "darkseagreen2"
## [16] "darkseagreen3" "darkseagreen4" "floralwhite"
## [19] "forestgreen" "ghostwhite" "green"
## [22] "green1" "green2" "green3"
## [25] "green4" "greenyellow" "lawngreen"
## [28] "lightgreen" "lightseagreen" "limegreen"
## [31] "mediumseagreen" "mediumspringgreen" "navajowhite"
## [34] "navajowhite1" "navajowhite2" "navajowhite3"
## [37] "navajowhite4" "palegreen" "palegreen1"
## [40] "palegreen2" "palegreen3" "palegreen4"
## [43] "seagreen" "seagreen1" "seagreen2"
## [46] "seagreen3" "seagreen4" "springgreen"
## [49] "springgreen1" "springgreen2" "springgreen3"
## [52] "springgreen4" "whitesmoke" "yellowgreen"
Let’s return values instead of indices
grep("green", colors(), value = F) #will give values with the string green in them
## [1] 81 85 86 87 88 89 102 103 104 105 106 139 254 255 256 257 258 259 393
## [20] 417 429 448 472 474 514 515 516 517 518 574 575 576 577 578 610 611 612 613
## [39] 614 657
returns a boolean true/false at each index
grepl("green", colors())
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [85] TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [289] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [301] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [313] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [325] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [337] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [349] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [373] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [385] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [397] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [409] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [421] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [433] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [445] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [457] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [469] FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [481] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [493] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [505] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE
## [517] TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [529] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [541] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [553] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [565] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE
## [577] TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [589] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [601] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE
## [613] TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [625] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [637] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [649] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
contrasts two vectors subtract out elements in one from another
me <- c("philadelphia", "new york", "boston")
you <- c("philadelphia", "new york", "boston", "new orleans")
setdiff(me, you) #me minus you what is different?
## character(0)
setdiff(you, me) #you minus me what is different?
## [1] "new orleans"
substitutes one string for another
you <- c("you make me sick!", "you really do.") #sub(pattern = 'x', replacement = 'm', vector name)
print(you)
me <- sub("you", "I", you) #only does first instance
print(me)
globe <- gsub("you", "I", you) #global - substitutes all instances
print(globe)
removepunctuation function in the tm package strips out punctuation
punked <- c("Sometimes, I just; use . all the wrong ! punctuation:;,?")
punked
## [1] "Sometimes, I just; use . all the wrong ! punctuation:;,?"
cleanpunk <- removePunctuation(punked, preserve_intra_word_contractions = TRUE)
cleanpunk #retain contractions
## [1] "Sometimes I just use all the wrong punctuation"
Here’s the first paragraph of Frankenstein by Mary Shelley
You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking.
# write the para to blockhead
blockhead <- as.data.frame(c("You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking."))
str(blockhead)
## 'data.frame': 1 obs. of 1 variable:
## $ c("You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking."): chr "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have rega"| __truncated__
split blockhead into a vector of individual words. (hint use: str_split with " " as the separator)
bolthead <- str_split(blockhead, " ")
str(bolthead)
## List of 1
## $ : chr [1:50] "You" "will" "rejoice" "to" ...
bolthead2 <- unlist(bolthead)
Convert bolthead to all upper case characters and print the first ten rows
bolt.up <- bolthead2 %>% toupper() %>% as.data.frame()
head(bolt.up)
How many words and characters are in the first para of Frankenstein
nchar(bolt.up)
## .
## 442
nrow(bolt.up) #each word gets its own row - so nrow is the total number of words
## [1] 50