This script details procedures for merging, recaling, and exporting
the lookup database (lookup_db) used for ConversationAlign . We will
merge several databases to create a single master database spanning a
bunch of affective variables (from affectvec) along with various other
psycholinguistic dimensions. We will rescale each variable to a 0 to 10
range using min/max normalization to specify a range to bound all the
original observations to. In this case 0 to 10.
https://stats.stackexchange.com/questions/281162/scale-a-number-between-a-range
For details on where each of the variables below was drawn from,
visit:
load(here("original_dbases", "lookup_db.rda"))
str(lookup_db)
## 'data.frame': 102682 obs. of 31 variables:
## $ word : chr "a" "aa" "aaa" "aaaa" ...
## $ aff_anger : num 2.87 2.54 2.07 1.94 2.55 ...
## $ aff_anxiety : num 2.27 1.69 2.27 1.8 2.29 ...
## $ aff_boredom : num 3 2.08 2.82 2.37 2.46 ...
## $ aff_closeness : num 2.77 1.95 1.29 1.62 1.76 ...
## $ aff_confusion : num 1.448 0.744 1.287 1.505 2.144 ...
## $ aff_dominance : num NA NA NA NA NA ...
## $ aff_doubt : num 2.59 1.05 1.34 1.49 1.49 ...
## $ aff_empathy : num 2.43 2.22 1.48 1.84 1.83 ...
## $ aff_encouragement : num 2.67 2.28 2.3 2.54 2.39 ...
## $ aff_excitement : num 2.83 3.19 2.63 3.1 2.78 ...
## $ aff_guilt : num 3.06 2.79 2.65 2.47 2.47 ...
## $ aff_happiness : num 3.21 3.62 2.99 3.22 3.39 ...
## $ aff_hope : num 3.3 3.31 2.96 3.36 2.64 ...
## $ aff_hostility : num 2.88 2.52 1.78 1.48 1.99 ...
## $ aff_politeness : num 2.33 2.63 2.84 2.29 3.05 ...
## $ aff_sadness : num 2.63 2.22 1.84 1.92 2.17 ...
## $ aff_stress : num 1.35 1.47 1.3 1.2 1.96 ...
## $ aff_surprise : num 2.85 2.54 2.7 3.31 3.22 ...
## $ aff_trust : num 3.06 3.55 3.85 3.59 3.07 ...
## $ aff_valence : num NA NA NA NA NA NA NA 4.79 NA NA ...
## $ lex_age_acquisition : num 0.561 NA NA NA NA ...
## $ lex_letter_count_raw : int 1 2 3 4 5 10 9 8 7 6 ...
## $ lex_morphemecount_raw: int 1 NA NA NA NA NA NA NA NA NA ...
## $ lex_prevalence : num 8.54 NA NA NA NA ...
## $ lex_senses_polysemy : num 0.933 NA NA NA NA ...
## $ lex_wordfreqlg10_raw : num 6.02 1.94 1.42 NA NA ...
## $ sem_arousal : num NA NA NA NA NA ...
## $ sem_concreteness : num 1.06 NA NA NA NA ...
## $ sem_diversity : num NA NA NA NA NA NA NA NA NA NA ...
## $ sem_neighbors : num NA NA NA NA NA NA NA NA NA NA ...
colnames(lookup_db)
## [1] "word" "aff_anger" "aff_anxiety"
## [4] "aff_boredom" "aff_closeness" "aff_confusion"
## [7] "aff_dominance" "aff_doubt" "aff_empathy"
## [10] "aff_encouragement" "aff_excitement" "aff_guilt"
## [13] "aff_happiness" "aff_hope" "aff_hostility"
## [16] "aff_politeness" "aff_sadness" "aff_stress"
## [19] "aff_surprise" "aff_trust" "aff_valence"
## [22] "lex_age_acquisition" "lex_letter_count_raw" "lex_morphemecount_raw"
## [25] "lex_prevalence" "lex_senses_polysemy" "lex_wordfreqlg10_raw"
## [28] "sem_arousal" "sem_concreteness" "sem_diversity"
## [31] "sem_neighbors"
# write.csv(lookup_db, 'lookup_db.csv')
by variable for appendix/table.
howmany <- function(x) {
sum(!is.na(x))
}
# CountCompCases(lookup_db)
longlook <- lookup_db %>%
pivot_longer(2:31, names_to = "dimension", values_to = "salience")
tapply(longlook$salience, longlook$dimension, howmany)
## aff_anger aff_anxiety aff_boredom
## 76427 76427 76427
## aff_closeness aff_confusion aff_dominance
## 76427 76427 19971
## aff_doubt aff_empathy aff_encouragement
## 76427 76427 76427
## aff_excitement aff_guilt aff_happiness
## 76427 76427 76427
## aff_hope aff_hostility aff_politeness
## 76427 76427 76427
## aff_sadness aff_stress aff_surprise
## 76427 76427 76427
## aff_trust aff_valence lex_age_acquisition
## 76427 19971 31104
## lex_letter_count_raw lex_morphemecount_raw lex_prevalence
## 95462 51531 46237
## lex_senses_polysemy lex_wordfreqlg10_raw sem_arousal
## 36408 60384 19971
## sem_concreteness sem_diversity sem_neighbors
## 39576 29613 45871
tapply(longlook$salience, longlook$dimension, summary)
## $aff_anger
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 2.409 2.729 2.754 3.062 10.000 26255
##
## $aff_anxiety
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 2.328 2.649 2.683 2.993 10.000 26255
##
## $aff_boredom
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 2.226 2.595 2.654 3.012 10.000 26255
##
## $aff_closeness
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 1.767 2.113 2.150 2.490 10.000 26255
##
## $aff_confusion
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 1.733 2.086 2.139 2.480 10.000 26255
##
## $aff_dominance
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 3.49 4.62 4.81 6.08 10.00 82711
##
## $aff_doubt
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 1.623 1.951 1.977 2.298 10.000 26255
##
## $aff_empathy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 1.865 2.203 2.261 2.587 10.000 26255
##
## $aff_encouragement
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 1.953 2.314 2.344 2.684 10.000 26255
##
## $aff_excitement
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 2.738 3.069 3.071 3.401 10.000 26255
##
## $aff_guilt
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 2.345 2.683 2.731 3.059 10.000 26255
##
## $aff_happiness
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 2.483 2.874 2.881 3.260 10.000 26255
##
## $aff_hope
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 2.230 2.622 2.630 3.016 10.000 26255
##
## $aff_hostility
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 2.238 2.569 2.598 2.913 10.000 26255
##
## $aff_politeness
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 2.203 2.516 2.538 2.846 10.000 26255
##
## $aff_sadness
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 2.197 2.518 2.566 2.867 10.000 26255
##
## $aff_stress
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 1.790 2.139 2.161 2.506 10.000 26255
##
## $aff_surprise
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 2.186 2.561 2.589 2.964 10.000 26255
##
## $aff_trust
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 2.292 2.648 2.688 3.035 10.000 26255
##
## $aff_valence
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 3.47 5.10 5.00 6.53 10.00 82711
##
## $lex_age_acquisition
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 3.14 4.09 4.02 4.96 10.00 71578
##
## $lex_letter_count_raw
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 6.000 7.000 7.642 9.000 34.000 7220
##
## $lex_morphemecount_raw
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 2.00 2.00 2.16 3.00 6.00 51151
##
## $lex_prevalence
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 6.54 7.94 7.63 9.10 10.00 56445
##
## $lex_senses_polysemy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 0.13 0.27 0.45 0.53 10.00 66274
##
## $lex_wordfreqlg10_raw
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.48 0.78 1.23 1.39 1.83 6.33 42298
##
## $sem_arousal
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 3.49 4.60 4.81 6.00 10.00 82711
##
## $sem_concreteness
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 2.73 4.60 5.02 7.17 10.00 63106
##
## $sem_diversity
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 5.14 6.24 6.10 7.22 10.00 73069
##
## $sem_neighbors
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 0.00 0.00 0.81 0.00 10.00 56811
# isolate all vars on a 0:10 scale first
justto10 <- lookup_db %>%
select(1:22)
long10 <- justto10 %>%
pivot_longer(2:22, names_to = "dimension", values_to = "salience")
ggplot(long10, aes(x = salience), fill = dimension) + geom_histogram(binwidth = 0.1,
fill = "green", color = "black") + ylab("Count") + jamie.theme + facet_wrap(~dimension,
ncol = 1)
print(longlook)
## # A tibble: 3,080,460 × 3
## word dimension salience
## <chr> <chr> <dbl>
## 1 a aff_anger 2.87
## 2 a aff_anxiety 2.27
## 3 a aff_boredom 3.00
## 4 a aff_closeness 2.77
## 5 a aff_confusion 1.45
## 6 a aff_dominance NA
## 7 a aff_doubt 2.59
## 8 a aff_empathy 2.43
## 9 a aff_encouragement 2.67
## 10 a aff_excitement 2.83
## # ℹ 3,080,450 more rows
Original scale is 1 to -1, Convert all affectvec values to a 1-10 scale using the rescale function from the scales package
base <- read.csv(here("original_dbases", "db_affectvec.csv"))
affectsmall <- base %>%
select(word, surprise, trust, anger, sadness, anxiety, boredom, confusion, excitement,
encouragement, stress, empathy, hostility, closeness, doubt, happiness, guilt,
hope, politeness)
word <- tolower(affectsmall$word)
affectsmall <- affectsmall %>%
select(!word)
# append aff- suffix to all variables
affectsmall <- affectsmall %>%
rename_with(.fn = function(.x) {
paste0("aff_", .x)
})
# append aff- to each variable str(affectsmall)
affectsmall[1:ncol(affectsmall)] <- lapply(affectsmall[1:ncol(affectsmall)], function(x) rescale(x,
to = c(0, 10)))
lookup_db <- cbind(word, affectsmall)
str(lookup_db)
Add the Kuperman norms to the lookup database, Yoke and rescale to 0 to 10
kup <- read.csv(here("original_dbases", "db_kuperman_aoa.csv"))
kup_aoa <- kup %>%
select(word, kup_aoa) %>%
na.omit #retain complete cases
word <- kup_aoa$word
kup_aoa <- kup_aoa %>%
select(!word)
dat <- kup_aoa %>%
mutate(lex_age_acquisition = rescale(kup_aoa, to = c(0, 10))) %>%
select(!kup_aoa)
dat <- cbind(word, dat)
lookup_db <- lookup_db %>%
full_join(dat, by = "word")
lookup_db <- lookup_db %>%
distinct(word, .keep_all = TRUE)
str(lookup_db)
NRC VAD Lexicon. Add valence, arousal, dominance ratings and reescale
to 0 to 10
https://saifmohammad.com/WebPages/nrc-vad.html
nrc <- read.csv(here("original_dbases", "db_nrc_vad.csv"))
nrc[2:ncol(nrc)] <- lapply(nrc[2:ncol(nrc)], function(x) rescale(x, to = c(0, 10)))
word <- tolower(nrc$word) #make sure all entries lowercase
nrc <- nrc %>%
rename(aff_valence = "valence")
nrc <- nrc %>%
rename(aff_dominance = "dominance")
nrc <- nrc %>%
rename(sem_arousal = "arousal")
nrc <- nrc %>%
select(!word)
nrc <- cbind(word, nrc)
# no need to select specific columns or omit NAs in this one
lookup_db <- lookup_db %>%
full_join(nrc, by = "word")
lookup_db <- lookup_db %>%
distinct(word, .keep_all = TRUE) #keeps first entry
Rescale from the Brysbaert norms
cnc_raw <- read.csv(here("original_dbases", "db_brysbaert_cnc.csv")) #read in raw data
cnc_small <- cnc_raw %>%
select(word, brys_concreteness) # select word and concreteness rating
cnc_rescale <- cnc_small %>%
mutate(sem_concreteness = rescale(brys_concreteness, to = c(0, 10))) %>%
select(!brys_concreteness)
cnc_rescale <- cnc_rescale %>%
distinct(word, .keep_all = TRUE) #retain only first entry
word <- tolower(cnc_rescale$word) #make sure all entries lowercase in separate vector
cnc_rescale <- cnc_rescale %>%
select(!word) #kill word column
dat2 <- cbind(word, cnc_rescale) #bind word column back in
str(dat2)
lookup_db <- lookup_db %>%
full_join(dat2, by = "word")
nletter count raw — UNSCALED
# use str_length to generate letter count per word as new var called n_letters
lookup_db <- lookup_db %>%
mutate(lex_letter_count_raw = stringr::str_length(word))
US Subtlex — UNSCALED Lg frequency
subtlex <- read.csv(here("original_dbases", "db_subtlex_clean.csv"))
dat <- subtlex %>%
select(word, lg10wf)
dat <- dat %>%
distinct(word, .keep_all = TRUE) #retain only first entry
word <- tolower(dat$word) #make sure all entries lowercase in separate vector
dat <- dat %>%
rename(lex_wordfreqlg10_raw = "lg10wf")
dat <- dat %>%
select(!word)
dat <- cbind(word, dat) #bind word column back in
lookup_db <- lookup_db %>%
full_join(dat, by = "word")
lookup_db <- lookup_db %>%
select(order(colnames(lookup_db)))
lookup_db <- lookup_db %>%
distinct(word, .keep_all = TRUE)
Merge the following values from the South Carolina (SCOPE) metabasefrom morpholex database into SCOPE
# database is already converted to lowercase all
scope <- read.csv(here("original_dbases", "db_scope_aug23rev.csv"))
dat <- scope %>%
rename(lex_morphemecount_raw = "morpheme_count")
dat <- dat %>%
distinct(word, .keep_all = TRUE)
from Shaoul and Westbury https://www.psych.ualberta.ca/~westburylab/downloads/westburylab.arcs.ncounts.html Number of semantic neighbors based on distance in co-occurence space
dat <- dat %>%
mutate(sem_neighbors = rescale(semantic_neighbors, to = c(0, 10))) %>%
select(!semantic_neighbors)
https://wordnet.princeton.edu/ number of senses from Wordnet database
dat <- dat %>%
mutate(lex_senses_polysemy = rescale(nsenses, to = c(0, 10))) %>%
select(!nsenses)
Rescaled semantic diversity from Hoffman et al.
https://link.springer.com/article/10.3758/s13428-012-0278-x#SecESM1
The degree to which different contexts associated with a word vary in
their meaning
dat <- dat %>%
mutate(sem_diversity = rescale(semantic_diversity, to = c(0, 10))) %>%
select(!semantic_diversity)
From brysbaert et al (how many people know a word)
dat <- dat %>%
mutate(lex_prevalence = rescale(prevalence, to = c(0, 10))) %>%
select(!prevalence)
Sort columns alphabetically, save as rda and export with system date appended to filname.
# check for duplicates Reorder so word is first
lookup_db <- lookup_db %>%
full_join(dat, by = "word")
lookup_db <- lookup_db %>%
select(order(colnames(lookup_db)))
lookup_db <- lookup_db %>%
distinct(word, .keep_all = TRUE)
colnames(lookup_db)
lookup_db <- lookup_db[, c(31, 1:30)] #reorder so word is first
currentDate <- Sys.Date()
save(lookup_db, file = paste("lookup_db", currentDate, ".rda", sep = "-"))