Prep lookup_db

This script details procedures for merging, recaling, and exporting the lookup database (lookup_db) used for ConversationAlign . We will merge several databases to create a single master database spanning a bunch of affective variables (from affectvec) along with various other psycholinguistic dimensions. We will rescale each variable to a 0 to 10 range using min/max normalization to specify a range to bound all the original observations to. In this case 0 to 10.

https://stats.stackexchange.com/questions/281162/scale-a-number-between-a-range

For details on where each of the variables below was drawn from, visit:

Load pre-built lookup_db

load(here("original_dbases", "lookup_db.rda"))
str(lookup_db)

## 'data.frame':    102682 obs. of  31 variables:
##  $ word                 : chr  "a" "aa" "aaa" "aaaa" ...
##  $ aff_anger            : num  2.87 2.54 2.07 1.94 2.55 ...
##  $ aff_anxiety          : num  2.27 1.69 2.27 1.8 2.29 ...
##  $ aff_boredom          : num  3 2.08 2.82 2.37 2.46 ...
##  $ aff_closeness        : num  2.77 1.95 1.29 1.62 1.76 ...
##  $ aff_confusion        : num  1.448 0.744 1.287 1.505 2.144 ...
##  $ aff_dominance        : num  NA NA NA NA NA ...
##  $ aff_doubt            : num  2.59 1.05 1.34 1.49 1.49 ...
##  $ aff_empathy          : num  2.43 2.22 1.48 1.84 1.83 ...
##  $ aff_encouragement    : num  2.67 2.28 2.3 2.54 2.39 ...
##  $ aff_excitement       : num  2.83 3.19 2.63 3.1 2.78 ...
##  $ aff_guilt            : num  3.06 2.79 2.65 2.47 2.47 ...
##  $ aff_happiness        : num  3.21 3.62 2.99 3.22 3.39 ...
##  $ aff_hope             : num  3.3 3.31 2.96 3.36 2.64 ...
##  $ aff_hostility        : num  2.88 2.52 1.78 1.48 1.99 ...
##  $ aff_politeness       : num  2.33 2.63 2.84 2.29 3.05 ...
##  $ aff_sadness          : num  2.63 2.22 1.84 1.92 2.17 ...
##  $ aff_stress           : num  1.35 1.47 1.3 1.2 1.96 ...
##  $ aff_surprise         : num  2.85 2.54 2.7 3.31 3.22 ...
##  $ aff_trust            : num  3.06 3.55 3.85 3.59 3.07 ...
##  $ aff_valence          : num  NA NA NA NA NA NA NA 4.79 NA NA ...
##  $ lex_age_acquisition  : num  0.561 NA NA NA NA ...
##  $ lex_letter_count_raw : int  1 2 3 4 5 10 9 8 7 6 ...
##  $ lex_morphemecount_raw: int  1 NA NA NA NA NA NA NA NA NA ...
##  $ lex_prevalence       : num  8.54 NA NA NA NA ...
##  $ lex_senses_polysemy  : num  0.933 NA NA NA NA ...
##  $ lex_wordfreqlg10_raw : num  6.02 1.94 1.42 NA NA ...
##  $ sem_arousal          : num  NA NA NA NA NA ...
##  $ sem_concreteness     : num  1.06 NA NA NA NA ...
##  $ sem_diversity        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ sem_neighbors        : num  NA NA NA NA NA NA NA NA NA NA ...

colnames(lookup_db)

##  [1] "word"                  "aff_anger"             "aff_anxiety"          
##  [4] "aff_boredom"           "aff_closeness"         "aff_confusion"        
##  [7] "aff_dominance"         "aff_doubt"             "aff_empathy"          
## [10] "aff_encouragement"     "aff_excitement"        "aff_guilt"            
## [13] "aff_happiness"         "aff_hope"              "aff_hostility"        
## [16] "aff_politeness"        "aff_sadness"           "aff_stress"           
## [19] "aff_surprise"          "aff_trust"             "aff_valence"          
## [22] "lex_age_acquisition"   "lex_letter_count_raw"  "lex_morphemecount_raw"
## [25] "lex_prevalence"        "lex_senses_polysemy"   "lex_wordfreqlg10_raw" 
## [28] "sem_arousal"           "sem_concreteness"      "sem_diversity"        
## [31] "sem_neighbors"

# write.csv(lookup_db, 'lookup_db.csv')

Count and summary

by variable for appendix/table.

howmany <- function(x) {
    sum(!is.na(x))
}
# CountCompCases(lookup_db)
longlook <- lookup_db %>%
    pivot_longer(2:31, names_to = "dimension", values_to = "salience")
tapply(longlook$salience, longlook$dimension, howmany)

##             aff_anger           aff_anxiety           aff_boredom 
##                 76427                 76427                 76427 
##         aff_closeness         aff_confusion         aff_dominance 
##                 76427                 76427                 19971 
##             aff_doubt           aff_empathy     aff_encouragement 
##                 76427                 76427                 76427 
##        aff_excitement             aff_guilt         aff_happiness 
##                 76427                 76427                 76427 
##              aff_hope         aff_hostility        aff_politeness 
##                 76427                 76427                 76427 
##           aff_sadness            aff_stress          aff_surprise 
##                 76427                 76427                 76427 
##             aff_trust           aff_valence   lex_age_acquisition 
##                 76427                 19971                 31104 
##  lex_letter_count_raw lex_morphemecount_raw        lex_prevalence 
##                 95462                 51531                 46237 
##   lex_senses_polysemy  lex_wordfreqlg10_raw           sem_arousal 
##                 36408                 60384                 19971 
##      sem_concreteness         sem_diversity         sem_neighbors 
##                 39576                 29613                 45871

tapply(longlook$salience, longlook$dimension, summary)

## $aff_anger
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   2.409   2.729   2.754   3.062  10.000   26255 
## 
## $aff_anxiety
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   2.328   2.649   2.683   2.993  10.000   26255 
## 
## $aff_boredom
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   2.226   2.595   2.654   3.012  10.000   26255 
## 
## $aff_closeness
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   1.767   2.113   2.150   2.490  10.000   26255 
## 
## $aff_confusion
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   1.733   2.086   2.139   2.480  10.000   26255 
## 
## $aff_dominance
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    3.49    4.62    4.81    6.08   10.00   82711 
## 
## $aff_doubt
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   1.623   1.951   1.977   2.298  10.000   26255 
## 
## $aff_empathy
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   1.865   2.203   2.261   2.587  10.000   26255 
## 
## $aff_encouragement
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   1.953   2.314   2.344   2.684  10.000   26255 
## 
## $aff_excitement
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   2.738   3.069   3.071   3.401  10.000   26255 
## 
## $aff_guilt
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   2.345   2.683   2.731   3.059  10.000   26255 
## 
## $aff_happiness
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   2.483   2.874   2.881   3.260  10.000   26255 
## 
## $aff_hope
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   2.230   2.622   2.630   3.016  10.000   26255 
## 
## $aff_hostility
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   2.238   2.569   2.598   2.913  10.000   26255 
## 
## $aff_politeness
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   2.203   2.516   2.538   2.846  10.000   26255 
## 
## $aff_sadness
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   2.197   2.518   2.566   2.867  10.000   26255 
## 
## $aff_stress
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   1.790   2.139   2.161   2.506  10.000   26255 
## 
## $aff_surprise
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   2.186   2.561   2.589   2.964  10.000   26255 
## 
## $aff_trust
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   2.292   2.648   2.688   3.035  10.000   26255 
## 
## $aff_valence
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    3.47    5.10    5.00    6.53   10.00   82711 
## 
## $lex_age_acquisition
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    3.14    4.09    4.02    4.96   10.00   71578 
## 
## $lex_letter_count_raw
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   6.000   7.000   7.642   9.000  34.000    7220 
## 
## $lex_morphemecount_raw
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1.00    2.00    2.00    2.16    3.00    6.00   51151 
## 
## $lex_prevalence
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    6.54    7.94    7.63    9.10   10.00   56445 
## 
## $lex_senses_polysemy
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    0.13    0.27    0.45    0.53   10.00   66274 
## 
## $lex_wordfreqlg10_raw
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.48    0.78    1.23    1.39    1.83    6.33   42298 
## 
## $sem_arousal
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    3.49    4.60    4.81    6.00   10.00   82711 
## 
## $sem_concreteness
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    2.73    4.60    5.02    7.17   10.00   63106 
## 
## $sem_diversity
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    5.14    6.24    6.10    7.22   10.00   73069 
## 
## $sem_neighbors
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    0.00    0.00    0.81    0.00   10.00   56811

Distributions

# isolate all vars on a 0:10 scale first
justto10 <- lookup_db %>%
    select(1:22)
long10 <- justto10 %>%
    pivot_longer(2:22, names_to = "dimension", values_to = "salience")
ggplot(long10, aes(x = salience), fill = dimension) + geom_histogram(binwidth = 0.1,
    fill = "green", color = "black") + ylab("Count") + jamie.theme + facet_wrap(~dimension,
    ncol = 1)

print(longlook)

## # A tibble: 3,080,460 × 3
##    word  dimension         salience
##    <chr> <chr>                <dbl>
##  1 a     aff_anger             2.87
##  2 a     aff_anxiety           2.27
##  3 a     aff_boredom           3.00
##  4 a     aff_closeness         2.77
##  5 a     aff_confusion         1.45
##  6 a     aff_dominance        NA   
##  7 a     aff_doubt             2.59
##  8 a     aff_empathy           2.43
##  9 a     aff_encouragement     2.67
## 10 a     aff_excitement        2.83
## # ℹ 3,080,450 more rows

Build lookup_db from scratch

Affectvec

Original scale is 1 to -1, Convert all affectvec values to a 1-10 scale using the rescale function from the scales package

base <- read.csv(here("original_dbases", "db_affectvec.csv"))
affectsmall <- base %>%
    select(word, surprise, trust, anger, sadness, anxiety, boredom, confusion, excitement,
        encouragement, stress, empathy, hostility, closeness, doubt, happiness, guilt,
        hope, politeness)
word <- tolower(affectsmall$word)
affectsmall <- affectsmall %>%
    select(!word)
# append aff- suffix to all variables
affectsmall <- affectsmall %>%
    rename_with(.fn = function(.x) {
        paste0("aff_", .x)
    })
# append aff- to each variable str(affectsmall)
affectsmall[1:ncol(affectsmall)] <- lapply(affectsmall[1:ncol(affectsmall)], function(x) rescale(x,
    to = c(0, 10)))
lookup_db <- cbind(word, affectsmall)
str(lookup_db)

Age of acquisition

Add the Kuperman norms to the lookup database, Yoke and rescale to 0 to 10

kup <- read.csv(here("original_dbases", "db_kuperman_aoa.csv"))
kup_aoa <- kup %>%
    select(word, kup_aoa) %>%
    na.omit  #retain complete cases
word <- kup_aoa$word
kup_aoa <- kup_aoa %>%
    select(!word)
dat <- kup_aoa %>%
    mutate(lex_age_acquisition = rescale(kup_aoa, to = c(0, 10))) %>%
    select(!kup_aoa)
dat <- cbind(word, dat)
lookup_db <- lookup_db %>%
    full_join(dat, by = "word")
lookup_db <- lookup_db %>%
    distinct(word, .keep_all = TRUE)
str(lookup_db)

Arousal, valence, dominance

NRC VAD Lexicon. Add valence, arousal, dominance ratings and reescale to 0 to 10
https://saifmohammad.com/WebPages/nrc-vad.html

nrc <- read.csv(here("original_dbases", "db_nrc_vad.csv"))
nrc[2:ncol(nrc)] <- lapply(nrc[2:ncol(nrc)], function(x) rescale(x, to = c(0, 10)))
word <- tolower(nrc$word)  #make sure all entries lowercase
nrc <- nrc %>%
    rename(aff_valence = "valence")
nrc <- nrc %>%
    rename(aff_dominance = "dominance")
nrc <- nrc %>%
    rename(sem_arousal = "arousal")
nrc <- nrc %>%
    select(!word)
nrc <- cbind(word, nrc)
# no need to select specific columns or omit NAs in this one
lookup_db <- lookup_db %>%
    full_join(nrc, by = "word")
lookup_db <- lookup_db %>%
    distinct(word, .keep_all = TRUE)  #keeps first entry

Concreteness

Rescale from the Brysbaert norms

cnc_raw <- read.csv(here("original_dbases", "db_brysbaert_cnc.csv"))  #read in raw data
cnc_small <- cnc_raw %>%
    select(word, brys_concreteness)  # select word and concreteness rating
cnc_rescale <- cnc_small %>%
    mutate(sem_concreteness = rescale(brys_concreteness, to = c(0, 10))) %>%
    select(!brys_concreteness)
cnc_rescale <- cnc_rescale %>%
    distinct(word, .keep_all = TRUE)  #retain only first entry
word <- tolower(cnc_rescale$word)  #make sure all entries lowercase in separate vector
cnc_rescale <- cnc_rescale %>%
    select(!word)  #kill word column
dat2 <- cbind(word, cnc_rescale)  #bind word column back in
str(dat2)
lookup_db <- lookup_db %>%
    full_join(dat2, by = "word")

Word length

nletter count raw — UNSCALED

# use str_length to generate letter count per word as new var called n_letters
lookup_db <- lookup_db %>%
    mutate(lex_letter_count_raw = stringr::str_length(word))

Word frequency

US Subtlex — UNSCALED Lg frequency

subtlex <- read.csv(here("original_dbases", "db_subtlex_clean.csv"))
dat <- subtlex %>%
    select(word, lg10wf)
dat <- dat %>%
    distinct(word, .keep_all = TRUE)  #retain only first entry
word <- tolower(dat$word)  #make sure all entries lowercase in separate vector
dat <- dat %>%
    rename(lex_wordfreqlg10_raw = "lg10wf")
dat <- dat %>%
    select(!word)
dat <- cbind(word, dat)  #bind word column back in
lookup_db <- lookup_db %>%
    full_join(dat, by = "word")
lookup_db <- lookup_db %>%
    select(order(colnames(lookup_db)))
lookup_db <- lookup_db %>%
    distinct(word, .keep_all = TRUE)

Morphemes per word

Merge the following values from the South Carolina (SCOPE) metabasefrom morpholex database into SCOPE

# database is already converted to lowercase all
scope <- read.csv(here("original_dbases", "db_scope_aug23rev.csv"))
dat <- scope %>%
    rename(lex_morphemecount_raw = "morpheme_count")
dat <- dat %>%
    distinct(word, .keep_all = TRUE)

Semantic neighborhoods

from Shaoul and Westbury https://www.psych.ualberta.ca/~westburylab/downloads/westburylab.arcs.ncounts.html Number of semantic neighbors based on distance in co-occurence space

dat <- dat %>%
    mutate(sem_neighbors = rescale(semantic_neighbors, to = c(0, 10))) %>%
    select(!semantic_neighbors)

Polysemy (word senses from WordNet)

https://wordnet.princeton.edu/ number of senses from Wordnet database

dat <- dat %>%
    mutate(lex_senses_polysemy = rescale(nsenses, to = c(0, 10))) %>%
    select(!nsenses)

Semantic diversity

Rescaled semantic diversity from Hoffman et al.
https://link.springer.com/article/10.3758/s13428-012-0278-x#SecESM1 The degree to which different contexts associated with a word vary in their meaning

dat <- dat %>%
    mutate(sem_diversity = rescale(semantic_diversity, to = c(0, 10))) %>%
    select(!semantic_diversity)

Prevalence

From brysbaert et al (how many people know a word)

dat <- dat %>%
    mutate(lex_prevalence = rescale(prevalence, to = c(0, 10))) %>%
    select(!prevalence)

Export

Sort columns alphabetically, save as rda and export with system date appended to filname.

# check for duplicates Reorder so word is first
lookup_db <- lookup_db %>%
    full_join(dat, by = "word")
lookup_db <- lookup_db %>%
    select(order(colnames(lookup_db)))
lookup_db <- lookup_db %>%
    distinct(word, .keep_all = TRUE)
colnames(lookup_db)
lookup_db <- lookup_db[, c(31, 1:30)]  #reorder so word is first
currentDate <- Sys.Date()
save(lookup_db, file = paste("lookup_db", currentDate, ".rda", sep = "-"))

Lookup Database Creation for ConversationAlign

Jamie Reilly, Ginny Ulichney, Ben Sacks

October 18, 2023