Changes to lookup_25

  1. Added scaled vars (changed all scales to 0-9 from prior 0-10)
  2. omitted: stress, politeness, empathy, prevalence, closeness, encouragement, hope, doubt, hostility, surprise from lookup_db
  3. Changed names: lex_AoA (age of acquisition),
  4. Added emo_intensity,
  5. Added ‘phon_’ for phonological factors (word length, nsyllables, etc), added sem_visual, sem_auditory reflecting auditory and visual salience from Lancaster Sensorimotor Norms

This script details procedures for merging, recaling, and exporting the lookup database (lookup_25) used for ConversationAlign . We will merge several databases to create a single master database spanning a bunch of affective variables (from affectvec) along with various other psycholinguistic dimensions. Each variable will appear in its raw form (unscaled from its original souce) and a recaled form from a 0 to 9 range using min/max normalization - EXCEPT for vars where recaling does not make sense (e.g., syllable count, letter count)

https://stats.stackexchange.com/questions/281162/scale-a-number-between-a-range

For details on where each of the variables below was drawn from, visit:
https://reilly-lab.github.io/ConversationAlign_LookupDatabaseCreation.html

Load prepped (mostly raw) databases

These are lightly processes versions of their original sources with:
1. appended prefix with source to variable name
2. transformed all words to lowercase
3. homogenized var names so every database has ‘word’ column
3. removed duplicate strings using dplyr::distinct() to prevent merge errors

# Original load('original_dbases/lookup_db.rda')

# Revised July25
load("~/Library/CloudStorage/OneDrive-TempleUniversity/Reilly_RData/ConversationAlign (dev)/original_dbases/lookup_Jul25.rda")
colnames(lookup_Jul25)
##  [1] "word"                     "aff_anger"               
##  [3] "aff_anger_rescale"        "aff_anxiety"             
##  [5] "aff_anxiety_rescale"      "aff_arousal_b24"         
##  [7] "aff_arousal_b24_rescale"  "aff_boredom"             
##  [9] "aff_boredom_rescale"      "aff_confusion"           
## [11] "aff_confusion_rescale"    "aff_doubt"               
## [13] "aff_doubt_rescale"        "aff_emo_intensity"       
## [15] "aff_emo_intensity_recale" "aff_excitement"          
## [17] "aff_excitement_rescale"   "aff_guilt"               
## [19] "aff_guilt_rescale"        "aff_happiness"           
## [21] "aff_happiness_rescale"    "aff_sadness"             
## [23] "aff_sadness_rescale"      "aff_surprise"            
## [25] "aff_surprise_rescale"     "aff_trust"               
## [27] "aff_trust_rescale"        "aff_valence_b24"         
## [29] "aff_valence_b24_rescale"  "lex_AoA"                 
## [31] "lex_AoA_rescale"          "lex_freqlg10"            
## [33] "lex_freqlg10_rescale"     "lex_n_morphemes"         
## [35] "lex_n_senses"             "lex_n_senses_rescale"    
## [37] "phon_n_lett"              "phon_nsyll"              
## [39] "sem_auditory"             "sem_auditory_rescale"    
## [41] "sem_cnc_b24"              "sem_cnc_b24_rescale"     
## [43] "sem_cnc_v2013"            "sem_cnc_v2013_rescale"   
## [45] "sem_diversity"            "sem_diversity_rescale"   
## [47] "sem_neighbors"            "sem_neighbors_rescale"   
## [49] "sem_visual"               "sem_visual_rescale"

Affectvec

Original scale is 1 to -1, Convert all affectvec values to a 0-9 scale, removed stress, politeness, empathy

base <- read.csv("~/Library/CloudStorage/OneDrive-TempleUniversity/Reilly_RData/Lookup_Database_Creation/original_databases/db_affectvec.csv")
affectsmall <- base %>%
    select(word, trust, anger, sadness, anxiety, boredom, confusion, excitement,
        happiness, guilt)
word <- tolower(affectsmall$word)
affectsmall <- affectsmall %>%
    select(!word)
# append aff- suffix to all variables
affectsmall <- affectsmall %>%
    rename_with(.fn = function(.x) {
        paste0("emo_", .x)
    })
affectsmall <- affectsmall %>%
    mutate(across(.cols = starts_with("emo_"), .fns = ~rescale(.x, to = c(0, 9)),
        .names = "{.col}_rescale"))
affvec_prepped <- cbind(word, affectsmall) %>%
    select(word, sort(names(.)[names(.) != "word"]))
sum(duplicated(affvec_prepped))
## [1] 0
str(affvec_prepped)
## 'data.frame':    76427 obs. of  19 variables:
##  $ word                  : chr  "a" "aa" "aaa" "aaaa" ...
##  $ emo_anger             : num  0.019 -0.0256 -0.0901 -0.1093 -0.0249 ...
##  $ emo_anger_rescale     : num  2.58 2.29 1.87 1.74 2.29 ...
##  $ emo_anxiety           : num  -0.0491 -0.1282 -0.0491 -0.1137 -0.0472 ...
##  $ emo_anxiety_rescale   : num  2.05 1.52 2.05 1.62 2.06 ...
##  $ emo_boredom           : num  0.0424 -0.0838 0.0168 -0.0445 -0.0321 -0.0142 0.0068 -0.0594 -0.0417 -0.0092 ...
##  $ emo_boredom_rescale   : num  2.7 1.87 2.53 2.13 2.21 ...
##  $ emo_confusion         : num  -0.0686 -0.1565 -0.0887 -0.0615 0.0184 ...
##  $ emo_confusion_rescale : num  1.3 0.67 1.16 1.35 1.93 ...
##  $ emo_excitement        : num  -0.0381 0.0135 -0.0681 0.0001 -0.0459 -0.0592 0.0362 -0.0007 0.0471 0.0606 ...
##  $ emo_excitement_rescale: num  2.55 2.87 2.36 2.79 2.5 ...
##  $ emo_guilt             : num  0.0634 0.028 0.0089 -0.0154 -0.016 0.0053 0.0215 0.0194 0.0447 -0.0286 ...
##  $ emo_guilt_rescale     : num  2.75 2.52 2.39 2.23 2.22 ...
##  $ emo_happiness         : num  0.0399 0.0981 0.0086 0.0413 0.0645 -0.0505 -0.0488 -0.0406 -0.0191 0.0143 ...
##  $ emo_happiness_rescale : num  2.89 3.26 2.69 2.9 3.05 ...
##  $ emo_sadness           : num  0.0066 -0.0492 -0.0993 -0.0886 -0.0552 0.0126 -0.0057 -0.0081 -0.0272 0.0064 ...
##  $ emo_sadness_rescale   : num  2.37 1.99 1.66 1.73 1.95 ...
##  $ emo_trust             : num  0.0363 0.1046 0.1462 0.1095 0.0376 ...
##  $ emo_trust_rescale     : num  2.75 3.2 3.47 3.23 2.76 ...

Age of acquisition

Add the Kuperman norms to the lookup database, Yoke and rescale to 0 to 9

kup_aoa <- read.csv("~/Library/CloudStorage/OneDrive-TempleUniversity/Reilly_RData/ConversationAlign (dev)/original_dbases/db_kuperman_aoa.csv")
kup_aoa <- kup_aoa %>%
    distinct(word, .keep_all = TRUE) %>%
    filter(complete.cases(.)) %>%
    select(word, kup_aoa, kup_nsyll)
kup_aoa <- kup_aoa %>%
    dplyr::rename(lex_AoA = "kup_aoa", phon_nsyll = "kup_nsyll")
aoa_prepped <- kup_aoa %>%
    mutate(lex_AoA_rescale = rescale(lex_AoA, to = c(0, 9))) %>%
    select(word, sort(names(.)[names(.) != "word"]))
sum(duplicated(aoa_prepped$word))
## [1] 0
str(aoa_prepped)
## 'data.frame':    31104 obs. of  4 variables:
##  $ word           : chr  "a" "aardvark" "abacus" "abalone" ...
##  $ lex_AoA        : num  2.89 9.89 8.69 12.23 8.32 ...
##  $ lex_AoA_rescale: num  0.505 3.193 2.732 4.093 2.59 ...
##  $ phon_nsyll     : int  1 2 3 4 3 4 4 2 3 2 ...

Brysbaert gpt Arousal, valence, dominance, add emo intensity

Brysbaert gpt ratings (2024)
https://link.springer.com/article/10.3758/s13428-024-02515-z?fromPaywallRec=false

brys_2024 <- read.csv("~/Library/CloudStorage/OneDrive-TempleUniversity/Reilly_RData/ConversationAlign (dev)/original_dbases/db_brysbaert_gpt.csv")
brys_2024 <- brys_2024 %>%
    dplyr::rename(emo_valence_b24 = "valence", emo_arousal_b24 = "arousal", sem_cnc_b24 = "cnc")
brys_2024 <- brys_2024 %>%
    distinct(word, .keep_all = TRUE) %>%
    filter(complete.cases(.))
str(brys_2024)
## 'data.frame':    126392 obs. of  4 variables:
##  $ word           : chr  "a" "aa" "aaaaaaah" "aaaah" ...
##  $ sem_cnc_b24    : num  1 1.78 1.28 2.28 1.32 2.22 4.99 4.94 5 2.77 ...
##  $ emo_valence_b24: num  4.99 4.98 5.07 5.94 7.01 5.29 5.01 5.04 5.01 5.2 ...
##  $ emo_arousal_b24: num  1.01 3.02 5.11 2.77 5.03 3.08 1.1 1.08 1.69 1.32 ...
brys_2024_prepped <- brys_2024 %>%
    mutate(emo_valence_b24_rescale = rescale(emo_valence_b24, to = c(0, 9)), emo_arousal_b24_rescale = rescale(emo_arousal_b24,
        to = c(0, 9)), sem_cnc_b24_rescale = rescale(sem_cnc_b24, to = c(0, 9))) %>%
    select(word, sort(names(.)[names(.) != "word"]))
# Add emotional intensity (absval, valence z-score)
brys_2024_prepped <- brys_2024_prepped %>%
    mutate(emo_intensity = abs(scale(emo_valence_b24)))
brys_2024_prepped <- brys_2024_prepped %>%
    mutate(emo_intensity_recale = rescale(emo_intensity, to = c(0, 9)))
str(brys_2024_prepped)
## 'data.frame':    126392 obs. of  9 variables:
##  $ word                   : chr  "a" "aa" "aaaaaaah" "aaaah" ...
##  $ emo_arousal_b24        : num  1.01 3.02 5.11 2.77 5.03 3.08 1.1 1.08 1.69 1.32 ...
##  $ emo_arousal_b24_rescale: num  0.133 2.364 4.683 2.086 4.594 ...
##  $ emo_valence_b24        : num  4.99 4.98 5.07 5.94 7.01 5.29 5.01 5.04 5.01 5.2 ...
##  $ emo_valence_b24_rescale: num  4.51 4.49 4.6 5.57 6.77 ...
##  $ sem_cnc_b24            : num  1 1.78 1.28 2.28 1.32 2.22 4.99 4.94 5 2.77 ...
##  $ sem_cnc_b24_rescale    : num  0.176 1.897 0.794 3 0.882 ...
##  $ emo_intensity          : num [1:126392, 1] 0.0412 0.0335 0.1032 0.7772 1.6062 ...
##   ..- attr(*, "scaled:center")= num 4.94
##   ..- attr(*, "scaled:scale")= num 1.29
##  $ emo_intensity_recale   : num [1:126392, 1] 0.1108 0.0887 0.2882 2.2167 4.5887 ...
##   ..- attr(*, "scaled:center")= num 4.94
##   ..- attr(*, "scaled:scale")= num 1.29
sum(duplicated(brys_2024_prepped$word))
## [1] 0

Concreteness v2013 (sem_cnc_v2013, sem_cnc_v2013_recale)

Brysbaert Concreteness Norms from 2013 human crowdsourced, original scale 1-5; rescale from 0 to 9. https://link.springer.com/article/10.3758/s13428-013-0403-5

brys_cnc <- read.csv("~/Library/CloudStorage/OneDrive-TempleUniversity/Reilly_RData/Lookup_Database_Creation/original_databases/db_brysbaert_cnc.csv")
brys_cnc <- brys_cnc %>%
    dplyr::distinct(word, .keep_all = TRUE) %>%
    filter(complete.cases(.))
brys_cnc <- brys_cnc %>%
    dplyr::rename(sem_cnc_v2013 = brys_concreteness)
brys_2013_prepped <- brys_cnc %>%
    mutate(sem_cnc_v2013_rescale = rescale(sem_cnc_v2013, to = c(0, 9)))
str(brys_2013_prepped)
## 'data.frame':    39576 obs. of  3 variables:
##  $ word                 : chr  "a" "acappella" "aardvark" "aback" ...
##  $ sem_cnc_v2013        : num  1.46 2.92 4.68 1.65 4.52 2.54 2.52 2.92 2.5 2.54 ...
##  $ sem_cnc_v2013_rescale: num  0.955 4.273 8.273 1.386 7.909 ...
sum(duplicated(brys_2013_prepped$word))
## [1] 0

Word frequency

US Subtlex — UNSCALED Lg frequency

subtlex <- read.csv("~/Library/CloudStorage/OneDrive-TempleUniversity/Reilly_RData/ConversationAlign (dev)/original_dbases/db_subtlex.csv")
subtlex <- subtlex %>%
    select(word, lg10wf)
freq_prepped <- subtlex %>%
    mutate(word = tolower(word)) %>%
    distinct(word, .keep_all = TRUE) %>%
    filter(complete.cases(.)) %>%
    dplyr::rename(lex_freqlg10 = "lg10wf")
freq_prepped <- freq_prepped %>%
    mutate(lex_freqlg10_rescale = rescale(lex_freqlg10, to = c(0, 9)))
str(freq_prepped)
## 'data.frame':    60384 obs. of  3 variables:
##  $ word                : chr  "the" "to" "a" "you" ...
##  $ lex_freqlg10        : num  6.18 6.06 6.02 6.33 5.83 ...
##  $ lex_freqlg10_rescale: num  8.77 8.59 8.52 9 8.24 ...
sum(duplicated(freq_prepped))
## [1] 0

Vars from SCOPE norms

Merge the following values from the South Carolina (SCOPE) metabasefrom morpholex database into SCOPE: 1) lex_morphemes: morphemes per word derived from Morpholex database 2) sem_neighbors: Number of semantic neighbors based on distance in co-occurence spacefrom Shaoul and Westbury https://www.psych.ualberta.ca/~westburylab/downloads/westburylab.arcs.ncounts.html 3) lex_senses: https://wordnet.princeton.edu/ number of senses from Wordnet database 4) sem_diversity: Semantic diversity from Hoffman et al. The degree to which different contexts associated with a word vary in their meaning https://link.springer.com/article/10.3758/s13428-012-0278-x#SecESM1

# database is already converted to lowercase all
scope <- read.csv("~/Library/CloudStorage/OneDrive-TempleUniversity/Reilly_RData/ConversationAlign (dev)/original_dbases/db_scope.csv")
scope <- scope %>%
    distinct(word, .keep_all = TRUE)
scope_prepped <- scope %>%
    mutate(lex_n_senses_rescale = rescale(lex_n_senses, to = c(0, 9)), sem_neighbors_rescale = rescale(sem_neighbors,
        to = c(0, 9)), sem_diversity_rescale = rescale(sem_diversity, to = c(0, 9)))
str(scope_prepped)
## 'data.frame':    68137 obs. of  8 variables:
##  $ word                 : chr  "a" "aa" "aah" "aardvark" ...
##  $ lex_n_senses         : int  7 NA 1 NA 2 NA 2 2 NA NA ...
##  $ sem_diversity        : num  NA NA 1.07 NA NA ...
##  $ sem_neighbors        : int  NA NA 0 NA NA NA 0 0 0 NA ...
##  $ lex_n_morphemes      : int  1 NA NA NA 1 NA 1 1 NA NA ...
##  $ lex_n_senses_rescale : num  0.84 NA 0.12 NA 0.24 NA 0.24 0.24 NA NA ...
##  $ sem_neighbors_rescale: num  NA NA 0 NA NA NA 0 0 0 NA ...
##  $ sem_diversity_rescale: num  NA NA 3.64 NA NA ...
sum(duplicated(scope_prepped))
## [1] 0

Vars from LANCASTER sensorimotor norms

auditory, visual sensorimotor salience

lancaster <- read.csv("~/Library/CloudStorage/OneDrive-TempleUniversity/Reilly_RData/ConversationAlign (dev)/original_dbases/db_lancaster.csv")
lancaster <- lancaster %>%
    distinct(word, .keep_all = TRUE)
lancaster_prepped <- lancaster %>%
    mutate(sem_auditory_rescale = rescale(sem_auditory, to = c(0, 9)), sem_visual_rescale = rescale(sem_visual,
        to = c(0, 9)))
str(lancaster_prepped)
## 'data.frame':    39329 obs. of  5 variables:
##  $ word                : chr  "a" "acappella" "aardvark" "aback" ...
##  $ sem_auditory        : num  2.21 4.33 1.62 1.29 1.56 ...
##  $ sem_visual          : num  2.43 1.67 4.12 2.82 3.94 ...
##  $ sem_auditory_rescale: num  3.99 7.8 2.93 2.33 2.8 ...
##  $ sem_visual_rescale  : num  4.37 3 7.42 5.08 7.1 ...
sum(duplicated(lancaster_prepped))
## [1] 0

MERGE

Sort columns alphabetically, save as rda and export with system date appended to filname.

# originals: affvec_prepped, aoa_prepped, brys_2024_prepped, brys_2013_prepped,
# lancaster_prepped, scope_prepped, freq_prepped
lookup_Jul25 <- affvec_prepped %>%
    full_join(aoa_prepped, by = "word") %>%
    full_join(scope_prepped, by = "word") %>%
    full_join(brys_2024_prepped, by = "word") %>%
    full_join(brys_2013_prepped, by = "word") %>%
    full_join(lancaster_prepped, by = "word") %>%
    full_join(freq_prepped, by = "word")

Word length (Letter Count)

nletter count raw — UNSCALED

# use str_length to generate letter count per word as new var called n_letters
lookup_Jul25 <- lookup_Jul25 %>%
    mutate(phon_n_lett = stringr::str_length(word))

SORT and EXPORT

#sort dataframe word is first column all other vars alphabetically ordered
lookup_Jul25 <- lookup_Jul25  %>% select(word, # Keep 'word' first
  sort(names(.)[names(.) != "word"])) # Alphabetize remaining columns
str(lookup_Jul25)
## 'data.frame':    156203 obs. of  46 variables:
##  $ word                   : chr  "a" "aa" "aaa" "aaaa" ...
##  $ emo_anger              : num  0.019 -0.0256 -0.0901 -0.1093 -0.0249 ...
##  $ emo_anger_rescale      : num  2.58 2.29 1.87 1.74 2.29 ...
##  $ emo_anxiety            : num  -0.0491 -0.1282 -0.0491 -0.1137 -0.0472 ...
##  $ emo_anxiety_rescale    : num  2.05 1.52 2.05 1.62 2.06 ...
##  $ emo_arousal_b24        : num  1.01 3.02 NA NA NA NA NA 5.11 NA NA ...
##  $ emo_arousal_b24_rescale: num  0.133 2.364 NA NA NA ...
##  $ emo_boredom            : num  0.0424 -0.0838 0.0168 -0.0445 -0.0321 -0.0142 0.0068 -0.0594 -0.0417 -0.0092 ...
##  $ emo_boredom_rescale    : num  2.7 1.87 2.53 2.13 2.21 ...
##  $ emo_confusion          : num  -0.0686 -0.1565 -0.0887 -0.0615 0.0184 ...
##  $ emo_confusion_rescale  : num  1.3 0.67 1.16 1.35 1.93 ...
##  $ emo_excitement         : num  -0.0381 0.0135 -0.0681 0.0001 -0.0459 -0.0592 0.0362 -0.0007 0.0471 0.0606 ...
##  $ emo_excitement_rescale : num  2.55 2.87 2.36 2.79 2.5 ...
##  $ emo_guilt              : num  0.0634 0.028 0.0089 -0.0154 -0.016 0.0053 0.0215 0.0194 0.0447 -0.0286 ...
##  $ emo_guilt_rescale      : num  2.75 2.52 2.39 2.23 2.22 ...
##  $ emo_happiness          : num  0.0399 0.0981 0.0086 0.0413 0.0645 -0.0505 -0.0488 -0.0406 -0.0191 0.0143 ...
##  $ emo_happiness_rescale  : num  2.89 3.26 2.69 2.9 3.05 ...
##  $ emo_intensity          : num [1:156203, 1] 0.0412 0.0335 NA NA NA ...
##   ..- attr(*, "scaled:center")= num 4.94
##   ..- attr(*, "scaled:scale")= num 1.29
##  $ emo_intensity_recale   : num [1:156203, 1] 0.1108 0.0887 NA NA NA ...
##   ..- attr(*, "scaled:center")= num 4.94
##   ..- attr(*, "scaled:scale")= num 1.29
##  $ emo_sadness            : num  0.0066 -0.0492 -0.0993 -0.0886 -0.0552 0.0126 -0.0057 -0.0081 -0.0272 0.0064 ...
##  $ emo_sadness_rescale    : num  2.37 1.99 1.66 1.73 1.95 ...
##  $ emo_trust              : num  0.0363 0.1046 0.1462 0.1095 0.0376 ...
##  $ emo_trust_rescale      : num  2.75 3.2 3.47 3.23 2.76 ...
##  $ emo_valence_b24        : num  4.99 4.98 NA NA NA NA NA 5.07 NA NA ...
##  $ emo_valence_b24_rescale: num  4.51 4.49 NA NA NA ...
##  $ lex_AoA                : num  2.89 NA NA NA NA ...
##  $ lex_AoA_rescale        : num  0.505 NA NA NA NA ...
##  $ lex_freqlg10           : num  6.02 1.94 1.42 NA NA ...
##  $ lex_freqlg10_rescale   : num  8.52 2.26 1.44 NA NA ...
##  $ lex_n_morphemes        : int  1 NA NA NA NA NA NA NA NA NA ...
##  $ lex_n_senses           : int  7 NA NA NA NA NA NA NA NA NA ...
##  $ lex_n_senses_rescale   : num  0.84 NA NA NA NA NA NA NA NA NA ...
##  $ phon_n_lett            : int  1 2 3 4 5 10 9 8 7 6 ...
##  $ phon_nsyll             : int  1 NA NA NA NA NA NA NA NA NA ...
##  $ sem_auditory           : num  2.21 NA NA NA NA ...
##  $ sem_auditory_rescale   : num  3.99 NA NA NA NA ...
##  $ sem_cnc_b24            : num  1 1.78 NA NA NA NA NA 1.28 NA NA ...
##  $ sem_cnc_b24_rescale    : num  0.176 1.897 NA NA NA ...
##  $ sem_cnc_v2013          : num  1.46 NA NA NA NA NA NA NA NA NA ...
##  $ sem_cnc_v2013_rescale  : num  0.955 NA NA NA NA ...
##  $ sem_diversity          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ sem_diversity_rescale  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ sem_neighbors          : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ sem_neighbors_rescale  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ sem_visual             : num  2.43 NA NA NA NA ...
##  $ sem_visual_rescale     : num  4.37 NA NA NA NA ...
save(lookup_Jul25, file = "original_dbases/lookup_Jul25.rda")
tools::checkRdaFiles("lookup_Jul25.rda")
size ASCII compress version
lookup_Jul25.rda NA NA NA NA
write.csv(lookup_Jul25, file = "original_dbases/lookup_Jul25.rda")