This script details procedures for merging, recaling, and exporting
the lookup database (lookup_25) used for ConversationAlign . We will
merge several databases to create a single master database spanning a
bunch of affective variables (from affectvec) along with various other
psycholinguistic dimensions. Each variable will appear in its raw form
(unscaled from its original souce) and a recaled form from a 0 to 9
range using min/max normalization - EXCEPT for vars where recaling does
not make sense (e.g., syllable count, letter count)
https://stats.stackexchange.com/questions/281162/scale-a-number-between-a-range
For details on where each of the variables below was drawn from,
visit:
https://reilly-lab.github.io/ConversationAlign_LookupDatabaseCreation.html
These are lightly processes versions of their original sources with:
1. appended prefix with source to variable name
2. transformed
all words to lowercase
3. homogenized var names so every database
has ‘word’ column
3. removed duplicate strings using
dplyr::distinct() to prevent merge errors
# Original load('original_dbases/lookup_db.rda')
# Revised July25
load("~/Library/CloudStorage/OneDrive-TempleUniversity/Reilly_RData/ConversationAlign (dev)/original_dbases/lookup_Jul25.rda")
colnames(lookup_Jul25)
## [1] "word" "aff_anger"
## [3] "aff_anger_rescale" "aff_anxiety"
## [5] "aff_anxiety_rescale" "aff_arousal_b24"
## [7] "aff_arousal_b24_rescale" "aff_boredom"
## [9] "aff_boredom_rescale" "aff_confusion"
## [11] "aff_confusion_rescale" "aff_doubt"
## [13] "aff_doubt_rescale" "aff_emo_intensity"
## [15] "aff_emo_intensity_recale" "aff_excitement"
## [17] "aff_excitement_rescale" "aff_guilt"
## [19] "aff_guilt_rescale" "aff_happiness"
## [21] "aff_happiness_rescale" "aff_sadness"
## [23] "aff_sadness_rescale" "aff_surprise"
## [25] "aff_surprise_rescale" "aff_trust"
## [27] "aff_trust_rescale" "aff_valence_b24"
## [29] "aff_valence_b24_rescale" "lex_AoA"
## [31] "lex_AoA_rescale" "lex_freqlg10"
## [33] "lex_freqlg10_rescale" "lex_n_morphemes"
## [35] "lex_n_senses" "lex_n_senses_rescale"
## [37] "phon_n_lett" "phon_nsyll"
## [39] "sem_auditory" "sem_auditory_rescale"
## [41] "sem_cnc_b24" "sem_cnc_b24_rescale"
## [43] "sem_cnc_v2013" "sem_cnc_v2013_rescale"
## [45] "sem_diversity" "sem_diversity_rescale"
## [47] "sem_neighbors" "sem_neighbors_rescale"
## [49] "sem_visual" "sem_visual_rescale"
Original scale is 1 to -1, Convert all affectvec values to a 0-9 scale, removed stress, politeness, empathy
base <- read.csv("~/Library/CloudStorage/OneDrive-TempleUniversity/Reilly_RData/Lookup_Database_Creation/original_databases/db_affectvec.csv")
affectsmall <- base %>%
select(word, trust, anger, sadness, anxiety, boredom, confusion, excitement,
happiness, guilt)
word <- tolower(affectsmall$word)
affectsmall <- affectsmall %>%
select(!word)
# append aff- suffix to all variables
affectsmall <- affectsmall %>%
rename_with(.fn = function(.x) {
paste0("emo_", .x)
})
affectsmall <- affectsmall %>%
mutate(across(.cols = starts_with("emo_"), .fns = ~rescale(.x, to = c(0, 9)),
.names = "{.col}_rescale"))
affvec_prepped <- cbind(word, affectsmall) %>%
select(word, sort(names(.)[names(.) != "word"]))
sum(duplicated(affvec_prepped))
## [1] 0
str(affvec_prepped)
## 'data.frame': 76427 obs. of 19 variables:
## $ word : chr "a" "aa" "aaa" "aaaa" ...
## $ emo_anger : num 0.019 -0.0256 -0.0901 -0.1093 -0.0249 ...
## $ emo_anger_rescale : num 2.58 2.29 1.87 1.74 2.29 ...
## $ emo_anxiety : num -0.0491 -0.1282 -0.0491 -0.1137 -0.0472 ...
## $ emo_anxiety_rescale : num 2.05 1.52 2.05 1.62 2.06 ...
## $ emo_boredom : num 0.0424 -0.0838 0.0168 -0.0445 -0.0321 -0.0142 0.0068 -0.0594 -0.0417 -0.0092 ...
## $ emo_boredom_rescale : num 2.7 1.87 2.53 2.13 2.21 ...
## $ emo_confusion : num -0.0686 -0.1565 -0.0887 -0.0615 0.0184 ...
## $ emo_confusion_rescale : num 1.3 0.67 1.16 1.35 1.93 ...
## $ emo_excitement : num -0.0381 0.0135 -0.0681 0.0001 -0.0459 -0.0592 0.0362 -0.0007 0.0471 0.0606 ...
## $ emo_excitement_rescale: num 2.55 2.87 2.36 2.79 2.5 ...
## $ emo_guilt : num 0.0634 0.028 0.0089 -0.0154 -0.016 0.0053 0.0215 0.0194 0.0447 -0.0286 ...
## $ emo_guilt_rescale : num 2.75 2.52 2.39 2.23 2.22 ...
## $ emo_happiness : num 0.0399 0.0981 0.0086 0.0413 0.0645 -0.0505 -0.0488 -0.0406 -0.0191 0.0143 ...
## $ emo_happiness_rescale : num 2.89 3.26 2.69 2.9 3.05 ...
## $ emo_sadness : num 0.0066 -0.0492 -0.0993 -0.0886 -0.0552 0.0126 -0.0057 -0.0081 -0.0272 0.0064 ...
## $ emo_sadness_rescale : num 2.37 1.99 1.66 1.73 1.95 ...
## $ emo_trust : num 0.0363 0.1046 0.1462 0.1095 0.0376 ...
## $ emo_trust_rescale : num 2.75 3.2 3.47 3.23 2.76 ...
Add the Kuperman norms to the lookup database, Yoke and rescale to 0 to 9
kup_aoa <- read.csv("~/Library/CloudStorage/OneDrive-TempleUniversity/Reilly_RData/ConversationAlign (dev)/original_dbases/db_kuperman_aoa.csv")
kup_aoa <- kup_aoa %>%
distinct(word, .keep_all = TRUE) %>%
filter(complete.cases(.)) %>%
select(word, kup_aoa, kup_nsyll)
kup_aoa <- kup_aoa %>%
dplyr::rename(lex_AoA = "kup_aoa", phon_nsyll = "kup_nsyll")
aoa_prepped <- kup_aoa %>%
mutate(lex_AoA_rescale = rescale(lex_AoA, to = c(0, 9))) %>%
select(word, sort(names(.)[names(.) != "word"]))
sum(duplicated(aoa_prepped$word))
## [1] 0
str(aoa_prepped)
## 'data.frame': 31104 obs. of 4 variables:
## $ word : chr "a" "aardvark" "abacus" "abalone" ...
## $ lex_AoA : num 2.89 9.89 8.69 12.23 8.32 ...
## $ lex_AoA_rescale: num 0.505 3.193 2.732 4.093 2.59 ...
## $ phon_nsyll : int 1 2 3 4 3 4 4 2 3 2 ...
Brysbaert gpt ratings (2024)
https://link.springer.com/article/10.3758/s13428-024-02515-z?fromPaywallRec=false
brys_2024 <- read.csv("~/Library/CloudStorage/OneDrive-TempleUniversity/Reilly_RData/ConversationAlign (dev)/original_dbases/db_brysbaert_gpt.csv")
brys_2024 <- brys_2024 %>%
dplyr::rename(emo_valence_b24 = "valence", emo_arousal_b24 = "arousal", sem_cnc_b24 = "cnc")
brys_2024 <- brys_2024 %>%
distinct(word, .keep_all = TRUE) %>%
filter(complete.cases(.))
str(brys_2024)
## 'data.frame': 126392 obs. of 4 variables:
## $ word : chr "a" "aa" "aaaaaaah" "aaaah" ...
## $ sem_cnc_b24 : num 1 1.78 1.28 2.28 1.32 2.22 4.99 4.94 5 2.77 ...
## $ emo_valence_b24: num 4.99 4.98 5.07 5.94 7.01 5.29 5.01 5.04 5.01 5.2 ...
## $ emo_arousal_b24: num 1.01 3.02 5.11 2.77 5.03 3.08 1.1 1.08 1.69 1.32 ...
brys_2024_prepped <- brys_2024 %>%
mutate(emo_valence_b24_rescale = rescale(emo_valence_b24, to = c(0, 9)), emo_arousal_b24_rescale = rescale(emo_arousal_b24,
to = c(0, 9)), sem_cnc_b24_rescale = rescale(sem_cnc_b24, to = c(0, 9))) %>%
select(word, sort(names(.)[names(.) != "word"]))
# Add emotional intensity (absval, valence z-score)
brys_2024_prepped <- brys_2024_prepped %>%
mutate(emo_intensity = abs(scale(emo_valence_b24)))
brys_2024_prepped <- brys_2024_prepped %>%
mutate(emo_intensity_recale = rescale(emo_intensity, to = c(0, 9)))
str(brys_2024_prepped)
## 'data.frame': 126392 obs. of 9 variables:
## $ word : chr "a" "aa" "aaaaaaah" "aaaah" ...
## $ emo_arousal_b24 : num 1.01 3.02 5.11 2.77 5.03 3.08 1.1 1.08 1.69 1.32 ...
## $ emo_arousal_b24_rescale: num 0.133 2.364 4.683 2.086 4.594 ...
## $ emo_valence_b24 : num 4.99 4.98 5.07 5.94 7.01 5.29 5.01 5.04 5.01 5.2 ...
## $ emo_valence_b24_rescale: num 4.51 4.49 4.6 5.57 6.77 ...
## $ sem_cnc_b24 : num 1 1.78 1.28 2.28 1.32 2.22 4.99 4.94 5 2.77 ...
## $ sem_cnc_b24_rescale : num 0.176 1.897 0.794 3 0.882 ...
## $ emo_intensity : num [1:126392, 1] 0.0412 0.0335 0.1032 0.7772 1.6062 ...
## ..- attr(*, "scaled:center")= num 4.94
## ..- attr(*, "scaled:scale")= num 1.29
## $ emo_intensity_recale : num [1:126392, 1] 0.1108 0.0887 0.2882 2.2167 4.5887 ...
## ..- attr(*, "scaled:center")= num 4.94
## ..- attr(*, "scaled:scale")= num 1.29
sum(duplicated(brys_2024_prepped$word))
## [1] 0
Brysbaert Concreteness Norms from 2013 human crowdsourced, original
scale 1-5; rescale from 0 to 9. https://link.springer.com/article/10.3758/s13428-013-0403-5
brys_cnc <- read.csv("~/Library/CloudStorage/OneDrive-TempleUniversity/Reilly_RData/Lookup_Database_Creation/original_databases/db_brysbaert_cnc.csv")
brys_cnc <- brys_cnc %>%
dplyr::distinct(word, .keep_all = TRUE) %>%
filter(complete.cases(.))
brys_cnc <- brys_cnc %>%
dplyr::rename(sem_cnc_v2013 = brys_concreteness)
brys_2013_prepped <- brys_cnc %>%
mutate(sem_cnc_v2013_rescale = rescale(sem_cnc_v2013, to = c(0, 9)))
str(brys_2013_prepped)
## 'data.frame': 39576 obs. of 3 variables:
## $ word : chr "a" "acappella" "aardvark" "aback" ...
## $ sem_cnc_v2013 : num 1.46 2.92 4.68 1.65 4.52 2.54 2.52 2.92 2.5 2.54 ...
## $ sem_cnc_v2013_rescale: num 0.955 4.273 8.273 1.386 7.909 ...
sum(duplicated(brys_2013_prepped$word))
## [1] 0
US Subtlex — UNSCALED Lg frequency
subtlex <- read.csv("~/Library/CloudStorage/OneDrive-TempleUniversity/Reilly_RData/ConversationAlign (dev)/original_dbases/db_subtlex.csv")
subtlex <- subtlex %>%
select(word, lg10wf)
freq_prepped <- subtlex %>%
mutate(word = tolower(word)) %>%
distinct(word, .keep_all = TRUE) %>%
filter(complete.cases(.)) %>%
dplyr::rename(lex_freqlg10 = "lg10wf")
freq_prepped <- freq_prepped %>%
mutate(lex_freqlg10_rescale = rescale(lex_freqlg10, to = c(0, 9)))
str(freq_prepped)
## 'data.frame': 60384 obs. of 3 variables:
## $ word : chr "the" "to" "a" "you" ...
## $ lex_freqlg10 : num 6.18 6.06 6.02 6.33 5.83 ...
## $ lex_freqlg10_rescale: num 8.77 8.59 8.52 9 8.24 ...
sum(duplicated(freq_prepped))
## [1] 0
Merge the following values from the South Carolina (SCOPE) metabasefrom morpholex database into SCOPE: 1) lex_morphemes: morphemes per word derived from Morpholex database 2) sem_neighbors: Number of semantic neighbors based on distance in co-occurence spacefrom Shaoul and Westbury https://www.psych.ualberta.ca/~westburylab/downloads/westburylab.arcs.ncounts.html 3) lex_senses: https://wordnet.princeton.edu/ number of senses from Wordnet database 4) sem_diversity: Semantic diversity from Hoffman et al. The degree to which different contexts associated with a word vary in their meaning https://link.springer.com/article/10.3758/s13428-012-0278-x#SecESM1
# database is already converted to lowercase all
scope <- read.csv("~/Library/CloudStorage/OneDrive-TempleUniversity/Reilly_RData/ConversationAlign (dev)/original_dbases/db_scope.csv")
scope <- scope %>%
distinct(word, .keep_all = TRUE)
scope_prepped <- scope %>%
mutate(lex_n_senses_rescale = rescale(lex_n_senses, to = c(0, 9)), sem_neighbors_rescale = rescale(sem_neighbors,
to = c(0, 9)), sem_diversity_rescale = rescale(sem_diversity, to = c(0, 9)))
str(scope_prepped)
## 'data.frame': 68137 obs. of 8 variables:
## $ word : chr "a" "aa" "aah" "aardvark" ...
## $ lex_n_senses : int 7 NA 1 NA 2 NA 2 2 NA NA ...
## $ sem_diversity : num NA NA 1.07 NA NA ...
## $ sem_neighbors : int NA NA 0 NA NA NA 0 0 0 NA ...
## $ lex_n_morphemes : int 1 NA NA NA 1 NA 1 1 NA NA ...
## $ lex_n_senses_rescale : num 0.84 NA 0.12 NA 0.24 NA 0.24 0.24 NA NA ...
## $ sem_neighbors_rescale: num NA NA 0 NA NA NA 0 0 0 NA ...
## $ sem_diversity_rescale: num NA NA 3.64 NA NA ...
sum(duplicated(scope_prepped))
## [1] 0
auditory, visual sensorimotor salience
lancaster <- read.csv("~/Library/CloudStorage/OneDrive-TempleUniversity/Reilly_RData/ConversationAlign (dev)/original_dbases/db_lancaster.csv")
lancaster <- lancaster %>%
distinct(word, .keep_all = TRUE)
lancaster_prepped <- lancaster %>%
mutate(sem_auditory_rescale = rescale(sem_auditory, to = c(0, 9)), sem_visual_rescale = rescale(sem_visual,
to = c(0, 9)))
str(lancaster_prepped)
## 'data.frame': 39329 obs. of 5 variables:
## $ word : chr "a" "acappella" "aardvark" "aback" ...
## $ sem_auditory : num 2.21 4.33 1.62 1.29 1.56 ...
## $ sem_visual : num 2.43 1.67 4.12 2.82 3.94 ...
## $ sem_auditory_rescale: num 3.99 7.8 2.93 2.33 2.8 ...
## $ sem_visual_rescale : num 4.37 3 7.42 5.08 7.1 ...
sum(duplicated(lancaster_prepped))
## [1] 0
Sort columns alphabetically, save as rda and export with system date appended to filname.
# originals: affvec_prepped, aoa_prepped, brys_2024_prepped, brys_2013_prepped,
# lancaster_prepped, scope_prepped, freq_prepped
lookup_Jul25 <- affvec_prepped %>%
full_join(aoa_prepped, by = "word") %>%
full_join(scope_prepped, by = "word") %>%
full_join(brys_2024_prepped, by = "word") %>%
full_join(brys_2013_prepped, by = "word") %>%
full_join(lancaster_prepped, by = "word") %>%
full_join(freq_prepped, by = "word")
nletter count raw — UNSCALED
# use str_length to generate letter count per word as new var called n_letters
lookup_Jul25 <- lookup_Jul25 %>%
mutate(phon_n_lett = stringr::str_length(word))
#sort dataframe word is first column all other vars alphabetically ordered
lookup_Jul25 <- lookup_Jul25 %>% select(word, # Keep 'word' first
sort(names(.)[names(.) != "word"])) # Alphabetize remaining columns
str(lookup_Jul25)
## 'data.frame': 156203 obs. of 46 variables:
## $ word : chr "a" "aa" "aaa" "aaaa" ...
## $ emo_anger : num 0.019 -0.0256 -0.0901 -0.1093 -0.0249 ...
## $ emo_anger_rescale : num 2.58 2.29 1.87 1.74 2.29 ...
## $ emo_anxiety : num -0.0491 -0.1282 -0.0491 -0.1137 -0.0472 ...
## $ emo_anxiety_rescale : num 2.05 1.52 2.05 1.62 2.06 ...
## $ emo_arousal_b24 : num 1.01 3.02 NA NA NA NA NA 5.11 NA NA ...
## $ emo_arousal_b24_rescale: num 0.133 2.364 NA NA NA ...
## $ emo_boredom : num 0.0424 -0.0838 0.0168 -0.0445 -0.0321 -0.0142 0.0068 -0.0594 -0.0417 -0.0092 ...
## $ emo_boredom_rescale : num 2.7 1.87 2.53 2.13 2.21 ...
## $ emo_confusion : num -0.0686 -0.1565 -0.0887 -0.0615 0.0184 ...
## $ emo_confusion_rescale : num 1.3 0.67 1.16 1.35 1.93 ...
## $ emo_excitement : num -0.0381 0.0135 -0.0681 0.0001 -0.0459 -0.0592 0.0362 -0.0007 0.0471 0.0606 ...
## $ emo_excitement_rescale : num 2.55 2.87 2.36 2.79 2.5 ...
## $ emo_guilt : num 0.0634 0.028 0.0089 -0.0154 -0.016 0.0053 0.0215 0.0194 0.0447 -0.0286 ...
## $ emo_guilt_rescale : num 2.75 2.52 2.39 2.23 2.22 ...
## $ emo_happiness : num 0.0399 0.0981 0.0086 0.0413 0.0645 -0.0505 -0.0488 -0.0406 -0.0191 0.0143 ...
## $ emo_happiness_rescale : num 2.89 3.26 2.69 2.9 3.05 ...
## $ emo_intensity : num [1:156203, 1] 0.0412 0.0335 NA NA NA ...
## ..- attr(*, "scaled:center")= num 4.94
## ..- attr(*, "scaled:scale")= num 1.29
## $ emo_intensity_recale : num [1:156203, 1] 0.1108 0.0887 NA NA NA ...
## ..- attr(*, "scaled:center")= num 4.94
## ..- attr(*, "scaled:scale")= num 1.29
## $ emo_sadness : num 0.0066 -0.0492 -0.0993 -0.0886 -0.0552 0.0126 -0.0057 -0.0081 -0.0272 0.0064 ...
## $ emo_sadness_rescale : num 2.37 1.99 1.66 1.73 1.95 ...
## $ emo_trust : num 0.0363 0.1046 0.1462 0.1095 0.0376 ...
## $ emo_trust_rescale : num 2.75 3.2 3.47 3.23 2.76 ...
## $ emo_valence_b24 : num 4.99 4.98 NA NA NA NA NA 5.07 NA NA ...
## $ emo_valence_b24_rescale: num 4.51 4.49 NA NA NA ...
## $ lex_AoA : num 2.89 NA NA NA NA ...
## $ lex_AoA_rescale : num 0.505 NA NA NA NA ...
## $ lex_freqlg10 : num 6.02 1.94 1.42 NA NA ...
## $ lex_freqlg10_rescale : num 8.52 2.26 1.44 NA NA ...
## $ lex_n_morphemes : int 1 NA NA NA NA NA NA NA NA NA ...
## $ lex_n_senses : int 7 NA NA NA NA NA NA NA NA NA ...
## $ lex_n_senses_rescale : num 0.84 NA NA NA NA NA NA NA NA NA ...
## $ phon_n_lett : int 1 2 3 4 5 10 9 8 7 6 ...
## $ phon_nsyll : int 1 NA NA NA NA NA NA NA NA NA ...
## $ sem_auditory : num 2.21 NA NA NA NA ...
## $ sem_auditory_rescale : num 3.99 NA NA NA NA ...
## $ sem_cnc_b24 : num 1 1.78 NA NA NA NA NA 1.28 NA NA ...
## $ sem_cnc_b24_rescale : num 0.176 1.897 NA NA NA ...
## $ sem_cnc_v2013 : num 1.46 NA NA NA NA NA NA NA NA NA ...
## $ sem_cnc_v2013_rescale : num 0.955 NA NA NA NA ...
## $ sem_diversity : num NA NA NA NA NA NA NA NA NA NA ...
## $ sem_diversity_rescale : num NA NA NA NA NA NA NA NA NA NA ...
## $ sem_neighbors : int NA NA NA NA NA NA NA NA NA NA ...
## $ sem_neighbors_rescale : num NA NA NA NA NA NA NA NA NA NA ...
## $ sem_visual : num 2.43 NA NA NA NA ...
## $ sem_visual_rescale : num 4.37 NA NA NA NA ...
save(lookup_Jul25, file = "original_dbases/lookup_Jul25.rda")
tools::checkRdaFiles("lookup_Jul25.rda")
| size | ASCII | compress | version | |
|---|---|---|---|---|
| lookup_Jul25.rda | NA | NA | NA | NA |
write.csv(lookup_Jul25, file = "original_dbases/lookup_Jul25.rda")