Overview

Our goal is to create a large lookup database by aggregating a number of smaller lexical lookup databases. This takes some thought. Some databases have multiple entries for each word reflecting different parts of speech. There are also other features that hinder term aggregation (e.g. upper/lowercase). I prepped each of the original databases a bit, shaping them from their raw forms.

Load prepped (mostly raw) databases

Here are some of the steps I applied in prepping the raw original databases:
1. appended prefix with source to variable name
2. transformed all words to lowercase
3. homogenized var names so every database has ‘word’ column
3. removed duplicate strings using dplyr::distinct() to prevent merge errors

We will load each of the original prepped databases and check for duplicates and other weird problems before joining.

brys_cnc: concreteness

Brysbaert Concreteness Norms, original scale 1-5
https://link.springer.com/article/10.3758/s13428-013-0403-5

# Brysbaert cnc norms
brys_cnc <- read.csv("~/Library/CloudStorage/OneDrive-TempleUniversity/Reilly_RData/Lookup_Database_Creation/dbases_forjoin/db_brysbaert_cnc.csv")
brys_cnc <- brys_cnc %>%
    dplyr::distinct(word, .keep_all = TRUE)
sum(duplicated(brys_cnc$word))
## [1] 0
colnames(brys_cnc)
## [1] "word"              "brys_concreteness"


brys_freq: frequency

Brysbaert Word Freq Norms SUBTLEX-US
https://link.springer.com/article/10.3758/BRM.41.4.977

# Brysbaert freq norms from SUBTLEX
brys_freq <- read.csv("~/Library/CloudStorage/OneDrive-TempleUniversity/Reilly_RData/Lookup_Database_Creation/dbases_forjoin/db_brysbaert_freq.csv")
brys_freq <- brys_freq %>%
    dplyr::distinct(word, .keep_all = TRUE)
sum(duplicated(brys_freq$word))
## [1] 0
colnames(brys_freq)
## [1] "word"          "brys_freq"     "brys_lg10freq"


elp_vars

# English Lexicon Project Naming Lex Decision RTs
elp <- read.csv("~/Library/CloudStorage/OneDrive-TempleUniversity/Reilly_RData/Lookup_Database_Creation/dbases_forjoin/db_elp.csv")
elp <- elp %>%
    dplyr::distinct(word, .keep_all = TRUE)
sum(duplicated(elp$word))
## [1] 0
colnames(elp)
## [1] "word"             "elp_freqHAL"      "elp_SemDensity"   "elp_SemNeighbors"
## [5] "elp_SemDiversity" "elp_LexDecRt"     "elp_NamingRT"


kup aoa

Kuperman et al age of acquisition norms

# Kuperman age acquisition norms
kup <- read.csv("~/Library/CloudStorage/OneDrive-TempleUniversity/Reilly_RData/Lookup_Database_Creation/dbases_forjoin/db_kuperman_aoa.csv")
sum(duplicated(kup$word))
## [1] 0
colnames(kup)
## [1] "word"      "kup_pos"   "kup_nphon" "kup_nsyll" "kup_aoa"


Lancaster Sensorimotor Norms

lanc <- read.csv("~/Library/CloudStorage/OneDrive-TempleUniversity/Reilly_RData/Lookup_Database_Creation/dbases_forjoin/db_lancaster.csv")
lanc <- lanc %>%
    dplyr::distinct(word, .keep_all = TRUE)
sum(duplicated(lanc$word))
## [1] 0
colnames(lanc)
##  [1] "word"              "lnc_Auditory"      "lnc_Gustatory"    
##  [4] "lnc_Haptic"        "lnc_Interoceptive" "lnc_olfactory"    
##  [7] "lnc_visual"        "lnc_footleg"       "lnc_handarm"      
## [10] "lnc_head"          "lnc_mouth"         "lnc_torso"        
## [13] "lnc_domsense"      "lnc_AgePerceptual"


Warriner Valence Norms

war <- read.csv("~/Library/CloudStorage/OneDrive-TempleUniversity/Reilly_RData/Lookup_Database_Creation/dbases_forjoin/db_warriner.csv")
sum(duplicated(war$word))
## [1] 4
colnames(war)
## [1] "word"          "war_valence"   "war_arousal"   "war_dominance"


VAD NRC

NRC Valence, Arousal, and Dominance (NRC-VAD) Lexicon
http://saifmohammad.com/WebPages/nrc-vad.html The NRC Valence, Arousal, and Dominance (VAD) Lexicon includes a list of more than 20,000 English words and their valence, arousal, and dominance scores. For a given word and a dimension (V/A/D), the scores range from 0 (lowest V/A/D) to 1 (highest V/A/D).”

vad <- read.csv("~/Library/CloudStorage/OneDrive-TempleUniversity/Reilly_RData/Lookup_Database_Creation/dbases_forjoin/db_nrc_vad.csv")
sum(duplicated(vad$word))
## [1] 0
colnames(vad)
## [1] "word"          "vad_valence"   "vad_arousal"   "vad_dominance"



Join databases

None of the original databases should have duplicate words, so there should be no merge errors.

dat <- brys_cnc %>%
    full_join(brys_freq) %>%
    full_join(elp) %>%
    full_join(kup) %>%
    full_join(vad) %>%
    full_join(war)
str(dat)
## 'data.frame':    82261 obs. of  20 variables:
##  $ word             : chr  "a" "acappella" "aardvark" "aback" ...
##  $ brys_concreteness: num  1.46 2.92 4.68 1.65 4.52 2.54 2.52 2.92 2.5 2.54 ...
##  $ brys_freq        : num  20415.27 NA 0.41 0.29 0.24 ...
##  $ brys_lg10freq    : num  6.02 NA 1.34 1.2 1.11 ...
##  $ elp_freqHAL      : chr  "10,610,626" NA NA "387" ...
##  $ elp_SemDensity   : num  NA NA NA 0.305 0.298 0.59 0.664 NA NA 0.547 ...
##  $ elp_SemNeighbors : chr  "" NA NA "0" ...
##  $ elp_SemDiversity : num  NA NA NA 1.59 1.64 ...
##  $ elp_LexDecRt     : chr  "798.917" NA NA "796.267" ...
##  $ elp_NamingRT     : chr  "662.087" NA NA "596.538" ...
##  $ kup_pos          : chr  "Article" NA "Noun" NA ...
##  $ kup_nphon        : int  1 NA 7 NA 6 7 8 NA 8 11 ...
##  $ kup_nsyll        : int  1 NA 2 NA 3 3 3 NA 4 4 ...
##  $ kup_aoa          : num  2.89 NA 9.89 NA 8.69 ...
##  $ vad_valence      : num  NA NA 0.427 0.385 0.51 0.052 0.046 NA NA 0.128 ...
##  $ vad_arousal      : num  NA NA 0.49 0.407 0.276 0.519 0.481 NA NA 0.43 ...
##  $ vad_dominance    : num  NA NA 0.437 0.288 0.485 0.245 0.13 NA NA 0.202 ...
##  $ war_valence      : num  NA NA 6.26 NA NA 2.84 NA NA NA 2.63 ...
##  $ war_arousal      : num  NA NA 2.41 NA NA 3.73 NA NA NA 4.95 ...
##  $ war_dominance    : num  NA NA 4.27 NA NA 3.32 NA NA NA 2.64 ...

Append nletts and order dat

Using base R count letters in each word, order lookup word first, then order cols alphabetical by prefix

# compute new var and order it second in the dataframe
dat <- dat %>%
    dplyr::mutate(base_nlett = nchar(word)) %>%
    relocate(base_nlett, .after = 1)
colnames(dat)
##  [1] "word"              "base_nlett"        "brys_concreteness"
##  [4] "brys_freq"         "brys_lg10freq"     "elp_freqHAL"      
##  [7] "elp_SemDensity"    "elp_SemNeighbors"  "elp_SemDiversity" 
## [10] "elp_LexDecRt"      "elp_NamingRT"      "kup_pos"          
## [13] "kup_nphon"         "kup_nsyll"         "kup_aoa"          
## [16] "vad_valence"       "vad_arousal"       "vad_dominance"    
## [19] "war_valence"       "war_arousal"       "war_dominance"
lex_lookup25 <- dat
str(lex_lookup25)
## 'data.frame':    82261 obs. of  21 variables:
##  $ word             : chr  "a" "acappella" "aardvark" "aback" ...
##  $ base_nlett       : int  1 9 8 5 6 7 9 9 9 11 ...
##  $ brys_concreteness: num  1.46 2.92 4.68 1.65 4.52 2.54 2.52 2.92 2.5 2.54 ...
##  $ brys_freq        : num  20415.27 NA 0.41 0.29 0.24 ...
##  $ brys_lg10freq    : num  6.02 NA 1.34 1.2 1.11 ...
##  $ elp_freqHAL      : chr  "10,610,626" NA NA "387" ...
##  $ elp_SemDensity   : num  NA NA NA 0.305 0.298 0.59 0.664 NA NA 0.547 ...
##  $ elp_SemNeighbors : chr  "" NA NA "0" ...
##  $ elp_SemDiversity : num  NA NA NA 1.59 1.64 ...
##  $ elp_LexDecRt     : chr  "798.917" NA NA "796.267" ...
##  $ elp_NamingRT     : chr  "662.087" NA NA "596.538" ...
##  $ kup_pos          : chr  "Article" NA "Noun" NA ...
##  $ kup_nphon        : int  1 NA 7 NA 6 7 8 NA 8 11 ...
##  $ kup_nsyll        : int  1 NA 2 NA 3 3 3 NA 4 4 ...
##  $ kup_aoa          : num  2.89 NA 9.89 NA 8.69 ...
##  $ vad_valence      : num  NA NA 0.427 0.385 0.51 0.052 0.046 NA NA 0.128 ...
##  $ vad_arousal      : num  NA NA 0.49 0.407 0.276 0.519 0.481 NA NA 0.43 ...
##  $ vad_dominance    : num  NA NA 0.437 0.288 0.485 0.245 0.13 NA NA 0.202 ...
##  $ war_valence      : num  NA NA 6.26 NA NA 2.84 NA NA NA 2.63 ...
##  $ war_arousal      : num  NA NA 2.41 NA NA 3.73 NA NA NA 4.95 ...
##  $ war_dominance    : num  NA NA 4.27 NA NA 3.32 NA NA NA 2.64 ...

Save output as rda and cvs for posting

write.csv(lex_lookup25, file = "lookup_databases/lex_lookup25.csv")
save(lex_lookup25, file = "lookup_databases/lex_lookup25.rda")