# library(devtools)
# devtools::install_github('Reilly-ConceptsCognitionLab/SemanticDistance')
library(SemanticDistance)
Podcast transcript, punctuation out, split one word per row, contractions to NA outside of package.
pod_dist_aug25 <- read.csv("~/Library/CloudStorage/OneDrive-TempleUniversity/Reilly_RData/Emily Podcast Collab 2025/Podcast_Recode_Aug25.csv")
We will ignore talker information and just split the text one word per row undifferentiated by speaker. We will omit stopwords but will NOT lemmatize strings.
Pod_Clean <- SemanticDistance::clean_monologue_or_list(pod_dist_aug25, wordcol = "text_prep",
omit_stops = T, lemmatize = F)
# select vars for merge
dists1 <- dist_ngram2word(Pod_Clean, ngram = 1)
dists1 <- dists1 %>%
select(id_row_orig, word_clean, CosDist_1gram_glo)
dists2 <- dist_ngram2word(Pod_Clean, ngram = 2)
dists2 <- dists2 %>%
select(id_row_orig, word_clean, CosDist_2gram_glo)
dists5 <- dist_ngram2word(Pod_Clean, ngram = 5)
dists5 <- dists5 %>%
select(id_row_orig, word_clean, CosDist_5gram_glo)
dists10 <- dist_ngram2word(Pod_Clean, ngram = 10)
dists10 <- dists10 %>%
select(id_row_orig, word_clean, CosDist_10gram_glo)
# Merge Different Ngram Distances
together <- dists1 %>%
dplyr::left_join(dists2, by = c("id_row_orig", "word_clean"))
together <- together %>%
dplyr::left_join(dists5, by = c("id_row_orig", "word_clean"))
together <- together %>%
dplyr::left_join(dists10, by = c("id_row_orig", "word_clean"))
head(together, n = 12)
id_row_orig | word_clean | CosDist_1gram_glo | CosDist_2gram_glo | CosDist_5gram_glo | CosDist_10gram_glo |
---|---|---|---|---|---|
1 | fresh | NA | NA | NA | NA |
2 | air | 0.55 | NA | NA | NA |
3 | NA | NA | NA | NA | NA |
4 | tgross | NA | NA | NA | NA |
5 | tgross | NA | NA | NA | NA |
6 | more | 0.42 | 0.36 | NA | NA |
7 | more | 0.00 | 0.11 | NA | NA |
8 | people | 0.13 | 0.13 | NA | NA |
9 | looking | 0.25 | 0.22 | 0.22 | NA |
10 | genetics | 0.84 | 0.79 | 0.81 | NA |
11 | understand | 0.65 | 0.33 | 0.21 | NA |
12 | where | 0.31 | 0.45 | 0.17 | NA |
Use quanteda
to compute surprisal values for each word
in the corpus.
Steps:
1) isolate all content words (no NAs
after stopword removal)
2) convert to corpus
3) convert to
document feature matrix
4) compute surprisal for each word
5)
join surprisal vals to original dataframe
clean <- together %>%
select(word_clean)
# omit nas from word_clean for dfm, return new shorter dataframe words only
clean <- data.frame(na.omit(clean))
clean <- clean[rowSums(clean != "") != 0, ]
clean_df <- data.frame(word_clean = clean, stringsAsFactors = FALSE)
# concatenate all strings into a single cell, pull extracts as a character
# vector, needed so corpus doesn't think each word is a new document.
single_string <- paste(clean_df$word_clean, collapse = " ")
# convert corpus to document feature matrix (retains stopwords from podcast, no
# lemmatization)
pod_corpus <- quanteda::corpus(single_string)
pod_dfm <- quanteda::dfm(tokens(pod_corpus))
# convert document feature matrix to transposed dataframe (word plus count)
pod_dat <- convert(pod_dfm, to = "data.frame")
pod_dat <- pod_dat %>%
select(-1)
# tranpose columns to rows, coerce to dataframe, rownames to col var, delete
# rownames
pod_dat <- t(pod_dat)
pod_dat <- as.data.frame(pod_dat, stringsAsFactors = FALSE)
pod_dat$word_clean <- rownames(pod_dat)
rownames(pod_dat) <- NULL
# change varname V1 to 'featcount'
pod_dat <- pod_dat %>%
dplyr::rename(featcount = V1)
# Function for computing suprisal based on relative probabilities in the
# corpus, thanks to Ken Benoit (Quanteda package maintainer)
add_surprisal <- function(dat) {
# Calculate total count
total_featcount <- sum(dat$featcount, na.rm = TRUE)
# Compute surprisal values
dat$surprisal_vals <- -log2(dat$featcount/total_featcount)
return(dat)
}
# Run custom surprisal fn
pod_surprisal <- add_surprisal(pod_dat)
head(pod_surprisal, n = 30)
featcount | word_clean | surprisal_vals |
---|---|---|
2 | fresh | 8.62 |
2 | air | 8.62 |
2 | tgross | 8.62 |
6 | more | 7.04 |
8 | people | 6.62 |
2 | looking | 8.62 |
4 | genetics | 7.62 |
1 | understand | 9.62 |
5 | where | 7.30 |
1 | come | 9.62 |
2 | future | 8.62 |
5 | genetic | 7.30 |
1 | sequencing | 9.62 |
3 | tell | 8.04 |
17 | about | 5.53 |
4 | ancestry | 7.62 |
1 | warn | 9.62 |
1 | risks | 9.62 |
4 | inherited | 7.62 |
4 | certain | 7.62 |
3 | diseases | 8.04 |
1 | conditions | 9.62 |
1 | meanwhile | 9.62 |
6 | scientists | 7.04 |
1 | exploring | 9.62 |
1 | ways | 9.62 |
1 | altering | 9.62 |
12 | genes | 6.04 |
1 | prevent | 9.62 |
1 | cure | 9.62 |
# join survival values to each word in the podcast, don't stop at just the
# first instance!
pod_plus_surprisal <- together %>%
left_join(pod_surprisal, by = "word_clean")
colnames(pod_plus_surprisal)
## [1] "id_row_orig" "word_clean" "CosDist_1gram_glo"
## [4] "CosDist_2gram_glo" "CosDist_5gram_glo" "CosDist_10gram_glo"
## [7] "featcount" "surprisal_vals"
For joins, we want Concreteness (sem_cnc_v2013), Semantic Neighbors (by HiDex norms)
# lood lexical lookup database
load(url("https://raw.githubusercontent.com/Reilly-ConceptsCognitionLab/reillylab_publicdata/main/lookup_Jul25.rda"))
lookup <- lookup_Jul25 %>%
select(word, sem_cnc_v2013, sem_neighbors, lex_freqlg10, lex_AoA)
# join vals for psycholinguistic vars
pod_plus_lexnorms <- pod_plus_surprisal %>%
left_join(lookup, by = c(word_clean = "word"))
colnames(pod_plus_lexnorms)
## [1] "id_row_orig" "word_clean" "CosDist_1gram_glo"
## [4] "CosDist_2gram_glo" "CosDist_5gram_glo" "CosDist_10gram_glo"
## [7] "featcount" "surprisal_vals" "sem_cnc_v2013"
## [10] "sem_neighbors" "lex_freqlg10" "lex_AoA"
# rename columns
pod_streamlined <- pod_plus_lexnorms %>%
dplyr::rename(cnc = sem_cnc_v2013, word = word_clean, dist1 = CosDist_1gram_glo,
row_id = id_row_orig, dist2 = CosDist_2gram_glo, dist5 = CosDist_5gram_glo,
dist10 = CosDist_10gram_glo, surprisal = surprisal_vals, freq_lg10 = lex_freqlg10,
neighbors = sem_neighbors, AoA = lex_AoA)
colnames(pod_streamlined)
## [1] "row_id" "word" "dist1" "dist2" "dist5" "dist10"
## [7] "featcount" "surprisal" "cnc" "neighbors" "freq_lg10" "AoA"
# structure dataframe for correlations only
pod_4corr <- pod_streamlined %>%
select(-c(row_id, word, featcount))
# build correlation matrix
pod_corrmat <- cor(pod_4corr, method = "pearson", use = "complete.obs")
print(pod_corrmat)
## dist1 dist2 dist5 dist10 surprisal cnc
## dist1 1.0000000 0.8157753 0.6130246 0.5457754 0.2204887 0.16051301
## dist2 0.8157753 1.0000000 0.8064824 0.7357347 0.2836952 0.23455256
## dist5 0.6130246 0.8064824 1.0000000 0.9552517 0.4515640 0.38802829
## dist10 0.5457754 0.7357347 0.9552517 1.0000000 0.4868330 0.42812557
## surprisal 0.2204887 0.2836952 0.4515640 0.4868330 1.0000000 0.39252709
## cnc 0.1605130 0.2345526 0.3880283 0.4281256 0.3925271 1.00000000
## neighbors -0.4322180 -0.5972473 -0.7889160 -0.8339813 -0.3931009 -0.29030864
## freq_lg10 -0.4666049 -0.6313242 -0.8461264 -0.8970843 -0.5958300 -0.38039147
## AoA 0.4340826 0.5521391 0.6830700 0.7176922 0.3300312 0.09137448
## neighbors freq_lg10 AoA
## dist1 -0.4322180 -0.4666049 0.43408263
## dist2 -0.5972473 -0.6313242 0.55213907
## dist5 -0.7889160 -0.8461264 0.68307002
## dist10 -0.8339813 -0.8970843 0.71769224
## surprisal -0.3931009 -0.5958300 0.33003120
## cnc -0.2903086 -0.3803915 0.09137448
## neighbors 1.0000000 0.7814631 -0.65101699
## freq_lg10 0.7814631 1.0000000 -0.82213255
## AoA -0.6510170 -0.8221325 1.00000000
# store p-values in new matrix to pass to p.mat
corr_p <- psych::corr.test(pod_corrmat)$p
# Generate viridis colors
viridis_colors <- viridis(200)
# BUILD CORRPLOT set sig threshold at .001 for 36 contrasts
pdf(file = "Figs/Podcast_Corrplot_Aug25.pdf")
pod_corrplot <- corrplot(pod_corrmat, method = "color", type = "upper", tl.srt = 45,
tl.cex = 1, col = viridis_colors, tl.col = "black", diag = F, digits = 2, order = "hclust",
p.mat = corr_p, sig.level = 0.001, insig = "blank", addgrid = T, addgrid.col = "gray",
addCoef.col = "white", number.cex = 8/ncol(pod_corrmat))
dev.off()
## quartz_off_screen
## 2
corrplot(pod_corrmat, method = "color", type = "upper", tl.srt = 45, tl.cex = 1,
col = viridis_colors, tl.col = "black", diag = F, digits = 2, order = "hclust",
p.mat = corr_p, sig.level = 0.05, insig = "blank", addgrid = T, addgrid.col = "gray",
addCoef.col = "white", number.cex = 8/ncol(pod_corrmat))
# write to csv and inspect join
write.csv(pod_streamlined, "Mechtenberg_Podcast_Annotated_Aug25.csv", row.names = F)