# library(devtools)
# devtools::install_github('Reilly-ConceptsCognitionLab/SemanticDistance')
library(SemanticDistance)

load pre-split transcript

Podcast transcript, punctuation out, split one word per row, contractions to NA outside of package.

pod_dist_aug25 <- read.csv("~/Library/CloudStorage/OneDrive-TempleUniversity/Reilly_RData/Emily Podcast Collab 2025/Podcast_Recode_Aug25.csv")

Clean podcast and compute distances

We will ignore talker information and just split the text one word per row undifferentiated by speaker. We will omit stopwords but will NOT lemmatize strings.

Pod_Clean <- SemanticDistance::clean_monologue_or_list(pod_dist_aug25, wordcol = "text_prep",
    omit_stops = T, lemmatize = F)

# select vars for merge
dists1 <- dist_ngram2word(Pod_Clean, ngram = 1)
dists1 <- dists1 %>%
    select(id_row_orig, word_clean, CosDist_1gram_glo)

dists2 <- dist_ngram2word(Pod_Clean, ngram = 2)
dists2 <- dists2 %>%
    select(id_row_orig, word_clean, CosDist_2gram_glo)

dists5 <- dist_ngram2word(Pod_Clean, ngram = 5)
dists5 <- dists5 %>%
    select(id_row_orig, word_clean, CosDist_5gram_glo)

dists10 <- dist_ngram2word(Pod_Clean, ngram = 10)
dists10 <- dists10 %>%
    select(id_row_orig, word_clean, CosDist_10gram_glo)

# Merge Different Ngram Distances
together <- dists1 %>%
    dplyr::left_join(dists2, by = c("id_row_orig", "word_clean"))
together <- together %>%
    dplyr::left_join(dists5, by = c("id_row_orig", "word_clean"))
together <- together %>%
    dplyr::left_join(dists10, by = c("id_row_orig", "word_clean"))
head(together, n = 12)
id_row_orig word_clean CosDist_1gram_glo CosDist_2gram_glo CosDist_5gram_glo CosDist_10gram_glo
1 fresh NA NA NA NA
2 air 0.55 NA NA NA
3 NA NA NA NA NA
4 tgross NA NA NA NA
5 tgross NA NA NA NA
6 more 0.42 0.36 NA NA
7 more 0.00 0.11 NA NA
8 people 0.13 0.13 NA NA
9 looking 0.25 0.22 0.22 NA
10 genetics 0.84 0.79 0.81 NA
11 understand 0.65 0.33 0.21 NA
12 where 0.31 0.45 0.17 NA


Append psycholinguistic norms

Surprisal: Each Content Word

Use quanteda to compute surprisal values for each word in the corpus.
Steps:
1) isolate all content words (no NAs after stopword removal)
2) convert to corpus
3) convert to document feature matrix
4) compute surprisal for each word
5) join surprisal vals to original dataframe

clean <- together %>%
    select(word_clean)

# omit nas from word_clean for dfm, return new shorter dataframe words only
clean <- data.frame(na.omit(clean))
clean <- clean[rowSums(clean != "") != 0, ]
clean_df <- data.frame(word_clean = clean, stringsAsFactors = FALSE)

# concatenate all strings into a single cell, pull extracts as a character
# vector, needed so corpus doesn't think each word is a new document.
single_string <- paste(clean_df$word_clean, collapse = " ")

# convert corpus to document feature matrix (retains stopwords from podcast, no
# lemmatization)
pod_corpus <- quanteda::corpus(single_string)
pod_dfm <- quanteda::dfm(tokens(pod_corpus))

# convert document feature matrix to transposed dataframe (word plus count)
pod_dat <- convert(pod_dfm, to = "data.frame")
pod_dat <- pod_dat %>%
    select(-1)

# tranpose columns to rows, coerce to dataframe, rownames to col var, delete
# rownames
pod_dat <- t(pod_dat)
pod_dat <- as.data.frame(pod_dat, stringsAsFactors = FALSE)
pod_dat$word_clean <- rownames(pod_dat)
rownames(pod_dat) <- NULL

# change varname V1 to 'featcount'
pod_dat <- pod_dat %>%
    dplyr::rename(featcount = V1)

# Function for computing suprisal based on relative probabilities in the
# corpus, thanks to Ken Benoit (Quanteda package maintainer)
add_surprisal <- function(dat) {
    # Calculate total count
    total_featcount <- sum(dat$featcount, na.rm = TRUE)
    # Compute surprisal values
    dat$surprisal_vals <- -log2(dat$featcount/total_featcount)
    return(dat)
}

# Run custom surprisal fn
pod_surprisal <- add_surprisal(pod_dat)
head(pod_surprisal, n = 30)
featcount word_clean surprisal_vals
2 fresh 8.62
2 air 8.62
2 tgross 8.62
6 more 7.04
8 people 6.62
2 looking 8.62
4 genetics 7.62
1 understand 9.62
5 where 7.30
1 come 9.62
2 future 8.62
5 genetic 7.30
1 sequencing 9.62
3 tell 8.04
17 about 5.53
4 ancestry 7.62
1 warn 9.62
1 risks 9.62
4 inherited 7.62
4 certain 7.62
3 diseases 8.04
1 conditions 9.62
1 meanwhile 9.62
6 scientists 7.04
1 exploring 9.62
1 ways 9.62
1 altering 9.62
12 genes 6.04
1 prevent 9.62
1 cure 9.62
# join survival values to each word in the podcast, don't stop at just the
# first instance!
pod_plus_surprisal <- together %>%
    left_join(pod_surprisal, by = "word_clean")
colnames(pod_plus_surprisal)
## [1] "id_row_orig"        "word_clean"         "CosDist_1gram_glo" 
## [4] "CosDist_2gram_glo"  "CosDist_5gram_glo"  "CosDist_10gram_glo"
## [7] "featcount"          "surprisal_vals"

Load lexical lookup database and join norms to podcast

For joins, we want Concreteness (sem_cnc_v2013), Semantic Neighbors (by HiDex norms)

# lood lexical lookup database
load(url("https://raw.githubusercontent.com/Reilly-ConceptsCognitionLab/reillylab_publicdata/main/lookup_Jul25.rda"))

lookup <- lookup_Jul25 %>%
    select(word, sem_cnc_v2013, sem_neighbors, lex_freqlg10, lex_AoA)

# join vals for psycholinguistic vars
pod_plus_lexnorms <- pod_plus_surprisal %>%
    left_join(lookup, by = c(word_clean = "word"))
colnames(pod_plus_lexnorms)
##  [1] "id_row_orig"        "word_clean"         "CosDist_1gram_glo" 
##  [4] "CosDist_2gram_glo"  "CosDist_5gram_glo"  "CosDist_10gram_glo"
##  [7] "featcount"          "surprisal_vals"     "sem_cnc_v2013"     
## [10] "sem_neighbors"      "lex_freqlg10"       "lex_AoA"
# rename columns
pod_streamlined <- pod_plus_lexnorms %>%
    dplyr::rename(cnc = sem_cnc_v2013, word = word_clean, dist1 = CosDist_1gram_glo,
        row_id = id_row_orig, dist2 = CosDist_2gram_glo, dist5 = CosDist_5gram_glo,
        dist10 = CosDist_10gram_glo, surprisal = surprisal_vals, freq_lg10 = lex_freqlg10,
        neighbors = sem_neighbors, AoA = lex_AoA)
colnames(pod_streamlined)
##  [1] "row_id"    "word"      "dist1"     "dist2"     "dist5"     "dist10"   
##  [7] "featcount" "surprisal" "cnc"       "neighbors" "freq_lg10" "AoA"
# structure dataframe for correlations only
pod_4corr <- pod_streamlined %>%
    select(-c(row_id, word, featcount))

# build correlation matrix
pod_corrmat <- cor(pod_4corr, method = "pearson", use = "complete.obs")
print(pod_corrmat)
##                dist1      dist2      dist5     dist10  surprisal         cnc
## dist1      1.0000000  0.8157753  0.6130246  0.5457754  0.2204887  0.16051301
## dist2      0.8157753  1.0000000  0.8064824  0.7357347  0.2836952  0.23455256
## dist5      0.6130246  0.8064824  1.0000000  0.9552517  0.4515640  0.38802829
## dist10     0.5457754  0.7357347  0.9552517  1.0000000  0.4868330  0.42812557
## surprisal  0.2204887  0.2836952  0.4515640  0.4868330  1.0000000  0.39252709
## cnc        0.1605130  0.2345526  0.3880283  0.4281256  0.3925271  1.00000000
## neighbors -0.4322180 -0.5972473 -0.7889160 -0.8339813 -0.3931009 -0.29030864
## freq_lg10 -0.4666049 -0.6313242 -0.8461264 -0.8970843 -0.5958300 -0.38039147
## AoA        0.4340826  0.5521391  0.6830700  0.7176922  0.3300312  0.09137448
##            neighbors  freq_lg10         AoA
## dist1     -0.4322180 -0.4666049  0.43408263
## dist2     -0.5972473 -0.6313242  0.55213907
## dist5     -0.7889160 -0.8461264  0.68307002
## dist10    -0.8339813 -0.8970843  0.71769224
## surprisal -0.3931009 -0.5958300  0.33003120
## cnc       -0.2903086 -0.3803915  0.09137448
## neighbors  1.0000000  0.7814631 -0.65101699
## freq_lg10  0.7814631  1.0000000 -0.82213255
## AoA       -0.6510170 -0.8221325  1.00000000
# store p-values in new matrix to pass to p.mat
corr_p <- psych::corr.test(pod_corrmat)$p

# Generate viridis colors
viridis_colors <- viridis(200)

# BUILD CORRPLOT set sig threshold at .001 for 36 contrasts
pdf(file = "Figs/Podcast_Corrplot_Aug25.pdf")
pod_corrplot <- corrplot(pod_corrmat, method = "color", type = "upper", tl.srt = 45,
    tl.cex = 1, col = viridis_colors, tl.col = "black", diag = F, digits = 2, order = "hclust",
    p.mat = corr_p, sig.level = 0.001, insig = "blank", addgrid = T, addgrid.col = "gray",
    addCoef.col = "white", number.cex = 8/ncol(pod_corrmat))
dev.off()
## quartz_off_screen 
##                 2
corrplot(pod_corrmat, method = "color", type = "upper", tl.srt = 45, tl.cex = 1,
    col = viridis_colors, tl.col = "black", diag = F, digits = 2, order = "hclust",
    p.mat = corr_p, sig.level = 0.05, insig = "blank", addgrid = T, addgrid.col = "gray",
    addCoef.col = "white", number.cex = 8/ncol(pod_corrmat))

Export data to OSF

# write to csv and inspect join
write.csv(pod_streamlined, "Mechtenberg_Podcast_Annotated_Aug25.csv", row.names = F)