Appendix II: Dataframe Making Codes

Load Libraries:

Load the necessary libraries for text processing, data manipulation, and analysis.

library(stringr)
library(tidyverse)
library(tm)
library(quanteda)
library(textstem)
library(udpipe)

Read the files containing commencement speeches and store them in a list.

file_path <- "C:/D/R/TAWR2/data/text/Commencements/"
all_titles <- list.files(paste0(file_path, "txt/"))
speeches_l <- lapply(all_titles, function(x) scan(paste0(file_path,
    "txt/", x, sep = ""), what = "character", sep = "\n"))
all_titles <- gsub(".txt", "", all_titles)
names(speeches_l) <- all_titles

Define Custom Stopwords:

Create a custom list of stopwords, which includes standard English stopwords and additional words specific to the dataset.

custom.stopwords <- c(stopwords("english"), letters, "(laughter.)",
    "(applause.)", "authenticity certified text version below transcribed directly from audio")

Preprocessing Function:

Define a function to preprocess and lemmatize text. This function will clean the text by converting it to lowercase, removing punctuation, removing stopwords, removing numbers, and lemmatizing the words.

preprocess_and_lemmatize <- function(text) {
    text %>%
        tolower() %>%
        removeWords(custom.stopwords) %>%
        removePunctuation(preserve_intra_word_dashes = TRUE,
            ucp = FALSE) %>%
        removePunctuation(preserve_intra_word_dashes = TRUE,
            ucp = TRUE) %>%
        removeNumbers() %>%
        removeWords(custom.stopwords) %>%
        stripWhitespace() %>%
        lemmatize_strings()
}

Construct a data frame that includes document names, and text content.

doc_names <- names(speeches_l)
all_texts <- unlist(speeches_l)
docName <- rep(doc_names, sapply(speeches_l, length))
my_df <- data.frame(doc.name = docName, document = all_texts,
    row.names = NULL)

Add sequence numbers and percentages for each document in the data frame.

my_df <- my_df %>%
    group_by(doc.name) %>%
    mutate(seq_num = 1:length(document), percent = seq_num/length(document) *
        100)

Clean and Preprocess Document Text:

Clean and preprocess the text for each document in the data frame.

my_df$text <- preprocess_and_lemmatize(my_df$document)
my_df$text <- sapply(my_df$text, function(x) gsub("[^ _a-z]",
    "", x))

Tokenize Text and Extract N-grams:

Tokenize the text and extract bigrams and trigrams using a custom tokenizer function.

my_tokenizer <- function(x, n, s) {
    a <- quanteda::tokens(x, what = "word")
    b <- quanteda::tokens_ngrams(a, n = n, skip = s, concatenator = "_")
    c <- as.character(b)
    d <- paste(c, collapse = " ")
    return(d)
}
my_df$bigrams <- sapply(my_df$text, function(x) my_tokenizer(x,
    2, 0))
my_df$trigrams <- sapply(my_df$text, function(x) my_tokenizer(x,
    3, 0))

Extract Years, Decades, and Speaker Details:

Extract years, decades, speaker names, and titles from the data frame. Here we added “decade” because it can be useful in larger datasets.

my_df <- my_df %>%
    mutate(year = if_else(seq_num == 3, as.integer(str_extract(document,
        "\\b\\d{4}\\b")), NA_integer_), location = if_else(seq_num ==
        3, gsub("\\b\\d{4}\\b", "", document), NA_character_)) %>%
    fill(year, .direction = "down") %>%
    fill(location, .direction = "down") %>%
    mutate(decade = (year%/%10) * 10, name = if_else(seq_num ==
        1, document, NA), title = if_else(seq_num == 2, document,
        NA)) %>%
    fill(name, .direction = "down") %>%
    fill(title, .direction = "down") %>%
    mutate(speaker = str_remove(doc.name, "\\.csv"), name = str_extract(speaker,
        "^[^-]+"), title = str_extract(speaker, "(?<=-\\s)[^\\s].*")) %>%
    filter(!(seq_num %in% c(1, 2, 3))) %>%
    mutate(name = str_trim(name), speaker = str_extract(name,
        "\\w+$")) %>%
    group_by(speaker) %>%
    mutate(seq_num = 1:length(document), doc_id = paste0(speaker,
        "_", seq_num), percent = seq_num/length(document) * 100)

Assign Professions to Speakers:

Assign professions to the speakers in the dataset. Create a data frame containing speaker names and their professions.

professions <- c("Business", "Arts and Literature", "Entertainment",
    "Academia", "Law and Politics")
speaker <- unique(my_df$speaker)
speakerprofessions = data.frame(speaker, profession = c("Law and Politics",
    "Academia", "Law and Politics", "Business", "Arts and Literature",
    "Entertainment", "Arts and Literature", "Entertainment",
    "Business", "Law and Politics", "Academia", "Arts and Literature",
    "Law and Politics", "Business", "Law and Politics", "Arts and Literature",
    "Entertainment", "Business", "Law and Politics", "Law and Politics",
    "Business", "Entertainment", "Arts and Literature", "Entertainment",
    "Business", "Law and Politics", "Academia", "Business", "Entertainment",
    "Business", "Entertainment", "Business"))

Merge Speaker Professions with Main Data Frame:

Merge the speakerprofessions data frame with the main data frame, my_df.

my_df <- merge(my_df, speakerprofessions, by = "speaker", all.x = TRUE)

Count Words and Filter Rows:

Count the number of words in each document and filter the rows to keep only the ones that have more than 2 and less than 1000 words.

my_df <- my_df %>%
    mutate(nwords = txt_count(text, pattern = " "))
my_df <- subset(my_df, nwords < 1000 & nwords > 2)

Export Clean Data Frame:

Export the clean data frame to a CSV file for future use.

write.csv(my_df, file = paste0(file_path, "/df/df_cln_lem.csv"),
    row.names = FALSE)

Compute Profession Statistics:

Create a summary of the number of speakers per profession.

file_path <- "C:/D/R/TAWR2/data/text/Commencements/"
df <- read.csv(paste0(file_path, "/df/df_cln_lem.csv"))
# Extracting professions_stat
professions_stat <- df %>%
    group_by(profession) %>%
    dplyr::summarize(same_pro = length(unique(speaker))) %>%
    arrange(desc(same_pro))
professions_stat

## # A tibble: 5 × 2
##   profession          same_pro
##   <chr>                  <int>
## 1 Business                   9
## 2 Law and Politics           8
## 3 Entertainment              7
## 4 Arts and Literature        5
## 5 Academia                   3

Compute Location Statistics:

Create a summary of the number of speakers per location.

# Extracting locations_stat
location_stat <- df %>%
    group_by(location) %>%
    dplyr::summarize(same_loc = length(unique(speaker))) %>%
    arrange(desc(same_loc))
location_stat

## # A tibble: 13 × 2
##    location                  same_loc
##    <chr>                        <int>
##  1 "Harvard "                      12
##  2 "Stanford "                      9
##  3 "BERKELEY "                      1
##  4 "Caltech "                       1
##  5 "Dartmouth "                     1
##  6 "Harvard  "                      1
##  7 "Johns Hopkins "                 1
##  8 "Kenyon "                        1
##  9 "Notre Dame "                    1
## 10 "Oberlin "                       1
## 11 "Princeton "                     1
## 12 "Tulane "                        1
## 13 "University of the Arts "        1

Compute Speech Length Statistics:

Create a summary of the total speech length (in words) for each speaker.

# Extracting speech_length
speech_length_stat <- df %>%
    group_by(speaker) %>%
    dplyr::summarize(speech_length = sum(nwords)) %>%
    arrange(desc(speech_length))
speech_length_stat

Create a Summary Data Frame for Speeches:

Create a summary data frame containing speaker, location, profession, year, title, name, and total speech length.

summary_df_speeches <- distinct(df[, c("speaker", "location",
    "profession", "year", "title", "name")])
summary_df_speeches$SpeechLength <- speech_length_stat$speech_length

Appendix I: Speeches Information

Appendix III: The Complete List of the Packages in R