Add files via upload

rcurty · web-flow · commit ca51d9e2449c · 2025-11-13T16:39:11.000-08:00
diff --git a/scripts/workshop_package/scripts_sentiment_analysis_ak.qmd b/scripts/workshop_package/scripts_sentiment_analysis_ak.qmd
@@ -0,0 +1,169 @@
+---
+title: "Sentiment and Emotion Analysis of Comments"
+editor: visual
+---
+
+## Setup: Installing and Loading Packages
+
+```{r}
+# Install all required packages
+install.packages("sentimentr")
+install.packages("syuzhet")
+# install.packages("dplyr")
+# install.packages("tidyr")
+# install.packages("readr")
+# install.packages("ggplot2")
+# install.packages("RColorBrewer")
+# install.packages("stringr")
+```
+
+```{r}
+# Load all packages
+library(sentimentr)
+library(syuzhet)
+library(dplyr)
+library(tidyr)
+library(readr)
+library(ggplot2)
+library(RColorBrewer)
+library(stringr)
+```
+
+### Polarity Analysis
+
+```{r}
+# Load Data
+comments <- readr::read_csv("./data/comments_preprocessed.csv")
+
+# Compute sentiment per row/case
+sentiment_scores <- sentiment_by(comments$comments)
+
+# Add scores and labels to original dataset
+polarity <- comments %>%
+  mutate(score = sentiment_scores$ave_sentiment,
+         sentiment_label = case_when(
+           score > 0.1  ~ "positive",
+           score < -0.1 ~ "negative",
+           TRUE         ~ "neutral"
+         ))
+
+# Check first rows with results
+head(polarity)
+
+# Scores per label
+table(polarity$sentiment_label)
+```
+
+#### Plotting
+
+```{r}
+# Visualize
+ggplot(polarity, aes(x = score)) +
+  geom_histogram(binwidth = 0.1, fill = "skyblue", color = "white") +
+  theme_minimal() +
+  labs(title = "Sentiment Score Distribution", x = "Average Sentiment", y = "Count")
+
+# Extract season info (s1, s2) into a new column
+polarity_seasons <- mutate(polarity,
+                           season = str_extract(id, "s\\d+"))
+
+# Histogram comparison by season, using Density
+ggplot(polarity_seasons, aes(x = score, fill = season)) +
+  geom_histogram(aes(y = after_stat(density)),  
+                 binwidth = 0.1, 
+                 position = "dodge", 
+                 color = "white") +
+  theme_minimal() +
+  labs(title = "Sentiment Score Distribution by Season (Normalized)", 
+       x = "Average Sentiment Score (Polarity)", 
+       y = "Density (Proportion of Comments)") + 
+  scale_fill_brewer(palette = "Set1")
+
+# Save results
+write_csv(polarity, "output/polarity_results.csv")
+```
+
+### Emotion Detection with Syuzhet's NRC Lexicon
+
+```{r}
+# Detecting Emotions per Comment/Sentence
+sentences <- get_sentences(comments$comments)
+
+# Compute Emotion Scores per Sentence
+# Assign NRC emotion scores (anger, joy, etc.) + positive/negative
+emotion_score <- get_nrc_sentiment(sentences)
+
+# Review Summary of Emotion Scores
+summary(emotion_score)
+
+# Regroup with Original Comments/IDs
+comments$comments <- sentences
+emotion_data <- bind_cols(comments, emotion_score)
+
+# Summarize Overall Emotion Counts
+emotion_summary <- emotion_data %>%
+  select(anger:trust) %>%              # only emotion columns
+  summarise(across(everything(), sum)) %>% 
+  pivot_longer(cols = everything(), names_to = "emotion", values_to = "count") %>%
+  arrange(desc(count))
+```
+
+### Plotting Things
+
+```{r}
+# Plot Overall Emotion Distribution
+ggplot(emotion_summary, aes(x = emotion, y = count, fill = emotion)) +
+  geom_col(show.legend = FALSE) +             
+  geom_text(aes(label = count), hjust = -0.2, size = 2) +
+  scale_fill_manual(values = brewer.pal(10, "Paired")) +
+  theme_minimal(base_size = 12) +
+  labs(title = "Overall Emotion Distribution", x = "Emotion", y = "Total Count") +
+  coord_flip()
+
+# Add "Season" Variable and Summarize by Season
+# Create season variable based on ID pattern
+emotion_seasons <- emotion_data %>%
+  mutate(season = ifelse(grepl("^s1_", id), "s1",
+                         ifelse(grepl("^s2_", id), "s2", NA)))
+
+# Aggregate emotion counts per season
+emotion_by_season <- emotion_seasons %>%
+  group_by(season) %>%
+  summarise(
+    across(anger:positive, ~sum(., na.rm = TRUE))
+  )
+
+# Compare Emotions by Season
+emotion_long <- emotion_by_season %>%
+  pivot_longer(cols = anger:positive, names_to = "emotion", values_to = "count")
+
+ggplot(emotion_long, aes(x = reorder(emotion, -count), y = count, fill = season)) +
+  geom_col(position = "dodge") + 
+  geom_text(aes(label = count), hjust = -0.2, size = 2) +
+  scale_fill_brewer(palette = "Set2") +
+  theme_minimal(base_size = 12) +
+  labs(title = "Emotion Distribution by Season", x = "Emotion", y = "Total Count", fill = "Season") +
+  coord_flip()
+
+# Emotion Co-occurrence Heatmap
+# Compute correlations between emotions
+emotion_matrix <- emotion_data %>% select(anger:trust)
+co_occurrence <- cor(emotion_matrix, method = "pearson")
+diag(co_occurrence) <- NA  # remove self-correlations
+
+# Convert to long format for plotting
+co_occurrence_long <- as.data.frame(as.table(co_occurrence))
+colnames(co_occurrence_long) <- c("emotion1", "emotion2", "correlation")
+
+# Plot heatmap
+ggplot(co_occurrence_long, aes(x = emotion1, y = emotion2, fill = correlation)) +
+  geom_tile(color = "white") +
+  scale_fill_gradient2(mid = "white", high = "red", midpoint = 0,
+                       limits = c(0, 1), na.value = "grey95", name = "Correlation") +
+  theme_minimal(base_size = 12) +
+  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
+  labs(title = "Emotion Co-occurrence Heatmap", x = "Emotion", y = "Emotion")
+
+# Save Results
+write_csv(emotion_data, "output/sentiment_emotion_results.csv")
+```
diff --git a/scripts/workshop_package/scripts_sentiment_analysis_wc.qmd b/scripts/workshop_package/scripts_sentiment_analysis_wc.qmd
@@ -0,0 +1,168 @@
+---
+title: "Sentiment and Emotion Analysis of Comments"
+editor: visual
+---
+
+## Setup: Installing and Loading Packages
+
+```{r}
+# Install all required packages
+install.packages("sentimentr")
+install.packages("syuzhet")
+# install.packages("dplyr")
+# install.packages("tidyr")
+# install.packages("readr")
+# install.packages("ggplot2")
+# install.packages("RColorBrewer")
+# install.packages("stringr")
+```
+
+```{r}
+# Load all packages
+library(sentimentr)
+library(syuzhet)
+library(dplyr)
+library(tidyr)
+library(readr)
+library(ggplot2)
+library(RColorBrewer)
+library(stringr)
+```
+
+### Polarity Analysis
+
+```{r}
+# Load Data
+comments <- readr::read_csv("./data/comments_preprocessed.csv")
+
+# Compute sentiment per row/case
+
+# Add scores and labels to original dataset
+polarity <- comments %>%
+  mutate(score = sentiment_scores$ave_sentiment,
+         sentiment_label = case_when(
+           score > 0.1  ~ "positive",
+           score < -0.1 ~ "negative",
+           TRUE         ~ "neutral"
+         ))
+
+# Check first rows with results
+
+
+# Scores per label
+
+```
+
+#### Plotting
+
+```{r}
+# Visualize
+ggplot(polarity, aes(x = score)) +
+  geom_histogram(binwidth = 0.1, fill = "skyblue", color = "white") +
+  theme_minimal() +
+  labs(title = "Sentiment Score Distribution", x = "Average Sentiment", y = "Count")
+
+# Extract season info (s1, s2) into a new column
+polarity_seasons <- mutate(polarity,
+                           season = str_extract(id, "s\\d+"))
+
+# Histogram comparison by season, using Density
+ggplot(polarity_seasons, aes(x = score, fill = season)) +
+  geom_histogram(aes(y = after_stat(density)),  
+                 binwidth = 0.1, 
+                 position = "dodge", 
+                 color = "white") +
+  theme_minimal() +
+  labs(title = "Sentiment Score Distribution by Season (Normalized)", 
+       x = "Average Sentiment Score (Polarity)", 
+       y = "Density (Proportion of Comments)") + 
+  scale_fill_brewer(palette = "Set1")
+
+# Save results
+write_csv(polarity, "output/polarity_results.csv")
+```
+
+### Emotion Detection with Syuzhet's NRC Lexicon
+
+```{r}
+# Detecting Emotions per Comment/Sentence
+
+
+# Compute Emotion Scores per Sentence
+# Assign NRC emotion scores (anger, joy, etc.) + positive/negative
+
+
+# Review Summary of Emotion Scores
+
+
+# Regroup with Original Comments/IDs
+comments$comments <- sentences
+emotion_data <- bind_cols(comments, emotion_score)
+
+# Summarize Overall Emotion Counts
+emotion_summary <- emotion_data %>%
+  select(anger:trust) %>%              # only emotion columns
+  summarise(across(everything(), sum)) %>% 
+  pivot_longer(cols = everything(), names_to = "emotion", values_to = "count") %>%
+  arrange(desc(count))
+```
+
+### Plotting Things
+
+```{r}
+# Plot Overall Emotion Distribution
+ggplot(emotion_summary, aes(x = emotion, y = count, fill = emotion)) +
+  geom_col(show.legend = FALSE) +             
+  geom_text(aes(label = count), hjust = -0.2, size = 2) +
+  scale_fill_manual(values = brewer.pal(10, "Paired")) +
+  theme_minimal(base_size = 12) +
+  labs(title = "Overall Emotion Distribution", x = "Emotion", y = "Total Count") +
+  coord_flip()
+
+# Add "Season" Variable and Summarize by Season
+# Create season variable based on ID pattern
+emotion_seasons <- emotion_data %>%
+  mutate(season = ifelse(grepl("^s1_", id), "s1",
+                         ifelse(grepl("^s2_", id), "s2", NA)))
+
+# Aggregate emotion counts per season
+emotion_by_season <- emotion_seasons %>%
+  group_by(season) %>%
+  summarise(
+    across(anger:positive, ~sum(., na.rm = TRUE))
+  )
+
+# Compare Emotions by Season
+emotion_long <- emotion_by_season %>%
+  pivot_longer(cols = anger:positive, names_to = "emotion", values_to = "count")
+
+ggplot(emotion_long, aes(x = reorder(emotion, -count), y = count, fill = season)) +
+  geom_col(position = "dodge") + 
+  geom_text(aes(label = count), hjust = -0.2, size = 2) +
+  scale_fill_brewer(palette = "Set2") +
+  theme_minimal(base_size = 12) +
+  labs(title = "Emotion Distribution by Season", x = "Emotion", y = "Total Count", fill = "Season") +
+  coord_flip()
+
+# Emotion Co-occurrence Heatmap
+# Compute correlations between emotions
+emotion_matrix <- emotion_data %>% select(anger:trust)
+co_occurrence <- cor(emotion_matrix, method = "pearson")
+diag(co_occurrence) <- NA  # remove self-correlations
+
+# Convert to long format for plotting
+co_occurrence_long <- as.data.frame(as.table(co_occurrence))
+colnames(co_occurrence_long) <- c("emotion1", "emotion2", "correlation")
+
+# Plot heatmap
+ggplot(co_occurrence_long, aes(x = emotion1, y = emotion2, fill = correlation)) +
+  geom_tile(color = "white") +
+  scale_fill_gradient2(mid = "white", high = "red", midpoint = 0,
+                       limits = c(0, 1), na.value = "grey95", name = "Correlation") +
+  theme_minimal(base_size = 12) +
+  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
+  labs(title = "Emotion Co-occurrence Heatmap", x = "Emotion", y = "Emotion")
+
+# Save Results
+write_csv(emotion_data, "output/sentiment_emotion_results.csv")
+```