Skip to content

Instantly share code, notes, and snippets.

@akbertram
Created February 7, 2018 08:24
Show Gist options
  • Save akbertram/b418f480bd372e74fce0464d5dfc1d2e to your computer and use it in GitHub Desktop.
Save akbertram/b418f480bd372e74fce0464d5dfc1d2e to your computer and use it in GitHub Desktop.
library(tm)
library(SnowballC)
library(stringr)
library(ggplot2)
## RQ: What are the most commonly used words & the term frequencies in the 'Protestants-Christelijk' schools and 'Openbaar' schools?
# Read in our school metadata
schools <- read.csv("schools.csv",
sep = ";",
stringsAsFactors = FALSE)
### PDF corpus ###
gids <- readRDS("/opt/duo/schoolgids2017v1_500.rds")
gids_id <- substring(names(gids), 1, 6)
### Document pre-processing
TurnSpace <- content_transformer(function(x, pattern) {return (gsub(pattern, " ", x))})
gids <- tm_map(gids, TurnSpace, "/|”|@|//|$|:|:)|*|&|!|?|_|-|‐|•|·|#|…+")
gids <- tm_map(gids, content_transformer(tolower))
gids <- tm_map(gids, stripWhitespace)
gids <- tm_map(gids, removePunctuation)
gids <- tm_map(gids, function(x) stemDocument(x, language = "dutch"))
gids <- tm_map(gids, removeNumbers)
gids <- tm_map(gids, removeWords, tm::stopwords("dutch"))
## Create a DocumentTermMatrix
dtm <- DocumentTermMatrix(gids)
ddtm <- as.matrix(removeSparseTerms(dtm, sparse = 0.90))
## Match the gids to rows in our schools table
gids_row <- match(gids_id, schools$VESTIGINGSNUMMER)
gids_denominatie <- schools$DENOMINATIE[gids_row]
## Compute a dummy variable indicating chrisitian schools
christian <- as.integer(gids_denominatie %in% c("Protestants-Christelijk", "Rooms-Katholiek"))
# Normalize by frequency per 1000 words
ndtm <- ddtm / rowSums(ddtm) * 1000
empty <- rowSums(ddtm) == 0
# Compute a correlation matrix between our dummy variable and the term frequency
cor_matrix <- cor(gids_score[!empty], ndtm[!empty, ], use="complete.obs")
cor_df <- data.frame(term = colnames(cor_matrix), cor = cor_matrix[1,],
row.names = NULL, stringsAsFactors = FALSE)
# Order by absolute correlation
cor_df <- cor_df[ order(-abs(cor_df$cor)), ]
# Take top 30 positive correlations
pos_cor <- cor_df[ cor_df$cor > 0, ][1:40, ]
ggplot(pos_cor, aes(reorder(term, cor), cor)) +
geom_col() +
coord_flip() +
xlab("Terms") + ylab("Terms with Positive Correlation with CHRISTIAN")
# ... And top 30 negative correlations
neg_cor <- cor_df[ cor_df$cor < 0, ][1:40, ]
ggplot(neg_cor, aes(reorder(term, cor), cor)) +
geom_col() +
coord_flip() +
xlab("Terms") + ylab("Terms with Negative Correlation with CORRELATION")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment