Created
February 7, 2018 08:24
-
-
Save akbertram/b418f480bd372e74fce0464d5dfc1d2e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tm) | |
library(SnowballC) | |
library(stringr) | |
library(ggplot2) | |
## RQ: What are the most commonly used words & the term frequencies in the 'Protestants-Christelijk' schools and 'Openbaar' schools? | |
# Read in our school metadata | |
schools <- read.csv("schools.csv", | |
sep = ";", | |
stringsAsFactors = FALSE) | |
### PDF corpus ### | |
gids <- readRDS("/opt/duo/schoolgids2017v1_500.rds") | |
gids_id <- substring(names(gids), 1, 6) | |
### Document pre-processing | |
TurnSpace <- content_transformer(function(x, pattern) {return (gsub(pattern, " ", x))}) | |
gids <- tm_map(gids, TurnSpace, "/|â€|@|//|$|:|:)|*|&|!|?|_|-|â€|•|·|#|…+") | |
gids <- tm_map(gids, content_transformer(tolower)) | |
gids <- tm_map(gids, stripWhitespace) | |
gids <- tm_map(gids, removePunctuation) | |
gids <- tm_map(gids, function(x) stemDocument(x, language = "dutch")) | |
gids <- tm_map(gids, removeNumbers) | |
gids <- tm_map(gids, removeWords, tm::stopwords("dutch")) | |
## Create a DocumentTermMatrix | |
dtm <- DocumentTermMatrix(gids) | |
ddtm <- as.matrix(removeSparseTerms(dtm, sparse = 0.90)) | |
## Match the gids to rows in our schools table | |
gids_row <- match(gids_id, schools$VESTIGINGSNUMMER) | |
gids_denominatie <- schools$DENOMINATIE[gids_row] | |
## Compute a dummy variable indicating chrisitian schools | |
christian <- as.integer(gids_denominatie %in% c("Protestants-Christelijk", "Rooms-Katholiek")) | |
# Normalize by frequency per 1000 words | |
ndtm <- ddtm / rowSums(ddtm) * 1000 | |
empty <- rowSums(ddtm) == 0 | |
# Compute a correlation matrix between our dummy variable and the term frequency | |
cor_matrix <- cor(gids_score[!empty], ndtm[!empty, ], use="complete.obs") | |
cor_df <- data.frame(term = colnames(cor_matrix), cor = cor_matrix[1,], | |
row.names = NULL, stringsAsFactors = FALSE) | |
# Order by absolute correlation | |
cor_df <- cor_df[ order(-abs(cor_df$cor)), ] | |
# Take top 30 positive correlations | |
pos_cor <- cor_df[ cor_df$cor > 0, ][1:40, ] | |
ggplot(pos_cor, aes(reorder(term, cor), cor)) + | |
geom_col() + | |
coord_flip() + | |
xlab("Terms") + ylab("Terms with Positive Correlation with CHRISTIAN") | |
# ... And top 30 negative correlations | |
neg_cor <- cor_df[ cor_df$cor < 0, ][1:40, ] | |
ggplot(neg_cor, aes(reorder(term, cor), cor)) + | |
geom_col() + | |
coord_flip() + | |
xlab("Terms") + ylab("Terms with Negative Correlation with CORRELATION") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment