Skip to content

Instantly share code, notes, and snippets.

@sjorsvanheuveln
Last active October 30, 2017 22:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sjorsvanheuveln/d8a0160e7bd6afea914cb1b457062508 to your computer and use it in GitHub Desktop.
Save sjorsvanheuveln/d8a0160e7bd6afea914cb1b457062508 to your computer and use it in GitHub Desktop.
#Analysis of Dutch Language Vocabulary
#Sjors van Heuveln 28-10-2017
#Analyze triplet occurrences in the Dutch Language. This script can easily source another vocab file to do new analysis.
#Functions
trimWord <- function(rawWord) {
return(unlist(strsplit(rawWord,'/'))[1]); #trims off ID
}
wordTriplets <- function(word, size) {
if (grepl(' ', word)) { return(); } #break if word contains spaces
if (nchar(word) < size) { return(); }
for (i in 1:(nchar(word) - (size-1))) {
triplet <- substr(word, i, i + (size-1));
write(triplet, outputLocation, append = T);
}
}
iterateDictionary <- function(dictionary, callback) {
print('This may take a while!');
print('Please wait ...');
for (i in 1:length(dictionary)) {
if (i %% 1000 == 0 ) { print(i); }
word <- dictionary[i];
wordTriplets(word);
}
callback;
}
#Installation: Install the below packages if you haven't already done so. Remove the hashtag and run the lines.
#library(devtools);
#install_github("sjorsvanheuveln/beepr2");
#install.packages('dplyr');
#install.packages('data.table');
#Setup
library("beepr2");
library(dplyr);
library(data.table)
outputLocation <- '~/Desktop/output.txt';
data <- fread('https://raw.githubusercontent.com/titoBouzout/Dictionaries/master/Dutch.dic')
data <- as.character(data$V1);
# Data Cleanup
dataTrimmed <- sapply(data, trimWord, USE.NAMES = F);
rm(data);gc()
# Main Triplet Creation Process
iterateDictionary(dataTrimmed, beep(23));
#Triplet Loading
rm(dataTrimmed);gc();
tripletData <- as.character(read.table(outputLocation)[,1]);
table <- table(tripletData);
#Triplet Top100 Analysis
wordNames <- names(table);
frequencies <- as.numeric(table);
cat(wordNames[order(frequencies, decreasing=T)][1:100])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment