Last active
October 30, 2017 22:00
-
-
Save sjorsvanheuveln/d8a0160e7bd6afea914cb1b457062508 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Analysis of Dutch Language Vocabulary | |
#Sjors van Heuveln 28-10-2017 | |
#Analyze triplet occurrences in the Dutch Language. This script can easily source another vocab file to do new analysis. | |
#Functions | |
trimWord <- function(rawWord) { | |
return(unlist(strsplit(rawWord,'/'))[1]); #trims off ID | |
} | |
wordTriplets <- function(word, size) { | |
if (grepl(' ', word)) { return(); } #break if word contains spaces | |
if (nchar(word) < size) { return(); } | |
for (i in 1:(nchar(word) - (size-1))) { | |
triplet <- substr(word, i, i + (size-1)); | |
write(triplet, outputLocation, append = T); | |
} | |
} | |
iterateDictionary <- function(dictionary, callback) { | |
print('This may take a while!'); | |
print('Please wait ...'); | |
for (i in 1:length(dictionary)) { | |
if (i %% 1000 == 0 ) { print(i); } | |
word <- dictionary[i]; | |
wordTriplets(word); | |
} | |
callback; | |
} | |
#Installation: Install the below packages if you haven't already done so. Remove the hashtag and run the lines. | |
#library(devtools); | |
#install_github("sjorsvanheuveln/beepr2"); | |
#install.packages('dplyr'); | |
#install.packages('data.table'); | |
#Setup | |
library("beepr2"); | |
library(dplyr); | |
library(data.table) | |
outputLocation <- '~/Desktop/output.txt'; | |
data <- fread('https://raw.githubusercontent.com/titoBouzout/Dictionaries/master/Dutch.dic') | |
data <- as.character(data$V1); | |
# Data Cleanup | |
dataTrimmed <- sapply(data, trimWord, USE.NAMES = F); | |
rm(data);gc() | |
# Main Triplet Creation Process | |
iterateDictionary(dataTrimmed, beep(23)); | |
#Triplet Loading | |
rm(dataTrimmed);gc(); | |
tripletData <- as.character(read.table(outputLocation)[,1]); | |
table <- table(tripletData); | |
#Triplet Top100 Analysis | |
wordNames <- names(table); | |
frequencies <- as.numeric(table); | |
cat(wordNames[order(frequencies, decreasing=T)][1:100]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment