Skip to content

Instantly share code, notes, and snippets.

@josefslerka
Created February 18, 2013 15:33
Show Gist options
  • Save josefslerka/4978242 to your computer and use it in GitHub Desktop.
Save josefslerka/4978242 to your computer and use it in GitHub Desktop.
Aplikace v R pro Corpus Viewer
library(shiny)
library(RCurl)
library(RJSONIO)
library(textcat)
library(tm)
library(tau)
library(corrplot)
# Define server logic
shinyServer(function(input, output) {
# load corpus from jobs publishers (sample set from Havel speechs)
obj.temp <- read.csv("prace-full.csv")
corpus.prace <- as.vector(obj.temp$POZNAMKA[1:1000])
corpus.prace <- Corpus(VectorSource(corpus.prace))
corpus.prace <- tm_map(corpus.prace, tolower)
corpus.prace <- tm_map(corpus.prace, removePunctuation)
corpus.prace <- tm_map(corpus.prace, removeNumbers)
dtm.prace <- DocumentTermMatrix(corpus.prace)
# create reactive function for switching between datasets
datasetInput <- reactive(function() {
corpus.prace
})
# create reactive function for switching between document term matrix
termetrixinput <- reactive(function() {
dtm.prace
})
# create reactive function for sparsing
dtmInput <- reactive(function() {
dtm <- termetrixinput()
dtm <- removeSparseTerms(dtm, input$sparse)
})
# quick summary for whole corpus
output$numDocuments <- reactivePrint(function() {
dataset <- datasetInput()
summary(dataset)
})
# calculate N-grams for whole corpus
output$viewNgram <- reactivePrint(function() {
dataset <- datasetInput()
kindofngrams <- textcnt(dataset, method = "string",n=input$numngrams)
data.frame(count=sort(kindofngrams, decreasing=TRUE)[1:100])
})
# quick summary for filtered subcorpus
output$numDocumentsFilter <- reactivePrint(function() {
dataset <- datasetInput()
subcorpus <- tm_filter(dataset, FUN = searchFullText, input$word)
summary(subcorpus)
})
# calculate N-grams for filtered subset
output$viewNgramFilter <- reactivePrint(function() {
dataset <- datasetInput()
subcorpus <- tm_filter(dataset, FUN = searchFullText, input$word)
kindofngramsFilter <- textcnt(subcorpus, method = "string",n=input$numngrams)
data.frame(count=sort(kindofngramsFilter, decreasing=TRUE)[1:100])
})
# calculate and plot correlation matrix
output$corrPlot <- reactivePlot(function() {
dtm <- dtmInput()
count <- input$count+1
words <- names(findAssocs(dtm, input$word,input$prob)[2:count])
oi <- as.matrix(dtm)
find <- colnames(oi) %in% words
corr <- cor(oi[,find])
corrplot(corr)
})
# calculate and plot MDS
output$mds <- reactivePlot(function() {
dtm <- dtmInput()
count <- input$count+1
words <- names(findAssocs(dtm, input$word,input$prob)[2:count])
oi <- as.matrix(dtm)
find <- colnames(oi) %in% words
corr <- cor(oi[,find])
d <- dist(corr) # euclidean distances between the rows
fit <- cmdscale(d,eig=TRUE, k=2) # k is the number of dim
x <- fit$points[,1]
y <- fit$points[,2]
plot(x, y, xlab="Coordinate 1", ylab="Coordinate 2", type="n")
text(x, y, labels = row.names(corr))
})
# calculate and print summary for factory analysis
output$viewFactor <- reactivePrint(function() {
dtm <- dtmInput()
count <- input$count+1
words <- names(findAssocs(dtm, input$word,input$prob)[2:count])
oi <- as.matrix(dtm)
find <- colnames(oi) %in% words
corr <- cor(oi[,find])
fit <- factanal(covmat=corr,factors=input$numfactors,rot="varimax")
fit
})
# calculate and plot factory analysis
output$factor <- reactivePlot(function() {
dtm <- dtmInput()
count <- input$count+1
words <- names(findAssocs(dtm, input$word,input$prob)[2:count])
oi <- as.matrix(dtm)
find <- colnames(oi) %in% words
corr <- cor(oi[,find])
fit <- factanal(covmat=corr,factors=input$numfactors,rot="varimax")
load <- fit$loadings[,1:2]
plot(load,type="n") # set up plot
text(load,labels=row.names(corr))
})
# calculate and plot hierarchical clustering
output$corrClust <- reactivePlot(function() {
dtm <- dtmInput()
count <- input$count+1
words <- names(findAssocs(dtm, input$word,input$prob)[2:count])
oi <- as.matrix(dtm)
find <- colnames(oi) %in% words
d <- dist(cor(oi[,find]) , method = "euclidean")
fit <- hclust(d, method="ward")
plot(fit)
})
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment