Skip to content

Instantly share code, notes, and snippets.

@tomastitera
Last active May 2, 2018 19:32
Show Gist options
  • Save tomastitera/a301a135e185b6d162a05e7d4cd40da6 to your computer and use it in GitHub Desktop.
Save tomastitera/a301a135e185b6d162a05e7d4cd40da6 to your computer and use it in GitHub Desktop.
# Postupujeme vicemen podle tohoto blogpostu: https://bnosac.github.io/udpipe/docs/doc5.html
setwd("~/R/snmupdpipe")
library(udpipe)
udmodel_czech <- udpipe_load_model(file = "czech-ud-2.0-170801.udpipe")
page <- read.csv("page_298789466930469_2018_05_02_10_38_53.tab", sep="\t", encoding = "UTF-8")
korpus.raw <- as.vector(page$post_message)
x <- udpipe_annotate(udmodel_czech, x = korpus.raw)
x <- as.data.frame(x)
# Varianta A: Vylucuji slovni druhy, ktere nejsou plnovyznamove
stats <- subset(x, !(upos %in% c("PROPN", "AUX", "PUNCT", "ADP", "DET")))
# Konec varianty A
# Varianta B: Zahrnuji jen podstatna a pridavna jmena
stats <- subset(x, upos %in% c("NOUN", "ADJ"))
# Vylucuji slovo "KSČM"
stats <- subset(stats, !(lemma %in% c("KSČM")))
# Do promenne "stats" zapisuji frekvence lemmatizovanych podstatnych a pridavnych jmen bez KSCM
stats <- txt_freq(x = stats$lemma)
# Varianta A: Delam barchart nejcastejsich pridavnych + podstatnych jmen
library(lattice)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 30), col = "cadetblue", main = "Most occurring nouns -KSČM", xlab = "Freq")
# Varianta C: Delam barchart, ktery srovnava keywords podle metody RAKE
library(lattice)
stats <- keywords_rake(x = x, term = "lemma", group = "doc_id", relevant = x$upos %in% c("NOUN", "ADJ"))
stats$key <- factor(stats$keyword, levels = rev(stats$keyword))
barchart(key ~ rake, data = head(subset(stats, freq > 3), 20), col = "cadetblue", main = "Keywords identified by RAKE", xlab = "Rake")
# Varianta C: Delam seznam nejcastejsich cooccurances
cooc <- cooccurrence(x = subset(x, upos %in% c("NOUN", "ADJ")),
term = "lemma",
group = c("doc_id", "paragraph_id", "sentence_id"))
head(cooc)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment