Last active
May 2, 2018 19:32
-
-
Save tomastitera/a301a135e185b6d162a05e7d4cd40da6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Postupujeme vicemen podle tohoto blogpostu: https://bnosac.github.io/udpipe/docs/doc5.html | |
setwd("~/R/snmupdpipe") | |
library(udpipe) | |
udmodel_czech <- udpipe_load_model(file = "czech-ud-2.0-170801.udpipe") | |
page <- read.csv("page_298789466930469_2018_05_02_10_38_53.tab", sep="\t", encoding = "UTF-8") | |
korpus.raw <- as.vector(page$post_message) | |
x <- udpipe_annotate(udmodel_czech, x = korpus.raw) | |
x <- as.data.frame(x) | |
# Varianta A: Vylucuji slovni druhy, ktere nejsou plnovyznamove | |
stats <- subset(x, !(upos %in% c("PROPN", "AUX", "PUNCT", "ADP", "DET"))) | |
# Konec varianty A | |
# Varianta B: Zahrnuji jen podstatna a pridavna jmena | |
stats <- subset(x, upos %in% c("NOUN", "ADJ")) | |
# Vylucuji slovo "KSČM" | |
stats <- subset(stats, !(lemma %in% c("KSČM"))) | |
# Do promenne "stats" zapisuji frekvence lemmatizovanych podstatnych a pridavnych jmen bez KSCM | |
stats <- txt_freq(x = stats$lemma) | |
# Varianta A: Delam barchart nejcastejsich pridavnych + podstatnych jmen | |
library(lattice) | |
stats$key <- factor(stats$key, levels = rev(stats$key)) | |
barchart(key ~ freq, data = head(stats, 30), col = "cadetblue", main = "Most occurring nouns -KSČM", xlab = "Freq") | |
# Varianta C: Delam barchart, ktery srovnava keywords podle metody RAKE | |
library(lattice) | |
stats <- keywords_rake(x = x, term = "lemma", group = "doc_id", relevant = x$upos %in% c("NOUN", "ADJ")) | |
stats$key <- factor(stats$keyword, levels = rev(stats$keyword)) | |
barchart(key ~ rake, data = head(subset(stats, freq > 3), 20), col = "cadetblue", main = "Keywords identified by RAKE", xlab = "Rake") | |
# Varianta C: Delam seznam nejcastejsich cooccurances | |
cooc <- cooccurrence(x = subset(x, upos %in% c("NOUN", "ADJ")), | |
term = "lemma", | |
group = c("doc_id", "paragraph_id", "sentence_id")) | |
head(cooc) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment