tomastitera/udpipe_fbposts_rake.R

## udpipe_fbposts_rake.R
# Postupujeme vicemen podle tohoto blogpostu: https://bnosac.github.io/udpipe/docs/doc5.html

setwd("~/R/snmupdpipe")
library(udpipe)
udmodel_czech <- udpipe_load_model(file = "czech-ud-2.0-170801.udpipe")
page <- read.csv("page_298789466930469_2018_05_02_10_38_53.tab", sep="\t", encoding = "UTF-8")
korpus.raw <- as.vector(page$post_message)
x <- udpipe_annotate(udmodel_czech, x = korpus.raw)
x <- as.data.frame(x)

# Varianta A: Vylucuji slovni druhy, ktere nejsou plnovyznamove

stats <- subset(x, !(upos %in% c("PROPN", "AUX", "PUNCT", "ADP", "DET")))

# Konec varianty A
# Varianta B: Zahrnuji jen podstatna a pridavna jmena
stats <- subset(x, upos %in% c("NOUN", "ADJ"))

# Vylucuji slovo "KSČM"
stats <- subset(stats, !(lemma %in% c("KSČM")))

# Do promenne "stats" zapisuji frekvence lemmatizovanych podstatnych a pridavnych jmen bez KSCM
stats <- txt_freq(x = stats$lemma)

# Varianta A: Delam barchart nejcastejsich pridavnych + podstatnych jmen
library(lattice)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 30), col = "cadetblue", main = "Most occurring nouns -KSČM", xlab = "Freq")

# Varianta C: Delam barchart, ktery srovnava keywords podle metody RAKE
library(lattice)
stats <- keywords_rake(x = x, term = "lemma", group = "doc_id", relevant = x$upos %in% c("NOUN", "ADJ"))
stats$key <- factor(stats$keyword, levels = rev(stats$keyword))
barchart(key ~ rake, data = head(subset(stats, freq > 3), 20), col = "cadetblue", main = "Keywords identified by RAKE", xlab = "Rake")

# Varianta C: Delam seznam nejcastejsich cooccurances
cooc <- cooccurrence(x = subset(x, upos %in% c("NOUN", "ADJ")),
                     term = "lemma",
                     group = c("doc_id", "paragraph_id", "sentence_id"))
head(cooc)
	# Postupujeme vicemen podle tohoto blogpostu: https://bnosac.github.io/udpipe/docs/doc5.html

	setwd("~/R/snmupdpipe")
	library(udpipe)
	udmodel_czech <- udpipe_load_model(file = "czech-ud-2.0-170801.udpipe")
	page <- read.csv("page_298789466930469_2018_05_02_10_38_53.tab", sep="\t", encoding = "UTF-8")
	korpus.raw <- as.vector(page$post_message)
	x <- udpipe_annotate(udmodel_czech, x = korpus.raw)
	x <- as.data.frame(x)

	# Varianta A: Vylucuji slovni druhy, ktere nejsou plnovyznamove

	stats <- subset(x, !(upos %in% c("PROPN", "AUX", "PUNCT", "ADP", "DET")))

	# Konec varianty A
	# Varianta B: Zahrnuji jen podstatna a pridavna jmena
	stats <- subset(x, upos %in% c("NOUN", "ADJ"))

	# Vylucuji slovo "KSČM"
	stats <- subset(stats, !(lemma %in% c("KSČM")))

	# Do promenne "stats" zapisuji frekvence lemmatizovanych podstatnych a pridavnych jmen bez KSCM
	stats <- txt_freq(x = stats$lemma)

	# Varianta A: Delam barchart nejcastejsich pridavnych + podstatnych jmen
	library(lattice)
	stats$key <- factor(stats$key, levels = rev(stats$key))
	barchart(key ~ freq, data = head(stats, 30), col = "cadetblue", main = "Most occurring nouns -KSČM", xlab = "Freq")

	# Varianta C: Delam barchart, ktery srovnava keywords podle metody RAKE
	library(lattice)
	stats <- keywords_rake(x = x, term = "lemma", group = "doc_id", relevant = x$upos %in% c("NOUN", "ADJ"))
	stats$key <- factor(stats$keyword, levels = rev(stats$keyword))
	barchart(key ~ rake, data = head(subset(stats, freq > 3), 20), col = "cadetblue", main = "Keywords identified by RAKE", xlab = "Rake")

	# Varianta C: Delam seznam nejcastejsich cooccurances
	cooc <- cooccurrence(x = subset(x, upos %in% c("NOUN", "ADJ")),
	term = "lemma",
	group = c("doc_id", "paragraph_id", "sentence_id"))
	head(cooc)