emraher/TBMM.R

## stop-words-turkish.txt
acaba
altı
ama
ancak
artık
asla
aslında
az
bana
bazen
bazı
bazıları
bazısı
belki
ben
beni
benim
beş
bile
bir
birçoğu
birçok
birçokları
biri
birisi
birkaç
birkaçı
birşey
birşeyi
biz
bize
bizi
bizim
böyle
böylece
bu
buna
bunda
bundan
bunu
bunun
burada
bütün
çoğu
çoğuna
çoğunu
çok
çünkü
da
daha
de
değil
demek
diğer
diğeri
diğerleri
diye
dokuz
dolayı
dört
elbette
en
fakat
falan
felan
filan
gene
gibi
hâlâ
hangi
hangisi
hani
hatta
hem
henüz
hep
hepsi
hepsine
hepsini
her
her biri
herkes
herkese
herkesi
hiç
hiç kimse
hiçbiri
hiçbirine
hiçbirini
için
içinde
iki
ile
ise
işte
kaç
kadar
kendi
kendine
kendini
ki
kim
kime
kimi
kimin
kimisi
madem
mı
mi
mu
mü
nasıl
ne
ne kadar
ne zaman
neden
nedir
nerde
nerede
nereden
nereye
nesi
neyse
niçin
niye
on
ona
ondan
onlar
onlara
onlardan
onların
onların
onu
onun
orada
oysa
oysaki
öbürü
ön
önce
ötürü
öyle
rağmen
sana
sekiz
sen
senden
seni
senin
siz
sizden
size
sizi
sizin
son
sonra
şayet
şey
şeyden
şeye
şeyi
şeyler
şimdi
şöyle
şu
şuna
şunda
şundan
şunlar
şunu
şunun
tabi
tamam
tüm
tümü
üç
üzere
var
ve
veya
veyahut
ya
ya da
yani
yedi
yerine
yine
yoksa
zaten
zira

## TBMM.R
rm(list=ls())

# Load required libraries
library(RCurl)
library(stringr)
library(tm)
library(wordcloud)
library(RColorBrewer)
library(twitteR)
library(streamR)
library(grid)
library(ggplot2)
library(wesanderson)

# Load credentials ============================================================= IMPORTANT
# SEE:http://thinktostart.com/twitter-authentification-with-r/
# UNCOMMENT LINES BELOW ======================================================== IMPORTANT
#load("")
#registerTwitterOAuth(my_oauth)
# Load credentials ============================================================= IMPORTANT

options(RCurlOptions = list(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl")))

# Set seed user
user <- "TBMMGenelKurulu"

# getting data for seed user
seed <- getUser(user)
(seed.n <- seed$screenName)

# Get the timeline
ut <- userTimeline(user, n=3200, includeRts = FALSE, encoding="utf-8")

# Extract tweets
tweets.text <- sapply(ut, function(x) x$getText())
head(tweets.text)
tweets.text[42:50]

# Remove non alphanumeric characters
tweets.text <- gsub("[^a-zA-ZğüşöçıİĞÜŞÖÇ ]","",tweets.text)

# Convert all text to lower case
tweets.text <- tolower(tweets.text)

# Replace @UserName
tweets.text <- gsub("@\\w+", "", tweets.text)

# Remove punctuation
tweets.text <- gsub("[[:punct:]]", "", tweets.text)

# Remove links
tweets.text <- gsub("http\\w+", "", tweets.text)

# Remove tabs
tweets.text <- gsub("[ |\t]{2,}", "", tweets.text)

# Remove blank spaces at the beginning
tweets.text <- gsub("^ ", "", tweets.text)

# Remove blank spaces at the end
tweets.text <- gsub(" $", "", tweets.text)

# Replace AK Parti with akp
tweets.text <- gsub("ak parti", "akp", tweets.text)

# Create corpus
tweets.text.corpus <- Corpus(VectorSource(tweets.text))

# Clean up by removing stop words
#tweets.text.corpus <- tm_map(tweets.text.corpus, function(x)removeWords(x,stopwords()))

# Turkish stopwords ============================================================ IMPORTANT
# Save stop-words-turkish.txt into your working directory
turkish <- read.table("stop-words-turkish.txt", sep="\n", stringsAsFactors = FALSE)
turkish_stop <- unlist(turkish)
# ============================================================================== IMPORTANT

# Create document term matrix applying some transformations
# To be safe re-clean text
tdm = TermDocumentMatrix(
    tweets.text.corpus,
    control = list(
        removePunctuation = TRUE,
        stopwords = c("bir",
                      "gibi",
                      "ama",
                      "daha",
                      "yok",
                      "http",
                      "ben",
                      "belki",
                      "hiçbir",
                      "sen",
                      "var",
                      "neden",
                      "nasi",
                      "ile",
                      "nasıl",
                      "kadar",
                      "kim",
                      "için",
                      "inci",
                      "uncu"),
        turkish_stop,
        removeNumbers = TRUE,
        tolower = TRUE))


# Create DTM
# Create document term matrix applying some transformations
dtm = DocumentTermMatrix(
    tweets.text.corpus,
    control = list(
        removePunctuation = TRUE,
        stopwords = c("bir",
                      "gibi",
                      "ama",
                      "daha",
                      "yok",
                      "http",
                      "ben",
                      "belki",
                      "hiçbir",
                      "sen",
                      "var",
                      "neden",
                      "nasi",
                      "ile",
                      "nasıl",
                      "kadar",
                      "kim",
                      "için",
                      "inci",
                      "uncu"),
        turkish_stop,
        removeNumbers = TRUE,
        tolower = TRUE))

# Assocsiations
findAssocs(dtm, "akp", corlimit=0.15)
findAssocs(dtm, "bdp", corlimit=0.15)
findAssocs(dtm, "chp", corlimit=0.15)
findAssocs(dtm, "hdp", corlimit=0.15)
findAssocs(dtm, "mhp", corlimit=0.15)

# AKP Correlation
toi <- "akp" # term of interest
corlimit <- 0.15 #  lower correlation bound limit.
akp_0.3 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1],
                      terms = row.names(findAssocs(tdm, toi, corlimit)))


akp_0.3$terms <- factor(akp_0.3$terms, levels = akp_0.3$terms)

# Plot and save the image in png format
png("akp.png", width=9, height=9, units="in", res=500)
ggplot(akp_0.3, aes( y = terms  ) ) +
    geom_point(aes(x = corr, size=corr), data = akp_0.3) +
    scale_size(range = c(3, 15)) +
    ylab("")+
    xlab(paste0("Correlation with the term ", "\"", toi, "\""))
dev.off()

# BDP Correlation
toi <- "bdp" # term of interest
corlimit <- 0.15 #  lower correlation bound limit.
bdp_0.3 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1],
                      terms = row.names(findAssocs(tdm, toi, corlimit)))


bdp_0.3$terms <- factor(bdp_0.3$terms, levels = bdp_0.3$terms)
# Plot and save the image in png format
png("bdp.png", width=9, height=9, units="in", res=500)
ggplot(bdp_0.3, aes( y = terms  ) ) +
    geom_point(aes(x = corr, size=corr), data = bdp_0.3) +
    scale_size(range = c(3, 15)) +
    ylab("")+
    xlab(paste0("Correlation with the term ", "\"", toi, "\""))
dev.off()

# CHP Correlation
toi <- "chp" # term of interest
corlimit <- 0.15 #  lower correlation bound limit.
chp_0.3 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1],
                      terms = row.names(findAssocs(tdm, toi, corlimit)))


chp_0.3$terms <- factor(chp_0.3$terms, levels = chp_0.3$terms)

# Plot and save the image in png format
png("chp.png", width=9, height=9, units="in", res=500)
ggplot(chp_0.3, aes( y = terms  ) ) +
    geom_point(aes(x = corr, size=corr), data = chp_0.3) +
    scale_size(range = c(3, 15)) +
    ylab("")+
    xlab(paste0("Correlation with the term ", "\"", toi, "\""))
dev.off()

# HDP Correlation
toi <- "hdp" # term of interest
corlimit <- 0.15 #  lower correlation bound limit.
hdp_0.3 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1],
                      terms = row.names(findAssocs(tdm, toi, corlimit)))


hdp_0.3$terms <- factor(hdp_0.3$terms, levels = hdp_0.3$terms)
# Plot and save the image in png format
png("hdp.png", width=9, height=9, units="in", res=500)
ggplot(hdp_0.3, aes( y = terms  ) ) +
    geom_point(aes(x = corr, size=corr), data = hdp_0.3) +
    scale_size(range = c(3, 15)) +
    ylab("")+
    xlab(paste0("Correlation with the term ", "\"", toi, "\""))
dev.off()

# MHP Correlation
toi <- "mhp" # term of interest
corlimit <- 0.15 #  lower correlation bound limit.
mhp_0.3 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1],
                      terms = row.names(findAssocs(tdm, toi, corlimit)))


mhp_0.3$terms <- factor(mhp_0.3$terms, levels = mhp_0.3$terms)
# Plot and save the image in png format
png("mhp.png", width=9, height=9, units="in", res=500)
ggplot(mhp_0.3, aes( y = terms  ) ) +
    geom_point(aes(x = corr, size=corr), data = mhp_0.3) +
    scale_size(range = c(3, 15)) +
    ylab("")+
    xlab(paste0("Correlation with the term ", "\"", toi, "\""))
dev.off()


# WORDCLOUD
# Define tdm as matrix
m = as.matrix(tdm)

# Get word counts in decreasing order
word_freqs = sort(rowSums(m), decreasing=TRUE)

# Create a data frame with words and their frequencies
dm = data.frame(word=names(word_freqs), freq=word_freqs)

# Color
la_cont <- wes_palette(name = "Zissou", type = "continuous")

# Plot and save the image in png format
png("tbmm.png", width=9, height=9, units="in", res=500)

wordcloud(dm$word, dm$freq, random.order=FALSE, min.freq = 2,scale=c(4,0.5), max.words = 100, colors=la_cont)

dev.off()


# Save workspace
save.image(file = "tbmm.RData")
	acaba
	altı
	ama
	ancak
	artık
	asla
	aslında
	az
	bana
	bazen
	bazı
	bazıları
	bazısı
	belki
	ben
	beni
	benim
	beş
	bile
	bir
	birçoğu
	birçok
	birçokları
	biri
	birisi
	birkaç
	birkaçı
	birşey
	birşeyi
	biz
	bize
	bizi
	bizim
	böyle
	böylece
	bu
	buna
	bunda
	bundan
	bunu
	bunun
	burada
	bütün
	çoğu
	çoğuna
	çoğunu
	çok
	çünkü
	da
	daha
	de
	değil
	demek
	diğer
	diğeri
	diğerleri
	diye
	dokuz
	dolayı
	dört
	elbette
	en
	fakat
	falan
	felan
	filan
	gene
	gibi
	hâlâ
	hangi
	hangisi
	hani
	hatta
	hem
	henüz
	hep
	hepsi
	hepsine
	hepsini
	her
	her biri
	herkes
	herkese
	herkesi
	hiç
	hiç kimse
	hiçbiri
	hiçbirine
	hiçbirini
	için
	içinde
	iki
	ile
	ise
	işte
	kaç
	kadar
	kendi
	kendine
	kendini
	ki
	kim
	kime
	kimi
	kimin
	kimisi
	madem
	mı
	mi
	mu
	mü
	nasıl
	ne
	ne kadar
	ne zaman
	neden
	nedir
	nerde
	nerede
	nereden
	nereye
	nesi
	neyse
	niçin
	niye
	on
	ona
	ondan
	onlar
	onlara
	onlardan
	onların
	onların
	onu
	onun
	orada
	oysa
	oysaki
	öbürü
	ön
	önce
	ötürü
	öyle
	rağmen
	sana
	sekiz
	sen
	senden
	seni
	senin
	siz
	sizden
	size
	sizi
	sizin
	son
	sonra
	şayet
	şey
	şeyden
	şeye
	şeyi
	şeyler
	şimdi
	şöyle
	şu
	şuna
	şunda
	şundan
	şunlar
	şunu
	şunun
	tabi
	tamam
	tüm
	tümü
	üç
	üzere
	var
	ve
	veya
	veyahut
	ya
	ya da
	yani
	yedi
	yerine
	yine
	yoksa
	zaten
	zira
	rm(list=ls())

	# Load required libraries
	library(RCurl)
	library(stringr)
	library(tm)
	library(wordcloud)
	library(RColorBrewer)
	library(twitteR)
	library(streamR)
	library(grid)
	library(ggplot2)
	library(wesanderson)

	# Load credentials ============================================================= IMPORTANT
	# SEE:http://thinktostart.com/twitter-authentification-with-r/
	# UNCOMMENT LINES BELOW ======================================================== IMPORTANT
	#load("")
	#registerTwitterOAuth(my_oauth)
	# Load credentials ============================================================= IMPORTANT

	options(RCurlOptions = list(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl")))

	# Set seed user
	user <- "TBMMGenelKurulu"

	# getting data for seed user
	seed <- getUser(user)
	(seed.n <- seed$screenName)

	# Get the timeline
	ut <- userTimeline(user, n=3200, includeRts = FALSE, encoding="utf-8")

	# Extract tweets
	tweets.text <- sapply(ut, function(x) x$getText())
	head(tweets.text)
	tweets.text[42:50]

	# Remove non alphanumeric characters
	tweets.text <- gsub("[^a-zA-ZğüşöçıİĞÜŞÖÇ ]","",tweets.text)

	# Convert all text to lower case
	tweets.text <- tolower(tweets.text)

	# Replace @UserName
	tweets.text <- gsub("@\\w+", "", tweets.text)

	# Remove punctuation
	tweets.text <- gsub("[[:punct:]]", "", tweets.text)

	# Remove links
	tweets.text <- gsub("http\\w+", "", tweets.text)

	# Remove tabs
	tweets.text <- gsub("[ \|\t]{2,}", "", tweets.text)

	# Remove blank spaces at the beginning
	tweets.text <- gsub("^ ", "", tweets.text)

	# Remove blank spaces at the end
	tweets.text <- gsub(" $", "", tweets.text)

	# Replace AK Parti with akp
	tweets.text <- gsub("ak parti", "akp", tweets.text)

	# Create corpus
	tweets.text.corpus <- Corpus(VectorSource(tweets.text))

	# Clean up by removing stop words
	#tweets.text.corpus <- tm_map(tweets.text.corpus, function(x)removeWords(x,stopwords()))

	# Turkish stopwords ============================================================ IMPORTANT
	# Save stop-words-turkish.txt into your working directory
	turkish <- read.table("stop-words-turkish.txt", sep="\n", stringsAsFactors = FALSE)
	turkish_stop <- unlist(turkish)
	# ============================================================================== IMPORTANT

	# Create document term matrix applying some transformations
	# To be safe re-clean text
	tdm = TermDocumentMatrix(
	tweets.text.corpus,
	control = list(
	removePunctuation = TRUE,
	stopwords = c("bir",
	"gibi",
	"ama",
	"daha",
	"yok",
	"http",
	"ben",
	"belki",
	"hiçbir",
	"sen",
	"var",
	"neden",
	"nasi",
	"ile",
	"nasıl",
	"kadar",
	"kim",
	"için",
	"inci",
	"uncu"),
	turkish_stop,
	removeNumbers = TRUE,
	tolower = TRUE))


	# Create DTM
	# Create document term matrix applying some transformations
	dtm = DocumentTermMatrix(
	tweets.text.corpus,
	control = list(
	removePunctuation = TRUE,
	stopwords = c("bir",
	"gibi",
	"ama",
	"daha",
	"yok",
	"http",
	"ben",
	"belki",
	"hiçbir",
	"sen",
	"var",
	"neden",
	"nasi",
	"ile",
	"nasıl",
	"kadar",
	"kim",
	"için",
	"inci",
	"uncu"),
	turkish_stop,
	removeNumbers = TRUE,
	tolower = TRUE))

	# Assocsiations
	findAssocs(dtm, "akp", corlimit=0.15)
	findAssocs(dtm, "bdp", corlimit=0.15)
	findAssocs(dtm, "chp", corlimit=0.15)
	findAssocs(dtm, "hdp", corlimit=0.15)
	findAssocs(dtm, "mhp", corlimit=0.15)

	# AKP Correlation
	toi <- "akp" # term of interest
	corlimit <- 0.15 # lower correlation bound limit.
	akp_0.3 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1],
	terms = row.names(findAssocs(tdm, toi, corlimit)))


	akp_0.3$terms <- factor(akp_0.3$terms, levels = akp_0.3$terms)

	# Plot and save the image in png format
	png("akp.png", width=9, height=9, units="in", res=500)
	ggplot(akp_0.3, aes( y = terms ) ) +
	geom_point(aes(x = corr, size=corr), data = akp_0.3) +
	scale_size(range = c(3, 15)) +
	ylab("")+
	xlab(paste0("Correlation with the term ", "\"", toi, "\""))
	dev.off()

	# BDP Correlation
	toi <- "bdp" # term of interest
	corlimit <- 0.15 # lower correlation bound limit.
	bdp_0.3 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1],
	terms = row.names(findAssocs(tdm, toi, corlimit)))


	bdp_0.3$terms <- factor(bdp_0.3$terms, levels = bdp_0.3$terms)
	# Plot and save the image in png format
	png("bdp.png", width=9, height=9, units="in", res=500)
	ggplot(bdp_0.3, aes( y = terms ) ) +
	geom_point(aes(x = corr, size=corr), data = bdp_0.3) +
	scale_size(range = c(3, 15)) +
	ylab("")+
	xlab(paste0("Correlation with the term ", "\"", toi, "\""))
	dev.off()

	# CHP Correlation
	toi <- "chp" # term of interest
	corlimit <- 0.15 # lower correlation bound limit.
	chp_0.3 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1],
	terms = row.names(findAssocs(tdm, toi, corlimit)))


	chp_0.3$terms <- factor(chp_0.3$terms, levels = chp_0.3$terms)

	# Plot and save the image in png format
	png("chp.png", width=9, height=9, units="in", res=500)
	ggplot(chp_0.3, aes( y = terms ) ) +
	geom_point(aes(x = corr, size=corr), data = chp_0.3) +
	scale_size(range = c(3, 15)) +
	ylab("")+
	xlab(paste0("Correlation with the term ", "\"", toi, "\""))
	dev.off()

	# HDP Correlation
	toi <- "hdp" # term of interest
	corlimit <- 0.15 # lower correlation bound limit.
	hdp_0.3 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1],
	terms = row.names(findAssocs(tdm, toi, corlimit)))


	hdp_0.3$terms <- factor(hdp_0.3$terms, levels = hdp_0.3$terms)
	# Plot and save the image in png format
	png("hdp.png", width=9, height=9, units="in", res=500)
	ggplot(hdp_0.3, aes( y = terms ) ) +
	geom_point(aes(x = corr, size=corr), data = hdp_0.3) +
	scale_size(range = c(3, 15)) +
	ylab("")+
	xlab(paste0("Correlation with the term ", "\"", toi, "\""))
	dev.off()

	# MHP Correlation
	toi <- "mhp" # term of interest
	corlimit <- 0.15 # lower correlation bound limit.
	mhp_0.3 <- data.frame(corr = findAssocs(dtm, toi, corlimit)[,1],
	terms = row.names(findAssocs(tdm, toi, corlimit)))


	mhp_0.3$terms <- factor(mhp_0.3$terms, levels = mhp_0.3$terms)
	# Plot and save the image in png format
	png("mhp.png", width=9, height=9, units="in", res=500)
	ggplot(mhp_0.3, aes( y = terms ) ) +
	geom_point(aes(x = corr, size=corr), data = mhp_0.3) +
	scale_size(range = c(3, 15)) +
	ylab("")+
	xlab(paste0("Correlation with the term ", "\"", toi, "\""))
	dev.off()



	# WORDCLOUD
	# Define tdm as matrix
	m = as.matrix(tdm)

	# Get word counts in decreasing order
	word_freqs = sort(rowSums(m), decreasing=TRUE)

	# Create a data frame with words and their frequencies
	dm = data.frame(word=names(word_freqs), freq=word_freqs)

	# Color
	la_cont <- wes_palette(name = "Zissou", type = "continuous")

	# Plot and save the image in png format
	png("tbmm.png", width=9, height=9, units="in", res=500)

	wordcloud(dm$word, dm$freq, random.order=FALSE, min.freq = 2,scale=c(4,0.5), max.words = 100, colors=la_cont)

	dev.off()


	# Save workspace
	save.image(file = "tbmm.RData")