csiu/emoji-data-science-twitter.Rmd

## emoji-data-science-twitter.Rmd
---
title: "Download twitter data"
author: "csiu"
output:
  html_document:
    keep_md: yes
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE,
                      eval=TRUE,
                      cache = TRUE )
```

- Primary reference: [Emoji data science in R: A tutorial (Hamdan Azhar, 2017)](https://prismoji.com/2017/02/06/emoji-data-science-in-r-tutorial/#part1)
- Secondary reference: [Twimoji: Identifying Emoji in Tweets (Chris Tufts, 2015)](http://miningthedetails.com/blog/r/IdentifyEmojiInTweets/)

## Connect to twitter

```{r load-lib, message=FALSE}
# install.packages("twitteR")
library(twitteR)
library(dplyr)
library(readr)
library(stringr)
library(lubridate)
```

```{r twitter-connect}
#' Create API keys from https://apps.twitter.com
api_key <- 'XXX'
api_secret <- 'XXX'
access_token <- 'XXX'
access_token_secret <- 'XXX'
source("twitter_api_key.R") # To load true values

setup_twitter_oauth(api_key, api_secret, access_token, access_token_secret)
```

## Pull tweets

```{r}
set.seed(20170202)

#' Pull tweets
search_string <- "#BCVotes2017"
tweets.raw <-
  searchTwitter(search_string,
                n = 10000,    # Max number of tweets to return
                lang = 'en', # Restrict tweets to given language
                since = '2017-05-09',
                until = '2017-05-10'
                )

#' Remove retweets & convert twitteR lists to data.frames
#' We do the following:
#' 1. Remove retweets
#' 2. convert to data frame
(df <-
  strip_retweets(tweets.raw, strip_manual = TRUE, strip_mt = TRUE) %>%
  twListToDF()
) %>%
  head()
```

## Tidy data frame

```{r}
df_tidy <-
  df %>%
  mutate(
    # Add new columns containing the hashtag & tweet url
    hashtag = search_string,
    url = paste0('https://twitter.com/', screenName, '/status/', id),

    # Convert character vector between encodings
    text = iconv(text, from='latin1', to='ASCII', sub='byte'),

    # Update type
    created = lubridate::ymd_hms(created, tz = "UTC")
  ) %>%
  rename(
    retweets = retweetCount
  ) %>%
  select(
    text, created, url, latitude, longitude, retweets, hashtag, screenName
  )

#' Print head 10 lines of data frame
head(df_tidy)
```

Number of tweets:

```{r}
nrow(df_tidy)
```

## Load emoji dictionary

The Emoji dictionary is obtained from [GitHub: today-is-a-good-day/emojis](https://github.com/today-is-a-good-day/emojis/blob/master/emDict.csv).

```{r warning=FALSE, message=FALSE}
(emoticons <-
  readr::read_delim(
    "emDict.csv",
    delim = ";",
    col_names = c("description", "native", "bytes", "r_encoding"),
    skip = 1
  ) %>%
  mutate(description = tolower(description))
) %>%
  head()

#' Number of emojis
nrow(emoticons)
```


```{r}
library(rvest)

my_html <-
  read_html("http://apps.timwhitlock.info/emoji/tables/unicode#emoji-modal") %>%
  html_nodes("tbody tr")

codes <-
  my_html %>%
  html_nodes("td.code a") %>%
  html_text()

names <-
  my_html %>%
    html_node("td.name") %>%
    html_text

length(codes)
length(names)

unicode_dict <-
  data.frame(codes, names, stringsAsFactors = FALSE) %>%
  tbl_df() %>%
  mutate(
    codes = sub("U\\+", "", codes) %>% tolower(),
    names = tolower(names)
  )

rm(codes, names, my_html)
```

## Count emojis

```{r}
# Helper function to count number of times pattern occur in string
count_emojis <- function(e){
  counts <- str_count(df_tidy$text, e)
  data.frame(
    counts,
    tweet_id = 1:length(counts)
  )
}
# Do the counting of emojis for each tweet
emoji_counts <-
  emoticons %>%
  select(description, native, r_encoding) %>%
  mutate(
    counts = purrr::map(r_encoding, ~count_emojis(.x))
  ) %>%
  tidyr::unnest(counts)

# Summarize the counts per emoji
(emoji_counts_summary <-
  emoji_counts %>%
  filter(counts != 0) %>%

  group_by(description, native) %>%
  summarise(count = sum(counts)) %>%
  arrange(desc(count))
)
```

## Visualize

```{r}
library(emojifont)
library(ggplot2)

## You need to download this file
load.emojifont('OpenSansEmoji.ttf')
```

```{r}
library(cowplot)
pdf("~/Desktop/test.pdf")
emoji_counts_summary %>%
  filter(count > 1) %>%

  ggplot(aes(x=reorder(description, -count),
             y=count,
             label=native)) +
  geom_bar(stat="identity", fill="grey88") +
  geom_text(family="OpenSansEmoji", size=5) +
  xlab("") +
  ggtitle(sprintf("%s during Election Day (May 9th, 2017)", search_string)) +
  theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5))
dev.off()
```
	---
	title: "Download twitter data"
	author: "csiu"
	output:
	html_document:
	keep_md: yes
	---

	```{r setup, include=FALSE}
	knitr::opts_chunk$set(echo = TRUE,
	eval=TRUE,
	cache = TRUE )
	```

	- Primary reference: [Emoji data science in R: A tutorial (Hamdan Azhar, 2017)](https://prismoji.com/2017/02/06/emoji-data-science-in-r-tutorial/#part1)
	- Secondary reference: [Twimoji: Identifying Emoji in Tweets (Chris Tufts, 2015)](http://miningthedetails.com/blog/r/IdentifyEmojiInTweets/)

	## Connect to twitter

	```{r load-lib, message=FALSE}
	# install.packages("twitteR")
	library(twitteR)
	library(dplyr)
	library(readr)
	library(stringr)
	library(lubridate)
	```

	```{r twitter-connect}
	#' Create API keys from https://apps.twitter.com
	api_key <- 'XXX'
	api_secret <- 'XXX'
	access_token <- 'XXX'
	access_token_secret <- 'XXX'
	source("twitter_api_key.R") # To load true values

	setup_twitter_oauth(api_key, api_secret, access_token, access_token_secret)
	```

	## Pull tweets

	```{r}
	set.seed(20170202)

	#' Pull tweets
	search_string <- "#BCVotes2017"
	tweets.raw <-
	searchTwitter(search_string,
	n = 10000, # Max number of tweets to return
	lang = 'en', # Restrict tweets to given language
	since = '2017-05-09',
	until = '2017-05-10'
	)

	#' Remove retweets & convert twitteR lists to data.frames
	#' We do the following:
	#' 1. Remove retweets
	#' 2. convert to data frame
	(df <-
	strip_retweets(tweets.raw, strip_manual = TRUE, strip_mt = TRUE) %>%
	twListToDF()
	) %>%
	head()
	```

	## Tidy data frame

	```{r}
	df_tidy <-
	df %>%
	mutate(
	# Add new columns containing the hashtag & tweet url
	hashtag = search_string,
	url = paste0('https://twitter.com/', screenName, '/status/', id),

	# Convert character vector between encodings
	text = iconv(text, from='latin1', to='ASCII', sub='byte'),

	# Update type
	created = lubridate::ymd_hms(created, tz = "UTC")
	) %>%
	rename(
	retweets = retweetCount
	) %>%
	select(
	text, created, url, latitude, longitude, retweets, hashtag, screenName
	)

	#' Print head 10 lines of data frame
	head(df_tidy)
	```

	Number of tweets:

	```{r}
	nrow(df_tidy)
	```

	## Load emoji dictionary

	The Emoji dictionary is obtained from [GitHub: today-is-a-good-day/emojis](https://github.com/today-is-a-good-day/emojis/blob/master/emDict.csv).

	```{r warning=FALSE, message=FALSE}
	(emoticons <-
	readr::read_delim(
	"emDict.csv",
	delim = ";",
	col_names = c("description", "native", "bytes", "r_encoding"),
	skip = 1
	) %>%
	mutate(description = tolower(description))
	) %>%
	head()

	#' Number of emojis
	nrow(emoticons)
	```


	```{r}
	library(rvest)

	my_html <-
	read_html("http://apps.timwhitlock.info/emoji/tables/unicode#emoji-modal") %>%
	html_nodes("tbody tr")

	codes <-
	my_html %>%
	html_nodes("td.code a") %>%
	html_text()

	names <-
	my_html %>%
	html_node("td.name") %>%
	html_text

	length(codes)
	length(names)

	unicode_dict <-
	data.frame(codes, names, stringsAsFactors = FALSE) %>%
	tbl_df() %>%
	mutate(
	codes = sub("U\\+", "", codes) %>% tolower(),
	names = tolower(names)
	)

	rm(codes, names, my_html)
	```

	## Count emojis

	```{r}
	# Helper function to count number of times pattern occur in string
	count_emojis <- function(e){
	counts <- str_count(df_tidy$text, e)
	data.frame(
	counts,
	tweet_id = 1:length(counts)
	)
	}
	# Do the counting of emojis for each tweet
	emoji_counts <-
	emoticons %>%
	select(description, native, r_encoding) %>%
	mutate(
	counts = purrr::map(r_encoding, ~count_emojis(.x))
	) %>%
	tidyr::unnest(counts)

	# Summarize the counts per emoji
	(emoji_counts_summary <-
	emoji_counts %>%
	filter(counts != 0) %>%

	group_by(description, native) %>%
	summarise(count = sum(counts)) %>%
	arrange(desc(count))
	)
	```

	## Visualize

	```{r}
	library(emojifont)
	library(ggplot2)

	## You need to download this file
	load.emojifont('OpenSansEmoji.ttf')
	```

	```{r}
	library(cowplot)
	pdf("~/Desktop/test.pdf")
	emoji_counts_summary %>%
	filter(count > 1) %>%

	ggplot(aes(x=reorder(description, -count),
	y=count,
	label=native)) +
	geom_bar(stat="identity", fill="grey88") +
	geom_text(family="OpenSansEmoji", size=5) +
	xlab("") +
	ggtitle(sprintf("%s during Election Day (May 9th, 2017)", search_string)) +
	theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5))
	dev.off()
	```