Skip to content

Instantly share code, notes, and snippets.

@csiu
Created May 11, 2017 07:52
Show Gist options
  • Save csiu/d14c5c66cd84f3192ef7311a356d9ab0 to your computer and use it in GitHub Desktop.
Save csiu/d14c5c66cd84f3192ef7311a356d9ab0 to your computer and use it in GitHub Desktop.
Day 75: emoji count during bc 2017 elections
---
title: "Download twitter data"
author: "csiu"
output:
html_document:
keep_md: yes
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE,
eval=TRUE,
cache = TRUE )
```
- Primary reference: [Emoji data science in R: A tutorial (Hamdan Azhar, 2017)](https://prismoji.com/2017/02/06/emoji-data-science-in-r-tutorial/#part1)
- Secondary reference: [Twimoji: Identifying Emoji in Tweets (Chris Tufts, 2015)](http://miningthedetails.com/blog/r/IdentifyEmojiInTweets/)
## Connect to twitter
```{r load-lib, message=FALSE}
# install.packages("twitteR")
library(twitteR)
library(dplyr)
library(readr)
library(stringr)
library(lubridate)
```
```{r twitter-connect}
#' Create API keys from https://apps.twitter.com
api_key <- 'XXX'
api_secret <- 'XXX'
access_token <- 'XXX'
access_token_secret <- 'XXX'
source("twitter_api_key.R") # To load true values
setup_twitter_oauth(api_key, api_secret, access_token, access_token_secret)
```
## Pull tweets
```{r}
set.seed(20170202)
#' Pull tweets
search_string <- "#BCVotes2017"
tweets.raw <-
searchTwitter(search_string,
n = 10000, # Max number of tweets to return
lang = 'en', # Restrict tweets to given language
since = '2017-05-09',
until = '2017-05-10'
)
#' Remove retweets & convert twitteR lists to data.frames
#' We do the following:
#' 1. Remove retweets
#' 2. convert to data frame
(df <-
strip_retweets(tweets.raw, strip_manual = TRUE, strip_mt = TRUE) %>%
twListToDF()
) %>%
head()
```
## Tidy data frame
```{r}
df_tidy <-
df %>%
mutate(
# Add new columns containing the hashtag & tweet url
hashtag = search_string,
url = paste0('https://twitter.com/', screenName, '/status/', id),
# Convert character vector between encodings
text = iconv(text, from='latin1', to='ASCII', sub='byte'),
# Update type
created = lubridate::ymd_hms(created, tz = "UTC")
) %>%
rename(
retweets = retweetCount
) %>%
select(
text, created, url, latitude, longitude, retweets, hashtag, screenName
)
#' Print head 10 lines of data frame
head(df_tidy)
```
Number of tweets:
```{r}
nrow(df_tidy)
```
## Load emoji dictionary
The Emoji dictionary is obtained from [GitHub: today-is-a-good-day/emojis](https://github.com/today-is-a-good-day/emojis/blob/master/emDict.csv).
```{r warning=FALSE, message=FALSE}
(emoticons <-
readr::read_delim(
"emDict.csv",
delim = ";",
col_names = c("description", "native", "bytes", "r_encoding"),
skip = 1
) %>%
mutate(description = tolower(description))
) %>%
head()
#' Number of emojis
nrow(emoticons)
```
```{r}
library(rvest)
my_html <-
read_html("http://apps.timwhitlock.info/emoji/tables/unicode#emoji-modal") %>%
html_nodes("tbody tr")
codes <-
my_html %>%
html_nodes("td.code a") %>%
html_text()
names <-
my_html %>%
html_node("td.name") %>%
html_text
length(codes)
length(names)
unicode_dict <-
data.frame(codes, names, stringsAsFactors = FALSE) %>%
tbl_df() %>%
mutate(
codes = sub("U\\+", "", codes) %>% tolower(),
names = tolower(names)
)
rm(codes, names, my_html)
```
## Count emojis
```{r}
# Helper function to count number of times pattern occur in string
count_emojis <- function(e){
counts <- str_count(df_tidy$text, e)
data.frame(
counts,
tweet_id = 1:length(counts)
)
}
# Do the counting of emojis for each tweet
emoji_counts <-
emoticons %>%
select(description, native, r_encoding) %>%
mutate(
counts = purrr::map(r_encoding, ~count_emojis(.x))
) %>%
tidyr::unnest(counts)
# Summarize the counts per emoji
(emoji_counts_summary <-
emoji_counts %>%
filter(counts != 0) %>%
group_by(description, native) %>%
summarise(count = sum(counts)) %>%
arrange(desc(count))
)
```
## Visualize
```{r}
library(emojifont)
library(ggplot2)
## You need to download this file
load.emojifont('OpenSansEmoji.ttf')
```
```{r}
library(cowplot)
pdf("~/Desktop/test.pdf")
emoji_counts_summary %>%
filter(count > 1) %>%
ggplot(aes(x=reorder(description, -count),
y=count,
label=native)) +
geom_bar(stat="identity", fill="grey88") +
geom_text(family="OpenSansEmoji", size=5) +
xlab("") +
ggtitle(sprintf("%s during Election Day (May 9th, 2017)", search_string)) +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5))
dev.off()
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment