Created
May 11, 2017 07:52
-
-
Save csiu/d14c5c66cd84f3192ef7311a356d9ab0 to your computer and use it in GitHub Desktop.
Day 75: emoji count during bc 2017 elections
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "Download twitter data" | |
author: "csiu" | |
output: | |
html_document: | |
keep_md: yes | |
--- | |
```{r setup, include=FALSE} | |
knitr::opts_chunk$set(echo = TRUE, | |
eval=TRUE, | |
cache = TRUE ) | |
``` | |
- Primary reference: [Emoji data science in R: A tutorial (Hamdan Azhar, 2017)](https://prismoji.com/2017/02/06/emoji-data-science-in-r-tutorial/#part1) | |
- Secondary reference: [Twimoji: Identifying Emoji in Tweets (Chris Tufts, 2015)](http://miningthedetails.com/blog/r/IdentifyEmojiInTweets/) | |
## Connect to twitter | |
```{r load-lib, message=FALSE} | |
# install.packages("twitteR") | |
library(twitteR) | |
library(dplyr) | |
library(readr) | |
library(stringr) | |
library(lubridate) | |
``` | |
```{r twitter-connect} | |
#' Create API keys from https://apps.twitter.com | |
api_key <- 'XXX' | |
api_secret <- 'XXX' | |
access_token <- 'XXX' | |
access_token_secret <- 'XXX' | |
source("twitter_api_key.R") # To load true values | |
setup_twitter_oauth(api_key, api_secret, access_token, access_token_secret) | |
``` | |
## Pull tweets | |
```{r} | |
set.seed(20170202) | |
#' Pull tweets | |
search_string <- "#BCVotes2017" | |
tweets.raw <- | |
searchTwitter(search_string, | |
n = 10000, # Max number of tweets to return | |
lang = 'en', # Restrict tweets to given language | |
since = '2017-05-09', | |
until = '2017-05-10' | |
) | |
#' Remove retweets & convert twitteR lists to data.frames | |
#' We do the following: | |
#' 1. Remove retweets | |
#' 2. convert to data frame | |
(df <- | |
strip_retweets(tweets.raw, strip_manual = TRUE, strip_mt = TRUE) %>% | |
twListToDF() | |
) %>% | |
head() | |
``` | |
## Tidy data frame | |
```{r} | |
df_tidy <- | |
df %>% | |
mutate( | |
# Add new columns containing the hashtag & tweet url | |
hashtag = search_string, | |
url = paste0('https://twitter.com/', screenName, '/status/', id), | |
# Convert character vector between encodings | |
text = iconv(text, from='latin1', to='ASCII', sub='byte'), | |
# Update type | |
created = lubridate::ymd_hms(created, tz = "UTC") | |
) %>% | |
rename( | |
retweets = retweetCount | |
) %>% | |
select( | |
text, created, url, latitude, longitude, retweets, hashtag, screenName | |
) | |
#' Print head 10 lines of data frame | |
head(df_tidy) | |
``` | |
Number of tweets: | |
```{r} | |
nrow(df_tidy) | |
``` | |
## Load emoji dictionary | |
The Emoji dictionary is obtained from [GitHub: today-is-a-good-day/emojis](https://github.com/today-is-a-good-day/emojis/blob/master/emDict.csv). | |
```{r warning=FALSE, message=FALSE} | |
(emoticons <- | |
readr::read_delim( | |
"emDict.csv", | |
delim = ";", | |
col_names = c("description", "native", "bytes", "r_encoding"), | |
skip = 1 | |
) %>% | |
mutate(description = tolower(description)) | |
) %>% | |
head() | |
#' Number of emojis | |
nrow(emoticons) | |
``` | |
```{r} | |
library(rvest) | |
my_html <- | |
read_html("http://apps.timwhitlock.info/emoji/tables/unicode#emoji-modal") %>% | |
html_nodes("tbody tr") | |
codes <- | |
my_html %>% | |
html_nodes("td.code a") %>% | |
html_text() | |
names <- | |
my_html %>% | |
html_node("td.name") %>% | |
html_text | |
length(codes) | |
length(names) | |
unicode_dict <- | |
data.frame(codes, names, stringsAsFactors = FALSE) %>% | |
tbl_df() %>% | |
mutate( | |
codes = sub("U\\+", "", codes) %>% tolower(), | |
names = tolower(names) | |
) | |
rm(codes, names, my_html) | |
``` | |
## Count emojis | |
```{r} | |
# Helper function to count number of times pattern occur in string | |
count_emojis <- function(e){ | |
counts <- str_count(df_tidy$text, e) | |
data.frame( | |
counts, | |
tweet_id = 1:length(counts) | |
) | |
} | |
# Do the counting of emojis for each tweet | |
emoji_counts <- | |
emoticons %>% | |
select(description, native, r_encoding) %>% | |
mutate( | |
counts = purrr::map(r_encoding, ~count_emojis(.x)) | |
) %>% | |
tidyr::unnest(counts) | |
# Summarize the counts per emoji | |
(emoji_counts_summary <- | |
emoji_counts %>% | |
filter(counts != 0) %>% | |
group_by(description, native) %>% | |
summarise(count = sum(counts)) %>% | |
arrange(desc(count)) | |
) | |
``` | |
## Visualize | |
```{r} | |
library(emojifont) | |
library(ggplot2) | |
## You need to download this file | |
load.emojifont('OpenSansEmoji.ttf') | |
``` | |
```{r} | |
library(cowplot) | |
pdf("~/Desktop/test.pdf") | |
emoji_counts_summary %>% | |
filter(count > 1) %>% | |
ggplot(aes(x=reorder(description, -count), | |
y=count, | |
label=native)) + | |
geom_bar(stat="identity", fill="grey88") + | |
geom_text(family="OpenSansEmoji", size=5) + | |
xlab("") + | |
ggtitle(sprintf("%s during Election Day (May 9th, 2017)", search_string)) + | |
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5)) | |
dev.off() | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment