Skip to content

Instantly share code, notes, and snippets.

@bearloga
Last active February 23, 2016 21:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bearloga/255073053a9be8236bb2 to your computer and use it in GitHub Desktop.
Save bearloga/255073053a9be8236bb2 to your computer and use it in GitHub Desktop.
Scrapes some basic info from my Twitter followers' bios. I used this to get an approximate lower bound on how many data science-y folks follow me.
your_handle <- "bearloga"
# Note: must be logged in to twitter to view your own or anyone's list of followers
library(magrittr) # install.packages('magrittr')
library(rvest) # install.packages('rvest')
library(RSelenium) # install.packages('RSelenium')
# Scrolling solution by NicE (http://stackoverflow.com/a/29965233/1091835):
checkForServer()
startServer()
remote_driver <- remoteDriver$new()
remote_driver$open()
remote_driver$navigate(paste0("https://twitter.com/", your_handle, "/followers"))
for (i in 1:25) {
# Sroll down N times, waiting for the page to load at each time
remote_driver$executeScript(paste("scroll(0, ",i * 10000,");"))
Sys.sleep(3)
}; rm(i)
page_source <- remote_driver$getPageSource()
remote_driver$close()
# Test: page_source <- read_html("~/Desktop/followers.html")
html <- read_html(page_source[[1]])
bios <- html %>%
html_nodes('div[data-test-selector="ProfileTimeline"] p.ProfileCard-bio') %>%
html_text(trim = TRUE)
stats_keywords <- data.frame(Biostatistics = grepl("biostat", bios, ignore.case = TRUE),
Statistics = grepl("stats", bios, ignore.case = TRUE) | grepl("statist", bios, ignore.case = TRUE),
Data = grepl("data", bios, ignore.case = TRUE),
R = grepl("\\bR\\b", bios) | grepl("rstats", bios, ignore.case = TRUE),
Datavis = grepl("datavi[sz]", bios, ignore.case = TRUE) | grepl("visualiz", bios, ignore.case = TRUE),
ML = grepl("\\bML\\b", bios, ignore.case = TRUE) | grepl("machine\\s?learning", bios, ignore.case = TRUE),
Ecology = grepl("ecolo", bios, ignore.case = TRUE) | grepl("marine life", bios, ignore.case = TRUE),
Math = grepl("ecolo", bios, ignore.case = TRUE),
Analytics = grepl("analyst", bios, ignore.case = TRUE) | grepl("analytic", bios, ignore.case = TRUE))
stats_keywords <- cbind(stats_keywords, "Data Science-y\n(Any)" = apply(stats_keywords, 1, any))
handles <- html %>%
html_nodes('div[data-test-selector="ProfileTimeline"] span.u-linkComplex-target') %>%
html_text(trim = TRUE) %>%
paste0("@", .)
display_names <- html %>%
html_nodes('div[data-test-selector="ProfileTimeline"] a.ProfileNameTruncated-link') %>%
html_text(trim = TRUE)
followers <- cbind(handle = handles, name = display_names, stats_keywords, bio = bios)
followers <- followers[order(followers$`Data Science-y\n(Any)`, decreasing = TRUE), ]
# View(followers)
prop.table(table(followers$`Data Science-y\n(Any)`)) # ~36.9%
# Next step requires tidyr, dplyr, and ggplot2...
followers %>%
tidyr::gather("keyword", "indicator", Biostatistics:`Data Science-y\n(Any)`) %>%
dplyr::group_by(keyword) %>%
dplyr::summarize(prop = sum(indicator)/n()) %>%
ggplot(data = ., aes(y = prop, x = reorder(keyword, -prop))) +
geom_bar(stat = "identity") +
scale_y_continuous("Proportion of total followers",
labels = scales::percent_format()) +
xlab("Keyword") +
ggtitle("Keywords found followers' Twitter bios") +
geom_text(aes(label = sprintf("%.1f%%", 100*prop), y = prop+0.01),
position = position_dodge(width = 1))
@bearloga
Copy link
Author

Note: remoteDriver$new() might have problems if you use Chrome, or at least it did for me when I specified browserName = "chrome" inside of new(). Using Firefox solved the issue for me.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment