Skip to content

Instantly share code, notes, and snippets.

@smach
Created July 2, 2021 01:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save smach/4860d2b5d7a69a24353af450e486cc39 to your computer and use it in GitHub Desktop.
Save smach/4860d2b5d7a69a24353af450e486cc39 to your computer and use it in GitHub Desktop.
For Dr. Emma Hodcroft
library(rvest)
library(rio)
library(dplyr)
library(xml2)
# If your spreadsheet is named "data.xlsx" and the column with submitter names is named "submitting_lab"
data <- rio::import("data.xlsx") %>%
dplyr::pull(submitting_lab) %>%
unique()
# This function gets the 3 top possible Twitter handles from a Google search. It sleeps for 3 seconds before returning results not to pound Google if running on multiple items
get_google_top_links <- function(submitter) {
submitter_encoded <- gsub(" ", "+", submitter, fixed = TRUE)
url <- URLencode(paste0("https://www.google.com/search?q=site%3Atwitter.com+", submitter_encoded))
my_html <- xml2::read_html(url)
titles <- rvest::html_nodes(my_html, xpath = "//div/div/div/a/div[not(div)]") %>%
rvest::html_text()
titles <- titles[1:3]
titles <- gsub("twitter.com › ", "", titles, fixed = TRUE)
titles <- gsub(" › status.*?$", "", titles)
mydf <- data.frame(Submitter = submitter, Google1 = paste0("@", titles[1]), Google2 = paste0("@", titles[2]), Google3 = paste0("@", titles[3]))
Sys.sleep(3)
return(mydf)
}
# test for 10 items
my_results <- purrr::map_df(data[1:10], get_google_top_links)
# OPTIONAL: Get HTML table of results so you can click to see whether you want those Twitter handles
library(DT)
library(htmltools)
get_clickable_url <- function(handle) {
handle1 <- gsub("@", "", handle, fixed = TRUE)
url <- paste0("<a href='https://twitter.com/", handle1, "'>", handle, "</a>")
return(url)
}
Google1links <- purrr::map_chr(my_results$Google1, get_clickable_url)
Google2links <- purrr::map_chr(my_results$Google2, get_clickable_url)
Google3links <- purrr::map_chr(my_results$Google3, get_clickable_url)
my_results_table_data <- my_results %>%
mutate(
Google1 = Google1links,
Google2 = Google2links,
Google3 = Google2links
)
DT::datatable(my_results_table_data, escape = FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment