Skip to content

Instantly share code, notes, and snippets.

@hemprichbennett
Forked from noamross/get_bat_wiki_pages.R
Last active April 9, 2018 10:10
Show Gist options
  • Save hemprichbennett/62e4facca777d1ea0b0287698d337786 to your computer and use it in GitHub Desktop.
Save hemprichbennett/62e4facca777d1ea0b0287698d337786 to your computer and use it in GitHub Desktop.
This script mines a wikipedia page for bat facts and (if they meet a basic QC) tweets them out. Then it waits 3 hours, before doing it again
##Wikipedia querying from Noam Ross https://gist.github.com/noamross/88a51bb880f18da88e4b259eefdefe87
library(tidyverse)
library(xml2)
library(rvest)
library(WikipediR)
library(urltools)
library(stringr)
library(tokenizers)
library(rtweet)
library(here)
setwd(here())
wait_in_r <- TRUE
wait_duration <- 211*60 #Number of seconds to wait
base_url <- "https://en.wikipedia.org/wiki/List_of_bats"
hashtag <- "#bats" #If you don't want a hashtag just assign this an empty character string
##twitter token was generated with the instructions here(http://rtweet.info/articles/auth.html), but I found it easier to just load the token rather than making it an environment variable
twitter_token <- readRDS('/Users/davehemprichbennett/twitter_token.rds')
#####Noam's wikipedia querying ####
#First, we get some basic information from wikipedia
A <- FALSE
while(A==FALSE){ #I schedule this by using an infinite loop with Sys.sleep used. This is a bad way of doing it, but works. To be ironed out later
# Get all speceies-level page titles from the Wikipedia list of bats
bat_titles <- read_html(base_url) %>%
html_nodes(xpath="//ul/li[contains(., 'Genus')]/ul/li/a[starts-with(@href, '/wiki/')]") %>%
xml_attr("href") %>%
basename() %>%
url_decode()
#####And now we're on Dave's far less elegant code #####
#Now we select a random bat-page, then some random sequential sentences from it, then if they pass some
#QC steps, we output it
tweetable <- FALSE
while(tweetable==FALSE){
bat_info <- map_df(bat_titles[sample(seq(1,length(bat_titles)),1)], function(x) {
return <- page_content(language="en", project="wikipedia", page_name=x)
data_frame(title = return$parse$title,
content = return$parse$text$`*`)
})#Get the info from a random page
# Extract just the text from the HTML
bat_text <- bat_info %>%
mutate(content = map_chr(content, ~html_text(read_html(.))))
row <- sample(seq(1,nrow(bat_text)), 1) #Chose a random entry
sp_name <- bat_text[row,1]$title #Get the bat species' name
section_names <- str_extract_all(bat_text[row,2], ".+\\[edit\\]")[[1]] #Break the wall of text up into names and items
section_names <- gsub('\\[edit\\]', '', section_names)
sections <- str_split(bat_text[row,2], ".+\\[edit\\]")[[1]]
sections <- sections[-1] # I don't want the first entry, its a bit dull and gets in the way of the next line
names(sections) <- section_names
sections <- gsub('\\\n', '', sections) #Clean out all the newline characters
sections <- gsub('\\[.\\]', '', sections) #Clean out all the references, they'll make no sense out of context
sections <- gsub('\\\\', '', sections)
if(length(sections)<2){#Some pages only contain references (length == 1) or are completely blank (length ==0). These are garbage and should be skipped
next()
}
if('References' %in% names(sections)){
ref_pos <- which(names(sections)=='References') #Find where the references section is, get rid of it as it would make for a terrible tweet
sections <- sections[-ref_pos]
}
if('Sources' %in% names(sections)){
source_pos <- which(names(sections)=='Sources') #Ditto for sources
sections <- sections[-source_pos]
}
if('Footnotes' %in% names(sections)){
source_pos <- which(names(sections)=='Footnotes') #Ditto for Footnotes
sections <- sections[-source_pos]
}
if(length(sections)==0){
next()
}
if(length(grep('easurements', names(sections)))>0){ #Measurement based tweets would be very boring, delete the section now
measurement_position <- grep('easurements', names(sections))
sections <- sections[-measurement_position]
}
if(length(sections)==0){
next()
}
section_choice <- sample(seq(1,length(sections)),1)#Choose a random section to tweet from
sentences <- tokenize_sentences(sections[section_choice])[[1]] #Convert the block of text into a vector wher each item is a sentence
if(length(sentences)<2){ #Skip if the section is empty of tiny
next()
}
n_sentences<- length(sentences) #Get how many sentences there are, required for the next two lines
start_point <- sample(seq(1,n_sentences-1), 1) #The position of
end_point <- start_point+ sample(c(1,2),1) #Where will we end our text chunk
outstring <- paste(sentences[c(start_point, end_point)], collapse = ' ') #Make a string out of this
####Now its wikimedia time to get an image and its creator ####
photo_details <- str_split(bat_info[1,2], pattern = '\" src')[[1]][1]
photo_details <- str_split(photo_details, pattern = "<img alt=\\\"")[[1]][2] #This is the NAME of the image, to be queried on wikimedia
photo_details <- gsub(' ', '_', photo_details)
if(nchar(photo_details)==0){#Unable to get a decent photo (another good photo may be available in the page in a different position but the code isn't complex enough to search for it) so skipping
next()
}
photo_credit_url <- paste('https://commons.wikimedia.org/wiki/File:', photo_details, sep ='') #Becuase wikipedia doesn't include creditation for images from wikimedia on the wikipedia page in question, we instead have to query wikimedia for the creditation instead *eyeroll emoji*
wikimedia_text <- html_text(read_html(photo_credit_url))
author <- str_split(pattern='Author', wikimedia_text)[[1]][2]
author <- str_split(pattern='\n', author)[[1]][2]
if(is.na(author)){ #If we were unable to get any info for the author, skip it
next()
}
if(nchar(author)==0){ #If we were unable to get any info for the author, skip it
next()
}
#Now to begin getting the bat image
bat_wiki <- read_html(photo_credit_url)
bat_media <- html_nodes(bat_wiki, ".internal")
bat_media_inf <- html_attrs(bat_media)[[1]]
photo_url <- bat_media_inf[1]
if(length(photo_url)!=1){ #If theres no url available for the image, or our regex gets confused and gives us too many potential images, skip
next()
}
####Now we put all the text together to make the tweet string####
outstring <- gsub("\\s*\\[[^\\)]+\\]","",outstring) #Kill any references that have made it through, as nobody wants [11] in their tweet
outstring <- gsub(".*\\] ","",outstring) #Also occasionally your tweet will begin with half a reference e.g. '"16] Nothing is known about the diet...' This sorts that
outstring <- gsub(".*\\]","",outstring) #Also occasionally your tweet will begin with half a reference e.g. '"16]Nothing is known about the diet...' This sorts that
outstring <- gsub("\\[","",outstring) #Or sometimes ends like ' Leaves from Balanites species and several insects may also be eaten.['
if(grepl("NA", outstring)){ #If we've selected empty space, skip to a new iteration of the while loop
next()
}
outstring <- paste(sp_name, ': ', outstring, sep ='') #Start making the tweet
#print(outstring)
n_chars <- nchar(outstring)
extra_length <- 24 + nchar(hashtag) + nchar(paste('Image by ', author, sep = '')) #all urls take up 23 characters, then one for a space after it, then more for image attribution
if(n_chars <240-extra_length & n_chars >120){ #If the string is a tweetable length and long enough to be interesting, tweet it
page <- page_info("en", "wikipedia", page=sp_name , as_wikitext=TRUE)
url <- page$query$pages[[1]]$fullurl
outstring <- paste(outstring, 'Image by', author, url, hashtag, sep =' ')
print(outstring)
tweetable <- TRUE
}
#outstring <- paste(outstring, 'put_url_here', '#bats', sep =' ')
#print(outstring)
#cat('outstring is ', n_chars, ' long')
#readline(prompt="Press [enter] to continue")
}
download.file(photo_url, 'temp.jpg', mode = 'wb') #This is done near the bottom so we don't regularly download images for potential tweets that don't pass our QC
#####twitter things ####
##Vignette of instructions for using it here http://rtweet.info/articles/auth.html
#now we just tweet the output
post_tweet(status = outstring, token = twitter_token,
in_reply_to_status_id = NULL, media = './temp.jpg')
file.remove('temp.jpg')
print(Sys.time())
if(wait_in_r==TRUE){
Sys.sleep(wait_duration) #The number of seconds to sleep for
}else{
A <- TRUE
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment