This script mines a wikipedia page for bat facts and (if they meet a basic QC) tweets them out. Then it waits 3 hours, before doing it again
##Wikipedia querying from Noam Ross
wait_in_r <- TRUE
wait_duration <- 211*60 #Number of seconds to wait
base_url <- ""
hashtag <- "#bats" #If you don't want a hashtag just assign this an empty character string
##twitter token was generated with the instructions here(, but I found it easier to just load the token rather than making it an environment variable
twitter_token <- readRDS('/Users/davehemprichbennett/twitter_token.rds')
#####Noam's wikipedia querying ####
#First, we get some basic information from wikipedia
while(A==FALSE){ #I schedule this by using an infinite loop with Sys.sleep used. This is a bad way of doing it, but works. To be ironed out later
# Get all speceies-level page titles from the Wikipedia list of bats
bat_titles <- read_html(base_url) %>%
html_nodes(xpath="//ul/li[contains(., 'Genus')]/ul/li/a[starts-with(@href, '/wiki/')]") %>%
xml_attr("href") %>%
basename() %>%
#####And now we're on Dave's far less elegant code #####
#Now we select a random bat-page, then some random sequential sentences from it, then if they pass some
#QC steps, we output it
tweetable <- FALSE
bat_info <- map_df(bat_titles[sample(seq(1,length(bat_titles)),1)], function(x) {
return <- page_content(language="en", project="wikipedia", page_name=x)
data_frame(title = return$parse$title,
content = return$parse$text$`*`)
})#Get the info from a random page
# Extract just the text from the HTML
bat_text <- bat_info %>%
mutate(content = map_chr(content, ~html_text(read_html(.))))
row <- sample(seq(1,nrow(bat_text)), 1) #Chose a random entry
sp_name <- bat_text[row,1]$title #Get the bat species' name
section_names <- str_extract_all(bat_text[row,2], ".+\\[edit\\]")[[1]] #Break the wall of text up into names and items
section_names <- gsub('\\[edit\\]', '', section_names)
sections <- str_split(bat_text[row,2], ".+\\[edit\\]")[[1]]
sections <- sections[-1] # I don't want the first entry, its a bit dull and gets in the way of the next line
names(sections) <- section_names
sections <- gsub('\\\n', '', sections) #Clean out all the newline characters
sections <- gsub('\\[.\\]', '', sections) #Clean out all the references, they'll make no sense out of context
sections <- gsub('\\\\', '', sections)
if(length(sections)<2){#Some pages only contain references (length == 1) or are completely blank (length ==0). These are garbage and should be skipped
if('References' %in% names(sections)){
ref_pos <- which(names(sections)=='References') #Find where the references section is, get rid of it as it would make for a terrible tweet
sections <- sections[-ref_pos]
if('Sources' %in% names(sections)){
source_pos <- which(names(sections)=='Sources') #Ditto for sources
sections <- sections[-source_pos]
if('Footnotes' %in% names(sections)){
source_pos <- which(names(sections)=='Footnotes') #Ditto for Footnotes
sections <- sections[-source_pos]
if(length(grep('easurements', names(sections)))>0){ #Measurement based tweets would be very boring, delete the section now
measurement_position <- grep('easurements', names(sections))
sections <- sections[-measurement_position]
section_choice <- sample(seq(1,length(sections)),1)#Choose a random section to tweet from
sentences <- tokenize_sentences(sections[section_choice])[[1]] #Convert the block of text into a vector wher each item is a sentence
if(length(sentences)<2){ #Skip if the section is empty of tiny
n_sentences<- length(sentences) #Get how many sentences there are, required for the next two lines
start_point <- sample(seq(1,n_sentences-1), 1) #The position of
end_point <- start_point+ sample(c(1,2),1) #Where will we end our text chunk
outstring <- paste(sentences[c(start_point, end_point)], collapse = ' ') #Make a string out of this
####Now its wikimedia time to get an image and its creator ####
photo_details <- str_split(bat_info[1,2], pattern = '\" src')[[1]][1]
photo_details <- str_split(photo_details, pattern = "<img alt=\\\"")[[1]][2] #This is the NAME of the image, to be queried on wikimedia
photo_details <- gsub(' ', '_', photo_details)
if(nchar(photo_details)==0){#Unable to get a decent photo (another good photo may be available in the page in a different position but the code isn't complex enough to search for it) so skipping
photo_credit_url <- paste('', photo_details, sep ='') #Becuase wikipedia doesn't include creditation for images from wikimedia on the wikipedia page in question, we instead have to query wikimedia for the creditation instead *eyeroll emoji*
wikimedia_text <- html_text(read_html(photo_credit_url))
author <- str_split(pattern='Author', wikimedia_text)[[1]][2]
author <- str_split(pattern='\n', author)[[1]][2]
if({ #If we were unable to get any info for the author, skip it
if(nchar(author)==0){ #If we were unable to get any info for the author, skip it
#Now to begin getting the bat image
bat_wiki <- read_html(photo_credit_url)
bat_media <- html_nodes(bat_wiki, ".internal")
bat_media_inf <- html_attrs(bat_media)[[1]]
photo_url <- bat_media_inf[1]
if(length(photo_url)!=1){ #If theres no url available for the image, or our regex gets confused and gives us too many potential images, skip
####Now we put all the text together to make the tweet string####
outstring <- gsub("\\s*\\[[^\\)]+\\]","",outstring) #Kill any references that have made it through, as nobody wants [11] in their tweet
outstring <- gsub(".*\\] ","",outstring) #Also occasionally your tweet will begin with half a reference e.g. '"16] Nothing is known about the diet...' This sorts that
outstring <- gsub(".*\\]","",outstring) #Also occasionally your tweet will begin with half a reference e.g. '"16]Nothing is known about the diet...' This sorts that
outstring <- gsub("\\[","",outstring) #Or sometimes ends like ' Leaves from Balanites species and several insects may also be eaten.['
if(grepl("NA", outstring)){ #If we've selected empty space, skip to a new iteration of the while loop
outstring <- paste(sp_name, ': ', outstring, sep ='') #Start making the tweet
n_chars <- nchar(outstring)
extra_length <- 24 + nchar(hashtag) + nchar(paste('Image by ', author, sep = '')) #all urls take up 23 characters, then one for a space after it, then more for image attribution
if(n_chars <240-extra_length & n_chars >120){ #If the string is a tweetable length and long enough to be interesting, tweet it
page <- page_info("en", "wikipedia", page=sp_name , as_wikitext=TRUE)
url <- page$query$pages[[1]]$fullurl
outstring <- paste(outstring, 'Image by', author, url, hashtag, sep =' ')
tweetable <- TRUE
#outstring <- paste(outstring, 'put_url_here', '#bats', sep =' ')
#cat('outstring is ', n_chars, ' long')
#readline(prompt="Press [enter] to continue")
download.file(photo_url, 'temp.jpg', mode = 'wb') #This is done near the bottom so we don't regularly download images for potential tweets that don't pass our QC
#####twitter things ####
##Vignette of instructions for using it here
#now we just tweet the output
post_tweet(status = outstring, token = twitter_token,
in_reply_to_status_id = NULL, media = './temp.jpg')
Sys.sleep(wait_duration) #The number of seconds to sleep for
