Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Generate BibTex from list of Tweets you (or another user) have favorited ('liked'), tweeted or RT'd
#! /usr/bin/env python3
"""
faves2bibtex.py
Author: Scott Hawley
Scrapes URLs contained in tweets you (or another user) favorited ('liked') for DOI & other bibliographic info,
and tries to generate a set of BibTex entries to stdout.
Status messages go to stderr
Sample usage:
./faves2bibtex.py drscotthawley | tee refs.bib
Status:
- It will generate BibTeX if it finds a DOI immediately, or if the reference is on arXiv.org but too new to have a DOI
- Otherwise, it tries to cobble together some kind of @misc entry by searching for 'common' meta tags, but often fails :'(
- Added a book ISBN-to-BibTeX functionality but it's currently not being used. TODO: scrape for ISBNs.
Tested on: Mac OS X 10.12.6, Python 3.5 (anaconda)
Aside: this project has been eye-opening re. the # of ways that, even if your HTTP request succeeds, various library routines may crash your code
"""
import tweepy
import sys
import re
import requests
#import urllib3
import http.client as client
import urllib
from bs4 import BeautifulSoup
import os
import time
# You need to supply your own Twitter API developer keys here
# Some instructions here: https://www.digitalocean.com/community/tutorials/how-to-authenticate-a-python-application-with-twitter-using-tweepy-on-ubuntu-14-04
consumer_key = '****'
consumer_secret = '****'
access_token = '****'
access_token_secret = '****'
def eprint(*args, **kwargs): # print to stderr
print(*args, file=sys.stderr, **kwargs)
def doi2bib(doi):
"""
Return a bibTeX string of metadata for a given DOI.
Based on https://gist.github.com/jrsmith3/5513926
"""
if ("" == doi):
return ""
bibtext = ''
url = "http://dx.doi.org/" + doi
headers = {"accept": "application/x-bibtex"}
r = requests.get(url, headers = headers)
if ('This DOI cannot be found in the DOI System' not in r.text):
bibtext = r.text
else:
eprint("Warning: Attempt to convert DOI",doi,"failed.")
return bibtext
def slurp_url(url): # just read in an entire webpage and return the source as text
# from https://stackoverflow.com/questions/13303449/urllib2-httperror-http-error-403-forbidden
html = ''
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
headers={'User-Agent':user_agent,}
request=urllib.request.Request(url,None,headers)
try:
response = urllib.request.urlopen(request)
except urllib.error.HTTPError: # e.g. 404
eprint(" slurp_url: http error ",urllib.error.HTTPError)
return ''
try: # to decode
html = response.read().decode("utf-8") # sometimes this gives you 'bad continuation bit' error
except:
html = response.read().decode("latin-1") # ...in which case latin-1 usually succeeds
return html
'''
def slurp_url_old(url):
html = ''
try: # requests is tempermental https://github.com/requests/requests/issues/3840
html = requests.get(url).text
except requests.exceptions.ContentDecodingError: # huffington post generates these regularly: malformed gzipped encoding
eprint(" ContentDecodingError for url = ",url)
eprint(" Skipping this url. ") # tired of this %&$$%$#
return html
'''
def expand_url(url):
# requests is nice in that it follows multiple links, but can crash too
actual_url = ''
try: # requests is tempermental https://github.com/requests/requests/issues/3840
r = requests.get(url)
actual_url = r.url
except requests.exceptions.ContentDecodingError: # huffington post generates these regularly
# below code from https://stackoverflow.com/questions/4201062/how-can-i-unshorten-a-url
# this 'old school' approach won't follow redirects, but is otherwise robust
parsed = urllib.parse.urlparse(url)
h = client.HTTPConnection(parsed.netloc)
h.request('HEAD', parsed.path)
response = h.getresponse()
if response.status//100 == 3 and response.getheader('Location'):
actual_url = response.getheader('Location')
return actual_url
'''
def expand_url_old(url): # Twitters tc.o link expression needs expanding
return old_skool(url)
actual_url = ''
query_url = 'https://unshorten.me/s/'+url # unshorten.me is great but only allows 10 new evals per hour!
actual_url = requests.get(query_url).text
eprint(" expand_url: actual_url =",actual_url)
if actual_url.find("Usage") != -1:
#if ('Usage' in actual_url):
eprint(" Hey")
actual_url = ''
try: # requests is tempermental https://github.com/requests/requests/issues/3840
r = requests.get(url)
eprint(" r = ",r)
actual_url = r.url
except requests.exceptions.ContentDecodingError: # huffington post generates these regularly
eprint(" ContentDecodingError for url = ",url)
eprint(" Trying urllib instead: ")
parsed = urllib.parse.urlparse(url)
h = client.HTTPConnection(parsed.netloc)
h.request('HEAD', parsed.path)
response = h.getresponse()
if response.status//100 == 3 and response.getheader('Location'):
return response.getheader('Location')
else:
return url
else:
eprint(" Nope")
return actual_url
'''
def extract_doi(url, html): # searches webpage text for the first string matching the DOI format
doi = "" # blank DOI string doubles as error/fail message
if ('doi' in url): # a couple easy special cases
url = url.replace('http://','')
url = url.replace('https://','')
doi = url.replace('aapt.scitation.org/doi/','')
doi = doi.replace('doi.org/','')
else:
doi_re = re.compile(r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'])\S)+)\b") # doi pattern regexp
matchObj = doi_re.search(html)
if matchObj:
doi = matchObj.group(0) # grab the first thing in the page that fits the doi format
return doi
def arxiv_without_doi(url, html): # if the arxiv entry is so new that it doesn't contain a DOI
bibtext = '' # blank bibtext serves an initialization and error code
# first a check: is this a pdf arxiv link? If so, can we get a DOI if we search the 'abs' url?
if ('pdf' in url):
new_url = url.replace('.pdf','')
new_url = new_url.replace('pdf','abs')
new_html = slurp_url(new_url)
doi = extract_doi(new_url, new_html)
if ("" != doi): # if webpage contains a DOI, we're finished
eprint(" Found DOI = ",doi)
bibtext = doi2bib(doi)
if ('' == bibtext):
# bibsonomy.org does a GREAT job of formatting, but sets a limit on how frequently it can be accessed
query_url = 'http://scraper.bibsonomy.org/service?format=bibtex&selection=&url='+url
eprint(" query_url = ",query_url)
attempts, maxattempts = 0, 10
while ((attempts < maxattempts) and ('' == bibtext)):
attempts += 1
r = requests.get(query_url)
if ('You sent too many requests' in r.text):
eprint(" Bibsonomy says we're using it too much. (Attempt",attempts,"of",maxattempts,").",end="")
if (attempts < maxattempts):
nsecs = 60
eprint(" Waiting",nsecs,"seconds before trying again...")
time.sleep(nsecs)
else: # success!
bibtext = r.text
eprint("")
if ('' == bibtext): # Try a different method
# arxiv2bibtex.org: no frequency limits but isn't formatted as nicely IMHO
arxiv_val = url.replace('https://arxiv.org/abs/','') # get only the arxiv index number
query_url = 'https://arxiv2bibtex.org/?q='+ arxiv_val +'&format=bibtex'
r = requests.get(query_url)
soup = BeautifulSoup(''.join(r.text), "html.parser")
textarea = soup.find('textarea') # the first textarea from arxiv2bibtex is the BibTeX output
if (textarea):
bibtext = textarea.getText()
return bibtext
def generic_web_page(url, html):
# For now, we're going to largely rely on common meta tags, e.g. facebook
# So far, if it can't find an author, then it doesn't produce anything.
# TODO: This is horrible and I will gladly replace this
bibtext = ''
if ('https://twitter.com/' in url): # url is un-shortened of course
eprint(" generic_web_page: skipping 'mere tweet'")
return '' # have yet to find any bibtex-able info in a mere tweet
soup = BeautifulSoup(''.join(html), "html.parser")
author = soup.find(name="author")
if not author:
author = soup.find(property="og:author")
eprint(" generic_web_page: author =",author)
if (author):
author = author.get("content")
bibtext += '@misc{'+author+',\n'
bibtext += ' Author = {'+author+'},\n'
else:
eprint(" skipping.")
title = soup.find(property="og:title")
if (author and title):
title = title.get("content")
bibtext += ' Title = {'+title+'},\n'
date = soup.find(itemprop="datePublished")
if (date):
date = date.get("content")
website_name = soup.find(property="og:site_name")
if (website_name):
website_name = website_name.get("content")
if ('' != bibtext):
bibtext += ' URL = {'+url+'},\n'
last_access = time.strftime("%b %d %Y")
bibtext += ' Note = {Last accessed '+last_access+'},\n'
bibtext += '}'
return bibtext
def scrape_for_isbn(actual_url, html):
isbn = None
# TODO: put something here!
return isbn
def isbn_to_bibtex(isbn):
# source: borrows from https://gist.github.com/wcaleb/5178632
bibtext = ''
query_url = 'http://www.ottobib.com/isbn/'+isbn+'/bibtex'
html = slurp_url(query_url)
# Use BS4 to get the formatted citation returned by OttoBib
soup = BeautifulSoup(''.join(html), "html.parser")
for br in soup.find_all(name='br'):
br.decompose()
result = soup.find("div", class_="nine columns")
if (result):
bibtext = result.text
return bibtext
def limit_handled(cursor): # limits API calls so Twitter won't block the bot
while True:
try:
yield cursor.next()
except tweepy.RateLimitError:
mins = 15
eprint(' Hit the Twitter API rate limit. Waiting',mins,'minutes')
time.sleep(mins * 60) # wait 15 minutes before trying again
def tweet_to_bibtex(tweet, bibcount):
bibtext = ""
# get list or urls
urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', tweet.full_text)
for url in urls: # all the urls in the Tweet (usually just one) get followed
bibtext = ""
eprint(" Trying url = ",url)
actual_url = expand_url(url)
eprint(" actual_url = ",actual_url)
if ('' != actual_url):
html = slurp_url(actual_url) # full text of web page
doi = extract_doi(actual_url, html)
if ("" != doi): # if webpage contains a DOI, we're finished
eprint(" Found DOI = ",doi)
bibtext = doi2bib(doi)
elif ("arxiv.org" in actual_url): # if the url is for an arxiv post (which doesn't contain a DOI)
bibtext = arxiv_without_doi(actual_url, html)
elif ('ISBN' in html): # somewhere in the linked page may be a book ISBN id
isbn = scrape_for_isbn(actual_url, html)
if (isbn):
bibtext = isbn_to_bibtex(isbn)
else: # let's try to generate an entry for the linked webpage itself
bibtext = generic_web_page(actual_url, html)
if ("" != bibtext):
bibcount += 1
print(bibtext,'\n',flush=True)
return bibtext, bibcount
def scrape_faves(user_id):
"""
This is the main routine.
"""
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
favecount, bibcount = 0, 0
for tweet in limit_handled( tweepy.Cursor(api.favorites, screen_name = user_id, include_entities = True, tweet_mode='extended').items() ):
#for tweet in api.favorites(screen_name = user_id, include_entities = True, tweet_mode='extended'): # only does 20 at a time
favecount += 1
bibtext, bibcount = tweet_to_bibtex(tweet, bibcount)
eprint("-----") # just a divider between tweets
# Things the user has tweeted or re-tweeted are as bib-worthy as things they've faved
for tweet in limit_handled( tweepy.Cursor(api.user_timeline, screen_name = user_id, include_entities = True, tweet_mode='extended').items() ):
favecount += 1
bibtext, bibcount = tweet_to_bibtex(tweet, bibcount)
eprint("-----") # just a divider between tweets
eprint(favecount,"favorites, tweets & RTs scraped")
eprint(bibcount,"BibTeX entries generated.")
if __name__ == '__main__':
if (False): # quick-testing block
eprint(isbn_to_bibtex('0754666913')) # testing for now; brent waters' book
url = 'https://t.co/Wf9U9fuPoI' # problem url from huffpo
url = 'https://fb.me/1sfK1HGSE' # problem url from fb
eprint(" trying url = ",url)
actual_url = expand_url(url)
eprint(" actual url = ",actual_url)
html = slurp_url(actual_url)
eprint(" html = ",html)
if len(sys.argv) == 2:
user_id = sys.argv[1]
scrape_faves(user_id)
else:
eprint("Usage: ",sys.argv[0]," <user_id>",sep="")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment