Skip to content

Instantly share code, notes, and snippets.

@drscotthawley
Last active February 6, 2024 10:43
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save drscotthawley/67382bf5a8ddd3478097efde94bae404 to your computer and use it in GitHub Desktop.
Save drscotthawley/67382bf5a8ddd3478097efde94bae404 to your computer and use it in GitHub Desktop.
Generate BibTex from list of Tweets you (or another user) have favorited ('liked'), tweeted or RT'd
#! /usr/bin/env python3
"""
faves2bibtex.py
Author: Scott Hawley
Scrapes URLs contained in tweets you (or another user) favorited ('liked') for DOI & other bibliographic info,
and tries to generate a set of BibTex entries to stdout.
Status messages go to stderr
Sample usage:
./faves2bibtex.py drscotthawley | tee refs.bib
Status:
- It will generate BibTeX if it finds a DOI immediately, or if the reference is on arXiv.org but too new to have a DOI
- Otherwise, it tries to cobble together some kind of @misc entry by searching for 'common' meta tags, but often fails :'(
- Added a book ISBN-to-BibTeX functionality but it's currently not being used. TODO: scrape for ISBNs.
Tested on: Mac OS X 10.12.6, Python 3.5 (anaconda)
Aside: this project has been eye-opening re. the # of ways that, even if your HTTP request succeeds, various library routines may crash your code
"""
import tweepy
import sys
import re
import requests
#import urllib3
import http.client as client
import urllib
from bs4 import BeautifulSoup
import os
import time
# You need to supply your own Twitter API developer keys here
# Some instructions here: https://www.digitalocean.com/community/tutorials/how-to-authenticate-a-python-application-with-twitter-using-tweepy-on-ubuntu-14-04
consumer_key = '****'
consumer_secret = '****'
access_token = '****'
access_token_secret = '****'
def eprint(*args, **kwargs): # print to stderr
print(*args, file=sys.stderr, **kwargs)
def doi2bib(doi):
"""
Return a bibTeX string of metadata for a given DOI.
Based on https://gist.github.com/jrsmith3/5513926
"""
if ("" == doi):
return ""
bibtext = ''
url = "http://dx.doi.org/" + doi
headers = {"accept": "application/x-bibtex"}
r = requests.get(url, headers = headers)
if ('This DOI cannot be found in the DOI System' not in r.text):
bibtext = r.text
else:
eprint("Warning: Attempt to convert DOI",doi,"failed.")
return bibtext
def slurp_url(url): # just read in an entire webpage and return the source as text
# from https://stackoverflow.com/questions/13303449/urllib2-httperror-http-error-403-forbidden
html = ''
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
headers={'User-Agent':user_agent,}
request=urllib.request.Request(url,None,headers)
try:
response = urllib.request.urlopen(request)
except urllib.error.HTTPError: # e.g. 404
eprint(" slurp_url: http error ",urllib.error.HTTPError)
return ''
try: # to decode
html = response.read().decode("utf-8") # sometimes this gives you 'bad continuation bit' error
except:
html = response.read().decode("latin-1") # ...in which case latin-1 usually succeeds
return html
'''
def slurp_url_old(url):
html = ''
try: # requests is tempermental https://github.com/requests/requests/issues/3840
html = requests.get(url).text
except requests.exceptions.ContentDecodingError: # huffington post generates these regularly: malformed gzipped encoding
eprint(" ContentDecodingError for url = ",url)
eprint(" Skipping this url. ") # tired of this %&$$%$#
return html
'''
def expand_url(url):
# requests is nice in that it follows multiple links, but can crash too
actual_url = ''
try: # requests is tempermental https://github.com/requests/requests/issues/3840
r = requests.get(url)
actual_url = r.url
except requests.exceptions.ContentDecodingError: # huffington post generates these regularly
# below code from https://stackoverflow.com/questions/4201062/how-can-i-unshorten-a-url
# this 'old school' approach won't follow redirects, but is otherwise robust
parsed = urllib.parse.urlparse(url)
h = client.HTTPConnection(parsed.netloc)
h.request('HEAD', parsed.path)
response = h.getresponse()
if response.status//100 == 3 and response.getheader('Location'):
actual_url = response.getheader('Location')
return actual_url
'''
def expand_url_old(url): # Twitters tc.o link expression needs expanding
return old_skool(url)
actual_url = ''
query_url = 'https://unshorten.me/s/'+url # unshorten.me is great but only allows 10 new evals per hour!
actual_url = requests.get(query_url).text
eprint(" expand_url: actual_url =",actual_url)
if actual_url.find("Usage") != -1:
#if ('Usage' in actual_url):
eprint(" Hey")
actual_url = ''
try: # requests is tempermental https://github.com/requests/requests/issues/3840
r = requests.get(url)
eprint(" r = ",r)
actual_url = r.url
except requests.exceptions.ContentDecodingError: # huffington post generates these regularly
eprint(" ContentDecodingError for url = ",url)
eprint(" Trying urllib instead: ")
parsed = urllib.parse.urlparse(url)
h = client.HTTPConnection(parsed.netloc)
h.request('HEAD', parsed.path)
response = h.getresponse()
if response.status//100 == 3 and response.getheader('Location'):
return response.getheader('Location')
else:
return url
else:
eprint(" Nope")
return actual_url
'''
def extract_doi(url, html): # searches webpage text for the first string matching the DOI format
doi = "" # blank DOI string doubles as error/fail message
if ('doi' in url): # a couple easy special cases
url = url.replace('http://','')
url = url.replace('https://','')
doi = url.replace('aapt.scitation.org/doi/','')
doi = doi.replace('doi.org/','')
else:
doi_re = re.compile(r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'])\S)+)\b") # doi pattern regexp
matchObj = doi_re.search(html)
if matchObj:
doi = matchObj.group(0) # grab the first thing in the page that fits the doi format
return doi
def arxiv_without_doi(url, html): # if the arxiv entry is so new that it doesn't contain a DOI
bibtext = '' # blank bibtext serves an initialization and error code
# first a check: is this a pdf arxiv link? If so, can we get a DOI if we search the 'abs' url?
if ('pdf' in url):
new_url = url.replace('.pdf','')
new_url = new_url.replace('pdf','abs')
new_html = slurp_url(new_url)
doi = extract_doi(new_url, new_html)
if ("" != doi): # if webpage contains a DOI, we're finished
eprint(" Found DOI = ",doi)
bibtext = doi2bib(doi)
if ('' == bibtext):
# bibsonomy.org does a GREAT job of formatting, but sets a limit on how frequently it can be accessed
query_url = 'http://scraper.bibsonomy.org/service?format=bibtex&selection=&url='+url
eprint(" query_url = ",query_url)
attempts, maxattempts = 0, 10
while ((attempts < maxattempts) and ('' == bibtext)):
attempts += 1
r = requests.get(query_url)
if ('You sent too many requests' in r.text):
eprint(" Bibsonomy says we're using it too much. (Attempt",attempts,"of",maxattempts,").",end="")
if (attempts < maxattempts):
nsecs = 60
eprint(" Waiting",nsecs,"seconds before trying again...")
time.sleep(nsecs)
else: # success!
bibtext = r.text
eprint("")
if ('' == bibtext): # Try a different method
# arxiv2bibtex.org: no frequency limits but isn't formatted as nicely IMHO
arxiv_val = url.replace('https://arxiv.org/abs/','') # get only the arxiv index number
query_url = 'https://arxiv2bibtex.org/?q='+ arxiv_val +'&format=bibtex'
r = requests.get(query_url)
soup = BeautifulSoup(''.join(r.text), "html.parser")
textarea = soup.find('textarea') # the first textarea from arxiv2bibtex is the BibTeX output
if (textarea):
bibtext = textarea.getText()
return bibtext
def generic_web_page(url, html):
# For now, we're going to largely rely on common meta tags, e.g. facebook
# So far, if it can't find an author, then it doesn't produce anything.
# TODO: This is horrible and I will gladly replace this
bibtext = ''
if ('https://twitter.com/' in url): # url is un-shortened of course
eprint(" generic_web_page: skipping 'mere tweet'")
return '' # have yet to find any bibtex-able info in a mere tweet
soup = BeautifulSoup(''.join(html), "html.parser")
author = soup.find(name="author")
if not author:
author = soup.find(property="og:author")
eprint(" generic_web_page: author =",author)
if (author):
author = author.get("content")
bibtext += '@misc{'+author+',\n'
bibtext += ' Author = {'+author+'},\n'
else:
eprint(" skipping.")
title = soup.find(property="og:title")
if (author and title):
title = title.get("content")
bibtext += ' Title = {'+title+'},\n'
date = soup.find(itemprop="datePublished")
if (date):
date = date.get("content")
website_name = soup.find(property="og:site_name")
if (website_name):
website_name = website_name.get("content")
if ('' != bibtext):
bibtext += ' URL = {'+url+'},\n'
last_access = time.strftime("%b %d %Y")
bibtext += ' Note = {Last accessed '+last_access+'},\n'
bibtext += '}'
return bibtext
def scrape_for_isbn(actual_url, html):
isbn = None
# TODO: put something here!
return isbn
def isbn_to_bibtex(isbn):
# source: borrows from https://gist.github.com/wcaleb/5178632
bibtext = ''
query_url = 'http://www.ottobib.com/isbn/'+isbn+'/bibtex'
html = slurp_url(query_url)
# Use BS4 to get the formatted citation returned by OttoBib
soup = BeautifulSoup(''.join(html), "html.parser")
for br in soup.find_all(name='br'):
br.decompose()
result = soup.find("div", class_="nine columns")
if (result):
bibtext = result.text
return bibtext
def limit_handled(cursor): # limits API calls so Twitter won't block the bot
while True:
try:
yield cursor.next()
except tweepy.RateLimitError:
mins = 15
eprint(' Hit the Twitter API rate limit. Waiting',mins,'minutes')
time.sleep(mins * 60) # wait 15 minutes before trying again
def tweet_to_bibtex(tweet, bibcount):
bibtext = ""
# get list or urls
urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', tweet.full_text)
for url in urls: # all the urls in the Tweet (usually just one) get followed
bibtext = ""
eprint(" Trying url = ",url)
actual_url = expand_url(url)
eprint(" actual_url = ",actual_url)
if ('' != actual_url):
html = slurp_url(actual_url) # full text of web page
doi = extract_doi(actual_url, html)
if ("" != doi): # if webpage contains a DOI, we're finished
eprint(" Found DOI = ",doi)
bibtext = doi2bib(doi)
elif ("arxiv.org" in actual_url): # if the url is for an arxiv post (which doesn't contain a DOI)
bibtext = arxiv_without_doi(actual_url, html)
elif ('ISBN' in html): # somewhere in the linked page may be a book ISBN id
isbn = scrape_for_isbn(actual_url, html)
if (isbn):
bibtext = isbn_to_bibtex(isbn)
else: # let's try to generate an entry for the linked webpage itself
bibtext = generic_web_page(actual_url, html)
if ("" != bibtext):
bibcount += 1
print(bibtext,'\n',flush=True)
return bibtext, bibcount
def scrape_faves(user_id):
"""
This is the main routine.
"""
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
favecount, bibcount = 0, 0
for tweet in limit_handled( tweepy.Cursor(api.favorites, screen_name = user_id, include_entities = True, tweet_mode='extended').items() ):
#for tweet in api.favorites(screen_name = user_id, include_entities = True, tweet_mode='extended'): # only does 20 at a time
favecount += 1
bibtext, bibcount = tweet_to_bibtex(tweet, bibcount)
eprint("-----") # just a divider between tweets
# Things the user has tweeted or re-tweeted are as bib-worthy as things they've faved
for tweet in limit_handled( tweepy.Cursor(api.user_timeline, screen_name = user_id, include_entities = True, tweet_mode='extended').items() ):
favecount += 1
bibtext, bibcount = tweet_to_bibtex(tweet, bibcount)
eprint("-----") # just a divider between tweets
eprint(favecount,"favorites, tweets & RTs scraped")
eprint(bibcount,"BibTeX entries generated.")
if __name__ == '__main__':
if (False): # quick-testing block
eprint(isbn_to_bibtex('0754666913')) # testing for now; brent waters' book
url = 'https://t.co/Wf9U9fuPoI' # problem url from huffpo
url = 'https://fb.me/1sfK1HGSE' # problem url from fb
eprint(" trying url = ",url)
actual_url = expand_url(url)
eprint(" actual url = ",actual_url)
html = slurp_url(actual_url)
eprint(" html = ",html)
if len(sys.argv) == 2:
user_id = sys.argv[1]
scrape_faves(user_id)
else:
eprint("Usage: ",sys.argv[0]," <user_id>",sep="")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment