drscotthawley/faves2bibtex.py

## faves2bibtex.py
#! /usr/bin/env python3
"""
faves2bibtex.py
Author: Scott Hawley

Scrapes URLs contained in tweets you (or another user) favorited ('liked') for DOI & other bibliographic info,
and tries to generate a set of BibTex entries to stdout.
Status messages go to stderr

Sample usage:
./faves2bibtex.py drscotthawley | tee refs.bib

Status:
  - It will generate BibTeX if it finds a DOI immediately, or if the reference is on arXiv.org but too new to have a DOI
  - Otherwise, it tries to cobble together some kind of @misc entry by searching for 'common' meta tags, but often fails :'(
  - Added a book ISBN-to-BibTeX functionality but it's currently not being used. TODO: scrape for ISBNs.

Tested on: Mac OS X 10.12.6, Python 3.5 (anaconda)

Aside: this project has been eye-opening re. the # of ways that, even if your HTTP request succeeds, various library routines may crash your code
"""

import tweepy
import sys
import re
import requests
#import urllib3
import http.client as client
import urllib

from bs4 import BeautifulSoup
import os
import time

# You need to supply your own Twitter API developer keys here
# Some instructions here: https://www.digitalocean.com/community/tutorials/how-to-authenticate-a-python-application-with-twitter-using-tweepy-on-ubuntu-14-04
consumer_key = '****'
consumer_secret = '****'
access_token = '****'
access_token_secret = '****'


def eprint(*args, **kwargs):         # print to stderr
    print(*args, file=sys.stderr, **kwargs)


def doi2bib(doi):
  """
  Return a bibTeX string of metadata for a given DOI.
  Based on https://gist.github.com/jrsmith3/5513926
  """
  if ("" == doi):
      return ""
  bibtext = ''
  url = "http://dx.doi.org/" + doi
  headers = {"accept": "application/x-bibtex"}
  r = requests.get(url, headers = headers)
  if ('This DOI cannot be found in the DOI System' not in r.text):
      bibtext = r.text
  else:
      eprint("Warning: Attempt to convert DOI",doi,"failed.")
  return bibtext


def slurp_url(url):   # just read in an entire webpage and return the source as text
    # from https://stackoverflow.com/questions/13303449/urllib2-httperror-http-error-403-forbidden
    html = ''
    user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
    headers={'User-Agent':user_agent,}
    request=urllib.request.Request(url,None,headers)
    try:
        response = urllib.request.urlopen(request)
    except urllib.error.HTTPError:                          # e.g. 404
        eprint("    slurp_url:  http error ",urllib.error.HTTPError)
        return ''
    try:    # to decode
        html = response.read().decode("utf-8")     # sometimes this gives you 'bad continuation bit' error
    except:
        html = response.read().decode("latin-1")    # ...in which case latin-1 usually succeeds
    return html

'''
def slurp_url_old(url):
    html = ''
    try:                            # requests is tempermental https://github.com/requests/requests/issues/3840
        html = requests.get(url).text
    except requests.exceptions.ContentDecodingError:   # huffington post generates these regularly: malformed gzipped encoding
        eprint("   ContentDecodingError for url = ",url)
        eprint("   Skipping this url.   ")    # tired of this %&$$%$#
    return html
'''

def expand_url(url):
    # requests is nice in that it follows multiple links, but can crash too
    actual_url = ''
    try:                            # requests is tempermental https://github.com/requests/requests/issues/3840
        r = requests.get(url)
        actual_url = r.url
    except requests.exceptions.ContentDecodingError:    # huffington post generates these regularly
        # below code from https://stackoverflow.com/questions/4201062/how-can-i-unshorten-a-url
        # this 'old school' approach won't follow redirects, but is otherwise robust
        parsed = urllib.parse.urlparse(url)
        h = client.HTTPConnection(parsed.netloc)
        h.request('HEAD', parsed.path)
        response = h.getresponse()
        if response.status//100 == 3 and response.getheader('Location'):
            actual_url = response.getheader('Location')
    return actual_url

'''
def expand_url_old(url):            # Twitters tc.o link expression needs expanding
    return old_skool(url)
    actual_url = ''
    query_url = 'https://unshorten.me/s/'+url    # unshorten.me is great but only allows 10 new evals per hour!
    actual_url = requests.get(query_url).text
    eprint("    expand_url: actual_url =",actual_url)
    if actual_url.find("Usage") != -1:
    #if ('Usage' in actual_url):
        eprint("     Hey")
        actual_url = ''
        try:                            # requests is tempermental https://github.com/requests/requests/issues/3840
            r = requests.get(url)
            eprint("   r = ",r)
            actual_url = r.url
        except requests.exceptions.ContentDecodingError:    # huffington post generates these regularly
            eprint("    ContentDecodingError for url = ",url)
            eprint("    Trying urllib instead: ")
            parsed = urllib.parse.urlparse(url)
            h = client.HTTPConnection(parsed.netloc)
            h.request('HEAD', parsed.path)
            response = h.getresponse()
            if response.status//100 == 3 and response.getheader('Location'):
                return response.getheader('Location')
            else:
                return url
    else:
        eprint("      Nope")

    return actual_url
'''

def extract_doi(url, html):    # searches webpage text for the first string matching the DOI format
    doi = ""                     # blank DOI string doubles as error/fail message
    if ('doi' in url):          # a couple easy special cases
        url = url.replace('http://','')
        url = url.replace('https://','')
        doi = url.replace('aapt.scitation.org/doi/','')
        doi = doi.replace('doi.org/','')
    else:
        doi_re = re.compile(r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'])\S)+)\b")  # doi pattern regexp
        matchObj = doi_re.search(html)
        if matchObj:
           doi = matchObj.group(0)      # grab the first thing in the page that fits the doi format
    return doi


def arxiv_without_doi(url, html):     # if the arxiv entry is so new that it doesn't contain a DOI
    bibtext = ''    # blank bibtext serves an initialization and error code

    # first a check: is this a pdf arxiv link?  If so, can we get a DOI if we search the 'abs' url?
    if ('pdf' in url):
        new_url = url.replace('.pdf','')
        new_url = new_url.replace('pdf','abs')
        new_html = slurp_url(new_url)
        doi = extract_doi(new_url, new_html)
        if ("" != doi):                   # if webpage contains a DOI, we're finished
            eprint("     Found DOI = ",doi)
            bibtext = doi2bib(doi)

    if ('' == bibtext):
        # bibsonomy.org does a GREAT job of formatting, but sets a limit on how frequently it can be accessed
        query_url = 'http://scraper.bibsonomy.org/service?format=bibtex&selection=&url='+url
        eprint("   query_url = ",query_url)
        attempts, maxattempts = 0, 10
        while ((attempts < maxattempts) and ('' == bibtext)):
            attempts += 1
            r = requests.get(query_url)
            if ('You sent too many requests' in r.text):
                eprint("   Bibsonomy says we're using it too much. (Attempt",attempts,"of",maxattempts,").",end="")
                if (attempts < maxattempts):
                    nsecs = 60
                    eprint("  Waiting",nsecs,"seconds before trying again...")
                    time.sleep(nsecs)
            else:                           # success!
                bibtext = r.text
                eprint("")

    if ('' == bibtext):   # Try a different method
        #  arxiv2bibtex.org: no frequency limits but isn't formatted as nicely IMHO
        arxiv_val = url.replace('https://arxiv.org/abs/','')          # get only the arxiv index number
        query_url = 'https://arxiv2bibtex.org/?q='+ arxiv_val +'&format=bibtex'
        r = requests.get(query_url)
        soup = BeautifulSoup(''.join(r.text),  "html.parser")
        textarea = soup.find('textarea')        # the first textarea from arxiv2bibtex is the BibTeX output
        if (textarea):
            bibtext = textarea.getText()

    return bibtext


def generic_web_page(url, html):
    # For now, we're going to largely rely on common meta tags, e.g. facebook
    # So far, if it can't find an author, then it doesn't produce anything.
    # TODO: This is horrible and I will gladly replace this

    bibtext = ''

    if ('https://twitter.com/' in url):   # url is un-shortened of course
        eprint("   generic_web_page: skipping 'mere tweet'")
        return ''               # have yet to find any bibtex-able info in a mere tweet

    soup = BeautifulSoup(''.join(html),  "html.parser")

    author = soup.find(name="author")
    if not author:
        author = soup.find(property="og:author")
    eprint("   generic_web_page: author =",author)
    if (author):
        author = author.get("content")
        bibtext += '@misc{'+author+',\n'
        bibtext += '  Author = {'+author+'},\n'
    else:
        eprint("    skipping.")

    title = soup.find(property="og:title")
    if (author and title):
        title = title.get("content")
        bibtext += '  Title = {'+title+'},\n'

    date = soup.find(itemprop="datePublished")
    if (date):
        date = date.get("content")

    website_name = soup.find(property="og:site_name")
    if (website_name):
        website_name = website_name.get("content")

    if ('' != bibtext):
        bibtext += '  URL = {'+url+'},\n'
        last_access = time.strftime("%b %d %Y")
        bibtext += '  Note = {Last accessed '+last_access+'},\n'
        bibtext += '}'
    return bibtext


def scrape_for_isbn(actual_url, html):
    isbn = None
    # TODO: put something here!
    return isbn


def isbn_to_bibtex(isbn):
    # source: borrows from https://gist.github.com/wcaleb/5178632
    bibtext = ''
    query_url = 'http://www.ottobib.com/isbn/'+isbn+'/bibtex'
    html = slurp_url(query_url)
    # Use BS4 to get the formatted citation returned by OttoBib
    soup = BeautifulSoup(''.join(html), "html.parser")
    for br in soup.find_all(name='br'):
    	br.decompose()
    result = soup.find("div", class_="nine columns")
    if (result):
        bibtext = result.text
    return bibtext


def limit_handled(cursor):   # limits API calls so Twitter won't block the bot
    while True:
        try:
            yield cursor.next()
        except tweepy.RateLimitError:
            mins = 15
            eprint(' Hit the Twitter API rate limit.  Waiting',mins,'minutes')
            time.sleep(mins * 60)   # wait 15 minutes before trying again


def tweet_to_bibtex(tweet, bibcount):
    bibtext = ""
    # get list or urls
    urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', tweet.full_text)
    for url in urls:   # all the urls in the Tweet (usually just one) get followed
        bibtext = ""
        eprint("   Trying url = ",url)
        actual_url = expand_url(url)
        eprint("   actual_url = ",actual_url)
        if ('' != actual_url):
            html = slurp_url(actual_url)   # full text of web page
            doi = extract_doi(actual_url, html)
            if ("" != doi):                   # if webpage contains a DOI, we're finished
                eprint("     Found DOI = ",doi)
                bibtext = doi2bib(doi)
            elif ("arxiv.org" in actual_url):  # if the url is for an arxiv post (which doesn't contain a DOI)
                bibtext = arxiv_without_doi(actual_url, html)
            elif ('ISBN' in html):             # somewhere in the linked page may be a book ISBN id
                isbn = scrape_for_isbn(actual_url, html)
                if (isbn):
                    bibtext = isbn_to_bibtex(isbn)
            else:                              # let's try to generate an entry for the linked webpage itself
                bibtext = generic_web_page(actual_url, html)

        if ("" != bibtext):
            bibcount += 1
            print(bibtext,'\n',flush=True)

    return bibtext, bibcount


def scrape_faves(user_id):
    """
    This is the main routine.
    """
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    api = tweepy.API(auth)

    favecount, bibcount = 0, 0
    for tweet in limit_handled( tweepy.Cursor(api.favorites, screen_name = user_id, include_entities = True, tweet_mode='extended').items() ):
    #for tweet in api.favorites(screen_name = user_id, include_entities = True, tweet_mode='extended'):  # only does 20 at a time
        favecount += 1
        bibtext, bibcount = tweet_to_bibtex(tweet, bibcount)
        eprint("-----")   # just a divider between tweets

    # Things the user has tweeted or re-tweeted are as bib-worthy as things they've faved
    for tweet in limit_handled( tweepy.Cursor(api.user_timeline, screen_name = user_id, include_entities = True, tweet_mode='extended').items() ):
        favecount += 1
        bibtext, bibcount = tweet_to_bibtex(tweet, bibcount)
        eprint("-----")   # just a divider between tweets

    eprint(favecount,"favorites, tweets & RTs scraped")
    eprint(bibcount,"BibTeX entries generated.")


if __name__ == '__main__':
    if (False):   # quick-testing block
        eprint(isbn_to_bibtex('0754666913'))  # testing for now; brent waters' book
        url = 'https://t.co/Wf9U9fuPoI'   # problem url from huffpo
        url = 'https://fb.me/1sfK1HGSE'  # problem url from fb
        eprint(" trying url = ",url)
        actual_url = expand_url(url)
        eprint(" actual url =  ",actual_url)
        html = slurp_url(actual_url)
        eprint(" html =  ",html)

    if len(sys.argv) == 2:
        user_id = sys.argv[1]
        scrape_faves(user_id)
    else:
        eprint("Usage: ",sys.argv[0]," <user_id>",sep="")
	#! /usr/bin/env python3
	"""
	faves2bibtex.py
	Author: Scott Hawley

	Scrapes URLs contained in tweets you (or another user) favorited ('liked') for DOI & other bibliographic info,
	and tries to generate a set of BibTex entries to stdout.
	Status messages go to stderr

	Sample usage:
	./faves2bibtex.py drscotthawley \| tee refs.bib

	Status:
	- It will generate BibTeX if it finds a DOI immediately, or if the reference is on arXiv.org but too new to have a DOI
	- Otherwise, it tries to cobble together some kind of @misc entry by searching for 'common' meta tags, but often fails :'(
	- Added a book ISBN-to-BibTeX functionality but it's currently not being used. TODO: scrape for ISBNs.

	Tested on: Mac OS X 10.12.6, Python 3.5 (anaconda)

	Aside: this project has been eye-opening re. the # of ways that, even if your HTTP request succeeds, various library routines may crash your code
	"""

	import tweepy
	import sys
	import re
	import requests
	#import urllib3
	import http.client as client
	import urllib

	from bs4 import BeautifulSoup
	import os
	import time

	# You need to supply your own Twitter API developer keys here
	# Some instructions here: https://www.digitalocean.com/community/tutorials/how-to-authenticate-a-python-application-with-twitter-using-tweepy-on-ubuntu-14-04
	consumer_key = '****'
	consumer_secret = '****'
	access_token = '****'
	access_token_secret = '****'


	def eprint(args, *kwargs): # print to stderr
	print(args, file=sys.stderr, *kwargs)


	def doi2bib(doi):
	"""
	Return a bibTeX string of metadata for a given DOI.
	Based on https://gist.github.com/jrsmith3/5513926
	"""
	if ("" == doi):
	return ""
	bibtext = ''
	url = "http://dx.doi.org/" + doi
	headers = {"accept": "application/x-bibtex"}
	r = requests.get(url, headers = headers)
	if ('This DOI cannot be found in the DOI System' not in r.text):
	bibtext = r.text
	else:
	eprint("Warning: Attempt to convert DOI",doi,"failed.")
	return bibtext


	def slurp_url(url): # just read in an entire webpage and return the source as text
	# from https://stackoverflow.com/questions/13303449/urllib2-httperror-http-error-403-forbidden
	html = ''
	user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
	headers={'User-Agent':user_agent,}
	request=urllib.request.Request(url,None,headers)
	try:
	response = urllib.request.urlopen(request)
	except urllib.error.HTTPError: # e.g. 404
	eprint(" slurp_url: http error ",urllib.error.HTTPError)
	return ''
	try: # to decode
	html = response.read().decode("utf-8") # sometimes this gives you 'bad continuation bit' error
	except:
	html = response.read().decode("latin-1") # ...in which case latin-1 usually succeeds
	return html

	'''
	def slurp_url_old(url):
	html = ''
	try: # requests is tempermental https://github.com/requests/requests/issues/3840
	html = requests.get(url).text
	except requests.exceptions.ContentDecodingError: # huffington post generates these regularly: malformed gzipped encoding
	eprint(" ContentDecodingError for url = ",url)
	eprint(" Skipping this url. ") # tired of this %&$$%$#
	return html
	'''

	def expand_url(url):
	# requests is nice in that it follows multiple links, but can crash too
	actual_url = ''
	try: # requests is tempermental https://github.com/requests/requests/issues/3840
	r = requests.get(url)
	actual_url = r.url
	except requests.exceptions.ContentDecodingError: # huffington post generates these regularly
	# below code from https://stackoverflow.com/questions/4201062/how-can-i-unshorten-a-url
	# this 'old school' approach won't follow redirects, but is otherwise robust
	parsed = urllib.parse.urlparse(url)
	h = client.HTTPConnection(parsed.netloc)
	h.request('HEAD', parsed.path)
	response = h.getresponse()
	if response.status//100 == 3 and response.getheader('Location'):
	actual_url = response.getheader('Location')
	return actual_url

	'''
	def expand_url_old(url): # Twitters tc.o link expression needs expanding
	return old_skool(url)
	actual_url = ''
	query_url = 'https://unshorten.me/s/'+url # unshorten.me is great but only allows 10 new evals per hour!
	actual_url = requests.get(query_url).text
	eprint(" expand_url: actual_url =",actual_url)
	if actual_url.find("Usage") != -1:
	#if ('Usage' in actual_url):
	eprint(" Hey")
	actual_url = ''
	try: # requests is tempermental https://github.com/requests/requests/issues/3840
	r = requests.get(url)
	eprint(" r = ",r)
	actual_url = r.url
	except requests.exceptions.ContentDecodingError: # huffington post generates these regularly
	eprint(" ContentDecodingError for url = ",url)
	eprint(" Trying urllib instead: ")
	parsed = urllib.parse.urlparse(url)
	h = client.HTTPConnection(parsed.netloc)
	h.request('HEAD', parsed.path)
	response = h.getresponse()
	if response.status//100 == 3 and response.getheader('Location'):
	return response.getheader('Location')
	else:
	return url
	else:
	eprint(" Nope")

	return actual_url
	'''

	def extract_doi(url, html): # searches webpage text for the first string matching the DOI format
	doi = "" # blank DOI string doubles as error/fail message
	if ('doi' in url): # a couple easy special cases
	url = url.replace('http://','')
	url = url.replace('https://','')
	doi = url.replace('aapt.scitation.org/doi/','')
	doi = doi.replace('doi.org/','')
	else:
	doi_re = re.compile(r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'])\S)+)\b") # doi pattern regexp
	matchObj = doi_re.search(html)
	if matchObj:
	doi = matchObj.group(0) # grab the first thing in the page that fits the doi format
	return doi


	def arxiv_without_doi(url, html): # if the arxiv entry is so new that it doesn't contain a DOI
	bibtext = '' # blank bibtext serves an initialization and error code

	# first a check: is this a pdf arxiv link? If so, can we get a DOI if we search the 'abs' url?
	if ('pdf' in url):
	new_url = url.replace('.pdf','')
	new_url = new_url.replace('pdf','abs')
	new_html = slurp_url(new_url)
	doi = extract_doi(new_url, new_html)
	if ("" != doi): # if webpage contains a DOI, we're finished
	eprint(" Found DOI = ",doi)
	bibtext = doi2bib(doi)

	if ('' == bibtext):
	# bibsonomy.org does a GREAT job of formatting, but sets a limit on how frequently it can be accessed
	query_url = 'http://scraper.bibsonomy.org/service?format=bibtex&selection=&url='+url
	eprint(" query_url = ",query_url)
	attempts, maxattempts = 0, 10
	while ((attempts < maxattempts) and ('' == bibtext)):
	attempts += 1
	r = requests.get(query_url)
	if ('You sent too many requests' in r.text):
	eprint(" Bibsonomy says we're using it too much. (Attempt",attempts,"of",maxattempts,").",end="")
	if (attempts < maxattempts):
	nsecs = 60
	eprint(" Waiting",nsecs,"seconds before trying again...")
	time.sleep(nsecs)
	else: # success!
	bibtext = r.text
	eprint("")

	if ('' == bibtext): # Try a different method
	# arxiv2bibtex.org: no frequency limits but isn't formatted as nicely IMHO
	arxiv_val = url.replace('https://arxiv.org/abs/','') # get only the arxiv index number
	query_url = 'https://arxiv2bibtex.org/?q='+ arxiv_val +'&format=bibtex'
	r = requests.get(query_url)
	soup = BeautifulSoup(''.join(r.text), "html.parser")
	textarea = soup.find('textarea') # the first textarea from arxiv2bibtex is the BibTeX output
	if (textarea):
	bibtext = textarea.getText()

	return bibtext


	def generic_web_page(url, html):
	# For now, we're going to largely rely on common meta tags, e.g. facebook
	# So far, if it can't find an author, then it doesn't produce anything.
	# TODO: This is horrible and I will gladly replace this

	bibtext = ''

	if ('https://twitter.com/' in url): # url is un-shortened of course
	eprint(" generic_web_page: skipping 'mere tweet'")
	return '' # have yet to find any bibtex-able info in a mere tweet

	soup = BeautifulSoup(''.join(html), "html.parser")

	author = soup.find(name="author")
	if not author:
	author = soup.find(property="og:author")
	eprint(" generic_web_page: author =",author)
	if (author):
	author = author.get("content")
	bibtext += '@misc{'+author+',\n'
	bibtext += ' Author = {'+author+'},\n'
	else:
	eprint(" skipping.")

	title = soup.find(property="og:title")
	if (author and title):
	title = title.get("content")
	bibtext += ' Title = {'+title+'},\n'

	date = soup.find(itemprop="datePublished")
	if (date):
	date = date.get("content")

	website_name = soup.find(property="og:site_name")
	if (website_name):
	website_name = website_name.get("content")

	if ('' != bibtext):
	bibtext += ' URL = {'+url+'},\n'
	last_access = time.strftime("%b %d %Y")
	bibtext += ' Note = {Last accessed '+last_access+'},\n'
	bibtext += '}'
	return bibtext


	def scrape_for_isbn(actual_url, html):
	isbn = None
	# TODO: put something here!
	return isbn


	def isbn_to_bibtex(isbn):
	# source: borrows from https://gist.github.com/wcaleb/5178632
	bibtext = ''
	query_url = 'http://www.ottobib.com/isbn/'+isbn+'/bibtex'
	html = slurp_url(query_url)
	# Use BS4 to get the formatted citation returned by OttoBib
	soup = BeautifulSoup(''.join(html), "html.parser")
	for br in soup.find_all(name='br'):
	br.decompose()
	result = soup.find("div", class_="nine columns")
	if (result):
	bibtext = result.text
	return bibtext


	def limit_handled(cursor): # limits API calls so Twitter won't block the bot
	while True:
	try:
	yield cursor.next()
	except tweepy.RateLimitError:
	mins = 15
	eprint(' Hit the Twitter API rate limit. Waiting',mins,'minutes')
	time.sleep(mins * 60) # wait 15 minutes before trying again


	def tweet_to_bibtex(tweet, bibcount):
	bibtext = ""
	# get list or urls
	urls = re.findall('http[s]?://(?:[a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*\(\),]\|(?:%[0-9a-fA-F][0-9a-fA-F]))+', tweet.full_text)
	for url in urls: # all the urls in the Tweet (usually just one) get followed
	bibtext = ""
	eprint(" Trying url = ",url)
	actual_url = expand_url(url)
	eprint(" actual_url = ",actual_url)
	if ('' != actual_url):
	html = slurp_url(actual_url) # full text of web page
	doi = extract_doi(actual_url, html)
	if ("" != doi): # if webpage contains a DOI, we're finished
	eprint(" Found DOI = ",doi)
	bibtext = doi2bib(doi)
	elif ("arxiv.org" in actual_url): # if the url is for an arxiv post (which doesn't contain a DOI)
	bibtext = arxiv_without_doi(actual_url, html)
	elif ('ISBN' in html): # somewhere in the linked page may be a book ISBN id
	isbn = scrape_for_isbn(actual_url, html)
	if (isbn):
	bibtext = isbn_to_bibtex(isbn)
	else: # let's try to generate an entry for the linked webpage itself
	bibtext = generic_web_page(actual_url, html)

	if ("" != bibtext):
	bibcount += 1
	print(bibtext,'\n',flush=True)

	return bibtext, bibcount


	def scrape_faves(user_id):
	"""
	This is the main routine.
	"""
	auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
	auth.set_access_token(access_token, access_token_secret)
	api = tweepy.API(auth)

	favecount, bibcount = 0, 0
	for tweet in limit_handled( tweepy.Cursor(api.favorites, screen_name = user_id, include_entities = True, tweet_mode='extended').items() ):
	#for tweet in api.favorites(screen_name = user_id, include_entities = True, tweet_mode='extended'): # only does 20 at a time
	favecount += 1
	bibtext, bibcount = tweet_to_bibtex(tweet, bibcount)
	eprint("-----") # just a divider between tweets

	# Things the user has tweeted or re-tweeted are as bib-worthy as things they've faved
	for tweet in limit_handled( tweepy.Cursor(api.user_timeline, screen_name = user_id, include_entities = True, tweet_mode='extended').items() ):
	favecount += 1
	bibtext, bibcount = tweet_to_bibtex(tweet, bibcount)
	eprint("-----") # just a divider between tweets

	eprint(favecount,"favorites, tweets & RTs scraped")
	eprint(bibcount,"BibTeX entries generated.")


	if __name__ == '__main__':
	if (False): # quick-testing block
	eprint(isbn_to_bibtex('0754666913')) # testing for now; brent waters' book
	url = 'https://t.co/Wf9U9fuPoI' # problem url from huffpo
	url = 'https://fb.me/1sfK1HGSE' # problem url from fb
	eprint(" trying url = ",url)
	actual_url = expand_url(url)
	eprint(" actual url = ",actual_url)
	html = slurp_url(actual_url)
	eprint(" html = ",html)

	if len(sys.argv) == 2:
	user_id = sys.argv[1]
	scrape_faves(user_id)
	else:
	eprint("Usage: ",sys.argv[0]," <user_id>",sep="")