Skip to content

Instantly share code, notes, and snippets.

@ayman
Last active August 29, 2015 14:16
Show Gist options
  • Save ayman/72385154220bf5fb1d0a to your computer and use it in GitHub Desktop.
Save ayman/72385154220bf5fb1d0a to your computer and use it in GitHub Desktop.
Get the DOIs for an ACM DL Citation...a simple scraper.
### Get the DOIs for an ACM DL Citation.
###
### David A. Shamma - GNU 2.0 License
###
## Call like this:
## get_citations('1290082.1290120', True)
## or just the tail/acm id
## get_citations('1290120')
##
## Prints to the console:
## DOI_URL <tab> ACM REF.
## or just the ACM REF if full is defaulted to False.
##
## Setting full to true will make a secondary request for every
## citation...slow but will fetch the proper DOI for each citation.
def get_citations(doi, full=False):
"Take a DOI and print the DOIs (ACM Optimized/Defaulted)."
## Make our url class
import urllib2
opener = urllib2.build_opener()
opener.addheaders = [('Connection', 'keep-alive'),
('Accept-Language', 'en-US,en;q=0.8'),
('User-Agent', 'Python2.x/urllib2'),
('Accept', '*/*'),
('Accept-Encoding', 'gzip, deflate, sdch'),
('DNT', '1')]
## Make our regex commands
import re
find_cites_regex = re.compile("<a href=\"(citation.+)\">\s*(.+)\s+</a>",
re.MULTILINE | re.UNICODE)
find_dois_regex = re.compile("<a href=\"(http://dx.doi.org/.+)\" target")
## Split the doi if needed
doi_tail = doi.split('.')
if (len(doi_tail) == 2):
doi_tail = doi_tail[1]
else:
doi_tail = doi_tail[0]
## Make the ACM URL
u = 'http://dl.acm.org/tab_citings.cfm?id=%s' % (doi_tail)
## Fetch it
req = opener.open(u)
res = req.read()
refs = find_cites_regex.findall(res)
for pair in refs:
if (full):
cu = 'http://dl.acm.org/%s' % pair[0]
req2 = opener.open(cu)
res2 = req2.read()
link = find_dois_regex.findall(res2)
if (len(link) > 0):
print "%s\t%s" % (link[0], pair[1])
else:
print '%s' % (pair[1])
else:
print '%s' % (pair[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment