Last active
August 29, 2015 14:16
-
-
Save ayman/72385154220bf5fb1d0a to your computer and use it in GitHub Desktop.
Get the DOIs for an ACM DL Citation...a simple scraper.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Get the DOIs for an ACM DL Citation. | |
### | |
### David A. Shamma - GNU 2.0 License | |
### | |
## Call like this: | |
## get_citations('1290082.1290120', True) | |
## or just the tail/acm id | |
## get_citations('1290120') | |
## | |
## Prints to the console: | |
## DOI_URL <tab> ACM REF. | |
## or just the ACM REF if full is defaulted to False. | |
## | |
## Setting full to true will make a secondary request for every | |
## citation...slow but will fetch the proper DOI for each citation. | |
def get_citations(doi, full=False): | |
"Take a DOI and print the DOIs (ACM Optimized/Defaulted)." | |
## Make our url class | |
import urllib2 | |
opener = urllib2.build_opener() | |
opener.addheaders = [('Connection', 'keep-alive'), | |
('Accept-Language', 'en-US,en;q=0.8'), | |
('User-Agent', 'Python2.x/urllib2'), | |
('Accept', '*/*'), | |
('Accept-Encoding', 'gzip, deflate, sdch'), | |
('DNT', '1')] | |
## Make our regex commands | |
import re | |
find_cites_regex = re.compile("<a href=\"(citation.+)\">\s*(.+)\s+</a>", | |
re.MULTILINE | re.UNICODE) | |
find_dois_regex = re.compile("<a href=\"(http://dx.doi.org/.+)\" target") | |
## Split the doi if needed | |
doi_tail = doi.split('.') | |
if (len(doi_tail) == 2): | |
doi_tail = doi_tail[1] | |
else: | |
doi_tail = doi_tail[0] | |
## Make the ACM URL | |
u = 'http://dl.acm.org/tab_citings.cfm?id=%s' % (doi_tail) | |
## Fetch it | |
req = opener.open(u) | |
res = req.read() | |
refs = find_cites_regex.findall(res) | |
for pair in refs: | |
if (full): | |
cu = 'http://dl.acm.org/%s' % pair[0] | |
req2 = opener.open(cu) | |
res2 = req2.read() | |
link = find_dois_regex.findall(res2) | |
if (len(link) > 0): | |
print "%s\t%s" % (link[0], pair[1]) | |
else: | |
print '%s' % (pair[1]) | |
else: | |
print '%s' % (pair[1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment