ayman/get_cited_by_dois.py

## get_cited_by_dois.py
### Get the DOIs for an ACM DL Citation.
###
### David A. Shamma - GNU 2.0 License
###
## Call like this:
## get_citations('1290082.1290120', True)
## or just the tail/acm id
## get_citations('1290120')
##
## Prints to the console:
## DOI_URL <tab> ACM REF.
## or just the ACM REF if full is defaulted to False.
##
## Setting full to true will make a secondary request for every
## citation...slow but will fetch the proper DOI for each citation.

def get_citations(doi, full=False):
  "Take a DOI and print the DOIs (ACM Optimized/Defaulted)."
  ## Make our url class
  import urllib2
  opener = urllib2.build_opener()
  opener.addheaders = [('Connection', 'keep-alive'),
                       ('Accept-Language', 'en-US,en;q=0.8'),
                       ('User-Agent', 'Python2.x/urllib2'),
                       ('Accept', '*/*'),
                       ('Accept-Encoding', 'gzip, deflate, sdch'),
                       ('DNT', '1')]
  ## Make our regex commands
  import re
  find_cites_regex = re.compile("<a href=\"(citation.+)\">\s*(.+)\s+</a>",
                     re.MULTILINE | re.UNICODE)
  find_dois_regex = re.compile("<a href=\"(http://dx.doi.org/.+)\" target")
  ## Split the doi if needed
  doi_tail = doi.split('.')
  if (len(doi_tail) == 2):
    doi_tail = doi_tail[1]
  else:
    doi_tail = doi_tail[0]
  ## Make the ACM URL
  u = 'http://dl.acm.org/tab_citings.cfm?id=%s' % (doi_tail)
  ## Fetch it
  req = opener.open(u)
  res = req.read()
  refs = find_cites_regex.findall(res)
  for pair in refs:
    if (full):
      cu = 'http://dl.acm.org/%s' % pair[0]
      req2 = opener.open(cu)
      res2 = req2.read()
      link = find_dois_regex.findall(res2)
      if (len(link) > 0):
        print "%s\t%s" % (link[0], pair[1])
      else:
        print '%s' % (pair[1])
    else:
      print '%s' % (pair[1])
	### Get the DOIs for an ACM DL Citation.
	###
	### David A. Shamma - GNU 2.0 License
	###
	## Call like this:
	## get_citations('1290082.1290120', True)
	## or just the tail/acm id
	## get_citations('1290120')
	##
	## Prints to the console:
	## DOI_URL <tab> ACM REF.
	## or just the ACM REF if full is defaulted to False.
	##
	## Setting full to true will make a secondary request for every
	## citation...slow but will fetch the proper DOI for each citation.

	def get_citations(doi, full=False):
	"Take a DOI and print the DOIs (ACM Optimized/Defaulted)."
	## Make our url class
	import urllib2
	opener = urllib2.build_opener()
	opener.addheaders = [('Connection', 'keep-alive'),
	('Accept-Language', 'en-US,en;q=0.8'),
	('User-Agent', 'Python2.x/urllib2'),
	('Accept', '/'),
	('Accept-Encoding', 'gzip, deflate, sdch'),
	('DNT', '1')]
	## Make our regex commands
	import re
	find_cites_regex = re.compile("<a href=\"(citation.+)\">\s*(.+)\s+</a>",
	re.MULTILINE \| re.UNICODE)
	find_dois_regex = re.compile("<a href=\"(http://dx.doi.org/.+)\" target")
	## Split the doi if needed
	doi_tail = doi.split('.')
	if (len(doi_tail) == 2):
	doi_tail = doi_tail[1]
	else:
	doi_tail = doi_tail[0]
	## Make the ACM URL
	u = 'http://dl.acm.org/tab_citings.cfm?id=%s' % (doi_tail)
	## Fetch it
	req = opener.open(u)
	res = req.read()
	refs = find_cites_regex.findall(res)
	for pair in refs:
	if (full):
	cu = 'http://dl.acm.org/%s' % pair[0]
	req2 = opener.open(cu)
	res2 = req2.read()
	link = find_dois_regex.findall(res2)
	if (len(link) > 0):
	print "%s\t%s" % (link[0], pair[1])
	else:
	print '%s' % (pair[1])
	else:
	print '%s' % (pair[1])