Skip to content

Instantly share code, notes, and snippets.

@lawlesst
Last active October 6, 2016 18:59
Show Gist options
  • Save lawlesst/b758c1573f6fca714c53da60860a0114 to your computer and use it in GitHub Desktop.
Save lawlesst/b758c1573f6fca714c53da60860a0114 to your computer and use it in GitHub Desktop.
Lookup UTs in AMR and output to CSV.
"""
Look up batches of UTs in InCites.
Run as:
$ python incites_batch_lookup.py sample_file.csv outputfile.csv
"""
import csv
import json
from itertools import izip_longest
import os
import sys
import time
import requests
INCITES_KEY = os.environ['INCITES_KEY']
def grouper(iterable, n, fillvalue=None):
"""
Group iterable into n sized chunks.
See: http://stackoverflow.com/a/312644/758157
"""
args = [iter(iterable)] * n
return izip_longest(*args, fillvalue=fillvalue)
def get(batch):
rsp = requests.get(
'https://api.thomsonreuters.com/incites_ps/v1/DocumentLevelMetricsByUT/json',
params={'X-TR-API-APP-ID': INCITES_KEY, 'UT': ",".join([b for b in batch if b is not None])}
)
if rsp.status_code != 200:
print>>sys.stderr, "Batch failed with", len(batch), "items."
print>>sys.stderr, rsp
#import ipdb; ipdb.set_trace()
return []
data = [item for item in rsp.json()['api'][0]['rval']]
return data
def main():
# Number of UTs to send to InCites at once
BATCH_SIZE = 200
found = []
to_check = []
with open(sys.argv[1]) as infile:
for row in csv.DictReader(infile):
d = {}
for k, v in row.items():
if k.lower().strip() == "ut":
to_check.append(v.strip().replace("WOS:", ""))
found = []
with open(sys.argv[2], 'wb') as outfile:
writer = csv.writer(outfile)
first = True
for idx, batch in enumerate(grouper(to_check, BATCH_SIZE)):
print>>sys.stderr, "Processing batch", idx
found = get(batch)
for grp in found:
if first is True:
# write header
writer.writerow(grp.keys())
first = False
writer.writerow(grp.values())
time.sleep(.5)
if __name__ == "__main__":
main()
"""
Expects an incoming CSV file with ISSNs and will generate output from AMR.
E.g.
ISSN
1234-4900
3902-3829
You can optionally include an ID column for the journal
ID,ISSN
13, 2309-9302
39, 3990-2123
Run as:
$ python issns_to_jcr.py sample_file.csv outputfile.csv
"""
import csv
import json
import os
import sys
import xml.etree.ElementTree as ET
import json
import requests
USER = os.environ['LAMR_USER']
PASSWORD = os.environ['LAMR_PASSWORD']
ns = {'isi': 'http://www.isinet.com/xrpc41'}
request_template = u"""<?xml version="1.0" encoding="UTF-8" ?>
<request xmlns="http://www.isinet.com/xrpc41" src="app.id=InternalVIVODemo">
<fn name="LinksAMR.retrieve">
<list>
<!-- authentication -->
<map>
<val name="username">{user}</val>
<val name="password">{password}</val>
</map>
<!-- what to to return -->
<map>
<list name="JCR">
<val>impactGraphURL</val>
<val>issn</val>
</list>
</map>
<!-- LOOKUP DATA -->
{items}
</list>
</fn>
</request>
"""
def prep_amr(items):
"""
<map name="cite_1">
<val name="{id_type}">{value}</val>
</map>
"""
map_items = ET.Element("map")
for item_id, issn in items:
if (item_id is None) or (issn is None):
continue
this_item = ET.Element("map", name=str(item_id))
de = ET.Element("val", name="issn")
de.text = issn
this_item.append(de)
map_items.append(this_item)
request_items = ET.tostring(map_items)
xml = request_template.format(user=USER, password=PASSWORD, items=request_items)
return xml
def read(raw):
raw = ET.fromstring(raw)
cites = raw.findall('isi:fn/isi:map/isi:map', ns)
out = {}
for cite in raw.findall('isi:fn/isi:map/isi:map', ns):
cite_key = cite.attrib['name']
meta = {}
for val in cite.findall('isi:map/isi:val', ns):
meta[val.attrib['name']] = val.text
out[cite_key] = meta
return out
def get(msg):
rsp = requests.post(
'https://ws.isiknowledge.com/cps/xrpc',
data=msg,
headers={'Content Type': "application/xml"}
)
data = read(rsp.text)
return data
def main():
found = []
journals = []
with open(sys.argv[1]) as infile:
for num, row in enumerate(csv.DictReader(infile)):
print>>sys.stderr, "Processing", row['ISSN']
jid = row.get('ID', num)
journals.append((jid, row['ISSN']))
amr_message = prep_amr(journals)
found = get(amr_message)
with open(sys.argv[2], 'wb') as outfile:
writer = csv.writer(outfile)
writer.writerow( ('number', 'ISSN', 'JCR') )
for item in found:
writer.writerow([item, found[item].get('issn', 'na'), found[item].get('impactGraphURL', 'na')])
if __name__ == "__main__":
main()
"""
Expects an incoming CSV file with UT, PMID, or DOI headers and wil post to
AMR in batches of 50.
E.g.
UT
01234
02394
039039
PMID
2093030
2405903
95930303
Run as:
$ python uts_batch_lookup.py sample_file.csv outputfile.csv
"""
import csv
import json
from itertools import izip_longest
import os
import sys
import xml.etree.ElementTree as ET
import json
import requests
USER = os.environ['LAMR_USER']
PASSWORD = os.environ['LAMR_PASSWORD']
ns = {'isi': 'http://www.isinet.com/xrpc41'}
ET.register_namespace("isi", "http://www.isinet.com/xrpc41")
def grouper(iterable, n, fillvalue=None):
"""
Group iterable into n sized chunks.
See: http://stackoverflow.com/a/312644/758157
"""
args = [iter(iterable)] * n
return izip_longest(*args, fillvalue=fillvalue)
def read(raw):
raw = ET.fromstring(raw)
cites = raw.findall('isi:fn/isi:map/isi:map', ns)
out = {}
for cite in raw.findall('isi:fn/isi:map/isi:map', ns):
cite_key = cite.attrib['name']
meta = {}
for val in cite.findall('isi:map/isi:val', ns):
meta[val.attrib['name']] = val.text
out[cite_key] = meta
return out
request_template = u"""<?xml version="1.0" encoding="UTF-8" ?>
<request xmlns="http://www.isinet.com/xrpc41" src="app.id=InternalVIVODemo">
<fn name="LinksAMR.retrieve">
<list>
<!-- authentication -->
<map>
<val name="username">{user}</val>
<val name="password">{password}</val>
</map>
<!-- what to to return -->
<map>
<list name="WOS">
<val>sourceURL</val>
<val>ut</val>
<val>doi</val>
<val>pmid</val>
<val>timesCited</val>
</list>
</map>
<!-- LOOKUP DATA -->
{items}
</list>
</fn>
</request>
"""
def get(request_xml):
rsp = requests.post(
'https://ws.isiknowledge.com/cps/xrpc',
data=request_xml,
headers={'Content Type': "application/xml"}
)
data = read(rsp.text)
return data
def prep_amr(items, local_id="id"):
"""
<map name="cite_1">
<val name="{id_type}">{value}</val>
</map>
"""
map_items = ET.Element("map")
for idx, pub in enumerate(items):
if pub is None:
continue
local_id_value = pub.get(local_id) or pub.get(local_id.upper())
if local_id_value is None:
local_id_value = str(idx)
this_item = ET.Element("map", name=local_id_value)
for k,v in pub.items():
if v is None:
continue
#import ipdb; ipdb.set_trace()
de = ET.Element("val", name=k.lower())
de.text = v.strip()
this_item.append(de)
map_items.append(this_item)
request_items = ET.tostring(map_items)
xml = request_template.format(user=USER, password=PASSWORD, items=request_items)
return xml
def main():
# AMR will take 50 items at a time.
BATCH_SIZE = 50
found = []
to_check = []
with open(sys.argv[1]) as infile:
for row in csv.DictReader(infile):
d = {}
for k, v in row.items():
d[k.lower()] = v.strip()
to_check.append(d)
for idx, batch in enumerate(grouper(to_check, BATCH_SIZE)):
xml = prep_amr(batch)
print>>sys.stderr, "Processing batch", idx
# Post the batch
found.append(get(xml))
with open(sys.argv[2], 'wb') as outfile:
writer = csv.writer(outfile)
writer.writerow(('id', 'ut', 'doi', 'pmid', 'times cited', 'source'))
for grp in found:
for k,item in grp.items():
#import ipdb; ipdb.set_trace()
ut = item.get('ut')
if ut is not None:
ut = "WOS:" + ut
writer.writerow([k, ut, item.get('doi', ""), item.get('pmid', ""), item.get('timesCited', '0'), item.get('sourceURL', 'N/A')])
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment