Skip to content

Instantly share code, notes, and snippets.

@titipata
Last active November 15, 2016 17:24
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save titipata/8466a44e30948f54fbbdd1fb59b2438f to your computer and use it in GitHub Desktop.
Save titipata/8466a44e30948f54fbbdd1fb59b2438f to your computer and use it in GitHub Desktop.
WoS extract

Web of Science parser

Download dependencies

import xml.etree.cElementTree as ET
from lxml import etree
from itertools import chain
import pandas as pd

function to read xml file

def get_record(filehandle):
    record = ''
    flag   = False
    for line in filehandle:
        if flag != True and not line.startswith('<REC'):
            continue;
        flag = True
        record = record + line
        if line.strip().endswith('</REC>'):
            return record
    return None

reading xml file

count = 0
records = list()
bad_records = list()
with open('/Users/titipat/Desktop/INDIANA_U_1997_2016_20161019125045.xml', 'r') as file:
    while len(records) < 100000:
        count  += 1
        record = get_record(file)
        try:
            rec = etree.fromstring(record)
            records.append(rec)
        except:
            bad_records.append(record)

Disambiguation

Now when we have list of records, we can now extract something that we are interested

def extract_wos_id(elem):
    wos_id = list(elem.iterfind('UID'))[0].text
    return wos_id
def extract_authors(elem, wos_id):
    authors = []
    for names in elem.iterfind('./static_data/summary/names'):
        for name in names:
            author = {'addr_num' : name.attrib.get('addr_no', 'NULL'),
                      'position' : name.attrib.get('seq_no', 'NULL'),
                      'reprint'  : name.attrib.get('reprint', 'NULL'),
                      'cluster_id': name.attrib.get('dais_id', 'NULL'),
                      'role'     : name.attrib.get('role','NULL'), 
                      'wos_id': wos_id}
            for item in name.iter():
                author[str(item.tag)] = str(item.text)
            authors.extend([author])
    return authors

Then you can just run

authors = []
for r in records:
    wos_id = extract_wos_id(r)
    authors.append(extract_authors(r, wos_id))
authors_df = pd.DataFrame(list(chain(*authors))) # get the table of authors

To get institution addresses

def extract_addresses(elem, wos_id):
    addresslist = []
    name_address_relation = []
    
    for addresses in elem.iterfind('./static_data/fullrecord_metadata/addresses/address_name'):
        addr = {'addr_num' : list(addresses.iterfind('./address_spec'))[0].attrib['addr_no'],
                'organization' : 'NULL'}

        for address in addresses.iter():
            if address.tag in ['full_address', 'city', 'state', 'country', 'zip']:
                addr[str(address.tag)] =  str(address.text)

        orgs = []
        for item in addresses.iter():
            if item.tag == 'organization':
                orgs.extend([item.text])
        if not orgs :
            orgs = ['NULL']
                    
        suborgs = []
        for item in addresses.iter():
            if item.tag == 'suborganization':
                suborgs.extend([item.text])
        if not suborgs :
            suborgs = ['NULL']

        for org in orgs:
            for suborg in suborgs:
                t = {'organization'    : org,
                     'suborganization' : suborg, 
                     'wos_id': wos_id}
                temp = addr.copy()
                temp.update(t)
                addresslist.extend([temp])
                
        for name in list(addresses.iterfind('./names/name')):
            name_address_relation.extend([{'position' : name.attrib['seq_no'],
                                           'addr_num' : name.attrib['addr_no'], 
                                           'wos_id': wos_id}])

    return addresslist, name_address_relation
address_lists = []
name_address_relations = []
for r in records:
    wos_id = extract_wos_id(r)
    addresslist, name_address_relation = extract_addresses(r, wos_id)
    address_lists.append(addresslist)
    name_address_relations.append(name_address_relation)

Now, transform to pandas dataframe

address_lists_df = pd.DataFrame(list(chain(*address_lists)))
name_address_relations_df = pd.DataFrame(list(chain(*name_address_relations)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment