titipata/wos_extract.md

## wos_extract.md

      
    Raw
  

              wos_extract.md
            
          
    Web of Science parser

Download dependencies
import xml.etree.cElementTree as ET
from lxml import etree
from itertools import chain
import pandas as pd
function to read xml file
def get_record(filehandle):
    record = ''
    flag   = False
    for line in filehandle:
        if flag != True and not line.startswith('<REC'):
            continue;
        flag = True
        record = record + line
        if line.strip().endswith('</REC>'):
            return record
    return None
reading xml file
count = 0
records = list()
bad_records = list()
with open('/Users/titipat/Desktop/INDIANA_U_1997_2016_20161019125045.xml', 'r') as file:
    while len(records) < 100000:
        count  += 1
        record = get_record(file)
        try:
            rec = etree.fromstring(record)
            records.append(rec)
        except:
            bad_records.append(record)
Disambiguation

Now when we have list of records, we can now extract something that we are interested
def extract_wos_id(elem):
    wos_id = list(elem.iterfind('UID'))[0].text
    return wos_id
def extract_authors(elem, wos_id):
    authors = []
    for names in elem.iterfind('./static_data/summary/names'):
        for name in names:
            author = {'addr_num' : name.attrib.get('addr_no', 'NULL'),
                      'position' : name.attrib.get('seq_no', 'NULL'),
                      'reprint'  : name.attrib.get('reprint', 'NULL'),
                      'cluster_id': name.attrib.get('dais_id', 'NULL'),
                      'role'     : name.attrib.get('role','NULL'), 
                      'wos_id': wos_id}
            for item in name.iter():
                author[str(item.tag)] = str(item.text)
            authors.extend([author])
    return authors
Then you can just run
authors = []
for r in records:
    wos_id = extract_wos_id(r)
    authors.append(extract_authors(r, wos_id))
authors_df = pd.DataFrame(list(chain(*authors))) # get the table of authors
To get institution addresses
def extract_addresses(elem, wos_id):
    addresslist = []
    name_address_relation = []
    
    for addresses in elem.iterfind('./static_data/fullrecord_metadata/addresses/address_name'):
        addr = {'addr_num' : list(addresses.iterfind('./address_spec'))[0].attrib['addr_no'],
                'organization' : 'NULL'}

        for address in addresses.iter():
            if address.tag in ['full_address', 'city', 'state', 'country', 'zip']:
                addr[str(address.tag)] =  str(address.text)

        orgs = []
        for item in addresses.iter():
            if item.tag == 'organization':
                orgs.extend([item.text])
        if not orgs :
            orgs = ['NULL']
                    
        suborgs = []
        for item in addresses.iter():
            if item.tag == 'suborganization':
                suborgs.extend([item.text])
        if not suborgs :
            suborgs = ['NULL']

        for org in orgs:
            for suborg in suborgs:
                t = {'organization'    : org,
                     'suborganization' : suborg, 
                     'wos_id': wos_id}
                temp = addr.copy()
                temp.update(t)
                addresslist.extend([temp])
                
        for name in list(addresses.iterfind('./names/name')):
            name_address_relation.extend([{'position' : name.attrib['seq_no'],
                                           'addr_num' : name.attrib['addr_no'], 
                                           'wos_id': wos_id}])

    return addresslist, name_address_relation
address_lists = []
name_address_relations = []
for r in records:
    wos_id = extract_wos_id(r)
    addresslist, name_address_relation = extract_addresses(r, wos_id)
    address_lists.append(addresslist)
    name_address_relations.append(name_address_relation)
Now, transform to pandas dataframe
address_lists_df = pd.DataFrame(list(chain(*address_lists)))
name_address_relations_df = pd.DataFrame(list(chain(*name_address_relations)))