Download dependencies
import xml.etree.cElementTree as ET
from lxml import etree
from itertools import chain
import pandas as pd
function to read xml file
def get_record(filehandle):
record = ''
flag = False
for line in filehandle:
if flag != True and not line.startswith('<REC'):
continue;
flag = True
record = record + line
if line.strip().endswith('</REC>'):
return record
return None
reading xml file
count = 0
records = list()
bad_records = list()
with open('/Users/titipat/Desktop/INDIANA_U_1997_2016_20161019125045.xml', 'r') as file:
while len(records) < 100000:
count += 1
record = get_record(file)
try:
rec = etree.fromstring(record)
records.append(rec)
except:
bad_records.append(record)
Now when we have list of records
, we can now extract something that we are interested
def extract_wos_id(elem):
wos_id = list(elem.iterfind('UID'))[0].text
return wos_id
def extract_authors(elem, wos_id):
authors = []
for names in elem.iterfind('./static_data/summary/names'):
for name in names:
author = {'addr_num' : name.attrib.get('addr_no', 'NULL'),
'position' : name.attrib.get('seq_no', 'NULL'),
'reprint' : name.attrib.get('reprint', 'NULL'),
'cluster_id': name.attrib.get('dais_id', 'NULL'),
'role' : name.attrib.get('role','NULL'),
'wos_id': wos_id}
for item in name.iter():
author[str(item.tag)] = str(item.text)
authors.extend([author])
return authors
Then you can just run
authors = []
for r in records:
wos_id = extract_wos_id(r)
authors.append(extract_authors(r, wos_id))
authors_df = pd.DataFrame(list(chain(*authors))) # get the table of authors
To get institution addresses
def extract_addresses(elem, wos_id):
addresslist = []
name_address_relation = []
for addresses in elem.iterfind('./static_data/fullrecord_metadata/addresses/address_name'):
addr = {'addr_num' : list(addresses.iterfind('./address_spec'))[0].attrib['addr_no'],
'organization' : 'NULL'}
for address in addresses.iter():
if address.tag in ['full_address', 'city', 'state', 'country', 'zip']:
addr[str(address.tag)] = str(address.text)
orgs = []
for item in addresses.iter():
if item.tag == 'organization':
orgs.extend([item.text])
if not orgs :
orgs = ['NULL']
suborgs = []
for item in addresses.iter():
if item.tag == 'suborganization':
suborgs.extend([item.text])
if not suborgs :
suborgs = ['NULL']
for org in orgs:
for suborg in suborgs:
t = {'organization' : org,
'suborganization' : suborg,
'wos_id': wos_id}
temp = addr.copy()
temp.update(t)
addresslist.extend([temp])
for name in list(addresses.iterfind('./names/name')):
name_address_relation.extend([{'position' : name.attrib['seq_no'],
'addr_num' : name.attrib['addr_no'],
'wos_id': wos_id}])
return addresslist, name_address_relation
address_lists = []
name_address_relations = []
for r in records:
wos_id = extract_wos_id(r)
addresslist, name_address_relation = extract_addresses(r, wos_id)
address_lists.append(addresslist)
name_address_relations.append(name_address_relation)
Now, transform to pandas dataframe
address_lists_df = pd.DataFrame(list(chain(*address_lists)))
name_address_relations_df = pd.DataFrame(list(chain(*name_address_relations)))