Skip to content

Instantly share code, notes, and snippets.

@cjbayesian
Created December 20, 2013 17:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cjbayesian/8058267 to your computer and use it in GitHub Desktop.
Save cjbayesian/8058267 to your computer and use it in GitHub Desktop.
Parse BMC OA articles from XML to utf-8 text of the title, abstract, and body.
#!/usr/bin/python
"""
Parse BMC OA articles from XML to utf-8 text of the title, abstract, and body.
"""
import libxml2
from os import listdir
from time import gmtime, strftime
input_dir = './BMC_FTP/content/articles/'
files = listdir(input_dir)
outdir = './parsed_articles/'
log_string = ''
for f in files:
outfile = outdir + f.replace('.xml', '.txt')
print 'Parsing: ' + f
infile = input_dir + f
art_txt = ""
try:
doc = libxml2.parseFile(infile)
tbl_nodes = doc.xpathEval('//*[name() = "tbl"]')
#remove table nodes from xml tree#
for n in tbl_nodes:
n.unlinkNode()
n.freeNode()
#Pull out title, abstract, and main body text.
nodes_of_interest = ['//art/fm/bibl/title','//art/fm/abs','//art/bdy/sec']
for noe in nodes_of_interest:
result = doc.xpathEval(noe)
for node in result:
art_txt = art_txt + node.content
doc.freeDoc()
except:
log_string = log_string + 'failed to parse: ' + f + '\n'
text_file = open(outfile, "w")
text_file.write(art_txt)
text_file.close()
curtime = strftime("%Y-%m-%d_%H:%M:%S", gmtime())
log_file = open('parse_' + curtime + '.log','w')
log_file.write(log_string)
log_file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment