Skip to content

Instantly share code, notes, and snippets.

@sorrat
Created August 25, 2014 15:08
Show Gist options
  • Save sorrat/da404e306a44098eacda to your computer and use it in GitHub Desktop.
Save sorrat/da404e306a44098eacda to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import os
import argparse
from lxml import etree
ELEM_TAG = 'tmk:MarkSignificantVerbalElementText'
ELEM_NAMESPACE = {'tmk': 'http://www.wipo.int/standards/XMLSchema/Trademark/1'}
def isfile(f):
if not os.path.exists(f):
raise argparse.ArgumentTypeError("%s does not exist" % f)
return f
def cmdline_args():
desc = 'Extract text inside %s elements' % ELEM_TAG
parser = argparse.ArgumentParser(description=desc)
parser.add_argument('infile', type=isfile, help='Input XML-file')
return parser.parse_args()
def parse_xml(infile, outfile):
tree = etree.parse(open(infile))
with open(outfile, 'w') as f:
elems = tree.findall('//' + ELEM_TAG, ELEM_NAMESPACE)
f.write('\n'.join(e.text.encode('utf8') for e in elems))
def get_filename_without_extension(path):
filename = os.path.basename(path)
return os.path.splitext(filename)[0]
if __name__ == '__main__':
args = cmdline_args()
outfile = 'results_%s.txt' % get_filename_without_extension(args.infile)
parse_xml(args.infile, outfile)
# -*- coding: utf-8 -*-
from parsexml import get_filename_without_extension
def test_get_filename_without_extension():
assert get_filename_without_extension('/tmp/a.py') == 'a'
assert get_filename_without_extension('a.py') == 'a'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment