Skip to content

Instantly share code, notes, and snippets.

@kynan
Created November 22, 2014 17:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kynan/c594fb3d68afdceb3921 to your computer and use it in GitHub Desktop.
Save kynan/c594fb3d68afdceb3921 to your computer and use it in GitHub Desktop.
from lxml import etree
ns = 'http://docs.oasis-open.org/legaldocml/ns/akn/3.0/CSD11'
ukm = 'http://www.legislation.gov.uk/namespaces/metadata'
hPath = etree.XPath('.//a:heading//text()', namespaces={'a': ns})
iPath = etree.XPath('.//a:intro//text()', namespaces={'a': ns})
cPath = etree.XPath('.//a:content//text()|.//a:content//a:ref/text()',
namespaces={'a': ns})
mdPath = etree.XPath('//ukm:PrimaryMetadata[1]', namespaces={'ukm': ukm})
yPath = etree.XPath('./ukm:Year/@Value', namespaces={'ukm': ukm})
nPath = etree.XPath('./ukm:Number/@Value', namespaces={'ukm': ukm})
dPath = etree.XPath('./ukm:EnactmentDate/@Date', namespaces={'ukm': ukm})
tPath = etree.XPath('./ukm:DocumentMainType/@Value', namespaces={'ukm': ukm})
sPath = etree.XPath('./ukm:DocumentStatus/@Value', namespaces={'ukm': ukm})
cPath = etree.XPath('./ukm:DocumentCategory/@Value', namespaces={'ukm': ukm})
def parse_file(fname):
sections = {}
context = etree.iterparse(fname, events=('start',), tag='{%s}section' % ns,
remove_blank_text=True, encoding='utf-8')
def parse_sec(section):
def make_count(xp):
text = ' '.join(xp(section))
return {'text': text, 'words': len(text.split()), 'chars': len(text)}
sec = {'heading': make_count(hPath),
'intro': make_count(iPath),
'content': make_count(cPath)}
for c in ['words', 'chars']:
sec[c] = sum(sec[s][c] for s in ['heading', 'intro', 'content'])
return sec
for _, section in context:
sections[section.attrib['eId']] = parse_sec(section)
for c in ['words', 'chars']:
sections[c] =
count = labmda c: sum(s[c] for s in sections.values() if isinstance(s, dict))
md = mdPath(etree.parse(fname))
return {'sections': sections,
'year': yPath(md),
'number': nPath(md),
'date': dPath(md),
'type': tPath(md),
'status': sPath(md),
'category': cPath(md),
'words': count('words'),
'chars': count('chars')}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment