Skip to content

Instantly share code, notes, and snippets.

@twneale
Created January 10, 2014 22:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save twneale/8363605 to your computer and use it in GitHub Desktop.
Save twneale/8363605 to your computer and use it in GitHub Desktop.
Get a set of all paragraph numbers used in the Code of Federal Regulations
import re
import os
import sys
import json
import contextlib
import lxml.etree
@contextlib.contextmanager
def cd(path):
'''Creates the path if it doesn't exist'''
old_dir = os.getcwd()
try:
os.makedirs(path)
except OSError:
pass
os.chdir(path)
try:
yield
finally:
os.chdir(old_dir)
def main():
enums = set()
for folder, subfolders, files in os.walk('data'):
with cd(folder):
for filename in files:
if not filename.lower().endswith('.xml'):
continue
print 'Starting', filename
doc = lxml.etree.parse(filename).getroot()
for el in doc.xpath('//SECTNO'):
enum = el.text or ''
enum = enum.strip(u'\xa7 ')
if enum:
enums.add(enum)
for el in doc.xpath('//P'):
text = el.text or ''
enum = re.search(r'^\s*\((\S+)\)', text)
if enum:
enum = enum.group(1)
enums.add(enum)
with open('cfr-enum.json', 'w') as f:
json.dump(tuple(enums), f)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment