Skip to content

Instantly share code, notes, and snippets.

@skorgu
Created March 27, 2011 03:22
Show Gist options
  • Save skorgu/888881 to your computer and use it in GitHub Desktop.
Save skorgu/888881 to your computer and use it in GitHub Desktop.
Extract individual files from edgar's 'txt' files
#!/usr/bin/python
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import mmap
import os
import re
#ftp://ftp.sec.gov/edgar/data/732717/0000732717-09-000007.txt
filename = "0000732717-09-000007.txt"
dest = "%s-extracted"% filename
try:
os.mkdir(dest)
except:
pass
file = open(filename, "r+")
size = os.path.getsize(filename)
data = mmap.mmap(file.fileno(), size)
start = 0
while data.find("<DOCUMENT>", start) > 0:
loc_doc = data.find("<DOCUMENT>", start)
loc_text = data.find("\n", data.find("<TEXT>", start))
loc_enddoc = data.find("</DOCUMENT>", loc_text)
meta = {}
for m in re.finditer('\<(?P<key>.*?)\>(?P<val>.*)', data[loc_doc:loc_text]):
meta.update({m.group('key'): m.group('val')})
start = loc_enddoc
outfile = os.path.join(dest, meta['FILENAME'])
with open(outfile, 'w') as f:
print "Writing %s to %s" % (meta.get('DESCRIPTION'), outfile)
f.write(data[loc_text:loc_enddoc])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment