Skip to content

Instantly share code, notes, and snippets.

@cornchz
Forked from e9t/README.md
Created August 31, 2012 13:20
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save cornchz/3552576 to your computer and use it in GitHub Desktop.
Save cornchz/3552576 to your computer and use it in GitHub Desktop.
Extract text in .smi files and convert to .txt files
#! /usr/bin/python2.7
# -*- coding: utf-8 -*-
"""
The `.smi` files should be in the `./smi` folder.
Extracted text will be contained in `.txt` files and located in a `./txt' folder.
"""
import html5lib
import os
from glob import glob
directory = '''./smi/'''
xpaths = "//body//text()"
def get_filenames(directory):
return glob(os.path.join(directory, '*'))
def get_xpaths(filename):
with open(filename, 'r') as f:
p = html5lib.HTMLParser(\
tree=html5lib.treebuilders.getTreeBuilder("lxml"),\
namespaceHTMLElements=False)
page = p.parse(f)
xp = page.xpath(xpaths)
return xp
def print_lines(filename, lines):
with open(filename, 'w') as f:
for line in lines:
f.write(line)
filenames = get_filenames(directory)
for oldfile in filenames:
newfile = 'txt' + oldfile[5:-4] + '.txt'
print 'processing ' + oldfile
lines = get_xpaths(oldfile)
encoded = (line.encode('utf-8') for line in lines)
print_lines(newfile, encoded)
print 'done'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment