Skip to content

Instantly share code, notes, and snippets.

@alexstorer
Created August 9, 2012 19:24
Show Gist options
  • Save alexstorer/3307317 to your computer and use it in GitHub Desktop.
Save alexstorer/3307317 to your computer and use it in GitHub Desktop.
Convert relevant html docs to text!
# open the files in the directory, parse them and get the text.
#eg.text_content()
import glob
from lxml import etree
import lxml.html
import csv
import urllib2
import urllib
import re
flist = glob.glob('/Users/astorer/Work/sgrossman/output/*.html')
parser = etree.HTMLParser()
for fname in flist:
f = open(fname,'r')
tree = lxml.html.document_fromstring(f.read())
f.close()
ftname = fname.replace('.html','.txt')
ft = open(ftname,'w')
textelem = tree.xpath('//*[@id="myfile"]')[0]
for el in textelem:
ft.write(el.text_content().encode('utf-8')+'\n')
@alexstorer
Copy link
Author

There are details on the lxml.html module here: http://lxml.de/lxmlhtml.html

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment