Skip to content

Instantly share code, notes, and snippets.

@ArnaudD
Created March 22, 2011 11:00
Show Gist options
  • Save ArnaudD/881059 to your computer and use it in GitHub Desktop.
Save ArnaudD/881059 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
from BeautifulSoup import BeautifulSoup
import feedparser, glob, codecs, csv, sys
from xml.dom.minidom import parse, parseString
i = 8888
for htmlFile in glob.glob('*.html'):
#print htmlFile
soup = BeautifulSoup (codecs.open(htmlFile,"rb").read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
for question in soup.findAll("div", { "class" : "question" }):
i += 1
reponse = question.parent.findNextSibling ('div', {'class': 'bloc_reponse'}).find ("div", { "class" : "reponse" }).renderContents().replace('\\n', "\n")
print repr(i)+',"'+htmlFile+'","'+question.renderContents().strip().replace ('"', '""')+'","'+reponse.strip().replace('"', '""')+'"'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment