Skip to content

Instantly share code, notes, and snippets.

@aparrish
Forked from ktibb/oprah.py
Created March 5, 2012 15:48
Show Gist options
  • Save aparrish/1978923 to your computer and use it in GitHub Desktop.
Save aparrish/1978923 to your computer and use it in GitHub Desktop.
import urllib
import BeautifulSoup
import re
html = urllib.urlopen('http://www.oprah.com/relationships/What-Kind-of-Woman-Watches-Porn-Researchers-Find-Answers').read()
soup = BeautifulSoup.BeautifulSoup(html)
#texts = soup.findAll(text=True)
texts = soup.find("div", {"class": "arial14"})
def visible(element):
if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
return False
elif re.match('<!--.*-->', str(element)):
return False
return True
visible_texts = filter(visible, texts)
print visible_texts
for tag in visible_texts:
line = tag.string
if line is not None:
if line not in ['\n',' <br /> ', '']:
print "-----"
line.strip(";:#-?.,")
print line
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment