Skip to content

Instantly share code, notes, and snippets.

@bngsudheer
Created June 24, 2011 13:05
Show Gist options
  • Save bngsudheer/1044721 to your computer and use it in GitHub Desktop.
Save bngsudheer/1044721 to your computer and use it in GitHub Desktop.
Grab editorial news from Indian express
#!/usr/bin/python
import urllib
import lxml
from lxml import etree
import StringIO
parser = etree.HTMLParser()
todays_paper_url = "http://www.indianexpress.com/supplement/"
page = urllib.urlopen(todays_paper_url)
html = page.read()
tree = etree.parse(StringIO.StringIO(html), parser)
editorial_item_xpath = ".//h4[text()='Editorials']/following-sibling::h5/a/@href"
result = tree.xpath(editorial_item_xpath)
editorial_url = "%s"%result[0]
news_content = ''
for url in [editorial_url, "%s/2"%editorial_url]:
page = urllib.urlopen(editorial_url)
html = page.read()
html = html.decode('iso-8859-1')
tree = etree.parse(StringIO.StringIO(html), parser)
paragraphs = tree.xpath(".//*[@id='box_left']/div[7]//text()")
paragraphs = [para.strip() for para in paragraphs]
for para in paragraphs:
if "... contd." in para:
continue
if len(para) > 3:
news_content = "%s%s"%(news_content,para)
print news_content
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment