Created
June 24, 2011 13:05
-
-
Save bngsudheer/1044721 to your computer and use it in GitHub Desktop.
Grab editorial news from Indian express
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import urllib | |
import lxml | |
from lxml import etree | |
import StringIO | |
parser = etree.HTMLParser() | |
todays_paper_url = "http://www.indianexpress.com/supplement/" | |
page = urllib.urlopen(todays_paper_url) | |
html = page.read() | |
tree = etree.parse(StringIO.StringIO(html), parser) | |
editorial_item_xpath = ".//h4[text()='Editorials']/following-sibling::h5/a/@href" | |
result = tree.xpath(editorial_item_xpath) | |
editorial_url = "%s"%result[0] | |
news_content = '' | |
for url in [editorial_url, "%s/2"%editorial_url]: | |
page = urllib.urlopen(editorial_url) | |
html = page.read() | |
html = html.decode('iso-8859-1') | |
tree = etree.parse(StringIO.StringIO(html), parser) | |
paragraphs = tree.xpath(".//*[@id='box_left']/div[7]//text()") | |
paragraphs = [para.strip() for para in paragraphs] | |
for para in paragraphs: | |
if "... contd." in para: | |
continue | |
if len(para) > 3: | |
news_content = "%s%s"%(news_content,para) | |
print news_content |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment