bngsudheer/todays-editorial-text.py

## todays-editorial-text.py
#!/usr/bin/python
import urllib
import lxml
from lxml import etree
import StringIO

parser = etree.HTMLParser()

todays_paper_url = "http://www.indianexpress.com/supplement/"

page = urllib.urlopen(todays_paper_url)
html = page.read()

tree = etree.parse(StringIO.StringIO(html), parser)
editorial_item_xpath = ".//h4[text()='Editorials']/following-sibling::h5/a/@href"
result = tree.xpath(editorial_item_xpath)
editorial_url = "%s"%result[0]

news_content = ''

for url in [editorial_url, "%s/2"%editorial_url]:
        page = urllib.urlopen(editorial_url)
        html = page.read()
        html = html.decode('iso-8859-1')

        tree = etree.parse(StringIO.StringIO(html), parser)

        paragraphs = tree.xpath(".//*[@id='box_left']/div[7]//text()")

        paragraphs = [para.strip() for para in paragraphs]

        for para in paragraphs:

            if "... contd." in para:
                continue

            if len(para) > 3:
                news_content = "%s%s"%(news_content,para)


print news_content
	#!/usr/bin/python
	import urllib
	import lxml
	from lxml import etree
	import StringIO

	parser = etree.HTMLParser()

	todays_paper_url = "http://www.indianexpress.com/supplement/"

	page = urllib.urlopen(todays_paper_url)
	html = page.read()

	tree = etree.parse(StringIO.StringIO(html), parser)
	editorial_item_xpath = ".//h4[text()='Editorials']/following-sibling::h5/a/@href"
	result = tree.xpath(editorial_item_xpath)
	editorial_url = "%s"%result[0]

	news_content = ''

	for url in [editorial_url, "%s/2"%editorial_url]:
	page = urllib.urlopen(editorial_url)
	html = page.read()
	html = html.decode('iso-8859-1')

	tree = etree.parse(StringIO.StringIO(html), parser)

	paragraphs = tree.xpath(".//*[@id='box_left']/div[7]//text()")

	paragraphs = [para.strip() for para in paragraphs]

	for para in paragraphs:

	if "... contd." in para:
	continue

	if len(para) > 3:
	news_content = "%s%s"%(news_content,para)



	print news_content