bartgee/webscrap.py

## webscrap.py
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
# author Bart Grzybicki <bgrzybicki@gmail.com>

from lxml import html
import requests

OUT_FILE = u'linuxtoday.txt'

def main():
    page = requests.get('http://www.linuxtoday.com')
    tree = html.fromstring(page.text)
    lt_list = []
    index = 1
    while True:
        lt_dict = {}
        title = tree.xpath('//*[@id="container"]/table[3]/tbody/tr/td[2]/div[3]/div[{}]/a/strong/text()'.format(index))
        post_date = tree.xpath('//*[@id="container"]/table[3]/tbody/tr/td[2]/div[3]/div[{}]/span/text()[1]'.format(index))
        link = tree.xpath('//*[@id="container"]/table[3]/tbody/tr/td[2]/div[3]/div[{}]/a/@href'.format(index))
        desc = tree.xpath('//*[@id="container"]/table[3]/tbody/tr/td[2]/div[3]/div[{}]/p/span/text()'.format(index))
        if desc == []:
            desc = tree.xpath('//*[@id="container"]/table[3]/tbody/tr/td[2]/div[3]/div[{}]/p/text()'.format(index))
        else:
            desc_bckp = desc
        if title == []:
            break
        else:
            idx = post_date[0].index('\n')
            post_date[0] = post_date[0][:idx]
            post_date[0] = post_date[0].replace(' (', '')
            post_date[0] = post_date[0].replace(')', '')
            lt_dict['title'] = title[0]
            lt_dict['date'] = post_date[0]
            lt_dict['link'] = link[0]
            try:
                lt_dict['description'] = desc[0]
            except Exception:
                lt_dict['description'] = '<missing description>'
            lt_list.append(lt_dict)
            index += 1
    out_file = open(OUT_FILE, 'w')
    for x in lt_list:
        final_out = (x['title'] + '\n' + x['date'] + '\n' + x['link'] + '\n' + x['description'] + '\n')
        final_utf8 = final_out.encode('utf-8')
        print(final_utf8)
        print(u'Saving data to ' + OUT_FILE + u'...')
        out_file.write(final_utf8)
        out_file.write(u'\n')
    out_file.close()
    print(u'Done.')

if __name__ == '__main__':
   main()
	#!/usr/bin/env python2
	# -- coding: utf-8 --
	# author Bart Grzybicki <bgrzybicki@gmail.com>

	from lxml import html
	import requests

	OUT_FILE = u'linuxtoday.txt'

	def main():
	page = requests.get('http://www.linuxtoday.com')
	tree = html.fromstring(page.text)
	lt_list = []
	index = 1
	while True:
	lt_dict = {}
	title = tree.xpath('//*[@id="container"]/table[3]/tbody/tr/td[2]/div[3]/div[{}]/a/strong/text()'.format(index))
	post_date = tree.xpath('//*[@id="container"]/table[3]/tbody/tr/td[2]/div[3]/div[{}]/span/text()[1]'.format(index))
	link = tree.xpath('//*[@id="container"]/table[3]/tbody/tr/td[2]/div[3]/div[{}]/a/@href'.format(index))
	desc = tree.xpath('//*[@id="container"]/table[3]/tbody/tr/td[2]/div[3]/div[{}]/p/span/text()'.format(index))
	if desc == []:
	desc = tree.xpath('//*[@id="container"]/table[3]/tbody/tr/td[2]/div[3]/div[{}]/p/text()'.format(index))
	else:
	desc_bckp = desc
	if title == []:
	break
	else:
	idx = post_date[0].index('\n')
	post_date[0] = post_date[0][:idx]
	post_date[0] = post_date[0].replace(' (', '')
	post_date[0] = post_date[0].replace(')', '')
	lt_dict['title'] = title[0]
	lt_dict['date'] = post_date[0]
	lt_dict['link'] = link[0]
	try:
	lt_dict['description'] = desc[0]
	except Exception:
	lt_dict['description'] = '<missing description>'
	lt_list.append(lt_dict)
	index += 1
	out_file = open(OUT_FILE, 'w')
	for x in lt_list:
	final_out = (x['title'] + '\n' + x['date'] + '\n' + x['link'] + '\n' + x['description'] + '\n')
	final_utf8 = final_out.encode('utf-8')
	print(final_utf8)
	print(u'Saving data to ' + OUT_FILE + u'...')
	out_file.write(final_utf8)
	out_file.write(u'\n')
	out_file.close()
	print(u'Done.')

	if __name__ == '__main__':
	main()