Create a gist now

Instantly share code, notes, and snippets.

@bartgee /webscrap.py
Last active Jul 23, 2018

Embed
What would you like to do?
Simple introduction to web scraping in Python. Not perfect, but you get an idea how it goes...
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
# author Bart Grzybicki <bgrzybicki@gmail.com>
from lxml import html
import requests
OUT_FILE = u'linuxtoday.txt'
def main():
page = requests.get('http://www.linuxtoday.com')
tree = html.fromstring(page.text)
lt_list = []
index = 1
while True:
lt_dict = {}
title = tree.xpath('//*[@id="container"]/table[3]/tbody/tr/td[2]/div[3]/div[{}]/a/strong/text()'.format(index))
post_date = tree.xpath('//*[@id="container"]/table[3]/tbody/tr/td[2]/div[3]/div[{}]/span/text()[1]'.format(index))
link = tree.xpath('//*[@id="container"]/table[3]/tbody/tr/td[2]/div[3]/div[{}]/a/@href'.format(index))
desc = tree.xpath('//*[@id="container"]/table[3]/tbody/tr/td[2]/div[3]/div[{}]/p/span/text()'.format(index))
if desc == []:
desc = tree.xpath('//*[@id="container"]/table[3]/tbody/tr/td[2]/div[3]/div[{}]/p/text()'.format(index))
else:
desc_bckp = desc
if title == []:
break
else:
idx = post_date[0].index('\n')
post_date[0] = post_date[0][:idx]
post_date[0] = post_date[0].replace(' (', '')
post_date[0] = post_date[0].replace(')', '')
lt_dict['title'] = title[0]
lt_dict['date'] = post_date[0]
lt_dict['link'] = link[0]
try:
lt_dict['description'] = desc[0]
except Exception:
lt_dict['description'] = '<missing description>'
lt_list.append(lt_dict)
index += 1
out_file = open(OUT_FILE, 'w')
for x in lt_list:
final_out = (x['title'] + '\n' + x['date'] + '\n' + x['link'] + '\n' + x['description'] + '\n')
final_utf8 = final_out.encode('utf-8')
print(final_utf8)
print(u'Saving data to ' + OUT_FILE + u'...')
out_file.write(final_utf8)
out_file.write(u'\n')
out_file.close()
print(u'Done.')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment