Skip to content

Instantly share code, notes, and snippets.

Python 2.7.11 |Anaconda 4.0.0 (64-bit)| (default, Dec 6 2015, 18:08:32)
Type "copyright", "credits" or "license" for more information.
IPython 4.1.2 -- An enhanced Interactive Python.
? -> Introduction and overview of IPython's features.
%quickref -> Quick reference.
help -> Python's own help system.
object? -> Details about 'object', use 'object??' for extra details.
In [1]: from lxml import etree
In [5]: etree.SubElement(root, 'head')
Out[5]: <Element head at 0x7f43a5c51e60>
In [6]: etree.SubElement(root, 'body')
Out[6]: <Element body at 0x7f43a5c51f38>
In [7]: print etree.tostring(root)
<html><head/><body/></html>
In [19]: import requests
In [20]: from lxml import html
In [21]: page = requests.get('http://www.cnn.com')
In [22]: html_content = html.fromstring(page.content)
In [23]: for i in html_content.iterchildren():
....: print i
....:
<Element head at 0x7f43a5737db8>
<Element body at 0x7f43a5737e10>
In [24]: news_stories = html_content.xpath('//h3[@data-analytics]/a/span/text()')
In [25]: news_links = html_content.xpath('//h3[@data-analytics]/a/@href')
In [28]: top_stories = []
In [29]: for i in zip(news_stories, news_links):
....: top_stories.append(i)
....:
In [30]: top_stories
Out[30]:
@datahutrepo
datahutrepo / etree2
Last active September 7, 2016 06:31
In [2]: root = etree.Element('html')
In [3]: root
Out[3]: <Element html at 0x7f43a5c51ab8>
In [4]: print root.tag
html
In [8]: root = etree.Element('html')
In [9]: head = etree.SubElement(root, 'head')
In [10]: body = etree.SubElement(root, 'body')
In [11]: title = etree.SubElement(head, 'title')
In [12]: title.text = 'lxml Example'
In [13]: h2 = etree.SubElement(body, 'h2')
In [14]: h2.text = 'Learning to lxml, a XML toolkit library in python'
In [15]: print etree.tos
etree.tostring etree.tostringlist
# -*- coding: utf-8 -*-
# Define your item pipelines here
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class AmazonPipeline(object):
def process_item(self, item, spider):
return item
[
{"product_category": "Electronics,Computers & Accessories,Data Storage,External Hard Drives", "product_sale_price": "$949.95", "product_name": "G-Technology G-SPEED eS PRO High-Performance Fail-Safe RAID Solution for HD/2K Production 8TB (0G01873)", "product_availability": "Only 1 left in stock."},
{"product_category": "Electronics,Computers & Accessories,Data Storage,USB Flash Drives", "product_sale_price": "", "product_name": "G-Technology G-RAID with Removable Drives High-Performance Storage System 4TB (Gen7) (0G03240)", "product_availability": "Available from these sellers."},
{"product_category": "Electronics,Computers & Accessories,Data Storage,USB Flash Drives", "product_sale_price": "$549.95", "product_name": "G-Technology G-RAID USB Removable Dual Drive Storage System 8TB (0G04069)", "product_availability": "Only 1 left in stock."},
{"product_category": "Electronics,Computers & Accessories,Data Storage,External Hard Drives", "product_sale_price": "$89.95", "product_name": "G-Technology G-DRIVE ev U