datahutrepo

## gist:e95d03962e8f07b6237fb19bd352f240
Python 2.7.11 |Anaconda 4.0.0 (64-bit)| (default, Dec 6 2015, 18:08:32)
Type "copyright", "credits" or "license" for more information.

IPython 4.1.2 -- An enhanced Interactive Python.
? -> Introduction and overview of IPython's features.
%quickref -> Quick reference.
help -> Python's own help system.
object? -> Details about 'object', use 'object??' for extra details.

In [1]: from lxml import etree

## etree2
In [2]: root = etree.Element('html')
In [3]: root
Out[3]: <Element html at 0x7f43a5c51ab8>

In [4]: print root.tag
html

## etree-3
In [5]: etree.SubElement(root, 'head')
Out[5]: <Element head at 0x7f43a5c51e60>

In [6]: etree.SubElement(root, 'body')
Out[6]: <Element body at 0x7f43a5c51f38>

In [7]: print etree.tostring(root)
<html><head/><body/></html>

## etree-4
In [8]: root = etree.Element('html')
In [9]: head = etree.SubElement(root, 'head')
In [10]: body = etree.SubElement(root, 'body')
In [11]: title = etree.SubElement(head, 'title')
In [12]: title.text = 'lxml Example'
In [13]: h2 = etree.SubElement(body, 'h2')
In [14]: h2.text = 'Learning to lxml, a XML toolkit library in python'
In [15]: print etree.tos
etree.tostring etree.tostringlist


## import-request
In [19]: import requests
 In [20]: from lxml import html

## requestcnn
In [21]: page = requests.get('http://www.cnn.com')
In [22]: html_content = html.fromstring(page.content)

## iteratecnn

In [23]: for i in html_content.iterchildren():
  ....: print i
  ....:
<Element head at 0x7f43a5737db8>
<Element body at 0x7f43a5737e10>

In [24]: news_stories = html_content.xpath('//h3[@data-analytics]/a/span/text()')

In [25]: news_links = html_content.xpath('//h3[@data-analytics]/a/@href')

## iterate-final
In [28]: top_stories = []

In [29]: for i in zip(news_stories, news_links):
  ....: top_stories.append(i)
  ....:


In [30]: top_stories
Out[30]:

## amazon_item
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class AmazonItem(scrapy.Item):

## amazon_parser_scrapy
# -*- coding: utf-8 -*-
import scrapy
from amazon.items import AmazonItem

class AmazonProductSpider(scrapy.Spider):
  name = "AmazonDeals"
  allowed_domains = ["amazon.com"]

  #Use working product URL below
  start_urls = [
	Python 2.7.11 \|Anaconda 4.0.0 (64-bit)\| (default, Dec 6 2015, 18:08:32)
	Type "copyright", "credits" or "license" for more information.

	IPython 4.1.2 -- An enhanced Interactive Python.
	? -> Introduction and overview of IPython's features.
	%quickref -> Quick reference.
	help -> Python's own help system.
	object? -> Details about 'object', use 'object??' for extra details.

	In [1]: from lxml import etree
	In [2]: root = etree.Element('html')
	In [3]: root
	Out[3]: <Element html at 0x7f43a5c51ab8>

	In [4]: print root.tag
	html
	In [5]: etree.SubElement(root, 'head')
	Out[5]: <Element head at 0x7f43a5c51e60>

	In [6]: etree.SubElement(root, 'body')
	Out[6]: <Element body at 0x7f43a5c51f38>

	In [7]: print etree.tostring(root)
	<html><head/><body/></html>
	In [8]: root = etree.Element('html')
	In [9]: head = etree.SubElement(root, 'head')
	In [10]: body = etree.SubElement(root, 'body')
	In [11]: title = etree.SubElement(head, 'title')
	In [12]: title.text = 'lxml Example'
	In [13]: h2 = etree.SubElement(body, 'h2')
	In [14]: h2.text = 'Learning to lxml, a XML toolkit library in python'
	In [15]: print etree.tos
	etree.tostring etree.tostringlist
	In [21]: page = requests.get('http://www.cnn.com')
	In [22]: html_content = html.fromstring(page.content)

	In [23]: for i in html_content.iterchildren():
	....: print i
	....:
	<Element head at 0x7f43a5737db8>
	<Element body at 0x7f43a5737e10>

	In [24]: news_stories = html_content.xpath('//h3[@data-analytics]/a/span/text()')

	In [25]: news_links = html_content.xpath('//h3[@data-analytics]/a/@href')
	In [28]: top_stories = []

	In [29]: for i in zip(news_stories, news_links):
	....: top_stories.append(i)
	....:



	In [30]: top_stories
	Out[30]:
	# -- coding: utf-8 --

	# Define here the models for your scraped items
	#
	# See documentation in:
	# http://doc.scrapy.org/en/latest/topics/items.html

	import scrapy

	class AmazonItem(scrapy.Item):
	# -- coding: utf-8 --
	import scrapy
	from amazon.items import AmazonItem

	class AmazonProductSpider(scrapy.Spider):
	name = "AmazonDeals"
	allowed_domains = ["amazon.com"]

	#Use working product URL below
	start_urls = [