Skip to content

Instantly share code, notes, and snippets.

@kmike
Last active August 9, 2016 10:36
Show Gist options
  • Save kmike/af647777cef39c3d01071905d176c006 to your computer and use it in GitHub Desktop.
Save kmike/af647777cef39c3d01071905d176c006 to your computer and use it in GitHub Desktop.
parsel HtmlParser benchmark
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import json\n",
"import parsel\n",
"from parsel import selector\n",
"from lxml import etree, html"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"selector._ctgroup['html_html'] = {\n",
" '_parser': html.HTMLParser,\n",
" '_csstranslator': selector.HTMLTranslator(),\n",
" '_tostring_method': 'html'\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 3703 pages.json\r\n"
]
}
],
"source": [
"!wc -l pages.json"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 4.28 s, sys: 788 ms, total: 5.06 s\n",
"Wall time: 5.13 s\n"
]
}
],
"source": [
"%%time\n",
"def load(): \n",
" pages = []\n",
" with open('pages.json', 'rt') as f:\n",
" for line in f:\n",
" if line.strip() in '[]':\n",
" continue\n",
" try:\n",
" page = json.loads(line.strip().rstrip(','))\n",
" except json.JSONDecodeError:\n",
" print(line)\n",
" break\n",
" pages.append(page)\n",
" return pages\n",
"\n",
"pages = load()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def parse(pages, type):\n",
" return [parsel.Selector(p['html'], type=type) for p in pages]\n",
"\n",
"def run_xpath(selectors):\n",
" for sel in selectors:\n",
" sel.xpath('//p').extract()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 loop, best of 5: 12 s per loop\n"
]
}
],
"source": [
"%%timeit -r5\n",
"selectors = parse(pages, 'html')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"selectors = parse(pages, 'html')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 loop, best of 5: 1.15 s per loop\n"
]
}
],
"source": [
"%%timeit -r5\n",
"run_xpath(selectors)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 loop, best of 5: 12.4 s per loop\n"
]
}
],
"source": [
"%%timeit -r5\n",
"selectors = parse(pages, 'html_html')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"selectors = parse(pages, 'html_html')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 loop, best of 5: 1.21 s per loop\n"
]
}
],
"source": [
"%%timeit -r5\n",
"run_xpath(selectors)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
#!/usr/bin/env bash
pip install subsample csvkit 1>&2
curl http://s3.amazonaws.com/alexa-static/top-1m.csv.zip > top-1m.csv.zip
unzip ./top-1m.csv.zip 1>&2
subsample --sample-size $1 top-1m.csv | csvcut -c 2
rm ./top-1m.csv
rm ./top-1m.csv.zip
# -*- coding: utf-8 -*-
"""
1. Create urls.txt file with urls, one url per line::
./get-random-domains.sh 1000 > urls.txt
2. Run spider to get page contents::
scrapy runspider savehtml.py -a urls=urls.txt -o pages.json -L INFO
"""
import random
import scrapy
from scrapy.utils.url import guess_scheme
from scrapy.linkextractors import LinkExtractor
class SavehtmlSpider(scrapy.Spider):
name = "savehtml"
requests_per_domain = 5
custom_settings = {
'CONCURRENT_REQUESTS': 50,
'REACTOR_THREADPOOL_MAXSIZE': 20,
'AJAXCRAWL_ENABLED': True,
}
def start_requests(self):
self.le = LinkExtractor(canonicalize=False)
with open(self.urls, 'rt') as f:
for line in f:
if not line.strip():
continue
url = guess_scheme(line.strip())
yield scrapy.Request(url, self.parse)
def parse(self, response):
if not hasattr(response, 'text'):
return
links = self.le.extract_links(response)
n_links = min(len(links), int(self.requests_per_domain) - 1)
links = random.sample(links, n_links)
for link in links:
yield scrapy.Request(link.url, self.parse_other)
yield {'url': response.url, 'html': response.text}
def parse_other(self, response):
if not hasattr(response, 'text'):
return
yield {'url': response.url, 'html': response.text}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment