Skip to content

Instantly share code, notes, and snippets.

@aguegu
Last active August 29, 2015 14:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aguegu/e45f24b2c74a0f2bc661 to your computer and use it in GitHub Desktop.
Save aguegu/e45f24b2c74a0f2bc661 to your computer and use it in GitHub Desktop.
BendixCrawler
# EditorConfig is awesome: http://EditorConfig.org
# top-most EditorConfig file
root = true
# Unix-style newlines with a newline ending every file
[*]
end_of_line = lf
insert_final_newline = true
trim_trailing_whitespace = true
insert_final_newline = true
charset = utf-8
indent_style = space
[*.py]
indent_size = 4
tab_width = 4
[*.yml]
indent_size = 2
tab_width = 2
[Makefile]
indent_style = tab
indent_size = 4
tab_width = 4
*.pyc
*.sqlite
*.zip
pdf/
gif/
sql:
brand:
init: |
DROP TABLE IF EXISTS BRAND;
CREATE TABLE BRAND (
ID INTEGER PRIMARY KEY AUTOINCREMENT,
TITLE TEXT NOT NULL UNIQUE);
insert: |
INSERT INTO BRAND (TITLE) VALUES (?);
select: |
SELECT TITLE,
ID
FROM BRAND;
serie:
init: |
DROP TABLE IF EXISTS SERIE;
CREATE TABLE SERIE (
ID INTEGER PRIMARY KEY AUTOINCREMENT,
TITLE TEXT NOT NULL,
BRAND_ID INTEGER NOT NULL);
insert: |
INSERT INTO SERIE (TITLE, BRAND_ID) VALUES (?, ?);
select: |
SELECT BRAND.TITLE,
SERIE.TITLE,
SERIE.ID
FROM BRAND, SERIE
WHERE SERIE.BRAND_ID = BRAND.ID;
model:
init: |
DROP TABLE IF EXISTS MODEL;
CREATE TABLE MODEL (
ID INTEGER PRIMARY KEY AUTOINCREMENT,
HREF TEXT NOT NULL,
TITLE TEXT NOT NULL,
YEAR TEXT NOT NULL,
FRONT TEXT,
REAR TEXT,
PARK TEXT,
SERIE_ID INTEGER NOT NULL);
insert: |
INSERT INTO MODEL (
HREF,
TITLE,
YEAR,
FRONT,
REAR,
PARK,
SERIE_ID)
VALUES (?, ?, ?, ?, ?, ?, ?);
select: |
SELECT HREF,
ID
FROM MODEL
WHERE ID
NOT IN (SELECT DISTINCT MODEL_ID
FROM PRODUCT);
product:
init: |
CREATE TABLE IF NOT EXISTS PRODUCT (
ID INTEGER PRIMARY KEY AUTOINCREMENT,
TITLE TEXT NOT NULL,
CONTENT TEXT,
IMAGE TEXT,
PDF TEXT,
MODEL_ID INTEGER);
insert: |
INSERT INTO PRODUCT (
TITLE,
CONTENT,
IMAGE,
PDF,
MODEL_ID)
VALUES (?, ?, ?, ?, ?);
image:
select: |
SELECT DISTINCT IMAGE
FROM PRODUCT
WHERE IMAGE != "";
pdf:
select: |
SELECT DISTINCT PDF
FROM PRODUCT
WHERE PDF != ""
AND PDF != '#';
# -*- coding: utf-8 -*-
import gevent.monkey
gevent.monkey.patch_socket()
import requests
import gevent
import simplejson
import pickle
from pyquery import PyQuery
from gevent.queue import JoinableQueue, Queue
from sqliter import Sqliter
import yaml
with open('config.yml') as f:
rules = yaml.load(f)
def queue2list(q):
l = []
while not q.empty():
l.append(q.get())
return l
def list2queue(l, queue_type=Queue):
q = queue_type()
map(q.put, l)
return q
def pick(s, tag):
s = s.replace('\\x3c', '<').replace('\\x3e', '>')
s = s.replace('\\"', '"')
s = s[s.rfind('<%s' % tag): s.rfind('</%s>' % tag) + len('</%s>' % tag)]
return s
def inject(q_out, conn, sql):
running = True
while running:
l = queue2list(q_out)
if len(l):
if isinstance(l[-1], str) and l[-1] == 'done':
running = False
l.pop()
conn.execute_many(sql, l)
gevent.sleep(2)
def fetch_brand(q_in, q_out):
while True:
s = q_in.get()
if isinstance(s, str) and s == 'done':
q_in.task_done()
break
while True:
try:
r = requests.get('http://www.bendix.com.au/autocomplete/manufacturer/%c' % s)
j = r.json()
except simplejson.JSONDecodeError:
pass
else:
if type(j) is dict:
map(q_out.put, [(x, ) for x in j.values()])
q_in.task_done()
break
def get_brand():
conn = Sqliter('bendix.sqlite')
conn.execute_script(rules['sql']['brand']['init'])
q_in = list2queue([chr(x) for x in range(ord('A'), ord('Z') + 1)], JoinableQueue)
q_out = Queue()
for _ in xrange(4):
q_in.put('done')
gevent.spawn(fetch_brand, q_in, q_out)
injector = gevent.spawn(inject, q_out, conn, rules['sql']['brand']['insert'])
q_in.join()
q_out.put('done')
injector.join()
def fetch_serie(q_in, q_out):
while True:
brand = q_in.get()
if isinstance(brand, str) and brand == 'done':
q_in.task_done()
break
params = dict(
manufacturer=brand[0],
view_name="search_by_vehicle",
view_display_id="page_1",
)
r = requests.get('http://www.bendix.com.au/views/ajax', params=params)
s = pick(r.text, 'select')
d = PyQuery(s)
l = [e.attr['value'] for e in d('option').items() if e.attr['value']]
map(q_out.put, zip(l, [brand[1]]*len(l)))
q_in.task_done()
def get_serie():
conn = Sqliter('bendix.sqlite')
conn.execute_script(rules['sql']['serie']['init'])
q_in = list2queue(conn.select(rules['sql']['brand']['select']), JoinableQueue)
q_out = Queue()
for _ in xrange(8):
q_in.put('done')
gevent.spawn(fetch_serie, q_in, q_out)
injector = gevent.spawn(inject, q_out, conn, rules['sql']['serie']['insert'])
q_in.join()
q_out.put('done')
injector.join()
def fetch_model(q_in, q_out):
while True:
serie = q_in.get()
if isinstance(serie, str) and serie == 'done':
q_in.task_done()
break
print serie[2]
params = dict(
manufacturer=serie[0],
model=serie[1],
view_name="search_by_vehicle",
view_display_id="page_1",
)
r=requests.get('http://www.bendix.com.au/views/ajax', params=params)
s = pick(r.text, 'tbody')
d = PyQuery(s)
l = zip (
[x.attr['href'] for x in d('tr').items()],
[x.text() for x in d('td.views-field-date a').items()],
[x.text() for x in d('td.views-field-variant a').items()],
[x.text() for x in d('td.views-field-front a').items()],
[x.text() for x in d('td.views-field-rear a').items()],
[x.text() for x in d('td.views-field-park a').items()],
[serie[2] for _ in d('tr').items()])
map(q_out.put, l)
q_in.task_done()
def get_model():
conn = Sqliter('bendix.sqlite')
conn.execute_script(rules['sql']['model']['init'])
q_in = list2queue(conn.select(rules['sql']['serie']['select']), JoinableQueue)
q_out = Queue()
for _ in xrange(16):
q_in.put('done')
gevent.spawn(fetch_model, q_in, q_out)
injector = gevent.spawn(inject, q_out, conn, rules['sql']['model']['insert'])
q_in.join()
q_out.put('done')
injector.join()
def fetch_product(q_in, q_out):
while True:
model = q_in.get()
if isinstance(model, str) and model == 'done':
q_in.task_done()
break
print model[1]
try:
r = requests.get('http://www.bendix.com.au%s' % model[0])
d = PyQuery(r.text)
except requests.ConnectionError, e:
print model[1], e
else:
for p in d('table.single-vehicle:not(:empty)').items():
q_out.put((p('span').text(),
p('p:last').text(),
p('td.image img').attr('src'),
p('a:last').attr('href'),
model[1]))
finally:
q_in.task_done()
def get_product():
conn = Sqliter('bendix.sqlite')
conn.execute_script(rules['sql']['product']['init'])
q_in = list2queue(conn.select(rules['sql']['model']['select']), JoinableQueue)
q_out = Queue()
for _ in xrange(16):
q_in.put('done')
gevent.spawn(fetch_product, q_in, q_out)
injector = gevent.spawn(inject, q_out, conn, rules['sql']['product']['insert'])
q_in.join()
q_out.put('done')
injector.join()
def fetch_file(q_in, dest):
while True:
url = q_in.get()
if isinstance(url, str) and url == 'done':
q_in.task_done()
break
filename = url[0][url[0].rfind('/') + 1:]
u = 'http://www.bendix.com.au' + url[0] if url[0][0] == '/' else url[0]
r = requests.get(u)
with open('%s/%s' % (dest, filename), 'wb') as f:
f.write(r.content)
q_in.task_done()
def get_file():
conn = Sqliter('bendix.sqlite')
q_in = list2queue(conn.select(rules['sql']['image']['select']), JoinableQueue)
for _ in xrange(16):
q_in.put('done')
gevent.spawn(fetch_file, q_in, 'gif')
q_in.join()
q_in = list2queue(conn.select(rules['sql']['pdf']['select']), JoinableQueue)
for _ in xrange(16):
q_in.put('done')
gevent.spawn(fetch_file, q_in, 'pdf')
q_in.join()
if __name__ == '__main__':
#get_brand()
#get_serie()
#get_model()
#get_product()
get_file()
#!/usr/bin/env python
import sqlite3
import re
class Sqliter:
@staticmethod
def match(expr, item):
return re.match(expr, item) is not None
def __init__(self, db):
self.conn = sqlite3.connect(db)
self.conn.create_function("MATCHES", 2, Sqliter.match)
def select(self, sql, row_factory = None):
self.conn.row_factory = row_factory
cursor = self.conn.cursor()
cursor.execute(sql)
lst = cursor.fetchall()
cursor.close()
return lst
def execute_script(self, sql):
self.conn.executescript(sql)
self.conn.commit()
def execute_many(self, sql, lst):
self.conn.executemany(sql, lst)
self.conn.commit()
def __del__(self):
self.conn.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment