Skip to content

Instantly share code, notes, and snippets.

@chengluyu
Created December 16, 2016 10:44
Show Gist options
  • Save chengluyu/745c3be37a40799d2928382a8aeb9e6d to your computer and use it in GitHub Desktop.
Save chengluyu/745c3be37a40799d2928382a8aeb9e6d to your computer and use it in GitHub Desktop.
# coding: utf-8
from requests import request
from bs4 import BeautifulSoup as bs
from pyquery import PyQuery as pq
from time import sleep
import json
import re
import MySQLdb
BASE = 'http://www.tmvan.com'
class SimpleIterator:
def __init__(self, elems):
self.elems = elems
self.index = 0
def next(self):
if self.index >= len(self.elems):
return None
save = self.elems[self.index]
self.index += 1
return save.strip()
class ItemListBuilder:
def __init__(self, id, title):
self.result = []
self.current = None
self.item = None
self.id = id
self.title = title
self.comment_for_parent = None
self.reference_for_parent = None
def end_up_item(self):
if self.item is not None:
self.current['children'].append(self.item)
self.item = None
def end_up_child(self):
if self.current is not None:
self.end_up_item()
self.result.append(self.current)
def add(self, id):
self.end_up_child()
self.current = { 'id': self.title + id, 'children': [] }
def text(self, text):
if self.current is None:
self.current = { 'children': [] }
if self.item is not None:
self.current['children'].append(self.item)
self.item = dict()
self.item['content'] = text.split(',')
def reference(self, text):
if self.current is None:
self.reference_for_parent = text
return
if self.current is None:
self.current = { 'children': [] }
self.item['reference'] = map(lambda x: x.strip(), text.split(','))
def comment(self, text):
if self.current is None:
self.comment_for_parent = text
return
if self.current.get('comment') is None:
self.current['comment'] = text
else:
self.current['comment'] += text
def get(self):
self.end_up_item()
self.end_up_child()
obj = {
'id': self.id,
'title': self.title,
'children': self.result
}
if self.comment_for_parent is not None:
obj['comment'] = self.comment_for_parent
if self.reference_for_parent is not None:
obj['reference'] = self.reference_for_parent
return obj
# download the content of an url
def download(url):
return request('GET', url).text.replace(u'\xa0', u' ')
def fetch_comment(index):
URL = "http://www.tmvan.com/Script/goodsclass.ashx"
payload = "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"goodsclassid\"\r\n\r\n%d\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW--"%index
headers2 = {
'content-type': "multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW",
'accept': "text/html, */*; q=0.01",
'cache-control': "no-cache",
'postman-token': "f0d491b7-14e7-728c-a17b-a212ce38272c"
}
return request("POST", URL, data=payload, headers=headers2).text
def process_text_node(text):
stripped = text.strip()
if stripped == '':
return None
m = re.match(r'((.{1,3}))(.*)$', stripped)
if m is not None:
return {
'type': 'heading',
'name': m.group(1),
'text': m.group(2)
}
m = re.match(r'^※(.*)$', stripped)
if m is not None:
return {
'type': 'ref',
'text': m.group(1)
}
m = re.match(r'^注:(.*)$', stripped)
if m is not None:
return {
'type': 'comment',
'text': m.group(1)
}
m = re.match(r'^\d+\.(.*)$', stripped)
if m is not None:
return {
'type': 'list',
'text': stripped
}
return {
'type': 'text',
'text': stripped
}
# process each single pages
def page(url, id, title):
URL = BASE + url
q = pq(download(URL))
child_nodes = q('.content1').contents()
text_nodes = filter(lambda x: isinstance(x, str), child_nodes)
builder = ItemListBuilder(id, title)
for obj in filter(lambda x: x is not None, map(process_text_node, text_nodes)):
if obj['type'] == 'heading':
builder.add(obj['name'])
builder.text(obj['text'])
elif obj['type'] == 'text':
builder.text(obj['text'])
elif obj['type'] == 'ref':
builder.reference(obj['text'])
elif obj['type'] == 'list':
builder.comment(obj['text'])
elif obj['type'] == 'comment':
builder.comment(obj['text'])
else:
raise Exception('This will never happen')
result = builder.get()
comment = q('div[style="float:left; width:890px;"]').text().strip()
if comment is not None and len(comment) != 0:
if result.get('comment') is not None:
result['comment'] += '\n' + comment
else:
result['comment'] = comment
return result
# process content page
def content():
URL = BASE + '/tool/goodsclass.aspx'
tree = bs(download(URL), 'html.parser')
sections = []
for index, section in enumerate(tree.find_all('div', { 'class': 'one' })):
# limits
if index > 0:
break
# Fetch metadata of this section
left = section.find('div', { 'class': 'left' })
title = left.find('div', { 'class': 'title' }).get_text().strip()
content = left.find('div', { 'class': 'content1' }).get_text().strip()
print '-----------------', title, '-----------------'
# Fetch each subsection
right = section.find('div', { 'class': 'right' })
parts = []
for sub_index, link in enumerate(right.find_all('a', { 'target': '_blank' })):
url = link.get('href')
sub_title = link.get_text()
print sub_index + 1, sub_title
part = page(url, id = '{:02d}{:02d}'.format(index + 1, sub_index + 1), title = sub_title)
parts.append(part)
sleep(0.01)
sections.append({
'id': '{:02d}'.format(index + 1),
'title': title,
'content': content,
'comment': fetch_comment(index + 1),
'children': parts
})
return sections
def db_dump(node, parent_id = ''):
print node['id'], node['title'], len(node['children'])
for child in node['children']:
db_dump(child, node['id'])
def db_dump_all(elems):
for el in elems:
db_dump(el)
# for debug use only
def inspect(obj):
print json.dumps(obj, indent = 2, ensure_ascii = True)
db_dump_all(content())
# inspect(page('/tool/goodsclassitem.aspx?goodsclassid=47'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment