Skip to content

Instantly share code, notes, and snippets.

@koolay
Created May 19, 2014 16:19
Show Gist options
  • Save koolay/59c7627e5c47dbb0eeea to your computer and use it in GitHub Desktop.
Save koolay/59c7627e5c47dbb0eeea to your computer and use it in GitHub Desktop.
crawl from lailaihui.com
#-*- coding:utf-8 -*-
#------------------------#
#requirement: peewee, requests, beautifulsoup4
#-------------------------#
import re, os
import datetime
import requests
import json, random
from bs4 import BeautifulSoup
from peewee import *
db_file = r'%s\spider.db'%(os.getcwd())
db = SqliteDatabase(db_file)
class Subject(Model):
title = CharField()
url = CharField()
class Meta:
database = db
class DaySell(Model):
price = IntegerField()
amount = IntegerField()
subject = ForeignKeyField(Subject)
updatetime = DateTimeField()
day = CharField()
class Meta:
database = db
def crawl_lailaihui():
page = 1
while 1:
url_list = 'http://www.lailaihui.com/search/start/all/all/all/all/recommend/%s'%page
page = page + 1
html_content = requests.get(url_list,timeout=15)
soup = BeautifulSoup(html_content.text)
items = soup.find_all('div', class_='products')
if not items or len(items) < 1:
print 'down complete!'
break
for item in items:
info = item.find(class_='rinfo')
detail_link = info.find('h3').find('a')
title = detail_link.text
href = 'http://lailaihui.com%s'%detail_link.get('href')
print href
subject = Subject(title=title, url=href)
subject.save()
detail_page = requests.get(href, timeout=15)
soup_detail = BeautifulSoup(detail_page.text)
if not soup_detail.find(class_='dateCon'):
continue
table = soup_detail.find(class_='dateCon').find('table')
all_td = table.find_all(startdateid=re.compile('\d+'))
for td in all_td:
pubdate = td.get('rel')
yw = td.find(class_='yw').text.strip()
price_day = td.find(class_='price').text
m = re.search(r'\d+',price_day)
price_day_int = m.group(0)
print yw
m = re.search(r'\d+', yw)
if not m or not m.group(0).isdigit():
yw = 0
else:
yw = m.group(0)
print pubdate, price_day_int, yw
sell = DaySell(price=price_day_int, amount=yw, subject=subject, updatetime=datetime.datetime.now(),
day=pubdate,)
sell.save()
print 'saved!'
price = info.find(class_='price2').text
print title
print price
if not soup.find(id='NextPage'):
print 'down complete!'
break
if __name__ == '__main__':
Subject.create_table(True)
DaySell.create_table(True)
crawl_lailaihui()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment