Skip to content

Instantly share code, notes, and snippets.

@dongguosheng
Last active August 2, 2017 09:21
Show Gist options
  • Save dongguosheng/8253237 to your computer and use it in GitHub Desktop.
Save dongguosheng/8253237 to your computer and use it in GitHub Desktop.
A simple crawler based on requests and pyquery.
# -*- coding: utf-8 -*-
'''
1. Construct the url with num_iid, eg: http://a.m.tmall.com/i15110720150.htm, 15110720150 is the num_iid.
2. Get the html text.
3. Parse the img urls and insert the num_iid and img urls into sqlite.
'''
import requests
from pyquery import PyQuery as pq
import threadpool
import urllib
def getNumiids():
'''
Get num_iids from txt file or from search
'''
num_iid_list = []
# 50025145 dress
url = r'http://list.tmall.com/search_product.htm?type=pc&totalPage=100&cat=50025145&style=l'
try:
page_sum = 100
num_per_page = 90
for page_num in range(page_sum):
print page_num
detail_url = url + '&jumpto=' + str(page_num + 1)
r = requests.get(detail_url, timeout=20)
if r.status_code == 200:
for i in range(num_per_page):
num_iid = pq(r.content)('body').find('div').filter('.product').eq(i).attr('data-id')
if(num_iid == None):
pass
else:
# print num_iid
num_iid_list.append(num_iid)
except Exception, e:
print e
# use set to unique
# num_iid_set = set(num_iid_list)
with open('num_iids.txt', 'w') as f:
for num_iid in num_iid_list:
f.write(str(num_iid) + '\n')
def getImgUrls(num_iid):
'''
Construct the url and get the img urls.
'''
url = r'http://a.m.tmall.com/i' + str(num_iid) + '.htm'
print url
img_urls = []
try:
r = requests.get(url, timeout=10)
if r.status_code == 200:
for i in range(2):
img_url = pq(r.content)('div').filter('.bd').find('div').eq(0).find('img').eq(i).attr.src
if(img_url == None):
pass
else:
img_url = img_url[: img_url.find('jpg') + 3]
print img_url
img_urls.append(img_url)
return img_urls
else :
print 'status_code != 200', r.status_code
return []
except Exception, e:
print e
return []
def getCidNumiids():
results = []
with open('num_iids.txt', 'r') as f:
for line in f:
if len(line) != 0:
results.append(str(int(line)))
return results
def main():
getNumiids()
thread_pool = threadpool.ThreadPool(threadpool.job)
id_list = getCidNumiids()
for num_id in id_list:
img_url_list = getImgUrls(num_id)
# feed the queue with urls
thread_pool.feed_queue(img_url_list)
# wait for the queue
thread_pool.wait_for_queue()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment