Skip to content

Instantly share code, notes, and snippets.

@K-Wu
Created July 28, 2017 10:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save K-Wu/b4fc829dc34c42643a45a8726bba9c7b to your computer and use it in GitHub Desktop.
Save K-Wu/b4fc829dc34c42643a45a8726bba9c7b to your computer and use it in GitHub Desktop.
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
#Wu Kun,2015010625 ee@tsinghua
#This is a multithread implement of dbrt.py
import json
import queue
import time
import urllib.request as req
from bs4 import BeautifulSoup
import threading
class DoubanSpider:
def __init__(self, start_url_ext):
self.start_url = start_url_ext # 请自行修改
self.checked_urls=[self.start_url]
self.movie_info = []
self.recommendations = queue.Queue()
self.recommendations.put(self.start_url)
self.mutex = threading.Lock()
def slave(self):
print (threading.current_thread().name,'starts!')
#while not self.recommendations.empty() and len(self.movie_info) < 20: # 不要设太大
while len(self.movie_info) < 20: # 不要设太大 TODO:找一个判断任务完成的方法,太难了这个坑先放着
print(threading.current_thread().name, 'gathering info')
checked_urls_to_append=[]
movie_info_to_append=[]
url = self.recommendations.get()
info, reco = self.extract_info(url)
if info is not None:
movie_info_to_append.append(info)
for new_url in reco:
if new_url not in self.checked_urls:
self.recommendations.put(new_url)
checked_urls_to_append.append(new_url)
self.mutex.acquire()
try:
self.movie_info.extend(movie_info_to_append)
self.checked_urls.extend(checked_urls_to_append)
finally:
self.mutex.release()
print(len(self.movie_info), 'finished!')
time.sleep(1) # 不要设太小
print(threading.current_thread().name, 'ends!')
def extract_info(self, url): # 如果不熟悉的话建议不要改这个函数
try:
content = req.urlopen(url).read()
soup = BeautifulSoup(content, 'lxml', from_encoding='utf-8')
title = soup.find('span', attrs={'property': 'v:itemreviewed'}).get_text()
rating = float(soup.find('strong').get_text())
reco_content = soup.find('div', attrs={'class': 'recommendations-bd'})
recomms = [x.get('href')[:-18] for x in reco_content.find_all('a')]
print(title, threading.current_thread().name)
return {'title': title, 'rating': rating, 'url': url}, recomms
except Exception as err:
print(url, str(err))
return None, None
def main(self):
threads =[]
for i in range(8):
threads.append(threading.Thread(target=self.slave,name='{0}'.format(i)))
threads[-1].start()
for t in threads:
t.join()
self.movie_info.sort(key=lambda x: x['rating'], reverse=True)
with open('result-mt.json', 'wt', encoding='utf-8') as f: # 结果写到 result.json
json.dump(self.movie_info, f, ensure_ascii=False, sort_keys=True, indent=4)
if __name__ == '__main__':
a=time.time()
foo=DoubanSpider(start_url_ext='https://movie.douban.com/subject/20495792/')
foo.main()
b=time.time()
print (b-a)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment