Created
July 28, 2017 10:01
-
-
Save K-Wu/b4fc829dc34c42643a45a8726bba9c7b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
#Wu Kun,2015010625 ee@tsinghua | |
#This is a multithread implement of dbrt.py | |
import json | |
import queue | |
import time | |
import urllib.request as req | |
from bs4 import BeautifulSoup | |
import threading | |
class DoubanSpider: | |
def __init__(self, start_url_ext): | |
self.start_url = start_url_ext # 请自行修改 | |
self.checked_urls=[self.start_url] | |
self.movie_info = [] | |
self.recommendations = queue.Queue() | |
self.recommendations.put(self.start_url) | |
self.mutex = threading.Lock() | |
def slave(self): | |
print (threading.current_thread().name,'starts!') | |
#while not self.recommendations.empty() and len(self.movie_info) < 20: # 不要设太大 | |
while len(self.movie_info) < 20: # 不要设太大 TODO:找一个判断任务完成的方法,太难了这个坑先放着 | |
print(threading.current_thread().name, 'gathering info') | |
checked_urls_to_append=[] | |
movie_info_to_append=[] | |
url = self.recommendations.get() | |
info, reco = self.extract_info(url) | |
if info is not None: | |
movie_info_to_append.append(info) | |
for new_url in reco: | |
if new_url not in self.checked_urls: | |
self.recommendations.put(new_url) | |
checked_urls_to_append.append(new_url) | |
self.mutex.acquire() | |
try: | |
self.movie_info.extend(movie_info_to_append) | |
self.checked_urls.extend(checked_urls_to_append) | |
finally: | |
self.mutex.release() | |
print(len(self.movie_info), 'finished!') | |
time.sleep(1) # 不要设太小 | |
print(threading.current_thread().name, 'ends!') | |
def extract_info(self, url): # 如果不熟悉的话建议不要改这个函数 | |
try: | |
content = req.urlopen(url).read() | |
soup = BeautifulSoup(content, 'lxml', from_encoding='utf-8') | |
title = soup.find('span', attrs={'property': 'v:itemreviewed'}).get_text() | |
rating = float(soup.find('strong').get_text()) | |
reco_content = soup.find('div', attrs={'class': 'recommendations-bd'}) | |
recomms = [x.get('href')[:-18] for x in reco_content.find_all('a')] | |
print(title, threading.current_thread().name) | |
return {'title': title, 'rating': rating, 'url': url}, recomms | |
except Exception as err: | |
print(url, str(err)) | |
return None, None | |
def main(self): | |
threads =[] | |
for i in range(8): | |
threads.append(threading.Thread(target=self.slave,name='{0}'.format(i))) | |
threads[-1].start() | |
for t in threads: | |
t.join() | |
self.movie_info.sort(key=lambda x: x['rating'], reverse=True) | |
with open('result-mt.json', 'wt', encoding='utf-8') as f: # 结果写到 result.json | |
json.dump(self.movie_info, f, ensure_ascii=False, sort_keys=True, indent=4) | |
if __name__ == '__main__': | |
a=time.time() | |
foo=DoubanSpider(start_url_ext='https://movie.douban.com/subject/20495792/') | |
foo.main() | |
b=time.time() | |
print (b-a) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment