Last active
July 4, 2018 05:39
-
-
Save p3nj/31842fc902125d55f3977f1468618403 to your computer and use it in GitHub Desktop.
預設路徑 D:\Beauty
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests, re, time, queue, threading | |
from bs4 import BeautifulSoup | |
from urllib.request import urlretrieve | |
from fake_useragent import UserAgent | |
class myThread (threading.Thread): | |
def __init__(self, threadID, name, q): | |
threading.Thread.__init__(self) | |
self.threadID = threadID | |
self.name = name | |
self.q = q | |
def run(self): | |
#print ("開始線程 : " + self.name) | |
process_data(self.name, self.q) | |
#print ("結束線程 : " + self.name) | |
def process_data(threadName, q): | |
while not exitFlag: | |
queueLock.acquire() | |
if not workQueue.empty(): | |
temp_list = q.get() | |
queueLock.release() | |
url = temp_list[0] | |
title = temp_list[1] | |
useragent = temp_list[2] | |
if 'Re:' in title: | |
title = title.replace('Re: ', 'Re_') | |
print('{}: 文章標題處理: {}'.format(threadName, title)) | |
elif 'Fw:' in title: | |
title = title.replace('Fw: ', 'Fw_') | |
print('{}: 文章標題處理: {}'.format(threadName, title)) | |
res = requests.get(url, headers=useragent) | |
soup = BeautifulSoup(res.text, 'lxml') | |
if len(soup.findAll('a', {'href':re.compile('http:\/\/i\.imgur\.com\/.*')})) > 0: | |
print('{}: 正在尋找正妹……'.format(threadName)) | |
for index, img_url in enumerate(soup.findAll('a', {'href':re.compile('http:\/\/i\.imgur\.com\/.*')})): | |
try: | |
urlretrieve(img_url['href'], 'D:\Beauty\{}_{}.jpg'.format(title, index)) | |
print('{}: {} {}_{}.jpg 下載成功!'.format(threadName, img_url['href'], title, index)) | |
except: | |
print('{}: {} {}_{}.jpg 下載失敗!'.format(threadName, img_url['href'], title, index)) | |
time.sleep(0.1) | |
else: | |
queueLock.release() | |
#time.sleep(1) | |
if __name__ == '__main__': | |
pagenum = 1200 | |
print('讀取 User Agent 中...') | |
ua = UserAgent() | |
while pagenum < 2559: | |
exitFlag = 0 | |
#threadList = ["大叔甲", "大叔乙", "大叔丙", "大叔丁", "大叔戊"] | |
threadList = [] | |
for x in range(1,10): | |
threadList.append(x) | |
queueLock = threading.Lock() | |
workQueue = queue.Queue() | |
threads = [] | |
threadID = 1 | |
for tName in threadList: | |
thread = myThread(threadID, tName, workQueue) | |
thread.start() | |
threads.append(thread) | |
threadID += 1 | |
queueLock.acquire() | |
print('目前頁面: {}'.format(pagenum)) | |
curres = 'https://www.ptt.cc/bbs/Beauty/index{}.html'.format(pagenum) | |
useragent = {'User-Agent': ua.random} | |
#print('偽裝為裝置: {}'.format(useragent)) | |
res = requests.get(curres, headers=useragent) | |
soup = BeautifulSoup(res.text, 'lxml') | |
print('正在爬取頁面: {}'.format(curres)) | |
for article in soup.select('.r-ent a'): | |
temp_list = [] | |
url = 'https://www.ptt.cc' + article['href'] | |
title = article.text | |
if 'search' in url: | |
pass | |
elif '[帥哥]' in title: | |
print('發現非正妹, 跳過! {}'.format(title)) | |
pass | |
elif '鲜肉' in title: | |
print('發現非正妹, 跳過! {}'.format(title)) | |
pass | |
elif '男' in title: | |
print('發現非正妹, 跳過! {}'.format(title)) | |
pass | |
else: | |
temp_list.append(url) | |
temp_list.append(title) | |
temp_list.append(useragent) | |
workQueue.put(temp_list) | |
queueLock.release() | |
while not workQueue.empty(): | |
pass | |
exitFlag = 1 | |
for t in threads: | |
t.join() | |
#print ("完成頁面: {}".format(curres)) | |
pagenum += 1 | |
time.sleep(2.0) | |
print('全部完成') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment