Created
May 2, 2019 13:52
-
-
Save jack841002/55988a0af6b5419a28fe15bbc284f9f9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
#爬蟲 (PTT表特版) | |
#抓取網頁原始碼 | |
import requests | |
requests.get("https://www.ptt.cc/bbs/beauty/index.html") | |
res = requests.get("https://www.ptt.cc/bbs/beauty/index.html") | |
print(res.text) | |
#操作HTML | |
from bs4 import BeautifulSoup | |
soup = BeautifulSoup(res.text,"html.parser") | |
tag_name = "div.title a" #拿到所有文章的網址 | |
articles = soup.select(tag_name) | |
#拿網址和標題 | |
for art in articles: | |
print(art["href"], art.text) | |
paging = soup.select("div.btn-group-paging a") | |
""" | |
import requests | |
from bs4 import BeautifulSoup | |
import re #抓圖片 | |
from urllib.request import urlretrieve #存照片 | |
import os #為了建立資料夾 | |
import sys #控制抓取文章頁數 system的縮寫 | |
import pymysql #建立資料庫 | |
db = pymysql.connect(host='localhost', port=3306, password='841002', user='jack', db='pttpicture', charset='utf8') | |
cursor=db.cursor() | |
if not os.path.isdir("download"): #建立資料夾 | |
os.mkdir("download") | |
url = "https://www.ptt.cc/bbs/beauty/index.html" | |
reg_imgur_file = re.compile("http[s]?://i.imgur.com/\w+\.(?:jpg|png|gif)") | |
#print(sys.argv) | |
pages = 3 | |
for round in range(pages): | |
res = requests.get(url) | |
soup = BeautifulSoup(res.text,"html.parser") | |
articles = soup.select("div.title a") | |
paging = soup.select("div.btn-group-paging a") | |
next_url = "https://www.ptt.cc" + paging[1]["href"] | |
url = next_url | |
for article in articles: | |
print(article["href"], article.text) | |
if not os.path.isdir(os.path.join("download", article.text)): #建立每個文章的資料夾 | |
os.mkdir(os.path.join("download", article.text)) | |
res = requests.get("https://www.ptt.cc" + article["href"]) #存取網址 | |
images = reg_imgur_file.findall(res.text) #找出所有圖片的網址 | |
print(images) | |
for image in set(images): | |
ID = re.search("http[s]?://i.imgur.com/(\w+\.(?:jpg|png|gif))", image).group(1) #找出圖片的後半段亂碼網址 | |
#urlretrieve(image, ID) #(網址,檔案名稱) | |
urlretrieve( image, os.path.join("download", article.text, ID) ) | |
sql = "INSERT INTO picture(name, pic) VALUES ('%s', '%s')" #存入資料庫 | |
try: | |
cursor.execute(sql % (ID, image)) | |
db.commit() | |
except: | |
db.rollback() | |
db.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment