Skip to content

Instantly share code, notes, and snippets.

@arthur-tomsjj
Created July 22, 2020 08:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save arthur-tomsjj/5873fc9f311783682b1940ff89edb878 to your computer and use it in GitHub Desktop.
Save arthur-tomsjj/5873fc9f311783682b1940ff89edb878 to your computer and use it in GitHub Desktop.
i am hentai senshi
####### e-hentai本子 爬蟲測試 #######
import urllib.request as req
import requests
from bs4 import BeautifulSoup
import os
import time
url = 'https://e-hentai.org/s/42331c9fc6/1467036-1'
Header = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'}
pagenum = 96 #本子總頁數
count = 1 #計算目前存到第幾頁
def crawl(url,count):
source = req.Request(url,headers = Header)
with req.urlopen(source) as response:
data = response.read().decode('utf-8')
page = BeautifulSoup(data,"html.parser")
div = page.find('div',id='i3') #找出div id為i3
Next_link = div.a['href'] #抓div中的a標籤 提取裡面href的路徑
img = div.a.img['src'] #抓div中的a中的img標籤 提取裡面src的路徑
fname = '爬蟲img/'+str(count)+'.jpg' #檔案儲存路徑+名稱
download = requests.get(img)
with open(fname,'wb') as f:
f.write(download.content)
f.close()
del download
print('downloaded: '+fname.split('/')[-1])
if(count<pagenum):
count+=1
del source
crawl(Next_link,count)
#開始抓爬
crawl(url,count)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment