Skip to content

Instantly share code, notes, and snippets.

@takahub1
Last active October 13, 2017 09:34
Show Gist options
  • Save takahub1/7006803dfd1a3641a10b440127be75d9 to your computer and use it in GitHub Desktop.
Save takahub1/7006803dfd1a3641a10b440127be75d9 to your computer and use it in GitHub Desktop.
python3 getImage.py "filename"
#-*- coding:utf-8 -*-
import os
import sys
import time
import bs4
import urllib.request
def crawring(url):
# 指定したURLのHTMLを取得
html = get_html_string(url)
if len(html) < 1:
print("HTMLが取得できませんでした。")
print("URLを確認してください。")
sys.exit(1)
soup = bs4.BeautifulSoup(html, "lxml")
book_title = soup.title.string
book_title = book_title.replace(" ", "_")
os.system('mkdir ' + book_title)
for a_tag in soup.find_all("div"):
href_str = a_tag.get("data-background-image")
if href_str is not None:
print(href_str)
os.system("wget -q -P ./" + book_title + " " + href_str)
os.system('zip -r ' + book_title + " " + book_title)
os.system("rm -r "+ book_title)
def get_html_string(url):
decoded_html = ""
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
}
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
decoded_html = response.read().decode('utf-8')
return decoded_html
def main():
# 引数確認
if len(sys.argv) != 2:
sys.exit(1)
f = open(sys.argv[1])
lines = f.readlines()
f.close()
for url in lines:
crawring(url)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment