takahub1/getImage.py

## getImage.py
#-*- coding:utf-8 -*-

import os
import sys
import time
import bs4
import urllib.request

def crawring(url):
	# 指定したURLのHTMLを取得
	html = get_html_string(url)
	if len(html) < 1:
		print("HTMLが取得できませんでした。")
		print("URLを確認してください。")
		sys.exit(1)

	soup = bs4.BeautifulSoup(html, "lxml")
	book_title = soup.title.string
	book_title = book_title.replace(" ", "_")
	os.system('mkdir ' + book_title)
	for a_tag in soup.find_all("div"):
		href_str = a_tag.get("data-background-image")
		if href_str is not None:
			print(href_str)
			os.system("wget -q -P ./" + book_title + " " + href_str)

	os.system('zip -r ' + book_title + " " + book_title)
	os.system("rm -r "+ book_title)

def get_html_string(url):
	decoded_html = ""

	headers = {
		"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
        }

	request = urllib.request.Request(url=url, headers=headers)
	response = urllib.request.urlopen(request)
	decoded_html = response.read().decode('utf-8')

	return decoded_html

def main():
	# 引数確認
	if len(sys.argv) != 2:
		sys.exit(1)

	f = open(sys.argv[1])
	lines = f.readlines()
	f.close()

	for url in lines:
		crawring(url)

if __name__ == "__main__":
	main()
	#-- coding:utf-8 --

	import os
	import sys
	import time
	import bs4
	import urllib.request

	def crawring(url):
	# 指定したURLのHTMLを取得
	html = get_html_string(url)
	if len(html) < 1:
	print("HTMLが取得できませんでした。")
	print("URLを確認してください。")
	sys.exit(1)

	soup = bs4.BeautifulSoup(html, "lxml")
	book_title = soup.title.string
	book_title = book_title.replace(" ", "_")
	os.system('mkdir ' + book_title)
	for a_tag in soup.find_all("div"):
	href_str = a_tag.get("data-background-image")
	if href_str is not None:
	print(href_str)
	os.system("wget -q -P ./" + book_title + " " + href_str)

	os.system('zip -r ' + book_title + " " + book_title)
	os.system("rm -r "+ book_title)

	def get_html_string(url):
	decoded_html = ""

	headers = {
	"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
	}

	request = urllib.request.Request(url=url, headers=headers)
	response = urllib.request.urlopen(request)
	decoded_html = response.read().decode('utf-8')

	return decoded_html

	def main():
	# 引数確認
	if len(sys.argv) != 2:
	sys.exit(1)

	f = open(sys.argv[1])
	lines = f.readlines()
	f.close()

	for url in lines:
	crawring(url)

	if __name__ == "__main__":
	main()