takahub1/getURLlist.py

## getURLlist.py
#-*- coding:utf-8 -*-

import os
import sys
import bs4
import urllib.request

def crawring(url, filename):
	# 指定したURLのHTMLを取得
	html = get_html_string(url)
	if len(html) < 1:
		print("HTMLが取得できませんでした。")
		print("URLを確認してください。")
		sys.exit(1)

	soup = bs4.BeautifulSoup(html, "lxml")
	book_title = soup.title.string
	main_body = soup.find("div", {"id": "B"})
	f = open(filename, 'w')
	for a_tag in main_body.find_all("a"):
		href_str = a_tag.get("href")
		if href_str is not None:
			print("http://mangamura.org/new_pc_view" + href_str[1:])
			f.write("http://mangamura.org/new_pc_view" + href_str[1:] + "\r\n")
	f.close()

def get_html_string(url):
	headers = {
		"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
        }

	request = urllib.request.Request(url=url, headers=headers)
	response = urllib.request.urlopen(request)
	decoded_html = response.read().decode('utf-8')

	return decoded_html

def main():
	crawring(sys.argv[1], sys.argv[2])

if __name__ == "__main__":
	main()
	#-- coding:utf-8 --

	import os
	import sys
	import bs4
	import urllib.request

	def crawring(url, filename):
	# 指定したURLのHTMLを取得
	html = get_html_string(url)
	if len(html) < 1:
	print("HTMLが取得できませんでした。")
	print("URLを確認してください。")
	sys.exit(1)

	soup = bs4.BeautifulSoup(html, "lxml")
	book_title = soup.title.string
	main_body = soup.find("div", {"id": "B"})
	f = open(filename, 'w')
	for a_tag in main_body.find_all("a"):
	href_str = a_tag.get("href")
	if href_str is not None:
	print("http://mangamura.org/new_pc_view" + href_str[1:])
	f.write("http://mangamura.org/new_pc_view" + href_str[1:] + "\r\n")
	f.close()

	def get_html_string(url):
	headers = {
	"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
	}

	request = urllib.request.Request(url=url, headers=headers)
	response = urllib.request.urlopen(request)
	decoded_html = response.read().decode('utf-8')

	return decoded_html

	def main():
	crawring(sys.argv[1], sys.argv[2])

	if __name__ == "__main__":
	main()