Skip to content

Instantly share code, notes, and snippets.

@takahub1
Last active October 13, 2017 09:34
Show Gist options
  • Save takahub1/4c2985e389057d0de259c29a03f8e859 to your computer and use it in GitHub Desktop.
Save takahub1/4c2985e389057d0de259c29a03f8e859 to your computer and use it in GitHub Desktop.
python3 "url" "filename"
#-*- coding:utf-8 -*-
import os
import sys
import bs4
import urllib.request
def crawring(url, filename):
# 指定したURLのHTMLを取得
html = get_html_string(url)
if len(html) < 1:
print("HTMLが取得できませんでした。")
print("URLを確認してください。")
sys.exit(1)
soup = bs4.BeautifulSoup(html, "lxml")
book_title = soup.title.string
main_body = soup.find("div", {"id": "B"})
f = open(filename, 'w')
for a_tag in main_body.find_all("a"):
href_str = a_tag.get("href")
if href_str is not None:
print("http://mangamura.org/new_pc_view" + href_str[1:])
f.write("http://mangamura.org/new_pc_view" + href_str[1:] + "\r\n")
f.close()
def get_html_string(url):
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
}
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
decoded_html = response.read().decode('utf-8')
return decoded_html
def main():
crawring(sys.argv[1], sys.argv[2])
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment