Skip to content

Instantly share code, notes, and snippets.

@hokekiyoo
Created July 2, 2017 04:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hokekiyoo/76656ee992926c3669a4da2cae20c920 to your computer and use it in GitHub Desktop.
Save hokekiyoo/76656ee992926c3669a4da2cae20c920 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
from urllib import request
import csv
from argparse import ArgumentParser
def articles_to_csv(url, output):
is_articles = True
page = 1
with open (output, "w") as f:
writer = csv.writer(f, lineterminator='\n') # 改行コード(\n)を指定しておく
while is_articles:
try:
html = request.urlopen("{}/archive?page={}".format(url, page))
except HTTPError as e:
# HTTPレスポンスのステータスコードが404, 403, 401などの例外処理
print(e.reson)
break
except URLError as e:
# アクセスしようとしたurlが無効なときの例外処理
print(e.reson)
break
soup = BeautifulSoup(html, "html.parser")
articles = soup.find_all("a",class_="entry-title-link")
for article in articles:
try:
writer.writerow([article.text, article.get("href")])
except UnicodeEncodeError as e:
# ふざけた文字が入ってる場合はエラー吐くことも
print(e.reason)
print("この記事のタイトルに良くない文字が入ってます :",article.get("href"))
if len(articles) == 0:
# articleがなくなったら終了
is_articles = False
page += 1
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument("-u", "--url", type=str, required=True,help="input your url")
parser.add_argument("-o", "--output", type=str, default="articles.csv", help="output csv name")
args = parser.parse_args()
articles_to_csv(args.url, args.output)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment