Created
December 24, 2018 07:04
-
-
Save radiocat/da0377cc0d7b29bcd042b0dc495edf71 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import bs4 | |
import re | |
import time | |
import datetime | |
import dateutil.parser | |
hatena_id = "はてなID" | |
blog_id = "ブログID" | |
password = "APIキー" | |
def get_collection_uri(hatena_id, blog_id, password): | |
service_doc_uri = "https://blog.hatena.ne.jp/{hatena_id:}/{blog_id:}/atom".format(hatena_id=hatena_id, blog_id=blog_id) | |
res_service_doc = requests.get(url=service_doc_uri, auth=(hatena_id, password)) | |
if res_service_doc.ok: | |
soup_servicedoc_xml = bs4.BeautifulSoup(res_service_doc.content, features="xml") | |
collection_uri = soup_servicedoc_xml.collection.get("href") | |
return collection_uri | |
return False | |
collection_uri = get_collection_uri(hatena_id, blog_id, password) | |
entry_id_list = [] | |
category = {} | |
dt_now = datetime.datetime.now() | |
entry_count = 0 | |
MAX_ITERATER_NUM = 50 | |
for i in range(MAX_ITERATER_NUM): | |
#print(collection_uri) | |
# Basic認証で記事一覧を取得 | |
res_collection = requests.get(collection_uri, auth=(hatena_id, password)) | |
if not res_collection.ok: | |
print("faild") | |
continue | |
# Beatifulsoup4でDOM化 | |
soup_collectino_xml = bs4.BeautifulSoup(res_collection.content, features="xml") | |
# entry elementのlistを取得 | |
entries = soup_collectino_xml.find_all("entry") | |
# 下書きを無視 | |
pub_entry_list = list(filter(lambda e: e.find("app:draft").string != "yes", entries)) | |
for e in pub_entry_list: | |
# 公開日が今年のものだけ取得 | |
published=dateutil.parser.parse(e.published.text) + datetime.timedelta(hours=9) | |
if published.year!=dt_now.year: | |
continue | |
entry_count+=1 | |
# entry idを取得 | |
entry_id_list.extend([re.search(r"-(\d+)$", string=e.id.string).group(1)]) | |
title=e.title.string | |
for t in e.find_all('category'): | |
category_name=t.get('term') | |
categories={category_name} - {None} | |
if category_name not in category: | |
category[category_name]=0 | |
category[category_name]+=1 | |
print("%s,%s,%s" % (title,published.strftime("%Y/%m/%d"),categories)) | |
# next | |
link_next = soup_collectino_xml.find("link", rel="next") | |
if not link_next: | |
break | |
collection_uri = link_next.get("href") | |
if not collection_uri: | |
break | |
time.sleep(0.01)# 10ms | |
# カテゴリーごとの集計 | |
for c in category: | |
print("%s,%d" % (c, category[c])) | |
# 今年のエントリー数 | |
print("entry count = %d" % entry_count) | |
entry_id_list |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment