radiocat/get_hatena_blog_year_info.py

## get_hatena_blog_year_info.py
import requests
import bs4
import re
import time
import datetime
import dateutil.parser

hatena_id = "はてなID"
blog_id = "ブログID"
password = "APIキー"

def get_collection_uri(hatena_id, blog_id, password):
    service_doc_uri = "https://blog.hatena.ne.jp/{hatena_id:}/{blog_id:}/atom".format(hatena_id=hatena_id, blog_id=blog_id)
    res_service_doc = requests.get(url=service_doc_uri, auth=(hatena_id, password))
    if res_service_doc.ok:
        soup_servicedoc_xml = bs4.BeautifulSoup(res_service_doc.content, features="xml")
        collection_uri = soup_servicedoc_xml.collection.get("href")
        return collection_uri

    return False

collection_uri = get_collection_uri(hatena_id, blog_id, password)
entry_id_list = []
category = {}
dt_now = datetime.datetime.now()
entry_count = 0

MAX_ITERATER_NUM = 50
for i in range(MAX_ITERATER_NUM):
    #print(collection_uri)
    # Basic認証で記事一覧を取得
    res_collection = requests.get(collection_uri, auth=(hatena_id, password))
    if not res_collection.ok:
        print("faild")
        continue
    # Beatifulsoup4でDOM化
    soup_collectino_xml = bs4.BeautifulSoup(res_collection.content, features="xml")
    # entry elementのlistを取得
    entries = soup_collectino_xml.find_all("entry")
    # 下書きを無視
    pub_entry_list = list(filter(lambda e: e.find("app:draft").string != "yes", entries))
    for e in pub_entry_list:
        # 公開日が今年のものだけ取得
        published=dateutil.parser.parse(e.published.text) + datetime.timedelta(hours=9)
        if published.year!=dt_now.year:
            continue
        entry_count+=1
        # entry idを取得
        entry_id_list.extend([re.search(r"-(\d+)$", string=e.id.string).group(1)])
        title=e.title.string
        for t in e.find_all('category'):
            category_name=t.get('term')
            categories={category_name} - {None}
            if category_name not in category:
                category[category_name]=0
            category[category_name]+=1
        print("%s,%s,%s" % (title,published.strftime("%Y/%m/%d"),categories))

    # next
    link_next = soup_collectino_xml.find("link", rel="next")
    if not link_next:
        break
    collection_uri = link_next.get("href")
    if not collection_uri:
        break
    time.sleep(0.01)# 10ms

# カテゴリーごとの集計
for c in category:
    print("%s,%d" % (c, category[c]))

# 今年のエントリー数
print("entry count = %d" % entry_count)
entry_id_list
	import requests
	import bs4
	import re
	import time
	import datetime
	import dateutil.parser

	hatena_id = "はてなID"
	blog_id = "ブログID"
	password = "APIキー"

	def get_collection_uri(hatena_id, blog_id, password):
	service_doc_uri = "https://blog.hatena.ne.jp/{hatena_id:}/{blog_id:}/atom".format(hatena_id=hatena_id, blog_id=blog_id)
	res_service_doc = requests.get(url=service_doc_uri, auth=(hatena_id, password))
	if res_service_doc.ok:
	soup_servicedoc_xml = bs4.BeautifulSoup(res_service_doc.content, features="xml")
	collection_uri = soup_servicedoc_xml.collection.get("href")
	return collection_uri

	return False

	collection_uri = get_collection_uri(hatena_id, blog_id, password)
	entry_id_list = []
	category = {}
	dt_now = datetime.datetime.now()
	entry_count = 0

	MAX_ITERATER_NUM = 50
	for i in range(MAX_ITERATER_NUM):
	#print(collection_uri)
	# Basic認証で記事一覧を取得
	res_collection = requests.get(collection_uri, auth=(hatena_id, password))
	if not res_collection.ok:
	print("faild")
	continue
	# Beatifulsoup4でDOM化
	soup_collectino_xml = bs4.BeautifulSoup(res_collection.content, features="xml")
	# entry elementのlistを取得
	entries = soup_collectino_xml.find_all("entry")
	# 下書きを無視
	pub_entry_list = list(filter(lambda e: e.find("app:draft").string != "yes", entries))
	for e in pub_entry_list:
	# 公開日が今年のものだけ取得
	published=dateutil.parser.parse(e.published.text) + datetime.timedelta(hours=9)
	if published.year!=dt_now.year:
	continue
	entry_count+=1
	# entry idを取得
	entry_id_list.extend([re.search(r"-(\d+)$", string=e.id.string).group(1)])
	title=e.title.string
	for t in e.find_all('category'):
	category_name=t.get('term')
	categories={category_name} - {None}
	if category_name not in category:
	category[category_name]=0
	category[category_name]+=1
	print("%s,%s,%s" % (title,published.strftime("%Y/%m/%d"),categories))

	# next
	link_next = soup_collectino_xml.find("link", rel="next")
	if not link_next:
	break
	collection_uri = link_next.get("href")
	if not collection_uri:
	break
	time.sleep(0.01)# 10ms

	# カテゴリーごとの集計
	for c in category:
	print("%s,%d" % (c, category[c]))

	# 今年のエントリー数
	print("entry count = %d" % entry_count)
	entry_id_list