Skip to content

Instantly share code, notes, and snippets.

@radiocat
Created December 24, 2018 07:04
Show Gist options
  • Save radiocat/da0377cc0d7b29bcd042b0dc495edf71 to your computer and use it in GitHub Desktop.
Save radiocat/da0377cc0d7b29bcd042b0dc495edf71 to your computer and use it in GitHub Desktop.
import requests
import bs4
import re
import time
import datetime
import dateutil.parser
hatena_id = "はてなID"
blog_id = "ブログID"
password = "APIキー"
def get_collection_uri(hatena_id, blog_id, password):
service_doc_uri = "https://blog.hatena.ne.jp/{hatena_id:}/{blog_id:}/atom".format(hatena_id=hatena_id, blog_id=blog_id)
res_service_doc = requests.get(url=service_doc_uri, auth=(hatena_id, password))
if res_service_doc.ok:
soup_servicedoc_xml = bs4.BeautifulSoup(res_service_doc.content, features="xml")
collection_uri = soup_servicedoc_xml.collection.get("href")
return collection_uri
return False
collection_uri = get_collection_uri(hatena_id, blog_id, password)
entry_id_list = []
category = {}
dt_now = datetime.datetime.now()
entry_count = 0
MAX_ITERATER_NUM = 50
for i in range(MAX_ITERATER_NUM):
#print(collection_uri)
# Basic認証で記事一覧を取得
res_collection = requests.get(collection_uri, auth=(hatena_id, password))
if not res_collection.ok:
print("faild")
continue
# Beatifulsoup4でDOM化
soup_collectino_xml = bs4.BeautifulSoup(res_collection.content, features="xml")
# entry elementのlistを取得
entries = soup_collectino_xml.find_all("entry")
# 下書きを無視
pub_entry_list = list(filter(lambda e: e.find("app:draft").string != "yes", entries))
for e in pub_entry_list:
# 公開日が今年のものだけ取得
published=dateutil.parser.parse(e.published.text) + datetime.timedelta(hours=9)
if published.year!=dt_now.year:
continue
entry_count+=1
# entry idを取得
entry_id_list.extend([re.search(r"-(\d+)$", string=e.id.string).group(1)])
title=e.title.string
for t in e.find_all('category'):
category_name=t.get('term')
categories={category_name} - {None}
if category_name not in category:
category[category_name]=0
category[category_name]+=1
print("%s,%s,%s" % (title,published.strftime("%Y/%m/%d"),categories))
# next
link_next = soup_collectino_xml.find("link", rel="next")
if not link_next:
break
collection_uri = link_next.get("href")
if not collection_uri:
break
time.sleep(0.01)# 10ms
# カテゴリーごとの集計
for c in category:
print("%s,%d" % (c, category[c]))
# 今年のエントリー数
print("entry count = %d" % entry_count)
entry_id_list
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment