Created
October 17, 2017 15:04
-
-
Save Cartman0/640829438679132e0a936a2a049d6aa2 to your computer and use it in GitHub Desktop.
はてなブログAPI 全記事のentry idを取得
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import bs4 | |
import re | |
import time | |
def get_collection_uri(hatena_id, blog_id, password): | |
service_doc_uri = "https://blog.hatena.ne.jp/{hatena_id:}/{blog_id:}/atom".format(hatena_id=hatena_id, blog_id=blog_id) | |
res_service_doc = requests.get(url=service_doc_uri, auth=(hatena_id, password)) | |
if res_service_doc.ok: | |
soup_servicedoc_xml = bs4.BeautifulSoup(res_service_doc.content, features="xml") | |
collection_uri = soup_servicedoc_xml.collection.get("href") | |
return collection_uri | |
return False | |
def get_entry_id_list(hatena_id, blog_id, password, limit_max_iterations=50, wait_s=0.01, print_collection_uri=True): | |
''' | |
return relesed order | |
''' | |
collection_uri = get_collection_uri(hatena_id, blog_id, password) | |
if not collection_uri: | |
raise Exception("Not get collection uri.") | |
entry_id_list = [] | |
for i in range(limit_max_iterations): | |
if print_collection_uri: | |
print(collection_uri) | |
# Basic認証で記事一覧を取得 | |
res_collection = requests.get(collection_uri, auth=(hatena_id, password)) | |
if not res_collection.ok: | |
return False | |
# Beatifulsoup4でDOM化 | |
soup_collection_xml = bs4.BeautifulSoup(res_collection.content, features="xml") | |
# entry elementのlistを取得 | |
entries = soup_collection_xml.find_all("entry") | |
# 下書きを無視 | |
pub_entry_list = list(filter(lambda e: e.find("app:draft").string != "yes", entries)) | |
# entry idを取得 | |
entry_id_list.extend([re.search(r"-(\d+)$", string=e.id.string).group(1) for e in pub_entry_list]) | |
# 次のcollection_uriへ更新 | |
link_next = soup_collection_xml.find("link", rel="next") | |
if not link_next: | |
return entry_id_list | |
collection_uri = link_next.get("href") | |
if not collection_uri: | |
return entry_id_list | |
# wait | |
time.sleep(wait_s)# 10ms | |
print("warning: possible to left some entry_id") | |
return entry_id_list | |
get_entry_id_list(hatena_id, blog_id, password, limit_max_iterations=50, wait_s=0.01) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment