Skip to content

Instantly share code, notes, and snippets.

@michimani
Last active December 10, 2019 13:46
Show Gist options
  • Save michimani/9ee3a194996d09f18a94a4dc2b2a856c to your computer and use it in GitHub Desktop.
Save michimani/9ee3a194996d09f18a94a4dc2b2a856c to your computer and use it in GitHub Desktop.
Get share count in Hatena Bookmark of each of posts in Hugo site, and save it as json files on S3.
from time import sleep
import boto3
import json
import logging
import re
import traceback
import urllib.parse
import urllib.request
import xml.etree.ElementTree as ET
HATEBU_CNT_API = 'https://bookmark.hatenaapis.com/count/entry?url='
S3_BUCKET = '<your-s3-bucket-name>'
SITE_MAP_KEY = 'sitemap.xml'
HUGO_HOST = '<your-hugo-site-host>' # eg) https://michimani.net
s3 = boto3.resource('s3')
logger = logging.getLogger()
logger.setLevel(logging.INFO)
def get_hatebu_count(post_url):
count = 0
hatebu_url = HATEBU_CNT_API + urllib.parse.quote(post_url)
try:
with urllib.request.urlopen(hatebu_url) as res:
count = int(res.read())
except Exception as e:
logger.error('Hatebu count request failed: %s', traceback.format_exc())
return count
def get_post_url_list():
post_url_list = []
try:
s3_object = s3.Object(bucket_name=S3_BUCKET, key=SITE_MAP_KEY)
sitemap = s3_object.get()['Body'].read().decode('utf-8')
xml_root = ET.fromstring(sitemap)
ns = {'post': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
reg = re.compile('^' + re.escape(HUGO_HOST + '/post/') + '.+')
for url_part in xml_root.findall('post:url/post:loc', ns):
if reg.match(url_part.text):
post_url_list.append(url_part.text)
except Exception as e:
logger.error('Get post url failed: %s', traceback.format_exc())
return post_url_list
def put_hatebu_count_file(post_url, hatebu_count):
try:
object_key = get_key_from_post_url(post_url)
s3obj = s3.Object(S3_BUCKET, object_key)
data = json.dumps({'cnt': hatebu_count}, ensure_ascii=False)
s3obj.put(Body=data)
except Exception as e:
logger.error('Put count data failed: %s', traceback.format_exc())
def get_key_from_post_url(post_url):
return 'data/htbcnt/{post_key}.json'.format(
post_key=post_url.replace(HUGO_HOST + '/post/', '').replace('/', ''))
def count_needs_update(post_url, new_count):
res = False
try:
object_key = get_key_from_post_url(post_url)
cnt_data_obj = s3.Object(bucket_name=S3_BUCKET, key=object_key)
cnt_data_raw = cnt_data_obj.get()["Body"].read().decode("utf-8")
cnt_data = json.loads(cnt_data_raw)
if new_count > cnt_data['cnt']:
res = True
except Exception:
print('Hatebu count file does not exists.')
res = True
return res
def lambda_handler(event, context):
post_list = get_post_url_list()
for post_url in post_list:
sleep(0.5)
count = get_hatebu_count(post_url)
if count_needs_update(post_url, count) is True:
put_hatebu_count_file(post_url, count)
logger.info('Updated for "{}", new Hatebu count is "{}"'.format(post_url, count))
else:
logger.info('No update requred for "{}"'.format(post_url))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment