michimani/fetch_hatebu_count.py

## fetch_hatebu_count.py
from time import sleep
import boto3
import json
import logging
import re
import traceback
import urllib.parse
import urllib.request
import xml.etree.ElementTree as ET

HATEBU_CNT_API = 'https://bookmark.hatenaapis.com/count/entry?url='
S3_BUCKET = '<your-s3-bucket-name>'
SITE_MAP_KEY = 'sitemap.xml'
HUGO_HOST = '<your-hugo-site-host>' # eg) https://michimani.net

s3 = boto3.resource('s3')
logger = logging.getLogger()
logger.setLevel(logging.INFO)


def get_hatebu_count(post_url):
    count = 0
    hatebu_url = HATEBU_CNT_API + urllib.parse.quote(post_url)
    try:
        with urllib.request.urlopen(hatebu_url) as res:
            count = int(res.read())
    except Exception as e:
        logger.error('Hatebu count request failed: %s', traceback.format_exc())

    return count


def get_post_url_list():
    post_url_list = []

    try:
        s3_object = s3.Object(bucket_name=S3_BUCKET, key=SITE_MAP_KEY)
        sitemap = s3_object.get()['Body'].read().decode('utf-8')
        xml_root = ET.fromstring(sitemap)
        ns = {'post': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
        reg = re.compile('^' + re.escape(HUGO_HOST + '/post/') + '.+')
        for url_part in xml_root.findall('post:url/post:loc', ns):
            if reg.match(url_part.text):
                post_url_list.append(url_part.text)
    except Exception as e:
        logger.error('Get post url failed: %s', traceback.format_exc())

    return post_url_list


def put_hatebu_count_file(post_url, hatebu_count):
    try:
        object_key = get_key_from_post_url(post_url)
        s3obj = s3.Object(S3_BUCKET, object_key)
        data = json.dumps({'cnt': hatebu_count}, ensure_ascii=False)
        s3obj.put(Body=data)
    except Exception as e:
        logger.error('Put count data failed: %s', traceback.format_exc())


def get_key_from_post_url(post_url):
    return 'data/htbcnt/{post_key}.json'.format(
        post_key=post_url.replace(HUGO_HOST + '/post/', '').replace('/', ''))


def count_needs_update(post_url, new_count):
    res = False
    try:
        object_key = get_key_from_post_url(post_url)
        cnt_data_obj = s3.Object(bucket_name=S3_BUCKET, key=object_key)
        cnt_data_raw = cnt_data_obj.get()["Body"].read().decode("utf-8")
        cnt_data = json.loads(cnt_data_raw)

        if new_count > cnt_data['cnt']:
            res = True
    except Exception:
        print('Hatebu count file does not exists.')
        res = True

    return res


def lambda_handler(event, context):
    post_list = get_post_url_list()
    for post_url in post_list:
        sleep(0.5)
        count = get_hatebu_count(post_url)
        if count_needs_update(post_url, count) is True:
            put_hatebu_count_file(post_url, count)
            logger.info('Updated for "{}", new Hatebu count is "{}"'.format(post_url, count))
        else:
            logger.info('No update requred for "{}"'.format(post_url))
	from time import sleep
	import boto3
	import json
	import logging
	import re
	import traceback
	import urllib.parse
	import urllib.request
	import xml.etree.ElementTree as ET

	HATEBU_CNT_API = 'https://bookmark.hatenaapis.com/count/entry?url='
	S3_BUCKET = '<your-s3-bucket-name>'
	SITE_MAP_KEY = 'sitemap.xml'
	HUGO_HOST = '<your-hugo-site-host>' # eg) https://michimani.net

	s3 = boto3.resource('s3')
	logger = logging.getLogger()
	logger.setLevel(logging.INFO)


	def get_hatebu_count(post_url):
	count = 0
	hatebu_url = HATEBU_CNT_API + urllib.parse.quote(post_url)
	try:
	with urllib.request.urlopen(hatebu_url) as res:
	count = int(res.read())
	except Exception as e:
	logger.error('Hatebu count request failed: %s', traceback.format_exc())

	return count


	def get_post_url_list():
	post_url_list = []

	try:
	s3_object = s3.Object(bucket_name=S3_BUCKET, key=SITE_MAP_KEY)
	sitemap = s3_object.get()['Body'].read().decode('utf-8')
	xml_root = ET.fromstring(sitemap)
	ns = {'post': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
	reg = re.compile('^' + re.escape(HUGO_HOST + '/post/') + '.+')
	for url_part in xml_root.findall('post:url/post:loc', ns):
	if reg.match(url_part.text):
	post_url_list.append(url_part.text)
	except Exception as e:
	logger.error('Get post url failed: %s', traceback.format_exc())

	return post_url_list


	def put_hatebu_count_file(post_url, hatebu_count):
	try:
	object_key = get_key_from_post_url(post_url)
	s3obj = s3.Object(S3_BUCKET, object_key)
	data = json.dumps({'cnt': hatebu_count}, ensure_ascii=False)
	s3obj.put(Body=data)
	except Exception as e:
	logger.error('Put count data failed: %s', traceback.format_exc())


	def get_key_from_post_url(post_url):
	return 'data/htbcnt/{post_key}.json'.format(
	post_key=post_url.replace(HUGO_HOST + '/post/', '').replace('/', ''))


	def count_needs_update(post_url, new_count):
	res = False
	try:
	object_key = get_key_from_post_url(post_url)
	cnt_data_obj = s3.Object(bucket_name=S3_BUCKET, key=object_key)
	cnt_data_raw = cnt_data_obj.get()["Body"].read().decode("utf-8")
	cnt_data = json.loads(cnt_data_raw)

	if new_count > cnt_data['cnt']:
	res = True
	except Exception:
	print('Hatebu count file does not exists.')
	res = True

	return res


	def lambda_handler(event, context):
	post_list = get_post_url_list()
	for post_url in post_list:
	sleep(0.5)
	count = get_hatebu_count(post_url)
	if count_needs_update(post_url, count) is True:
	put_hatebu_count_file(post_url, count)
	logger.info('Updated for "{}", new Hatebu count is "{}"'.format(post_url, count))
	else:
	logger.info('No update requred for "{}"'.format(post_url))