ryuuji/openBDのインデックス

## openBDのインデックス
# -*- coding: utf-8 -*-
# MIT


"""
参考スキーマ
    mapping = {
        "unitrad-doc": {
            "properties": {
                "source": {"type": "keyword"},
                "title": {"type": "text", "analyzer": "kuromoji"},
                "author": {"type": "text", "analyzer": "kuromoji"},
                "publisher": {"type": "text", "analyzer": "kuromoji"},
                "pubdate": {"type": "integer"},
                "volume": {"type": "text", "analyzer": "kuromoji"},
                "isbn": {"type": "keyword"},
                "free": {"type": "text", "analyzer": "kuromoji"},
                "url": {"type": "object"},
                "holdings_count": {"type": "integer"},
                "timestamp": {"type": "date"}
            }
        }
    }
"""
__title__ = 'CALIL openBD Driver'
__copyright__ = "Copyright (C) 2017 CALIL Inc."
__author__ = "Ryuuji Yoshimoto <ryuuji@calil.jp>"

import requests
import isbnlib
import unicodedata
import ujson
import datetime

OPENBD_ENDPOINT = 'https://api.openbd.jp/v1/'


def normalize_pubdate(pubdate):
    """
    出版年を正規化して整数値で返す
    print unitrad.doping.normalize_pubdate(u'1８２５')
    print unitrad.doping.normalize_pubdate(u'　1８２５')
    print unitrad.doping.normalize_pubdate(u'1925.2')
    print unitrad.doping.normalize_pubdate(u'192502')
    print unitrad.doping.normalize_pubdate(u'1925/02')
    print unitrad.doping.normalize_pubdate(u'２０１５')
    print unitrad.doping.normalize_pubdate(u'２０１')
    print unitrad.doping.normalize_pubdate(u'2155')
    :param pubdate: 文字列
    :return: 数値
    """
    if not pubdate:
        return None
    _pubdate = unicode(pubdate)
    _pubdate = unicodedata.normalize('NFKC', _pubdate).strip()
    if len(_pubdate) >= 4:
        if _pubdate[0:2] in ['18', '19', '20']:
            try:
                return int(_pubdate[0:4])
            except ValueError:
                return None
        else:
            return None


def chunked(iterable, n):
    """
    リストをn個単位のリストに分割する
    http://cortyuming.hateblo.jp/entry/2015/12/26/091224
    """
    return [iterable[x:x + n] for x in range(0, len(iterable), n)]


def get_coverage():
    """
    openBDから収録ISBNの一覧を取得
    :return: ISBNのリスト
    """
    r = requests.get(OPENBD_ENDPOINT + 'coverage')
    r.raise_for_status()
    print("load...")
    return ujson.loads(r.content)


def get_bibs(items):
    """
    openBDからPOSTでデータを取得する
    :param items: ISBNのリスト
    :return: 書誌のリスト
    """
    print "req:bib"
    r = requests.post(OPENBD_ENDPOINT + 'get', data={'isbn': ','.join(items)})
    r.raise_for_status()
    return ujson.loads(r.content)


def normalize_isbn(isbn):
    """
    ISBNを集約用に正規化する
    :param isbn: 文字列
    :return: isbn 文字列
    """
    if isbn is None:
        return None
    _isbn = unicodedata.normalize('NFKC', isbn).strip()
    _isbn = isbnlib.canonical(_isbn)
    if isbnlib.is_isbn10(_isbn):
        return _isbn
    if isbnlib.is_isbn13(_isbn) and _isbn[0:3] == '978':
        return isbnlib.to_isbn10(_isbn)
    if isbnlib.is_isbn13('978' + _isbn):
        return isbnlib.to_isbn10('978' + _isbn)
    if len(_isbn) < 10:
        return None
    return _isbn


def index_all():
    from elasticsearch import Elasticsearch
    from elasticsearch import helpers

    ES_SERVERS = ['*****']

    es = Elasticsearch(ES_SERVERS, sniff_on_connection_fail=True, maxsize=25)

    # ISBNのリストを10000件単位に分割
    chunked_coverage = chunked(get_coverage(), 1000)

    actions = []
    for isbns in chunked_coverage:
        result = get_bibs(isbns)
        for _book in result:
            if _book:
                book = {
                    'id': normalize_isbn(_book['summary']['isbn']),
                    'isbn': _book['summary']['isbn'],
                    'title': _book['summary']['title'],
                    'author': _book['summary']['author'],
                    'publisher': _book['summary']['publisher'],
                    'volume': _book['summary']['volume'],
                    'pubdate': normalize_pubdate(_book['summary']['pubdate']),
                    'timestamp': datetime.datetime.today(),
                    'source': 'openBD',
                    'url': [],
                    'holdings': [],
                    'holdings_count': 1,
                }
                book['free'] = book['title']
                if book['volume']:
                    book['free'] += " " + book['volume']
                if book['author']:
                    book['free'] += " " + book['author']
                if _book['summary']['series']:
                    book['free'] += " " + _book['summary']['series']

                if _book["onix"]["CollateralDetail"].get("TextContent"):
                    for text in _book["onix"]["CollateralDetail"]["TextContent"]:
                        book['free'] += ' ' + text['Text']

                actions.append({
                    '_index': 'unitrad-openbd-all',
                    '_type': 'unitrad-doc',
                    '_id': book['id'],
                    '_source': book
                })

                if len(_book['summary']['pubdate']) == 8:
                    try:
                        x = datetime.date(int(_book['summary']['pubdate'][0:4]), int(_book['summary']['pubdate'][4:6]),
                                          int(_book['summary']['pubdate'][6:8]))
                        if x > datetime.date.today() - datetime.timedelta(30):
                            # print _book['summary']['isbn'], _book['summary']['pubdate']
                            actions.append({
                                '_index': 'unitrad-openbd-future',
                                '_type': 'unitrad-doc',
                                '_id': book['id'],
                                '_source': book
                            })
                    except:
                        #print _book['summary']['pubdate']
                        pass

                if len(actions) > 1000:
                    helpers.bulk(es, actions)
                    actions = []
    if len(actions) > 0:
        helpers.bulk(es, actions)


if __name__ == '__main__':
    index_all()
	# -- coding: utf-8 --
	# MIT


	"""
	参考スキーマ
	mapping = {
	"unitrad-doc": {
	"properties": {
	"source": {"type": "keyword"},
	"title": {"type": "text", "analyzer": "kuromoji"},
	"author": {"type": "text", "analyzer": "kuromoji"},
	"publisher": {"type": "text", "analyzer": "kuromoji"},
	"pubdate": {"type": "integer"},
	"volume": {"type": "text", "analyzer": "kuromoji"},
	"isbn": {"type": "keyword"},
	"free": {"type": "text", "analyzer": "kuromoji"},
	"url": {"type": "object"},
	"holdings_count": {"type": "integer"},
	"timestamp": {"type": "date"}
	}
	}
	}
	"""
	__title__ = 'CALIL openBD Driver'
	__copyright__ = "Copyright (C) 2017 CALIL Inc."
	__author__ = "Ryuuji Yoshimoto <ryuuji@calil.jp>"

	import requests
	import isbnlib
	import unicodedata
	import ujson
	import datetime

	OPENBD_ENDPOINT = 'https://api.openbd.jp/v1/'


	def normalize_pubdate(pubdate):
	"""
	出版年を正規化して整数値で返す
	print unitrad.doping.normalize_pubdate(u'1８２５')
	print unitrad.doping.normalize_pubdate(u'　1８２５')
	print unitrad.doping.normalize_pubdate(u'1925.2')
	print unitrad.doping.normalize_pubdate(u'192502')
	print unitrad.doping.normalize_pubdate(u'1925/02')
	print unitrad.doping.normalize_pubdate(u'２０１５')
	print unitrad.doping.normalize_pubdate(u'２０１')
	print unitrad.doping.normalize_pubdate(u'2155')
	:param pubdate: 文字列
	:return: 数値
	"""
	if not pubdate:
	return None
	_pubdate = unicode(pubdate)
	_pubdate = unicodedata.normalize('NFKC', _pubdate).strip()
	if len(_pubdate) >= 4:
	if _pubdate[0:2] in ['18', '19', '20']:
	try:
	return int(_pubdate[0:4])
	except ValueError:
	return None
	else:
	return None


	def chunked(iterable, n):
	"""
	リストをn個単位のリストに分割する
	http://cortyuming.hateblo.jp/entry/2015/12/26/091224
	"""
	return [iterable[x:x + n] for x in range(0, len(iterable), n)]


	def get_coverage():
	"""
	openBDから収録ISBNの一覧を取得
	:return: ISBNのリスト
	"""
	r = requests.get(OPENBD_ENDPOINT + 'coverage')
	r.raise_for_status()
	print("load...")
	return ujson.loads(r.content)


	def get_bibs(items):
	"""
	openBDからPOSTでデータを取得する
	:param items: ISBNのリスト
	:return: 書誌のリスト
	"""
	print "req:bib"
	r = requests.post(OPENBD_ENDPOINT + 'get', data={'isbn': ','.join(items)})
	r.raise_for_status()
	return ujson.loads(r.content)


	def normalize_isbn(isbn):
	"""
	ISBNを集約用に正規化する
	:param isbn: 文字列
	:return: isbn 文字列
	"""
	if isbn is None:
	return None
	_isbn = unicodedata.normalize('NFKC', isbn).strip()
	_isbn = isbnlib.canonical(_isbn)
	if isbnlib.is_isbn10(_isbn):
	return _isbn
	if isbnlib.is_isbn13(_isbn) and _isbn[0:3] == '978':
	return isbnlib.to_isbn10(_isbn)
	if isbnlib.is_isbn13('978' + _isbn):
	return isbnlib.to_isbn10('978' + _isbn)
	if len(_isbn) < 10:
	return None
	return _isbn


	def index_all():
	from elasticsearch import Elasticsearch
	from elasticsearch import helpers

	ES_SERVERS = ['*****']

	es = Elasticsearch(ES_SERVERS, sniff_on_connection_fail=True, maxsize=25)

	# ISBNのリストを10000件単位に分割
	chunked_coverage = chunked(get_coverage(), 1000)

	actions = []
	for isbns in chunked_coverage:
	result = get_bibs(isbns)
	for _book in result:
	if _book:
	book = {
	'id': normalize_isbn(_book['summary']['isbn']),
	'isbn': _book['summary']['isbn'],
	'title': _book['summary']['title'],
	'author': _book['summary']['author'],
	'publisher': _book['summary']['publisher'],
	'volume': _book['summary']['volume'],
	'pubdate': normalize_pubdate(_book['summary']['pubdate']),
	'timestamp': datetime.datetime.today(),
	'source': 'openBD',
	'url': [],
	'holdings': [],
	'holdings_count': 1,
	}
	book['free'] = book['title']
	if book['volume']:
	book['free'] += " " + book['volume']
	if book['author']:
	book['free'] += " " + book['author']
	if _book['summary']['series']:
	book['free'] += " " + _book['summary']['series']

	if _book["onix"]["CollateralDetail"].get("TextContent"):
	for text in _book["onix"]["CollateralDetail"]["TextContent"]:
	book['free'] += ' ' + text['Text']

	actions.append({
	'_index': 'unitrad-openbd-all',
	'_type': 'unitrad-doc',
	'_id': book['id'],
	'_source': book
	})

	if len(_book['summary']['pubdate']) == 8:
	try:
	x = datetime.date(int(_book['summary']['pubdate'][0:4]), int(_book['summary']['pubdate'][4:6]),
	int(_book['summary']['pubdate'][6:8]))
	if x > datetime.date.today() - datetime.timedelta(30):
	# print _book['summary']['isbn'], _book['summary']['pubdate']
	actions.append({
	'_index': 'unitrad-openbd-future',
	'_type': 'unitrad-doc',
	'_id': book['id'],
	'_source': book
	})
	except:
	#print _book['summary']['pubdate']
	pass

	if len(actions) > 1000:
	helpers.bulk(es, actions)
	actions = []
	if len(actions) > 0:
	helpers.bulk(es, actions)


	if __name__ == '__main__':
	index_all()