Skip to content

Instantly share code, notes, and snippets.

@ryuuji
Last active February 8, 2019 02:27
Show Gist options
  • Save ryuuji/fca9742c63153e9051f371c7d546e777 to your computer and use it in GitHub Desktop.
Save ryuuji/fca9742c63153e9051f371c7d546e777 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
# MIT
"""
参考スキーマ
mapping = {
"unitrad-doc": {
"properties": {
"source": {"type": "keyword"},
"title": {"type": "text", "analyzer": "kuromoji"},
"author": {"type": "text", "analyzer": "kuromoji"},
"publisher": {"type": "text", "analyzer": "kuromoji"},
"pubdate": {"type": "integer"},
"volume": {"type": "text", "analyzer": "kuromoji"},
"isbn": {"type": "keyword"},
"free": {"type": "text", "analyzer": "kuromoji"},
"url": {"type": "object"},
"holdings_count": {"type": "integer"},
"timestamp": {"type": "date"}
}
}
}
"""
__title__ = 'CALIL openBD Driver'
__copyright__ = "Copyright (C) 2017 CALIL Inc."
__author__ = "Ryuuji Yoshimoto <ryuuji@calil.jp>"
import requests
import isbnlib
import unicodedata
import ujson
import datetime
OPENBD_ENDPOINT = 'https://api.openbd.jp/v1/'
def normalize_pubdate(pubdate):
"""
出版年を正規化して整数値で返す
print unitrad.doping.normalize_pubdate(u'1825')
print unitrad.doping.normalize_pubdate(u' 1825')
print unitrad.doping.normalize_pubdate(u'1925.2')
print unitrad.doping.normalize_pubdate(u'192502')
print unitrad.doping.normalize_pubdate(u'1925/02')
print unitrad.doping.normalize_pubdate(u'2015')
print unitrad.doping.normalize_pubdate(u'201')
print unitrad.doping.normalize_pubdate(u'2155')
:param pubdate: 文字列
:return: 数値
"""
if not pubdate:
return None
_pubdate = unicode(pubdate)
_pubdate = unicodedata.normalize('NFKC', _pubdate).strip()
if len(_pubdate) >= 4:
if _pubdate[0:2] in ['18', '19', '20']:
try:
return int(_pubdate[0:4])
except ValueError:
return None
else:
return None
def chunked(iterable, n):
"""
リストをn個単位のリストに分割する
http://cortyuming.hateblo.jp/entry/2015/12/26/091224
"""
return [iterable[x:x + n] for x in range(0, len(iterable), n)]
def get_coverage():
"""
openBDから収録ISBNの一覧を取得
:return: ISBNのリスト
"""
r = requests.get(OPENBD_ENDPOINT + 'coverage')
r.raise_for_status()
print("load...")
return ujson.loads(r.content)
def get_bibs(items):
"""
openBDからPOSTでデータを取得する
:param items: ISBNのリスト
:return: 書誌のリスト
"""
print "req:bib"
r = requests.post(OPENBD_ENDPOINT + 'get', data={'isbn': ','.join(items)})
r.raise_for_status()
return ujson.loads(r.content)
def normalize_isbn(isbn):
"""
ISBNを集約用に正規化する
:param isbn: 文字列
:return: isbn 文字列
"""
if isbn is None:
return None
_isbn = unicodedata.normalize('NFKC', isbn).strip()
_isbn = isbnlib.canonical(_isbn)
if isbnlib.is_isbn10(_isbn):
return _isbn
if isbnlib.is_isbn13(_isbn) and _isbn[0:3] == '978':
return isbnlib.to_isbn10(_isbn)
if isbnlib.is_isbn13('978' + _isbn):
return isbnlib.to_isbn10('978' + _isbn)
if len(_isbn) < 10:
return None
return _isbn
def index_all():
from elasticsearch import Elasticsearch
from elasticsearch import helpers
ES_SERVERS = ['*****']
es = Elasticsearch(ES_SERVERS, sniff_on_connection_fail=True, maxsize=25)
# ISBNのリストを10000件単位に分割
chunked_coverage = chunked(get_coverage(), 1000)
actions = []
for isbns in chunked_coverage:
result = get_bibs(isbns)
for _book in result:
if _book:
book = {
'id': normalize_isbn(_book['summary']['isbn']),
'isbn': _book['summary']['isbn'],
'title': _book['summary']['title'],
'author': _book['summary']['author'],
'publisher': _book['summary']['publisher'],
'volume': _book['summary']['volume'],
'pubdate': normalize_pubdate(_book['summary']['pubdate']),
'timestamp': datetime.datetime.today(),
'source': 'openBD',
'url': [],
'holdings': [],
'holdings_count': 1,
}
book['free'] = book['title']
if book['volume']:
book['free'] += " " + book['volume']
if book['author']:
book['free'] += " " + book['author']
if _book['summary']['series']:
book['free'] += " " + _book['summary']['series']
if _book["onix"]["CollateralDetail"].get("TextContent"):
for text in _book["onix"]["CollateralDetail"]["TextContent"]:
book['free'] += ' ' + text['Text']
actions.append({
'_index': 'unitrad-openbd-all',
'_type': 'unitrad-doc',
'_id': book['id'],
'_source': book
})
if len(_book['summary']['pubdate']) == 8:
try:
x = datetime.date(int(_book['summary']['pubdate'][0:4]), int(_book['summary']['pubdate'][4:6]),
int(_book['summary']['pubdate'][6:8]))
if x > datetime.date.today() - datetime.timedelta(30):
# print _book['summary']['isbn'], _book['summary']['pubdate']
actions.append({
'_index': 'unitrad-openbd-future',
'_type': 'unitrad-doc',
'_id': book['id'],
'_source': book
})
except:
#print _book['summary']['pubdate']
pass
if len(actions) > 1000:
helpers.bulk(es, actions)
actions = []
if len(actions) > 0:
helpers.bulk(es, actions)
if __name__ == '__main__':
index_all()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment