Skip to content

Instantly share code, notes, and snippets.

@ryuuji
Last active June 21, 2020 10:38
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save ryuuji/8e5f5eb141d445e766cfd82b0dcd2eab to your computer and use it in GitHub Desktop.
OPACのための汎用的なインデックスとスコアリング戦略 CC-0 / CALIL Inc / Ryuuji Yoshimoto
def create_index(es, index):
es.indices.create(index='negima-' + index, body={
"settings": {
"number_of_shards": 2,
"number_of_replicas": 0,
"refresh_interval": "30s",
"analysis": {
"analyzer": {
"wakachi": {
"type": "custom",
"tokenizer": "kuromoji_tokenizer",
"mode": "search",
"char_filter": [
"icu_normalizer",
"kuromoji_iteration_mark"
],
"filter": [
"hiragana_to_katakana",
"kuromoji_number",
"kuromoji_stemmer",
"kuromoji_baseform"
]
},
"ngram": {
"type": "custom",
"tokenizer": "ngram_tokenizer",
"char_filter": [
"icu_normalizer",
"remove_kigou",
"kuromoji_iteration_mark"
],
"filter": [
"hiragana_to_katakana",
"kuromoji_number",
"kuromoji_stemmer"
]
},
"readingform": {
"type": "custom",
"tokenizer": "kuromoji_tokenizer",
"mode": "search",
"char_filter": [
"icu_normalizer",
"kuromoji_iteration_mark"
],
"filter": [
"hiragana_to_katakana",
"kuromoji_number",
"kuromoji_stemmer",
"katakana_readingform"
]
}
},
"tokenizer": {
"ngram_tokenizer": {
"type": "ngram",
"min_gram": "2",
"max_gram": "3",
"token_chars": ["letter", "digit"]
}
},
"filter": {
"katakana_readingform": {
"type": "kuromoji_readingform",
"use_romaji": False
},
"hiragana_to_katakana": {
"type": "icu_transform",
"id": "Hiragana-Katakana"
},
},
"char_filter": {
"remove_kigou": {
"type": "mapping",
"mappings": [
"!=>",
"・=>",
"&=>",
"==>",
"★=>",
"☆=>",
"\\s=>"
]
}
},
"normalizer": {
"title_normalizer": {
"type": "custom",
"char_filter": [
"icu_normalizer",
"kuromoji_iteration_mark"
],
"filter": [
"hiragana_to_katakana"
]
}
}
}
},
"mappings": {
"_meta": {
"version": 5
},
"properties": {
"source": {"type": "keyword"},
"title": {"type": "text", "analyzer": "kuromoji"},
"title_exact": {"type": "keyword", "normalizer": "title_normalizer"},
"author": {"type": "text", "analyzer": "kuromoji"},
"publisher": {"type": "text", "analyzer": "kuromoji"},
"class": {"type": "text", "analyzer": "kuromoji"},
"pubdate": {"type": "integer"},
"volume": {"type": "text", "analyzer": "kuromoji"},
"isbn": {"type": "keyword"},
"isbn_normalized": {"type": "keyword"},
"free": {
"type": 'text',
"analyzer": 'wakachi',
"fields": {
"english": {"type": 'text', "analyzer": 'english'},
"ngram": {"type": 'text', "analyzer": 'ngram'},
"readingform": {"type": 'text', "analyzer": 'readingform'}
}
},
"url": {"type": "keyword"},
"holdings_count": {"type": "integer"},
"timestamp": {"type": "date"}
}
}
})
import isbnlib
import unicodedata
def normalize_isbn(isbn):
"""
ISBNを集約用に正規化する
:param isbn: 文字列
:return: isbn 文字列 / None
"""
if not isbn:
return None
_isbn = unicodedata.normalize('NFKC', isbn).strip()
_isbn = isbnlib.canonical(_isbn)
if isbnlib.is_isbn13(_isbn) and _isbn[0:3] == '978':
return isbnlib.to_isbn10(_isbn)
if isbnlib.is_isbn13('978' + _isbn):
return isbnlib.to_isbn10('978' + _isbn)
if len(_isbn) == 13 and isbnlib.is_isbn10(_isbn[3:]): # negimaで追加した
return _isbn[3:]
return _isbn if isbnlib.is_isbn10(_isbn) or isbnlib.is_isbn13(_isbn) else None
def normalize(s):
"""
文字列を正規化する。
連続するスペース・改行文字を除去する
:param: 文字列(UTF-8)
:return: 正規化された文字列(UTF-8)
"""
if s is None:
return ''
return re.sub(r'\s+', u' ', unicodedata.normalize('NFKC', s).strip())
def build_v2(query, limit=1000, mode='phrase'):
"""
クエリパラメーターを生成する
:param limit: 最大値は1000(int)
:param query:
:return:
"""
min_score = 4
if query.get('ISBN'):
if normalize_isbn(query['isbn']):
query['isbn'] = normalize_isbn(query['isbn'])
et = []
def match(field, value):
if field == 'free':
if len(normalize(value)) <= 4:
return {
"bool": {
"should": [
{
"multi_match": {
"query": normalize(value),
"type": "phrase",
"fields": [
"title^8",
"free^4",
"free.ngram^2",
"free.english^2",
"free.readingform"
]
}
},
{
"term": {
"title_exact": {
"value": normalize(value),
"boost": 500
}
}
}
]
}
}
else:
return {
"multi_match": {
"query": normalize(value),
"type": "phrase",
"slop": 40,
"fields": [
"title^8",
"free^4",
"free.ngram^2",
"free.readingform"
]
}
}
else:
return {
"match": {
field: {
"query": normalize(value),
"operator": "and"
}
}
}
def match_phrase(field, value):
return {
"match_phrase": {
field: {
"query": normalize(value)
}
}
}
for k, v in query.items():
if len(v) == 0:
continue
if k == 'free':
for m in v.split(' '):
et.append(match('free', m))
elif k == 'title':
if mode == 'phrase':
et.append(match_phrase('title', v))
else:
et.append(match('title', v))
elif k == 'author':
et.append(match_phrase('author', v))
elif k == 'publisher':
et.append(match_phrase('publisher', v))
elif k == 'year_start':
year_start = normalize_pubdate(str(v))
et.append({
"range": {
"pubdate": {
"gte": year_start
}
}
})
elif k == 'year_end':
year_end = normalize_pubdate(str(v))
et.append({
"range": {
"pubdate": {
"lte": year_end
}
}
})
elif k == 'isbn':
# et.append({
# "ids": {"values": [v]}
# })
et.append(
{
"term": {
"normalized_isbn": {
"value": v
}
}
}
)
min_score = 0
elif k == 'class':
et.append(match('class', v))
else:
raise Exception('unsupported param:' + k)
query = {
"query": {
"function_score": {
"query": {
"bool": {
"must": et
}
},
"score_mode": "max",
"boost_mode": "multiply",
"functions": [
{
"field_value_factor": {
"field": "holdings_count",
"factor": 1,
"modifier": "sqrt",
"missing": 1
}
},
{
"exp": {
"pubdate": {
"origin": 2020,
"scale": 50,
"offset": 3,
"decay": 0.8
}
}
}
]
}
},
"min_score": min_score,
"size": limit
}
return query
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment