Skip to content

Instantly share code, notes, and snippets.

@pianotaiq
Last active November 21, 2018 02:13
Show Gist options
  • Save pianotaiq/6b81f18ccee24fc8f5d85d5b0e8844f9 to your computer and use it in GitHub Desktop.
Save pianotaiq/6b81f18ccee24fc8f5d85d5b0e8844f9 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# コード例1:Trie木の例(書きかけ)\n",
"class Node:\n",
" def __init__(self, left=-1, right=-1, ch='', word_id=-1):\n",
" self.left = left\n",
" self.right = right\n",
" self.ch = ch\n",
" self.word_id = word_id\n",
"\n",
"class Trie:\n",
" def __init__(self):\n",
" self.data = dict()\n",
" self.data[0] = Node()\n",
" \n",
" # その他、登録メソッドとか検索メソッドとか..."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# コード例2:Wikipediaタイトルを加工して見出し語リストを作る\n",
"#   元ファイルはこちら:https://dumps.wikimedia.org/jawiki/latest/jawiki-latest-all-titles-in-ns0.gz\n",
"import gzip\n",
"with gzip.open('jawiki-latest-all-titles-in-ns0.gz', 'r') as f:\n",
" with open('words.txt', 'w', encoding='utf-8') as fout:\n",
" for ln in f:\n",
" w = ln.decode('utf-8').strip()\n",
" if len(w) >= 2:\n",
" print(w, file=fout)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"len(words): 1806416\n",
"len(cs): 8106\n",
"len(db): 7469426\n",
"0 (1) 1000000 (1030923) 2000000 (2059003) 3000000 (3090354) 4000000 (4121030) 5000000 (5154989) 6000000 (6191933) 7000000 (7228090) \n",
"len(base), len(check): 7715231 7715231\n",
"max(base), max(check): 7721348 7721348\n",
"75.93 s\n"
]
}
],
"source": [
"# コード例3:簡単にdouble arrayを作ってみたい\n",
"import sys\n",
"from collections import defaultdict\n",
"import random\n",
"import time\n",
"\n",
"def get_branch_db(words):\n",
" '''\n",
" 分岐データベースの準備\n",
" (先頭からの部分文字列の文字IDのtuple => 次に来る文字の文字IDのset)\n",
" 同時に文字テーブルとその逆引きも作っておく\n",
" '''\n",
" db = defaultdict(set) # 分岐データベース\n",
" db_keys = [] # ordered defaultdictにしたいのでこのようにする\n",
" db_keys_set = set() # db_keysの作成補助\n",
" cs = [''] # 文字テーブル\n",
" c2code = {'':0} # 文字テーブル逆引き辞書\n",
" for word in words[1:]: # 先頭は空文字列なので省く\n",
" for i in range(len(word)+1):\n",
" pre = word[:i]\n",
" c = word[i] if i < len(word) else ''\n",
" if c in c2code:\n",
" code = c2code[c]\n",
" else:\n",
" cs.append(c)\n",
" code = len(c2code)\n",
" c2code[c] = code\n",
" pre_codes = tuple(c2code[c] for c in pre)\n",
" db[pre_codes].add(code)\n",
" if not pre_codes in db_keys_set:\n",
" db_keys.append(pre_codes)\n",
" db_keys_set.add(pre_codes)\n",
" return dict(db), db_keys, cs, c2code\n",
"\n",
"def get_base_s(s, v, check, slot_start=1):\n",
" '''\n",
" 現在のノード位置:s, 次の文字IDのset:v, check配列:checkを入力して、\n",
" 空きスロットを左から探してbase[s]を求める\n",
" '''\n",
" min_v = min(v)\n",
" offsets = [vn - min_v for vn in v]\n",
" # checkの空きを見つける\n",
" p = slot_start # 検索開始位置\n",
" while True:\n",
" found = True\n",
" for offset in offsets:\n",
" if p + offset in check:\n",
" found = False\n",
" break\n",
" if found:\n",
" break\n",
" p += 1\n",
" base_s = p - min_v\n",
" return base_s\n",
"\n",
"def update_slot_start(check, slot_start):\n",
" '''\n",
" darrayの空きスロット左端の更新\n",
" '''\n",
" p = slot_start\n",
" while True:\n",
" if p in check:\n",
" p += 1\n",
" else:\n",
" break\n",
" return p\n",
"\n",
"def make_darray(base, check, db, db_keys, wdict):\n",
" '''\n",
" darrayを作成\n",
" '''\n",
" nd_inds = dict() # 遷移元データベース(部分文字列tuple => 末尾ノード位置)\n",
" slot_start = 1\n",
" for i, k in enumerate(db_keys):\n",
" if i % 1000000 == 0:\n",
" print(i, end=' ')\n",
" print('(' + str(slot_start) + ') ', end='')\n",
" v = db[k] \n",
" slot_start = update_slot_start(check, slot_start)\n",
" \n",
" if len(k) == 0: # ルート\n",
" s = 1\n",
" for c in v:\n",
" t = base[s] + c\n",
" check[t] = s\n",
" nd_inds[() + (c,)] = t # 遷移先を登録\n",
" else: # ルート以外\n",
" s = nd_inds[k] # ノードsに移動\n",
" #pv(s)\n",
" if v == {0}: # 分岐しない葉ノードは圧縮\n",
" base[s] = -wdict[k]\n",
" else:\n",
" # checkの空きスロットからbase[s]の値を求める\n",
" base[s] = get_base_s(s, v, check, slot_start)\n",
" for c in v:\n",
" t = base[s] + c\n",
" check[t] = s\n",
" if c == 0: # 分岐のある葉ノード\n",
" base[t] = -wdict[k]\n",
" else:\n",
" nd_inds[k + (c,)] = t # 遷移先を登録\n",
" del(nd_inds[k]) # 使い終わったら消す(増えるので)\n",
" print() \n",
" \n",
"def save_darray(base, check, cs):\n",
" '''\n",
" darrayをファイルに保存\n",
" '''\n",
" with open('darray.base', 'wb') as f:\n",
" ind_max = max(base.keys())\n",
" for i in range(0, ind_max + 1):\n",
" num = base[i] if i in base else 0\n",
" f.write(num.to_bytes(3, byteorder='little', signed=True))\n",
" \n",
" with open('darray.check', 'wb') as f:\n",
" ind_max = max(check.keys())\n",
" for i in range(0, ind_max + 1):\n",
" num = check[i] if i in check else 0\n",
" f.write(num.to_bytes(3, byteorder='little'))\n",
" \n",
" with open('darray.code', 'w', encoding='utf-8') as f:\n",
" print(*cs, sep='', file=f)\n",
"\n",
"def main():\n",
" t1 = time.time()\n",
"\n",
" # 見出し語リスト\n",
" words = []\n",
" if True:\n",
" with open('words.txt', 'r', encoding='utf-8') as f:\n",
" for ln in f:\n",
" words.append(ln.strip())\n",
" else:\n",
" words = ['でん', 'どこ', 'どん', 'どんちゃん' ,'どんどん', 'どんべぇ']\n",
" words = [''] + words # 先頭に空文字列を追加(必須)\n",
" print('len(words):', len(words))\n",
" \n",
" # 分岐データベースを作成\n",
" # ついでに、文字テーブルとその逆引きも、この関数内で作っておく\n",
" db, db_keys, cs, c2code = get_branch_db(words)\n",
" print('len(cs):', len(cs))\n",
" print('len(db):', len(db))\n",
" \n",
" # 単語辞書を作成(見出し語 => 単語ID)\n",
" # ただし見出し語は文字コードのtupleに変換\n",
" wdict = {tuple(c2code[c] for c in words[i]):i for i in range(len(words))}\n",
"\n",
" # base配列とcheck配列をdictで表現\n",
" # ルートを登録\n",
" base = {1:1}\n",
" check = {1:0}\n",
" \n",
" make_darray(base, check, db, db_keys, wdict)\n",
" print('len(base), len(check):', len(base), len(check))\n",
" print('max(base), max(check):', max(base), max(check))\n",
"\n",
" # ファイル書き出し\n",
" save_darray(base, check, cs)\n",
"\n",
" t2 = time.time()\n",
" print(int((t2 - t1) * 100 + 0.5) / 100, 's')\n",
"\n",
"if __name__ == '__main__':\n",
" main()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# コード例4:double arrayの利用クラス DArray\n",
"class DArray:\n",
" def __init__(self):\n",
" with open('darray.base', 'rb') as f:\n",
" self._base = f.read()\n",
" with open('darray.check', 'rb') as f:\n",
" self._check = f.read()\n",
" with open('darray.code', 'r', encoding='utf-8') as f:\n",
" self._code = f.read()\n",
" self.c2code = {'':0}\n",
" for c in self._code:\n",
" self.c2code[c] = len(self.c2code)\n",
"\n",
" def base(self, p):\n",
" base_p = int.from_bytes(self._base[3*p:3*(p+1)], byteorder='little', signed=True)\n",
" return base_p\n",
"\n",
" def check(self, p):\n",
" check_p = int.from_bytes(self._check[3*p:3*(p+1)], byteorder='little')\n",
" return check_p\n",
"\n",
" def search_gen(self, key):\n",
" s = 1\n",
" for i, c in enumerate(key):\n",
" if not c in self.c2code:\n",
" return\n",
" code = self.c2code[c]\n",
" t = self.base(s) + code\n",
" if self.check(t) == s:\n",
" if self.base(t) < 0:\n",
" yield i, -self.base(t)\n",
" else:\n",
" t2 = self.base(t)\n",
" if self.base(t2) < 0 and self.check(t2) == t:\n",
" yield i, -self.base(t2)\n",
" s = t\n",
" next\n",
" else:\n",
" return"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"#\tword\tword_id\tstart\tend\tline_num\n",
"0\tヨー\t831624\t0\t1\t0\n",
"1\tヨーロッパ\t832167\t0\t4\t0\n",
"2\tヨーロッパにおける政教分離の歴史\t832181\t0\t15\t0\n",
"3\tロッパ\t878705\t2\t4\t0\n",
"4\tにお\t231924\t5\t6\t0\n",
"5\tおけ\t207929\t6\t7\t0\n",
"6\t政教分離\t1303794\t9\t12\t0\n",
"7\t政教分離の歴史\t1303797\t9\t15\t0\n",
"8\t分離\t1020920\t11\t12\t0\n",
"9\t歴史\t1432124\t14\t15\t0\n",
"\n",
"84972 hits & 10081 unique words\n",
"1.91 s\n"
]
}
],
"source": [
"# コード例5:DArrayを使ってテキスト検索する\n",
"import time\n",
"\n",
"def readline_gen(fn):\n",
" '''\n",
" (1)ファイルの行を読む\n",
" '''\n",
" with open(fn, 'r', encoding='utf-8') as f:\n",
" for lnum, ln in enumerate(f):\n",
" yield ln.rstrip(), lnum\n",
" \n",
"def get_str2scan_gen(rl_gen):\n",
" '''\n",
" (2)その行の文字列から各文字列位置を先頭とする部分文字列を作る\n",
" '''\n",
" for ln, lnum in rl_gen:\n",
" for start in range(len(ln)):\n",
" yield ln[start:], start, lnum\n",
" \n",
"def search_incr_gen(gs_gen, da):\n",
" '''\n",
" (3)それぞれの部分文字列についてインクリメンタル検索をする\n",
" '''\n",
" for s, start, lnum in gs_gen:\n",
" for end, wid in da.search_gen(s):\n",
" yield (s[:end+1], wid, start, start + end, lnum)\n",
" \n",
"def main():\n",
" # 長いページはここから => https://ja.wikipedia.org/wiki/特別:長いページ\n",
" fn = 'church_state.txt'\n",
" \n",
" t1 = time.time()\n",
" da = DArray()\n",
" # 多段generator\n",
" gen1 = readline_gen(fn)\n",
" gen2 = get_str2scan_gen(gen1)\n",
" gen3 = search_incr_gen(gen2, da)\n",
" print('#', 'word', 'word_id', 'start', 'end', 'line_num', sep='\\t')\n",
" ws = set()\n",
" for i, (word, wid, start, end, lnum) in enumerate(gen3):\n",
" ws.add(word)\n",
" if i < 10: # 先頭のいくつかだけ表示してみる\n",
" print(i, word, wid, start, end, lnum, sep='\\t')\n",
" print()\n",
" print(i + 1, 'hits &', len(ws), 'unique words' )\n",
"\n",
" t2 = time.time()\n",
" print(int((t2 - t1) * 100 + 0.5) / 100, 's')\n",
"\n",
"if __name__ == '__main__':\n",
" main()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment