Last active
November 21, 2018 02:13
-
-
Save pianotaiq/6b81f18ccee24fc8f5d85d5b0e8844f9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# コード例1:Trie木の例(書きかけ)\n", | |
"class Node:\n", | |
" def __init__(self, left=-1, right=-1, ch='', word_id=-1):\n", | |
" self.left = left\n", | |
" self.right = right\n", | |
" self.ch = ch\n", | |
" self.word_id = word_id\n", | |
"\n", | |
"class Trie:\n", | |
" def __init__(self):\n", | |
" self.data = dict()\n", | |
" self.data[0] = Node()\n", | |
" \n", | |
" # その他、登録メソッドとか検索メソッドとか..." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# コード例2:Wikipediaタイトルを加工して見出し語リストを作る\n", | |
"# 元ファイルはこちら:https://dumps.wikimedia.org/jawiki/latest/jawiki-latest-all-titles-in-ns0.gz\n", | |
"import gzip\n", | |
"with gzip.open('jawiki-latest-all-titles-in-ns0.gz', 'r') as f:\n", | |
" with open('words.txt', 'w', encoding='utf-8') as fout:\n", | |
" for ln in f:\n", | |
" w = ln.decode('utf-8').strip()\n", | |
" if len(w) >= 2:\n", | |
" print(w, file=fout)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"len(words): 1806416\n", | |
"len(cs): 8106\n", | |
"len(db): 7469426\n", | |
"0 (1) 1000000 (1030923) 2000000 (2059003) 3000000 (3090354) 4000000 (4121030) 5000000 (5154989) 6000000 (6191933) 7000000 (7228090) \n", | |
"len(base), len(check): 7715231 7715231\n", | |
"max(base), max(check): 7721348 7721348\n", | |
"75.93 s\n" | |
] | |
} | |
], | |
"source": [ | |
"# コード例3:簡単にdouble arrayを作ってみたい\n", | |
"import sys\n", | |
"from collections import defaultdict\n", | |
"import random\n", | |
"import time\n", | |
"\n", | |
"def get_branch_db(words):\n", | |
" '''\n", | |
" 分岐データベースの準備\n", | |
" (先頭からの部分文字列の文字IDのtuple => 次に来る文字の文字IDのset)\n", | |
" 同時に文字テーブルとその逆引きも作っておく\n", | |
" '''\n", | |
" db = defaultdict(set) # 分岐データベース\n", | |
" db_keys = [] # ordered defaultdictにしたいのでこのようにする\n", | |
" db_keys_set = set() # db_keysの作成補助\n", | |
" cs = [''] # 文字テーブル\n", | |
" c2code = {'':0} # 文字テーブル逆引き辞書\n", | |
" for word in words[1:]: # 先頭は空文字列なので省く\n", | |
" for i in range(len(word)+1):\n", | |
" pre = word[:i]\n", | |
" c = word[i] if i < len(word) else ''\n", | |
" if c in c2code:\n", | |
" code = c2code[c]\n", | |
" else:\n", | |
" cs.append(c)\n", | |
" code = len(c2code)\n", | |
" c2code[c] = code\n", | |
" pre_codes = tuple(c2code[c] for c in pre)\n", | |
" db[pre_codes].add(code)\n", | |
" if not pre_codes in db_keys_set:\n", | |
" db_keys.append(pre_codes)\n", | |
" db_keys_set.add(pre_codes)\n", | |
" return dict(db), db_keys, cs, c2code\n", | |
"\n", | |
"def get_base_s(s, v, check, slot_start=1):\n", | |
" '''\n", | |
" 現在のノード位置:s, 次の文字IDのset:v, check配列:checkを入力して、\n", | |
" 空きスロットを左から探してbase[s]を求める\n", | |
" '''\n", | |
" min_v = min(v)\n", | |
" offsets = [vn - min_v for vn in v]\n", | |
" # checkの空きを見つける\n", | |
" p = slot_start # 検索開始位置\n", | |
" while True:\n", | |
" found = True\n", | |
" for offset in offsets:\n", | |
" if p + offset in check:\n", | |
" found = False\n", | |
" break\n", | |
" if found:\n", | |
" break\n", | |
" p += 1\n", | |
" base_s = p - min_v\n", | |
" return base_s\n", | |
"\n", | |
"def update_slot_start(check, slot_start):\n", | |
" '''\n", | |
" darrayの空きスロット左端の更新\n", | |
" '''\n", | |
" p = slot_start\n", | |
" while True:\n", | |
" if p in check:\n", | |
" p += 1\n", | |
" else:\n", | |
" break\n", | |
" return p\n", | |
"\n", | |
"def make_darray(base, check, db, db_keys, wdict):\n", | |
" '''\n", | |
" darrayを作成\n", | |
" '''\n", | |
" nd_inds = dict() # 遷移元データベース(部分文字列tuple => 末尾ノード位置)\n", | |
" slot_start = 1\n", | |
" for i, k in enumerate(db_keys):\n", | |
" if i % 1000000 == 0:\n", | |
" print(i, end=' ')\n", | |
" print('(' + str(slot_start) + ') ', end='')\n", | |
" v = db[k] \n", | |
" slot_start = update_slot_start(check, slot_start)\n", | |
" \n", | |
" if len(k) == 0: # ルート\n", | |
" s = 1\n", | |
" for c in v:\n", | |
" t = base[s] + c\n", | |
" check[t] = s\n", | |
" nd_inds[() + (c,)] = t # 遷移先を登録\n", | |
" else: # ルート以外\n", | |
" s = nd_inds[k] # ノードsに移動\n", | |
" #pv(s)\n", | |
" if v == {0}: # 分岐しない葉ノードは圧縮\n", | |
" base[s] = -wdict[k]\n", | |
" else:\n", | |
" # checkの空きスロットからbase[s]の値を求める\n", | |
" base[s] = get_base_s(s, v, check, slot_start)\n", | |
" for c in v:\n", | |
" t = base[s] + c\n", | |
" check[t] = s\n", | |
" if c == 0: # 分岐のある葉ノード\n", | |
" base[t] = -wdict[k]\n", | |
" else:\n", | |
" nd_inds[k + (c,)] = t # 遷移先を登録\n", | |
" del(nd_inds[k]) # 使い終わったら消す(増えるので)\n", | |
" print() \n", | |
" \n", | |
"def save_darray(base, check, cs):\n", | |
" '''\n", | |
" darrayをファイルに保存\n", | |
" '''\n", | |
" with open('darray.base', 'wb') as f:\n", | |
" ind_max = max(base.keys())\n", | |
" for i in range(0, ind_max + 1):\n", | |
" num = base[i] if i in base else 0\n", | |
" f.write(num.to_bytes(3, byteorder='little', signed=True))\n", | |
" \n", | |
" with open('darray.check', 'wb') as f:\n", | |
" ind_max = max(check.keys())\n", | |
" for i in range(0, ind_max + 1):\n", | |
" num = check[i] if i in check else 0\n", | |
" f.write(num.to_bytes(3, byteorder='little'))\n", | |
" \n", | |
" with open('darray.code', 'w', encoding='utf-8') as f:\n", | |
" print(*cs, sep='', file=f)\n", | |
"\n", | |
"def main():\n", | |
" t1 = time.time()\n", | |
"\n", | |
" # 見出し語リスト\n", | |
" words = []\n", | |
" if True:\n", | |
" with open('words.txt', 'r', encoding='utf-8') as f:\n", | |
" for ln in f:\n", | |
" words.append(ln.strip())\n", | |
" else:\n", | |
" words = ['でん', 'どこ', 'どん', 'どんちゃん' ,'どんどん', 'どんべぇ']\n", | |
" words = [''] + words # 先頭に空文字列を追加(必須)\n", | |
" print('len(words):', len(words))\n", | |
" \n", | |
" # 分岐データベースを作成\n", | |
" # ついでに、文字テーブルとその逆引きも、この関数内で作っておく\n", | |
" db, db_keys, cs, c2code = get_branch_db(words)\n", | |
" print('len(cs):', len(cs))\n", | |
" print('len(db):', len(db))\n", | |
" \n", | |
" # 単語辞書を作成(見出し語 => 単語ID)\n", | |
" # ただし見出し語は文字コードのtupleに変換\n", | |
" wdict = {tuple(c2code[c] for c in words[i]):i for i in range(len(words))}\n", | |
"\n", | |
" # base配列とcheck配列をdictで表現\n", | |
" # ルートを登録\n", | |
" base = {1:1}\n", | |
" check = {1:0}\n", | |
" \n", | |
" make_darray(base, check, db, db_keys, wdict)\n", | |
" print('len(base), len(check):', len(base), len(check))\n", | |
" print('max(base), max(check):', max(base), max(check))\n", | |
"\n", | |
" # ファイル書き出し\n", | |
" save_darray(base, check, cs)\n", | |
"\n", | |
" t2 = time.time()\n", | |
" print(int((t2 - t1) * 100 + 0.5) / 100, 's')\n", | |
"\n", | |
"if __name__ == '__main__':\n", | |
" main()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"scrolled": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# コード例4:double arrayの利用クラス DArray\n", | |
"class DArray:\n", | |
" def __init__(self):\n", | |
" with open('darray.base', 'rb') as f:\n", | |
" self._base = f.read()\n", | |
" with open('darray.check', 'rb') as f:\n", | |
" self._check = f.read()\n", | |
" with open('darray.code', 'r', encoding='utf-8') as f:\n", | |
" self._code = f.read()\n", | |
" self.c2code = {'':0}\n", | |
" for c in self._code:\n", | |
" self.c2code[c] = len(self.c2code)\n", | |
"\n", | |
" def base(self, p):\n", | |
" base_p = int.from_bytes(self._base[3*p:3*(p+1)], byteorder='little', signed=True)\n", | |
" return base_p\n", | |
"\n", | |
" def check(self, p):\n", | |
" check_p = int.from_bytes(self._check[3*p:3*(p+1)], byteorder='little')\n", | |
" return check_p\n", | |
"\n", | |
" def search_gen(self, key):\n", | |
" s = 1\n", | |
" for i, c in enumerate(key):\n", | |
" if not c in self.c2code:\n", | |
" return\n", | |
" code = self.c2code[c]\n", | |
" t = self.base(s) + code\n", | |
" if self.check(t) == s:\n", | |
" if self.base(t) < 0:\n", | |
" yield i, -self.base(t)\n", | |
" else:\n", | |
" t2 = self.base(t)\n", | |
" if self.base(t2) < 0 and self.check(t2) == t:\n", | |
" yield i, -self.base(t2)\n", | |
" s = t\n", | |
" next\n", | |
" else:\n", | |
" return" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"#\tword\tword_id\tstart\tend\tline_num\n", | |
"0\tヨー\t831624\t0\t1\t0\n", | |
"1\tヨーロッパ\t832167\t0\t4\t0\n", | |
"2\tヨーロッパにおける政教分離の歴史\t832181\t0\t15\t0\n", | |
"3\tロッパ\t878705\t2\t4\t0\n", | |
"4\tにお\t231924\t5\t6\t0\n", | |
"5\tおけ\t207929\t6\t7\t0\n", | |
"6\t政教分離\t1303794\t9\t12\t0\n", | |
"7\t政教分離の歴史\t1303797\t9\t15\t0\n", | |
"8\t分離\t1020920\t11\t12\t0\n", | |
"9\t歴史\t1432124\t14\t15\t0\n", | |
"\n", | |
"84972 hits & 10081 unique words\n", | |
"1.91 s\n" | |
] | |
} | |
], | |
"source": [ | |
"# コード例5:DArrayを使ってテキスト検索する\n", | |
"import time\n", | |
"\n", | |
"def readline_gen(fn):\n", | |
" '''\n", | |
" (1)ファイルの行を読む\n", | |
" '''\n", | |
" with open(fn, 'r', encoding='utf-8') as f:\n", | |
" for lnum, ln in enumerate(f):\n", | |
" yield ln.rstrip(), lnum\n", | |
" \n", | |
"def get_str2scan_gen(rl_gen):\n", | |
" '''\n", | |
" (2)その行の文字列から各文字列位置を先頭とする部分文字列を作る\n", | |
" '''\n", | |
" for ln, lnum in rl_gen:\n", | |
" for start in range(len(ln)):\n", | |
" yield ln[start:], start, lnum\n", | |
" \n", | |
"def search_incr_gen(gs_gen, da):\n", | |
" '''\n", | |
" (3)それぞれの部分文字列についてインクリメンタル検索をする\n", | |
" '''\n", | |
" for s, start, lnum in gs_gen:\n", | |
" for end, wid in da.search_gen(s):\n", | |
" yield (s[:end+1], wid, start, start + end, lnum)\n", | |
" \n", | |
"def main():\n", | |
" # 長いページはここから => https://ja.wikipedia.org/wiki/特別:長いページ\n", | |
" fn = 'church_state.txt'\n", | |
" \n", | |
" t1 = time.time()\n", | |
" da = DArray()\n", | |
" # 多段generator\n", | |
" gen1 = readline_gen(fn)\n", | |
" gen2 = get_str2scan_gen(gen1)\n", | |
" gen3 = search_incr_gen(gen2, da)\n", | |
" print('#', 'word', 'word_id', 'start', 'end', 'line_num', sep='\\t')\n", | |
" ws = set()\n", | |
" for i, (word, wid, start, end, lnum) in enumerate(gen3):\n", | |
" ws.add(word)\n", | |
" if i < 10: # 先頭のいくつかだけ表示してみる\n", | |
" print(i, word, wid, start, end, lnum, sep='\\t')\n", | |
" print()\n", | |
" print(i + 1, 'hits &', len(ws), 'unique words' )\n", | |
"\n", | |
" t2 = time.time()\n", | |
" print(int((t2 - t1) * 100 + 0.5) / 100, 's')\n", | |
"\n", | |
"if __name__ == '__main__':\n", | |
" main()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment