Skip to content

Instantly share code, notes, and snippets.

@hideaki-t
Created July 28, 2012 04:11
Show Gist options
  • Save hideaki-t/3191764 to your computer and use it in GitHub Desktop.
Save hideaki-t/3191764 to your computer and use it in GitHub Desktop.
comparing index size by indexing method
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from whoosh import query
from whoosh.fields import *
from whoosh.filedb.filestore import RamStorage
import whooshjp
from whooshjp.IgoTokenizer import IgoTokenizer
import igo.Tagger
c = 'こんにちは世界'
tk = IgoTokenizer(igo.Tagger.Tagger('ipadic'))
scm_ext = Schema(c=TEXT(stored=True))
st_ext = RamStorage()
scm_emb = Schema(c=TEXT(stored=True, analyzer=tk))
st_emb = RamStorage()
def check(storage):
ix = storage.open_index()
with ix.searcher() as s:
print s.search(query.Term("c", "世界"))
for f in storage.list():
print storage.file_length(f), f
with st_emb.create_index(scm_emb).writer() as w:
w.add_document(c=c)
check(st_emb)
with st_ext.create_index(scm_ext).writer() as w:
w.add_document(c=[token.text for token in tk(c)], _stored_content=c)
check(st_ext)
<Top 1 Results for Term(u'c', u'\u4e16\u754c') runtime=0.00155806541443>
3775 MAIN_qe05n6qbhe6w.seg
41186237 _MAIN_1.toc
<Top 1 Results for Term(u'c', u'\u4e16\u754c') runtime=0.00134181976318>
3788 MAIN_oay2jj05pa3z.seg
996 _MAIN_1.toc
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment