This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import MeCab | |
import sys | |
raw_text_file = sys.argv[1] | |
preprocessed_file = sys.argv[2] | |
# preprocess for pke | |
m = MeCab.Tagger() | |
with open(raw_text_file) as src: | |
with open(preprocessed_file, 'w') as dst: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"ア": ["a"], "イ": ["i"], "ウ": ["u"], "エ": ["e"], "オ": ["o"], | |
"カ": ["ka"], "キ": ["ki"], "ク": ["ku"], "ケ": ["ke"], "コ": ["ko"], "キャ": ["kya"], "キュ": ["kyu"], "キョ": ["kyo"], | |
"ガ": ["ga"], "ギ": ["gi"], "グ": ["gu"], "ゲ": ["ge"], "ゴ": ["go"], "ギャ": ["gya"], "ギュ": ["gyu"], "ギョ": ["gyo"], | |
"サ": ["sa"], "シ": ["si", "shi"], "ス": ["su"], "セ": ["se"], "ソ": ["so"], "シャ": ["sya", "sha"], "シュ": ["syu", "shu"], "シェ": ["sye", "she"], "ショ": ["syo", "sho"], | |
"ザ": ["za"], "ジ": ["zi", "ji"], "ズ": ["zu"], "ゼ": ["ze"], "ゾ": ["zo"], "ジャ": ["zya", "ja"], "ジュ": ["zyu", "ju"], "ジェ": ["zye", "je"], "ジョ": ["zyo", "jo"], | |
"タ": ["ta"], "チ": ["ti", "chi"], "ツ": ["tu", "tsu"], "テ": ["te"], "ト": ["to"], "チャ": ["tya", "cha", "cya"], "チュ": ["tyu", "chu", "cyu"], "チョ": ["tyo", "cho", "cyo"], | |
"ダ": ["da"], "ヂ": ["di", "zi", "ji"], "ヅ": ["du", "zu"], "デ": ["de"], "ド": ["do"], "ヂャ": ["dya", "zya", "ja"], "ヂュ": ["dyu", "zyu", "ju"], "ヂョ": ["dyo", "zyo", "jo"], | |
"ナ": ["na"], "ニ": ["ni"], "ヌ": ["nu"], "ネ": ["ne"], "ノ": ["no"], "ニャ": ["nya"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
FROM docker.elastic.co/elasticsearch/elasticsearch:7.7.0 | |
ENV PATH /usr/share/elasticsearch/bin:$PATH | |
# switch user to elasticsearch | |
USER elasticsearch | |
# install plugins | |
RUN elasticsearch-plugin install analysis-kuromoji |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# check | |
lucene $ ./gradlew check | |
BUILD SUCCESSFUL in 3m 59s | |
# packaging | |
lucene $ ./gradlew clean | |
lucene $ ./gradlew assembleRelease | |
BUILD SUCCESSFUL in 49s | |
# luke |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.lucene.document.Document; | |
import org.apache.lucene.document.Field; | |
import org.apache.lucene.document.StringField; | |
import org.apache.lucene.document.VectorField; | |
import org.apache.lucene.index.*; | |
import org.apache.lucene.search.IndexSearcher; | |
import org.apache.lucene.search.KnnGraphQuery; | |
import org.apache.lucene.search.ScoreDoc; | |
import org.apache.lucene.search.TopDocs; | |
import org.apache.lucene.store.Directory; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-*- coding: utf-8 -*- | |
def ed(s1, s2, detail=False): | |
u""" s1, s2 の編集距離を計算する. ※置換のコストは 1 """ | |
len_s1 = len(s1) | |
len_s2 = len(s2) | |
# initialize | |
m = [[0 for i in range(len_s2 + 1)] for j in range(len_s1 + 1)] | |
for i in range(1, (len_s1+1)): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* PoC Indexing/Querying example code for LUCENE-9004 | |
* @see https://github.com/mocobeta/lucene-solr-mirror/tree/jira/LUCENE-9004-aknn | |
*/ | |
public class VectorValuesFieldExample { | |
public static void main(String[] args) { | |
String indexDir = "/tmp/vector-search"; | |
String vectorField = "vector"; | |
int maxDoc = 100_000; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.lucene.analysis.Analyzer; | |
import org.apache.lucene.analysis.TokenStream; | |
import org.apache.lucene.analysis.ja.JapaneseAnalyzer; | |
import org.apache.lucene.analysis.ja.JapaneseTokenizer; | |
import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute; | |
import org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute; | |
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | |
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; | |
import org.apache.lucene.analysis.util.CharArraySet; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@State(Scope.Benchmark) | |
public class SearchBenchmark { | |
private static final String dirPath = System.getProperty("index.dir"); | |
private static final String[] terms1 = new String[]{"電車", "列車", "鉄道"}; | |
private Directory dir; | |
private IndexReader reader; | |
private Query query1; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use rand::prelude::*; | |
fn main() { | |
let mut rng = thread_rng(); | |
let p: f32 = 0.00001; | |
let max_doc: usize = 1_000_000; | |
let mut postings: Vec<usize> = vec![rng.gen_range(1, 1000) as usize]; | |
loop { | |
let next = postings.last().unwrap() + geo_random(p); | |
if next > max_doc { |
NewerOlder