Skip to content

Instantly share code, notes, and snippets.

View mocobeta's full-sized avatar

Tomoko Uchida mocobeta

View GitHub Profile
@mocobeta
mocobeta / 01_preprocess-jp.py
Created October 10, 2017 15:45
Japanese keyphrase extraction by pke
import MeCab
import sys
raw_text_file = sys.argv[1]
preprocessed_file = sys.argv[2]
# preprocess for pke
m = MeCab.Tagger()
with open(raw_text_file) as src:
with open(preprocessed_file, 'w') as dst:
{
"ア": ["a"], "イ": ["i"], "ウ": ["u"], "エ": ["e"], "オ": ["o"],
"カ": ["ka"], "キ": ["ki"], "ク": ["ku"], "ケ": ["ke"], "コ": ["ko"], "キャ": ["kya"], "キュ": ["kyu"], "キョ": ["kyo"],
"ガ": ["ga"], "ギ": ["gi"], "グ": ["gu"], "ゲ": ["ge"], "ゴ": ["go"], "ギャ": ["gya"], "ギュ": ["gyu"], "ギョ": ["gyo"],
"サ": ["sa"], "シ": ["si", "shi"], "ス": ["su"], "セ": ["se"], "ソ": ["so"], "シャ": ["sya", "sha"], "シュ": ["syu", "shu"], "シェ": ["sye", "she"], "ショ": ["syo", "sho"],
"ザ": ["za"], "ジ": ["zi", "ji"], "ズ": ["zu"], "ゼ": ["ze"], "ゾ": ["zo"], "ジャ": ["zya", "ja"], "ジュ": ["zyu", "ju"], "ジェ": ["zye", "je"], "ジョ": ["zyo", "jo"],
"タ": ["ta"], "チ": ["ti", "chi"], "ツ": ["tu", "tsu"], "テ": ["te"], "ト": ["to"], "チャ": ["tya", "cha", "cya"], "チュ": ["tyu", "chu", "cyu"], "チョ": ["tyo", "cho", "cyo"],
"ダ": ["da"], "ヂ": ["di", "zi", "ji"], "ヅ": ["du", "zu"], "デ": ["de"], "ド": ["do"], "ヂャ": ["dya", "zya", "ja"], "ヂュ": ["dyu", "zyu", "ju"], "ヂョ": ["dyo", "zyo", "jo"],
"ナ": ["na"], "ニ": ["ni"], "ヌ": ["nu"], "ネ": ["ne"], "ノ": ["no"], "ニャ": ["nya"]
@mocobeta
mocobeta / Dockerfile-0.1.0
Last active May 25, 2022 16:39
Configuring Elasticsearch 7 cluster on GKE
FROM docker.elastic.co/elasticsearch/elasticsearch:7.7.0
ENV PATH /usr/share/elasticsearch/bin:$PATH
# switch user to elasticsearch
USER elasticsearch
# install plugins
RUN elasticsearch-plugin install analysis-kuromoji
@mocobeta
mocobeta / checks.sh
Last active December 18, 2021 04:28
sanity checks on jms2
# check
lucene $ ./gradlew check
BUILD SUCCESSFUL in 3m 59s
# packaging
lucene $ ./gradlew clean
lucene $ ./gradlew assembleRelease
BUILD SUCCESSFUL in 49s
# luke
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.VectorField;
import org.apache.lucene.index.*;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.KnnGraphQuery;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
@mocobeta
mocobeta / ed.py
Last active June 10, 2021 00:16
途中結果を表示しながら編集距離(レーベンシュタイン距離)を計算する。
#-*- coding: utf-8 -*-
def ed(s1, s2, detail=False):
u""" s1, s2 の編集距離を計算する. ※置換のコストは 1 """
len_s1 = len(s1)
len_s2 = len(s2)
# initialize
m = [[0 for i in range(len_s2 + 1)] for j in range(len_s1 + 1)]
for i in range(1, (len_s1+1)):
@mocobeta
mocobeta / VectorValuesFieldExample.java
Last active April 3, 2021 07:11
PoC example for approximate vector search for Lucene
/**
* PoC Indexing/Querying example code for LUCENE-9004
* @see https://github.com/mocobeta/lucene-solr-mirror/tree/jira/LUCENE-9004-aknn
*/
public class VectorValuesFieldExample {
public static void main(String[] args) {
String indexDir = "/tmp/vector-search";
String vectorField = "vector";
int maxDoc = 100_000;
@mocobeta
mocobeta / HelloKuromoji.java
Last active December 21, 2020 05:33
Hello Lucene! (5.0)
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
import org.apache.lucene.analysis.ja.JapaneseTokenizer;
import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute;
import org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
@State(Scope.Benchmark)
public class SearchBenchmark {
private static final String dirPath = System.getProperty("index.dir");
private static final String[] terms1 = new String[]{"電車", "列車", "鉄道"};
private Directory dir;
private IndexReader reader;
private Query query1;
@mocobeta
mocobeta / main.rs
Last active December 13, 2020 06:52
compression algorithms performance comparison
use rand::prelude::*;
fn main() {
let mut rng = thread_rng();
let p: f32 = 0.00001;
let max_doc: usize = 1_000_000;
let mut postings: Vec<usize> = vec![rng.gen_range(1, 1000) as usize];
loop {
let next = postings.last().unwrap() + geo_random(p);
if next > max_doc {