Skip to content

Instantly share code, notes, and snippets.

View mocobeta's full-sized avatar

Tomoko Uchida mocobeta

View GitHub Profile
@mocobeta
mocobeta / build.py
Last active August 29, 2015 14:17
Rough implementation for Minimal Acyclic Subsequential Transducers
import time
from fst import *
from struct import pack
def build_dict(enc, outfile, *files):
entries = {}
for file in files:
with open(file, encoding=enc) as f:
for line in f:
line = line.rstrip()
@mocobeta
mocobeta / HelloKuromoji.java
Last active December 21, 2020 05:33
Hello Lucene! (5.0)
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
import org.apache.lucene.analysis.ja.JapaneseTokenizer;
import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute;
import org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
@mocobeta
mocobeta / wikidata2pgsql.py
Created December 13, 2014 15:02
Python script importing Wikipedia xml dump data to PostgreSQL
# -*- coding: utf-8 -*-
import psycopg2
import xml.sax
from xml.sax.handler import ContentHandler
from dicttoxml import dicttoxml
INSERT_STMT = "INSERT INTO pages (id, page) VALUES('%s', '%s')"
COMMIT_WINDOW = 10000
@mocobeta
mocobeta / SpatialIndexSample.java
Last active August 29, 2015 14:07
(Lucene) Spatial search のサンプル
package indexer;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
@mocobeta
mocobeta / FastVectorHighlighterSample.java
Last active August 29, 2015 14:06
(Lucene) Highlighter のサンプル
package higlighter;
import java.io.File;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
@mocobeta
mocobeta / sharded_set.py
Created May 5, 2014 03:03
Redis : SET のシャーディング
import binascii
import random
TOTAL_ELEMENTS = 2000
SHARD_SIZE = 512
def shard_key(base, key, total_elements, shard_size):
"""
シャードキーを計算する関数
「Redis入門」リスト9-7 から引用
@mocobeta
mocobeta / multi_taskqueue.py
Last active August 29, 2015 14:00
Redis を使ったタスクキューのデモ (http://mocobeta-backup.tumblr.com/post/83910499938/redis-2)
import redis
import json
import time
import datetime
import uuid
import threading
import argparse
def provider(conn, name, total_tasks, queue, sleep=0.1):
""" Provider """
@mocobeta
mocobeta / ed.py
Last active June 10, 2021 00:16
途中結果を表示しながら編集距離(レーベンシュタイン距離)を計算する。
#-*- coding: utf-8 -*-
def ed(s1, s2, detail=False):
u""" s1, s2 の編集距離を計算する. ※置換のコストは 1 """
len_s1 = len(s1)
len_s2 = len(s2)
# initialize
m = [[0 for i in range(len_s2 + 1)] for j in range(len_s1 + 1)]
for i in range(1, (len_s1+1)):
@mocobeta
mocobeta / detect_update.py
Last active July 2, 2016 12:07
SQLite のレプリケーション
import sqlite3
import time
import traceback
conn_meta = sqlite3.connect('/db/meta.db')
# get current db file and connect
cur = conn_meta.cursor()
cur.execute('SELECT path FROM db_file WHERE status = 0 ORDER BY ver DESC')
row = cur.fetchone()
current_db = row[0] if row else None
@mocobeta
mocobeta / get_ranking_data.py
Last active December 19, 2015 17:48
SQLite vs Redis ソーティング速度比較
import sqlite3
import redis
import sys
from timeit import Timer
loop = int(sys.argv[1])
def ranking_sqlite(conn, key):
cur = conn.cursor()
cur.execute("SELECT val FROM ranking WHERE key = '%s' ORDER BY score DESC limit 10" % key)