Skip to content

Instantly share code, notes, and snippets.

@mocobeta
Created August 2, 2012 17:38
Show Gist options
  • Save mocobeta/3238964 to your computer and use it in GitHub Desktop.
Save mocobeta/3238964 to your computer and use it in GitHub Desktop.
Solr カスタムanalyzerもどきを作る
schema.xml は schema version 1.5 に合わせて修正された版を、以下から頂きました。
http://johtani.jugem.jp/?eid=44
--text_ja の analyzer の箇所だけを抜粋--
<analyzer type="index">
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ja.txt"/>
<tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>
<filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms-ja.txt" ignoreCase="true" expand="true"
tokenizerFactory="solr.JapaneseTokenizerFactory"/>
<filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="pos-deny.txt" enablePositionIncrements="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords-ja.txt"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
-------------------------------------
package test.solrbook;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute;
import org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute;
import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute;
import org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.util.Version;
import org.apache.solr.analysis.JapaneseKatakanaStemFilterFactory;
import org.apache.solr.analysis.JapanesePartOfSpeechStopFilterFactory;
import org.apache.solr.analysis.JapaneseTokenizerFactory;
import org.apache.solr.analysis.LowerCaseFilterFactory;
import org.apache.solr.analysis.MappingCharFilterFactory;
import org.apache.solr.analysis.StopFilterFactory;
import org.apache.solr.analysis.SynonymFilterFactory;
/**
* 「Apache Solr入門」2章
* text_ja型のanalyzerと同等の動きをする(はずの)クラス
* Lucene/Solr 4.0 alphaで動作します。
*/
public class SolrbookAnalyzer {
private ResourceLoader resourceLoader = new MyResourceLoader();
public static void main(String[] args) {
String content = "ソーラーは検索エンジンです。";
SolrbookAnalyzer analyzer = new SolrbookAnalyzer();
analyzer.analyze(new StringReader(content));
}
private void analyze(Reader reader) {
CharStream charStream = charFilter(reader);
TokenStream tokenizer = tokenizer(charStream);
TokenStream stream = filter(tokenizer);
try {
while(stream.incrementToken()) {
CharTermAttribute charAtt = stream.getAttribute(CharTermAttribute.class);
BaseFormAttribute bfAtt = stream.getAttribute(BaseFormAttribute.class);
InflectionAttribute infAtt = stream.getAttribute(InflectionAttribute.class);
ReadingAttribute readAtt = stream.getAttribute(ReadingAttribute.class);
PartOfSpeechAttribute posAtt = stream.getAttribute(PartOfSpeechAttribute.class);
System.out.println(
charAtt + "\t" + // トークン
bfAtt.getBaseForm() + "\t" + // 基本形
infAtt.getInflectionForm() + "\t" + // 活用形
readAtt.getReading() + "\t" + // 読み
readAtt.getPronunciation() + "\t" + // 発音
posAtt.getPartOfSpeech() // 品詞
);
}
} catch (IOException e) {
e.printStackTrace();
}
}
/** charFilter もどき */
private CharStream charFilter(Reader reader) {
// solr.MappingCharFilterFactory
MappingCharFilterFactory factory = new MappingCharFilterFactory();
Map<String, String> args = new HashMap<String, String>();
args.put("mapping", "mapping-ja.txt");
factory.init(args);
factory.inform(resourceLoader);
CharStream stream = factory.create(CharReader.get(reader));
return stream;
}
/** tokenizer もどき */
private TokenStream tokenizer(CharStream charStream) {
// solr.JapaneseTokenizerFactory
JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory();
Map<String, String> args = new HashMap<String, String>();
args.put("mode", "search");
factory.init(args);
factory.inform(resourceLoader);
TokenStream stream = factory.create(charStream);
return stream;
}
/** filter もどき */
private TokenStream filter(TokenStream tokenizer) {
TokenStream stream = null;
// solr.JapaneseKatakanaStemFilterFactory
JapaneseKatakanaStemFilterFactory factory1 = new JapaneseKatakanaStemFilterFactory();
Map<String, String> args1 = new HashMap<String, String>();
args1.put("minimumLength", "4");
factory1.init(args1);
stream = factory1.create(tokenizer);
// solr.SynonymFilterFactory
SynonymFilterFactory factory2 = new SynonymFilterFactory();
Map<String, String> args2 = new HashMap<String, String>();
args2.put("synonyms", "synonyms-ja.txt");
args2.put("ignoreCase", "true");
args2.put("expand", "true");
args2.put("tokenizerFactory", "org.apache.solr.analysis.JapaneseTokenizerFactory");
factory2.setLuceneMatchVersion(Version.LUCENE_40);
factory2.init(args2);
factory2.inform(resourceLoader);
stream = factory2.create(stream);
// solr.JapanesePartOfSpeechStopFilterFactory
JapanesePartOfSpeechStopFilterFactory factory3 = new JapanesePartOfSpeechStopFilterFactory();
Map<String, String> args3 = new HashMap<String, String>();
args3.put("tags", "pos-deny.txt");
args3.put("enablePositionIncrements", "true");
factory3.setLuceneMatchVersion(Version.LUCENE_40);
factory3.init(args3);
factory3.inform(resourceLoader);
stream = factory3.create(stream);
// solr.StopFilterFactory
StopFilterFactory factory4 = new StopFilterFactory();
Map<String, String> args4 = new HashMap<String, String>();
args4.put("ignoreCase", "true");
args4.put("words", "stopwords-ja.txt");
factory4.setLuceneMatchVersion(Version.LUCENE_40);
factory4.init(args4);
factory4.inform(resourceLoader);
stream = factory4.create(stream);
// solr.LowerCaseFilterFactory
LowerCaseFilterFactory factory5 = new LowerCaseFilterFactory();
factory5.setLuceneMatchVersion(Version.LUCENE_40);
stream = factory5.create(stream);
return stream;
}
/** リソースローダーもどき */
class MyResourceLoader implements ResourceLoader {
// 設定ファイルを置くディレクトリ
// ここに、mapping-ja.txt, pos-deny.txt, stopwords-ja.txt, synonyms-ja.txt を置いておく
// 本のサンプル(サポートサイトからダウンロード)をそのまま使用
private static final String confdir = "solrbook";
@Override
public InputStream openResource(String resource) throws IOException {
return new FileInputStream(new File(confdir, resource));
}
@Override
public List<String> getLines(String resource) throws IOException {
List<String> lines = new ArrayList<String>();
File file = new File(confdir, resource);
BufferedReader reader = new BufferedReader(new FileReader(file));
String line = null;
while ((line = reader.readLine()) != null) {
if (line.startsWith("#")) continue;
if (line.length() == 0) continue;
lines.add(line);
}
reader.close();
return lines;
}
@Override
public <T> T newInstance(String cname, Class<T> expectedType,
String... subpackages) {
// !! とりあえずなにかnewするだけのダメコード
T obj = null;
try {
ClassLoader classLoader = MyResourceLoader.class.getClassLoader();
Class<? extends T> clazz = Class.forName(cname, true, classLoader).asSubclass(expectedType);
obj = clazz.newInstance();
} catch (Exception e) {
e.printStackTrace();
}
return obj;
}
}
}
solr null null null null null
ソーラ null null null null null
検索 null null ケンサク ケンサク 名詞-サ変接続
エンジン null null エンジン エンジン 名詞-一般
(実際にSolrを起動して、管理画面から「Field Analysis」にかけた時と同じ出力となります。)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment