Skip to content

Instantly share code, notes, and snippets.

@kijun
Created February 3, 2010 06:02
Show Gist options
  • Save kijun/293372 to your computer and use it in GitHub Desktop.
Save kijun/293372 to your computer and use it in GitHub Desktop.
package org.apache.lucene.analysis;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.AttributeSource;
/**
* Emits the entire input as a single token.
*/
public final class KeywordTokenizer extends Tokenizer {
private static final int DEFAULT_BUFFER_SIZE = 256;
private boolean done;
private int finalOffset;
private TermAttribute termAtt;
private OffsetAttribute offsetAtt;
public KeywordTokenizer(Reader input) {
this(input, DEFAULT_BUFFER_SIZE);
}
public KeywordTokenizer(Reader input, int bufferSize) {
super(input);
init(bufferSize);
}
private void init(int bufferSize) {
this.done = false;
termAtt = addAttribute(TermAttribute.class);
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt.resizeTermBuffer(bufferSize);
}
@Override
public final boolean incrementToken() throws IOException {
if (!done) {
clearAttributes();
done = true;
int upto = 0;
// 미리 정의된 버퍼를 받아온다
char[] buffer = termAtt.termBuffer();
// 문서가 끝날 때 까지 읽는다
while (true) {
final int length = input.read(buffer, upto, buffer.length-upto);
if (length == -1) break;
upto += length;
if (upto == buffer.length)
buffer = termAtt.resizeTermBuffer(1+buffer.length);
}
// 이 전체가 하나의 term이다
termAtt.setTermLength(upto);
// offset 정보 설정.
finalOffset = correctOffset(upto);
offsetAtt.setOffset(correctOffset(0), finalOffset);
return true;
}
return false;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment