Skip to content

Instantly share code, notes, and snippets.

@need4spd
Created November 27, 2012 07:06
Show Gist options
  • Save need4spd/4152863 to your computer and use it in GitHub Desktop.
Save need4spd/4152863 to your computer and use it in GitHub Desktop.
lucene 4.0에서 FilteringTokenFilter 설명
package com.tistory.devyongsik.analyzer;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.FilteringTokenFilter;
/**
* @author need4spd, need4spd@cplanet.co.kr, 2011. 7. 8.
*
*/
public class DevysStopFilter extends FilteringTokenFilter {
private Log logger = LogFactory.getLog(DevysStopFilter.class);
private final CharTermAttribute charTermAtt = addAttribute(CharTermAttribute.class);
private List stopWords = new ArrayList();
private void initStopWord() {
stopWords.add("the");
stopWords.add(".");
}
public DevysStopFilter(boolean enablePositionIncrements, TokenStream input) {
super(enablePositionIncrements, input);
initStopWord();
if(logger.isDebugEnabled())
logger.debug("initailize...");
}
@Override
protected boolean accept() throws IOException {
boolean isAccept = !stopWords.contains(charTermAtt.toString());
return isAccept;
}
}
package com.tistory.devyongsik.analyzer;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
/**
* @author need4spd, need4spd@cplanet.co.kr, 2011. 7. 8.
*
*/
public class DevysStopFilterTest {
private static Set tokens = new HashSet();
//불용어는 the와 .
StringReader reader = new StringReader("the 개발하고 꼭 이것을 잘 해야합니다. 공백입니다.");
@BeforeClass
public static void setUp() {
tokens.add("개발하고");
tokens.add("이것을");
tokens.add("해야합니다");
tokens.add("공백입니다");
tokens.add("꼭");
tokens.add("잘");
}
@Test
public void stopFilter() throws IOException {
TokenStream stream = new DevysStopFilter(new DevysTokenizer(reader));
CharTermAttribute charTermAttr = stream.getAttribute(CharTermAttribute.class);
PositionIncrementAttribute positionAttr = stream.getAttribute(PositionIncrementAttribute.class);
while(stream.incrementToken()) {
System.out.println("charTermAttr : " + charTermAttr.toString());
System.out.println("positionAttr : " + positionAttr.getPositionIncrement());
Assert.assertTrue(tokens.contains(charTermAttr.toString()));
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment