Created
November 27, 2012 07:06
-
-
Save need4spd/4152863 to your computer and use it in GitHub Desktop.
lucene 4.0에서 FilteringTokenFilter 설명
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.tistory.devyongsik.analyzer; | |
import java.io.IOException; | |
import java.util.ArrayList; | |
import java.util.List; | |
import org.apache.commons.logging.Log; | |
import org.apache.commons.logging.LogFactory; | |
import org.apache.lucene.analysis.TokenStream; | |
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | |
import org.apache.lucene.analysis.util.FilteringTokenFilter; | |
/** | |
* @author need4spd, need4spd@cplanet.co.kr, 2011. 7. 8. | |
* | |
*/ | |
public class DevysStopFilter extends FilteringTokenFilter { | |
private Log logger = LogFactory.getLog(DevysStopFilter.class); | |
private final CharTermAttribute charTermAtt = addAttribute(CharTermAttribute.class); | |
private List stopWords = new ArrayList(); | |
private void initStopWord() { | |
stopWords.add("the"); | |
stopWords.add("."); | |
} | |
public DevysStopFilter(boolean enablePositionIncrements, TokenStream input) { | |
super(enablePositionIncrements, input); | |
initStopWord(); | |
if(logger.isDebugEnabled()) | |
logger.debug("initailize..."); | |
} | |
@Override | |
protected boolean accept() throws IOException { | |
boolean isAccept = !stopWords.contains(charTermAtt.toString()); | |
return isAccept; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.tistory.devyongsik.analyzer; | |
import java.io.IOException; | |
import java.io.StringReader; | |
import java.util.HashSet; | |
import java.util.Set; | |
import org.apache.lucene.analysis.TokenStream; | |
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | |
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; | |
import org.junit.Assert; | |
import org.junit.BeforeClass; | |
import org.junit.Test; | |
/** | |
* @author need4spd, need4spd@cplanet.co.kr, 2011. 7. 8. | |
* | |
*/ | |
public class DevysStopFilterTest { | |
private static Set tokens = new HashSet(); | |
//불용어는 the와 . | |
StringReader reader = new StringReader("the 개발하고 꼭 이것을 잘 해야합니다. 공백입니다."); | |
@BeforeClass | |
public static void setUp() { | |
tokens.add("개발하고"); | |
tokens.add("이것을"); | |
tokens.add("해야합니다"); | |
tokens.add("공백입니다"); | |
tokens.add("꼭"); | |
tokens.add("잘"); | |
} | |
@Test | |
public void stopFilter() throws IOException { | |
TokenStream stream = new DevysStopFilter(new DevysTokenizer(reader)); | |
CharTermAttribute charTermAttr = stream.getAttribute(CharTermAttribute.class); | |
PositionIncrementAttribute positionAttr = stream.getAttribute(PositionIncrementAttribute.class); | |
while(stream.incrementToken()) { | |
System.out.println("charTermAttr : " + charTermAttr.toString()); | |
System.out.println("positionAttr : " + positionAttr.getPositionIncrement()); | |
Assert.assertTrue(tokens.contains(charTermAttr.toString())); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment