Skip to content

Instantly share code, notes, and snippets.

@molekilla
Created April 16, 2012 19:29
Show Gist options
  • Save molekilla/2400941 to your computer and use it in GitHub Desktop.
Save molekilla/2400941 to your computer and use it in GitHub Desktop.
Lucene Spanish Tokenizer (with UAX29URLEmailTokenizer and SpanishAnalyzer stop words)
// RM: Imports
import scala.collection._
import org.apache.lucene.util._
import org.apache.lucene.analysis.es._
import org.apache.lucene.analysis.standard._
import org.apache.lucene.analysis.snowball._
// RM: Where analyzer is
val TextAnalyzer = new StandardAnalyzer(Version.LUCENE_35, SpanishAnalyzer.getDefaultStopSet)
def getTokenList(analyzer:Analyzer, text:String):List[String] =
{
// RM: Replace twitter commands with other values
val textWithTweet = text.replaceAll(" @", " __").replaceAll(" #", " _ht_");
val reader = new StringReader(textWithTweet)
// RM:UAX29URLEmailTokenizer is just like using the analyzer tokenStream
val ts = new UAX29URLEmailTokenizer(reader);
//val ts = analyzer.tokenStream("contents", reader)
// RM: You can use CharTermAttribute but tokenizing in Spanish gets complicated. Easier with TermAttribute
val termAtt = ts.addAttribute(classOf[TermAttribute])
var list = List.empty[String]
try {
while ( ts.incrementToken ) {
// RM: You can use CharTermAttribute but tokenizing in Spanish gets complicated. Easier with TermAttribute
// val termBuffer = termAtt.termBuffer
// int termLen = termAtt.length() + 1;
// String w = new String(termBuffer, 0, termLen);
val w = termAtt.term
if ( w.length > 2 ) {
list = list ::: List(w.trim)
}
}
} catch {
case e: IOException =>
// TODO Auto-generated catch block
println(e.printStackTrace)
}
list
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment