molekilla/gist:2400941

## gistfile1.scala
// RM: Imports
import scala.collection._
import org.apache.lucene.util._
import org.apache.lucene.analysis.es._
import org.apache.lucene.analysis.standard._
import org.apache.lucene.analysis.snowball._

// RM: Where analyzer is

val TextAnalyzer = new StandardAnalyzer(Version.LUCENE_35, SpanishAnalyzer.getDefaultStopSet)


def getTokenList(analyzer:Analyzer, text:String):List[String] =
{
  // RM: Replace twitter commands with other values
  val textWithTweet = text.replaceAll(" @", " __").replaceAll(" #", " _ht_");
  val reader = new StringReader(textWithTweet)

 // RM:UAX29URLEmailTokenizer is just like using the analyzer tokenStream
  val ts = new UAX29URLEmailTokenizer(reader);

        //val ts = analyzer.tokenStream("contents", reader)
        // RM: You can use CharTermAttribute but tokenizing in Spanish gets complicated. Easier with TermAttribute
        val termAtt = ts.addAttribute(classOf[TermAttribute])
        var list = List.empty[String]

        try {
            while ( ts.incrementToken ) {
// RM: You can use CharTermAttribute but tokenizing in Spanish gets complicated. Easier with TermAttribute
//                val termBuffer = termAtt.termBuffer
//                int termLen = termAtt.length() + 1;
//                String w = new String(termBuffer, 0, termLen);
                val w = termAtt.term
                if ( w.length > 2  ) {
                  list = list ::: List(w.trim)
                }
            }
        } catch {
          case e: IOException =>
            // TODO Auto-generated catch block
            println(e.printStackTrace)
        }

        list
    }
}
	// RM: Imports
	import scala.collection._
	import org.apache.lucene.util._
	import org.apache.lucene.analysis.es._
	import org.apache.lucene.analysis.standard._
	import org.apache.lucene.analysis.snowball._

	// RM: Where analyzer is

	val TextAnalyzer = new StandardAnalyzer(Version.LUCENE_35, SpanishAnalyzer.getDefaultStopSet)


	def getTokenList(analyzer:Analyzer, text:String):List[String] =
	{
	// RM: Replace twitter commands with other values
	val textWithTweet = text.replaceAll(" @", " __").replaceAll(" #", " _ht_");
	val reader = new StringReader(textWithTweet)

	// RM:UAX29URLEmailTokenizer is just like using the analyzer tokenStream
	val ts = new UAX29URLEmailTokenizer(reader);

	//val ts = analyzer.tokenStream("contents", reader)
	// RM: You can use CharTermAttribute but tokenizing in Spanish gets complicated. Easier with TermAttribute
	val termAtt = ts.addAttribute(classOf[TermAttribute])
	var list = List.empty[String]

	try {
	while ( ts.incrementToken ) {
	// RM: You can use CharTermAttribute but tokenizing in Spanish gets complicated. Easier with TermAttribute
	// val termBuffer = termAtt.termBuffer
	// int termLen = termAtt.length() + 1;
	// String w = new String(termBuffer, 0, termLen);
	val w = termAtt.term
	if ( w.length > 2 ) {
	list = list ::: List(w.trim)
	}
	}
	} catch {
	case e: IOException =>
	// TODO Auto-generated catch block
	println(e.printStackTrace)
	}

	list
	}
	}