fozziethebeat/TestLttoolbox.scala

## TestLttoolbox.scala
/**
 * Sample code for using the <a
 * href="http://wiki.apertium.org/wiki/Lttoolbox-java">lttoolbox-java</a> code
 * from within Scala.  This morhpologically analyzes a simple sentence using
 * this <a
 * href="http://sourceforge.net/projects/apertium/files/apertium-mk-en/apertium-mk-en-0.1.0.tar.gz/download">english
 * dictionary</a>.  To run this code, first do two steps:
 *
 * <ul>
 *  <li> download and compile lttoolbox-java as per <a href="http://wiki.apertium.org/wiki/Lttoolbox-java">these instructions</a>.</li>
 *  <li> compile the english dictionary apertium-mk-en.en.dix as in the
 *  instructions.</li>
 * </ul>
 *
 * Then run this code with the lttoolbox-java jar and the compiled dictionary as
 * the first argument.
 */

// Include the Finite State Transducer processor and some handy io utilities.
import org.apertium.lttoolbox.process.FSTProcessor
import org.apertium.utils.IOUtils._

// We need to do regular expression to recognize tokens and morphological tags
// after analysis.
import scala.util.matching.Regex

// Readers and Writers for handling input/output.
import java.io.StringReader
import java.io.StringWriter

// Create the Finite State Transducer processor.
val fstp = new FSTProcessor()

// Load the finite state transducer with the compiled dictionary file.  The
// dictionary file must be compiled with:
//   java -jar target/lttoolbox-java-3.2.0-SNAPSHOT.jar lt-comp <dic.dix> <dic.bin>
// Then pass <dic.bin> to this program as the compiled dictionary file.
fstp.load(openInFileStream(args(0)))

// Setup the trandsducer to do morphological analysis and make sure it's valid.
fstp.initAnalysis
if (!fstp.valid)
    println("ERRRROR")

// Create a sample sentence for the transducer to analyze.  all words but
// "blubber" should be analyzed correctly.  "blubber" won't be in the dictionary
// so it will be a special test case.
val in = new StringReader("cats, dogs and blubber all running quickly!")

// Create the set of features that we don't care about.  These are standard
// part of speech features and some other ones related to quanitifiers.
val rejectFeatures = Set("<n>", "<cnjcoo>", "<cm>", "<prn>",
                         "<qnt>", "<mf>", "<vblex>", "<adv>", "<sp>")

// Create a writer for the output to go.
val out = new StringWriter()

// Do the analysis.
fstp.analysis(in, out)

// Create some regular expressions for recognizing and splitting each part of
// the output.

// 1: Recognize a fully analyzed word so that they can be tokenized.  In the
// above test case, "cats," will not be separated by white space so we require
// this more complicated splitting method.
val parseRegex = """\^.*?\$""".r
// 2: Recognize a word with morphological tags.
val morphredRegex = """\^(.+)/(.+?)(<.*)\$""".r
// 3: Recognize a word that could not be recognized.  The transducer prepends
// "*" to unrecognized tokens, so we match and eliminate it.
val unknownRegex = """\^(.+)/\*(.+?)\$""".r
// 4: A regular expression for matching morphological tags.  This is simpler
// than writing a splitting rule.
val featureRegex = """<.*?>""".r

// Iterate through the analyzed words and return a list of the tokens we care
// about.
val tokens = parseRegex.findAllIn(out.toString).map(parseMatch =>
    // Match the current analyzed word as being morphed or unknown.  For morphed
    // words, create a list of the lemma and the tags.  For unknown words just
    // create a list of the lemma.
    parseMatch.toString match {
        case morphredRegex(surface, lemma, tags) =>
            lemma :: featureRegex.findAllIn(tags).toList
        case unknownRegex(surface, lemma) =>
            List(lemma)
    }).reduceLeft(_++_).filter(!rejectFeatures.contains(_))

// Print out the features after being fully split.  Each token and tag should be
// separated by white space.
println(tokens.mkString(" "))
	/**
	* Sample code for using the <a
	* href="http://wiki.apertium.org/wiki/Lttoolbox-java">lttoolbox-java</a> code
	* from within Scala. This morhpologically analyzes a simple sentence using
	* this <a
	* href="http://sourceforge.net/projects/apertium/files/apertium-mk-en/apertium-mk-en-0.1.0.tar.gz/download">english
	* dictionary</a>. To run this code, first do two steps:
	*
	* <ul>
	* <li> download and compile lttoolbox-java as per <a href="http://wiki.apertium.org/wiki/Lttoolbox-java">these instructions</a>.</li>
	* <li> compile the english dictionary apertium-mk-en.en.dix as in the
	* instructions.</li>
	* </ul>
	*
	* Then run this code with the lttoolbox-java jar and the compiled dictionary as
	* the first argument.
	*/

	// Include the Finite State Transducer processor and some handy io utilities.
	import org.apertium.lttoolbox.process.FSTProcessor
	import org.apertium.utils.IOUtils._

	// We need to do regular expression to recognize tokens and morphological tags
	// after analysis.
	import scala.util.matching.Regex

	// Readers and Writers for handling input/output.
	import java.io.StringReader
	import java.io.StringWriter

	// Create the Finite State Transducer processor.
	val fstp = new FSTProcessor()

	// Load the finite state transducer with the compiled dictionary file. The
	// dictionary file must be compiled with:
	// java -jar target/lttoolbox-java-3.2.0-SNAPSHOT.jar lt-comp <dic.dix> <dic.bin>
	// Then pass <dic.bin> to this program as the compiled dictionary file.
	fstp.load(openInFileStream(args(0)))

	// Setup the trandsducer to do morphological analysis and make sure it's valid.
	fstp.initAnalysis
	if (!fstp.valid)
	println("ERRRROR")

	// Create a sample sentence for the transducer to analyze. all words but
	// "blubber" should be analyzed correctly. "blubber" won't be in the dictionary
	// so it will be a special test case.
	val in = new StringReader("cats, dogs and blubber all running quickly!")

	// Create the set of features that we don't care about. These are standard
	// part of speech features and some other ones related to quanitifiers.
	val rejectFeatures = Set("<n>", "<cnjcoo>", "<cm>", "<prn>",
	"<qnt>", "<mf>", "<vblex>", "<adv>", "<sp>")

	// Create a writer for the output to go.
	val out = new StringWriter()

	// Do the analysis.
	fstp.analysis(in, out)

	// Create some regular expressions for recognizing and splitting each part of
	// the output.

	// 1: Recognize a fully analyzed word so that they can be tokenized. In the
	// above test case, "cats," will not be separated by white space so we require
	// this more complicated splitting method.
	val parseRegex = """\^.*?\$""".r
	// 2: Recognize a word with morphological tags.
	val morphredRegex = """\^(.+)/(.+?)(<.*)\$""".r
	// 3: Recognize a word that could not be recognized. The transducer prepends
	// "*" to unrecognized tokens, so we match and eliminate it.
	val unknownRegex = """\^(.+)/\*(.+?)\$""".r
	// 4: A regular expression for matching morphological tags. This is simpler
	// than writing a splitting rule.
	val featureRegex = """<.*?>""".r

	// Iterate through the analyzed words and return a list of the tokens we care
	// about.
	val tokens = parseRegex.findAllIn(out.toString).map(parseMatch =>
	// Match the current analyzed word as being morphed or unknown. For morphed
	// words, create a list of the lemma and the tags. For unknown words just
	// create a list of the lemma.
	parseMatch.toString match {
	case morphredRegex(surface, lemma, tags) =>
	lemma :: featureRegex.findAllIn(tags).toList
	case unknownRegex(surface, lemma) =>
	List(lemma)
	}).reduceLeft(_++_).filter(!rejectFeatures.contains(_))

	// Print out the features after being fully split. Each token and tag should be
	// separated by white space.
	println(tokens.mkString(" "))