krrrr38/DataImport.scala

## DataImport.scala
object DataImport extends App {
  val inputDirJa = "../data/sample-data/web/ja"
  val stopWordJa = "../data/stoplists/jp.txt"
  val outputFileJa = "data/import_test_ja.mallet"

  val importer = new ImportJapaneseNoun(new File(stopWordJa))
  val instances = importer.readDirectory(new File(inputDirJa))
  instances.save(new File(outputFileJa))
}

## ImportJapaneseNoun.scala
package com.krrrr38.mallet.create_model

import java.io._
import java.util.ArrayList
import java.util.regex._

import scala.collection.JavaConversions._

import cc.mallet.pipe._
import cc.mallet.pipe.iterator._
import cc.mallet.types._
import cc.mallet.extract.StringSpan;
import cc.mallet.extract.StringTokenization;

class ImportJapaneseNoun(stopWord: File) {
  val pipe: Pipe = buildPipe(stopWord)

  def buildPipe(stopWord: File): Pipe = {
    val pipeList = new ArrayList[Pipe]()

    // Read data from File objects
    pipeList.add(new Input2CharSequence("UTF-8"))

    // Regular expression for what constitutes a token.
    val tokenPattern = Pattern.compile("[\\p{L}\\p{N}_]+")

    // Tokenize raw strings
    pipeList.add(new MecabNounPipe())

    // Normalize all tokens to all lowercase
    pipeList.add(new TokenSequenceLowercase())

    // Remove stopwords from a standard English stoplist.
    //  options: [case sensitive] [mark deletions]
    pipeList.add(new TokenSequenceRemoveStopwords(stopWord, "UTF-8", true, false, false));

    // Rather than storing tokens as strings, convert
    //  them to integers by looking them up in an alphabet.
    pipeList.add(new TokenSequence2FeatureSequence());

    // Do the same thing for the "target" field:
    //  convert a class label string to a Label object,
    //  which has an index in a Label alphabet.
    pipeList.add(new Target2Label());

    // Now convert the sequence of features to a sparse vector,
    //  mapping feature IDs to counts.
    pipeList.add(new FeatureSequence2FeatureVector());

    // Print out the features and the label
    pipeList.add(new PrintInputAndTarget());

    new SerialPipes(pipeList);
  }

  def readDirectory(directory: File): InstanceList =
    readDirectories(Array(directory))

  def readDirectories(directories: Array[File]): InstanceList = {

    // Construct a file iterator, starting with the
    //  specified directories, and recursing through subdirectories.
    // The second argument specifies a FileFilter to use to select
    //  files within a directory.
    // The third argument is a Pattern that is applied to the
    //   filename to produce a class label. In this case, I've
    //   asked it to use the last directory name in the path.
    val iterator = new FileIterator(
      directories,
      new TxtFilter(),
      FileIterator.LAST_DIRECTORY)

    // Construct a new instance list, passing it the pipe
    //  we want to use to process instances.
    val instances = new InstanceList(pipe)

    // Now process each instance provided by the iterator.
    instances.addThruPipe(iterator)

    return instances
  }
  /** This class illustrates how to build a simple file filter */
  class TxtFilter extends FileFilter {
    /** Test whether the string representation of the file
      *    ends with the correct extension. Note that {@ref FileIterator}
      *    will only call this filter if the file is not a directory,
      *    so we do not need to test that it is a file.
      */
    def accept(file: File) = file.toString().endsWith(".txt")
  }
}

class MecabNounPipe extends Pipe{
  import net.moraleboost.mecab.Lattice
  import net.moraleboost.mecab.Tagger
  import net.moraleboost.mecab.impl.StandardTagger
  import net.moraleboost.mecab.Node

  val nounRegex = """名詞""".r

  override def pipe(carrier: Instance): Instance = {
    val input = carrier.getData.asInstanceOf[CharSequence]
    val string = input.toString

	val tagger = new StandardTagger("-u ../data/hatena-wiki.dic")
	val lattice = tagger.createLattice

	lattice.setSentence(string)
	tagger.parse(lattice)

	var node = lattice.bosNode
    var cursor = 0
    val ts = new StringTokenization(input)
	while(node != null) {
	  nounRegex findPrefixOf node.feature map { v =>
		val surface = node.surface
        cursor = string.indexOf(surface, cursor)
        ts.add(new StringSpan(input, cursor, cursor+surface.length))
	  }
	  node = node.next
	}
	lattice.destroy
	tagger.destroy

    carrier.setData(ts)
    carrier
  }
}
	object DataImport extends App {
	val inputDirJa = "../data/sample-data/web/ja"
	val stopWordJa = "../data/stoplists/jp.txt"
	val outputFileJa = "data/import_test_ja.mallet"

	val importer = new ImportJapaneseNoun(new File(stopWordJa))
	val instances = importer.readDirectory(new File(inputDirJa))
	instances.save(new File(outputFileJa))
	}
	package com.krrrr38.mallet.create_model

	import java.io._
	import java.util.ArrayList
	import java.util.regex._

	import scala.collection.JavaConversions._

	import cc.mallet.pipe._
	import cc.mallet.pipe.iterator._
	import cc.mallet.types._
	import cc.mallet.extract.StringSpan;
	import cc.mallet.extract.StringTokenization;

	class ImportJapaneseNoun(stopWord: File) {
	val pipe: Pipe = buildPipe(stopWord)

	def buildPipe(stopWord: File): Pipe = {
	val pipeList = new ArrayList[Pipe]()

	// Read data from File objects
	pipeList.add(new Input2CharSequence("UTF-8"))

	// Regular expression for what constitutes a token.
	val tokenPattern = Pattern.compile("[\\p{L}\\p{N}_]+")

	// Tokenize raw strings
	pipeList.add(new MecabNounPipe())

	// Normalize all tokens to all lowercase
	pipeList.add(new TokenSequenceLowercase())

	// Remove stopwords from a standard English stoplist.
	// options: [case sensitive] [mark deletions]
	pipeList.add(new TokenSequenceRemoveStopwords(stopWord, "UTF-8", true, false, false));

	// Rather than storing tokens as strings, convert
	// them to integers by looking them up in an alphabet.
	pipeList.add(new TokenSequence2FeatureSequence());

	// Do the same thing for the "target" field:
	// convert a class label string to a Label object,
	// which has an index in a Label alphabet.
	pipeList.add(new Target2Label());

	// Now convert the sequence of features to a sparse vector,
	// mapping feature IDs to counts.
	pipeList.add(new FeatureSequence2FeatureVector());

	// Print out the features and the label
	pipeList.add(new PrintInputAndTarget());

	new SerialPipes(pipeList);
	}

	def readDirectory(directory: File): InstanceList =
	readDirectories(Array(directory))

	def readDirectories(directories: Array[File]): InstanceList = {

	// Construct a file iterator, starting with the
	// specified directories, and recursing through subdirectories.
	// The second argument specifies a FileFilter to use to select
	// files within a directory.
	// The third argument is a Pattern that is applied to the
	// filename to produce a class label. In this case, I've
	// asked it to use the last directory name in the path.
	val iterator = new FileIterator(
	directories,
	new TxtFilter(),
	FileIterator.LAST_DIRECTORY)

	// Construct a new instance list, passing it the pipe
	// we want to use to process instances.
	val instances = new InstanceList(pipe)

	// Now process each instance provided by the iterator.
	instances.addThruPipe(iterator)

	return instances
	}
	/** This class illustrates how to build a simple file filter */
	class TxtFilter extends FileFilter {
	/** Test whether the string representation of the file
	* ends with the correct extension. Note that {@ref FileIterator}
	* will only call this filter if the file is not a directory,
	* so we do not need to test that it is a file.
	*/
	def accept(file: File) = file.toString().endsWith(".txt")
	}
	}

	class MecabNounPipe extends Pipe{
	import net.moraleboost.mecab.Lattice
	import net.moraleboost.mecab.Tagger
	import net.moraleboost.mecab.impl.StandardTagger
	import net.moraleboost.mecab.Node

	val nounRegex = """名詞""".r

	override def pipe(carrier: Instance): Instance = {
	val input = carrier.getData.asInstanceOf[CharSequence]
	val string = input.toString

	val tagger = new StandardTagger("-u ../data/hatena-wiki.dic")
	val lattice = tagger.createLattice

	lattice.setSentence(string)
	tagger.parse(lattice)

	var node = lattice.bosNode
	var cursor = 0
	val ts = new StringTokenization(input)
	while(node != null) {
	nounRegex findPrefixOf node.feature map { v =>
	val surface = node.surface
	cursor = string.indexOf(surface, cursor)
	ts.add(new StringSpan(input, cursor, cursor+surface.length))
	}
	node = node.next
	}
	lattice.destroy
	tagger.destroy

	carrier.setData(ts)
	carrier
	}
	}