Skip to content

Instantly share code, notes, and snippets.

@krrrr38
Created March 15, 2013 02:43
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save krrrr38/5167122 to your computer and use it in GitHub Desktop.
Save krrrr38/5167122 to your computer and use it in GitHub Desktop.
mallet形式の日本語名詞データ生成 with Mecab(wiki+hatena辞書)
object DataImport extends App {
val inputDirJa = "../data/sample-data/web/ja"
val stopWordJa = "../data/stoplists/jp.txt"
val outputFileJa = "data/import_test_ja.mallet"
val importer = new ImportJapaneseNoun(new File(stopWordJa))
val instances = importer.readDirectory(new File(inputDirJa))
instances.save(new File(outputFileJa))
}
package com.krrrr38.mallet.create_model
import java.io._
import java.util.ArrayList
import java.util.regex._
import scala.collection.JavaConversions._
import cc.mallet.pipe._
import cc.mallet.pipe.iterator._
import cc.mallet.types._
import cc.mallet.extract.StringSpan;
import cc.mallet.extract.StringTokenization;
class ImportJapaneseNoun(stopWord: File) {
val pipe: Pipe = buildPipe(stopWord)
def buildPipe(stopWord: File): Pipe = {
val pipeList = new ArrayList[Pipe]()
// Read data from File objects
pipeList.add(new Input2CharSequence("UTF-8"))
// Regular expression for what constitutes a token.
val tokenPattern = Pattern.compile("[\\p{L}\\p{N}_]+")
// Tokenize raw strings
pipeList.add(new MecabNounPipe())
// Normalize all tokens to all lowercase
pipeList.add(new TokenSequenceLowercase())
// Remove stopwords from a standard English stoplist.
// options: [case sensitive] [mark deletions]
pipeList.add(new TokenSequenceRemoveStopwords(stopWord, "UTF-8", true, false, false));
// Rather than storing tokens as strings, convert
// them to integers by looking them up in an alphabet.
pipeList.add(new TokenSequence2FeatureSequence());
// Do the same thing for the "target" field:
// convert a class label string to a Label object,
// which has an index in a Label alphabet.
pipeList.add(new Target2Label());
// Now convert the sequence of features to a sparse vector,
// mapping feature IDs to counts.
pipeList.add(new FeatureSequence2FeatureVector());
// Print out the features and the label
pipeList.add(new PrintInputAndTarget());
new SerialPipes(pipeList);
}
def readDirectory(directory: File): InstanceList =
readDirectories(Array(directory))
def readDirectories(directories: Array[File]): InstanceList = {
// Construct a file iterator, starting with the
// specified directories, and recursing through subdirectories.
// The second argument specifies a FileFilter to use to select
// files within a directory.
// The third argument is a Pattern that is applied to the
// filename to produce a class label. In this case, I've
// asked it to use the last directory name in the path.
val iterator = new FileIterator(
directories,
new TxtFilter(),
FileIterator.LAST_DIRECTORY)
// Construct a new instance list, passing it the pipe
// we want to use to process instances.
val instances = new InstanceList(pipe)
// Now process each instance provided by the iterator.
instances.addThruPipe(iterator)
return instances
}
/** This class illustrates how to build a simple file filter */
class TxtFilter extends FileFilter {
/** Test whether the string representation of the file
* ends with the correct extension. Note that {@ref FileIterator}
* will only call this filter if the file is not a directory,
* so we do not need to test that it is a file.
*/
def accept(file: File) = file.toString().endsWith(".txt")
}
}
class MecabNounPipe extends Pipe{
import net.moraleboost.mecab.Lattice
import net.moraleboost.mecab.Tagger
import net.moraleboost.mecab.impl.StandardTagger
import net.moraleboost.mecab.Node
val nounRegex = """名詞""".r
override def pipe(carrier: Instance): Instance = {
val input = carrier.getData.asInstanceOf[CharSequence]
val string = input.toString
val tagger = new StandardTagger("-u ../data/hatena-wiki.dic")
val lattice = tagger.createLattice
lattice.setSentence(string)
tagger.parse(lattice)
var node = lattice.bosNode
var cursor = 0
val ts = new StringTokenization(input)
while(node != null) {
nounRegex findPrefixOf node.feature map { v =>
val surface = node.surface
cursor = string.indexOf(surface, cursor)
ts.add(new StringSpan(input, cursor, cursor+surface.length))
}
node = node.next
}
lattice.destroy
tagger.destroy
carrier.setData(ts)
carrier
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment