Created
March 15, 2013 02:43
-
-
Save krrrr38/5167122 to your computer and use it in GitHub Desktop.
mallet形式の日本語名詞データ生成 with Mecab(wiki+hatena辞書)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
object DataImport extends App { | |
val inputDirJa = "../data/sample-data/web/ja" | |
val stopWordJa = "../data/stoplists/jp.txt" | |
val outputFileJa = "data/import_test_ja.mallet" | |
val importer = new ImportJapaneseNoun(new File(stopWordJa)) | |
val instances = importer.readDirectory(new File(inputDirJa)) | |
instances.save(new File(outputFileJa)) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.krrrr38.mallet.create_model | |
import java.io._ | |
import java.util.ArrayList | |
import java.util.regex._ | |
import scala.collection.JavaConversions._ | |
import cc.mallet.pipe._ | |
import cc.mallet.pipe.iterator._ | |
import cc.mallet.types._ | |
import cc.mallet.extract.StringSpan; | |
import cc.mallet.extract.StringTokenization; | |
class ImportJapaneseNoun(stopWord: File) { | |
val pipe: Pipe = buildPipe(stopWord) | |
def buildPipe(stopWord: File): Pipe = { | |
val pipeList = new ArrayList[Pipe]() | |
// Read data from File objects | |
pipeList.add(new Input2CharSequence("UTF-8")) | |
// Regular expression for what constitutes a token. | |
val tokenPattern = Pattern.compile("[\\p{L}\\p{N}_]+") | |
// Tokenize raw strings | |
pipeList.add(new MecabNounPipe()) | |
// Normalize all tokens to all lowercase | |
pipeList.add(new TokenSequenceLowercase()) | |
// Remove stopwords from a standard English stoplist. | |
// options: [case sensitive] [mark deletions] | |
pipeList.add(new TokenSequenceRemoveStopwords(stopWord, "UTF-8", true, false, false)); | |
// Rather than storing tokens as strings, convert | |
// them to integers by looking them up in an alphabet. | |
pipeList.add(new TokenSequence2FeatureSequence()); | |
// Do the same thing for the "target" field: | |
// convert a class label string to a Label object, | |
// which has an index in a Label alphabet. | |
pipeList.add(new Target2Label()); | |
// Now convert the sequence of features to a sparse vector, | |
// mapping feature IDs to counts. | |
pipeList.add(new FeatureSequence2FeatureVector()); | |
// Print out the features and the label | |
pipeList.add(new PrintInputAndTarget()); | |
new SerialPipes(pipeList); | |
} | |
def readDirectory(directory: File): InstanceList = | |
readDirectories(Array(directory)) | |
def readDirectories(directories: Array[File]): InstanceList = { | |
// Construct a file iterator, starting with the | |
// specified directories, and recursing through subdirectories. | |
// The second argument specifies a FileFilter to use to select | |
// files within a directory. | |
// The third argument is a Pattern that is applied to the | |
// filename to produce a class label. In this case, I've | |
// asked it to use the last directory name in the path. | |
val iterator = new FileIterator( | |
directories, | |
new TxtFilter(), | |
FileIterator.LAST_DIRECTORY) | |
// Construct a new instance list, passing it the pipe | |
// we want to use to process instances. | |
val instances = new InstanceList(pipe) | |
// Now process each instance provided by the iterator. | |
instances.addThruPipe(iterator) | |
return instances | |
} | |
/** This class illustrates how to build a simple file filter */ | |
class TxtFilter extends FileFilter { | |
/** Test whether the string representation of the file | |
* ends with the correct extension. Note that {@ref FileIterator} | |
* will only call this filter if the file is not a directory, | |
* so we do not need to test that it is a file. | |
*/ | |
def accept(file: File) = file.toString().endsWith(".txt") | |
} | |
} | |
class MecabNounPipe extends Pipe{ | |
import net.moraleboost.mecab.Lattice | |
import net.moraleboost.mecab.Tagger | |
import net.moraleboost.mecab.impl.StandardTagger | |
import net.moraleboost.mecab.Node | |
val nounRegex = """名詞""".r | |
override def pipe(carrier: Instance): Instance = { | |
val input = carrier.getData.asInstanceOf[CharSequence] | |
val string = input.toString | |
val tagger = new StandardTagger("-u ../data/hatena-wiki.dic") | |
val lattice = tagger.createLattice | |
lattice.setSentence(string) | |
tagger.parse(lattice) | |
var node = lattice.bosNode | |
var cursor = 0 | |
val ts = new StringTokenization(input) | |
while(node != null) { | |
nounRegex findPrefixOf node.feature map { v => | |
val surface = node.surface | |
cursor = string.indexOf(surface, cursor) | |
ts.add(new StringSpan(input, cursor, cursor+surface.length)) | |
} | |
node = node.next | |
} | |
lattice.destroy | |
tagger.destroy | |
carrier.setData(ts) | |
carrier | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment