Created
March 4, 2013 13:53
-
-
Save krrrr38/5082368 to your computer and use it in GitHub Desktop.
日本語用コーパス作成コード w/cmecab https://code.google.com/p/cmecab-java/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.krrrr38.bagofwords_en | |
import java.io.File | |
object Bagofwords_ja { | |
def main(args: Array[String]) { | |
val dir = "data" | |
val files = selectDoc(dir) | |
val bagDocs = files.map(new BagDoc(_)) | |
val (docs, voca) = BagofwordsJa.mkBagOfWords(bagDocs) | |
docs.export("output/docword.sample.txt") | |
voca.export("output/vocab.sample.txt") | |
} | |
private def selectDoc(dir: String): Seq[File] = | |
new File(dir).listFiles.map { | |
case f if f.getPath.endsWith(".txt") => f | |
} | |
} | |
object BagofwordsJa { | |
def mkBagOfWords(bagDocs: Seq[BagDoc]) = { | |
val words = bagDocs.map(_.getVocabulary).flatten.sorted.distinct | |
(mkDocsWord(bagDocs, words), Vocabulary(words)) | |
} | |
private def mkDocsWord(bagDocs: Seq[BagDoc], words: Seq[String]) = { | |
val wordIds = words.zipWithIndex.toMap | |
val wordSets = bagDocs.zipWithIndex.map{case (doc, docId) => | |
for{ | |
(word, freq) <- doc.bags.toList.sortBy(_._1) | |
wordId = wordIds.getOrElse(word, -1) | |
} yield WordSet(docId, wordId, freq) | |
}.flatten | |
DocsWord(bagDocs.length, words.length, wordSets.length, wordSets) | |
} | |
} | |
case class DocsWord(docNumD: Int, wordNumW: Int, totalWordNumN: Int, wordSets: Seq[WordSet]) { | |
import java.io.PrintWriter | |
def export(file: String){ | |
val out = new PrintWriter(file) | |
out.println(docNumD) | |
out.println(wordNumW) | |
out.println(totalWordNumN) | |
wordSets.foreach(out.println(_)) | |
out.close | |
} | |
} | |
case class Vocabulary(words: Seq[String]) { | |
import java.io.PrintWriter | |
def export(file: String){ | |
val out = new PrintWriter(file) | |
words.foreach(out.println(_)) | |
out.close | |
} | |
} | |
class BagDoc(file: File){ | |
import scala.io.Source | |
import net.moraleboost.mecab.Lattice | |
import net.moraleboost.mecab.Tagger | |
import net.moraleboost.mecab.impl.StandardTagger | |
import net.moraleboost.mecab.Node | |
val nounRegex = """名詞""".r | |
val termRegex = """[ぁ-ん|ァ-ヶ|一-龠|a-z|A-Z]+""".r | |
val bags = scala.collection.mutable.Map.empty[String, Int] | |
println(file.getPath) | |
init | |
def this(file: String){ | |
this(new File(file)) | |
} | |
private def init = { | |
val lines = Source.fromFile(file).mkString | |
val tagger = new StandardTagger("") | |
val lattice = tagger.createLattice | |
lattice.setSentence(lines) | |
tagger.parse(lattice) | |
var node = lattice.bosNode | |
while(node != null) { | |
nounRegex findPrefixOf node.feature map { v => | |
val surface = node.surface | |
termRegex.findFirstIn(surface) match{ | |
case Some(_) => addBagWord(surface) | |
case None => | |
} | |
} | |
node = node.next | |
} | |
lattice.destroy | |
tagger.destroy | |
} | |
private def addBagWord(word: String) { | |
val preFreq = bags.getOrElse(word, 0) | |
bags.update(word, preFreq + 1) | |
} | |
def getVocabulary = bags.keys | |
} | |
case class WordSet(docId: Int, wordId: Int, freq: Int){ | |
override def toString = s"$docId $wordId $freq" | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment