Skip to content

Instantly share code, notes, and snippets.

@krrrr38
Created March 4, 2013 13:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save krrrr38/5082368 to your computer and use it in GitHub Desktop.
Save krrrr38/5082368 to your computer and use it in GitHub Desktop.
日本語用コーパス作成コード w/cmecab https://code.google.com/p/cmecab-java/
package com.krrrr38.bagofwords_en
import java.io.File
object Bagofwords_ja {
def main(args: Array[String]) {
val dir = "data"
val files = selectDoc(dir)
val bagDocs = files.map(new BagDoc(_))
val (docs, voca) = BagofwordsJa.mkBagOfWords(bagDocs)
docs.export("output/docword.sample.txt")
voca.export("output/vocab.sample.txt")
}
private def selectDoc(dir: String): Seq[File] =
new File(dir).listFiles.map {
case f if f.getPath.endsWith(".txt") => f
}
}
object BagofwordsJa {
def mkBagOfWords(bagDocs: Seq[BagDoc]) = {
val words = bagDocs.map(_.getVocabulary).flatten.sorted.distinct
(mkDocsWord(bagDocs, words), Vocabulary(words))
}
private def mkDocsWord(bagDocs: Seq[BagDoc], words: Seq[String]) = {
val wordIds = words.zipWithIndex.toMap
val wordSets = bagDocs.zipWithIndex.map{case (doc, docId) =>
for{
(word, freq) <- doc.bags.toList.sortBy(_._1)
wordId = wordIds.getOrElse(word, -1)
} yield WordSet(docId, wordId, freq)
}.flatten
DocsWord(bagDocs.length, words.length, wordSets.length, wordSets)
}
}
case class DocsWord(docNumD: Int, wordNumW: Int, totalWordNumN: Int, wordSets: Seq[WordSet]) {
import java.io.PrintWriter
def export(file: String){
val out = new PrintWriter(file)
out.println(docNumD)
out.println(wordNumW)
out.println(totalWordNumN)
wordSets.foreach(out.println(_))
out.close
}
}
case class Vocabulary(words: Seq[String]) {
import java.io.PrintWriter
def export(file: String){
val out = new PrintWriter(file)
words.foreach(out.println(_))
out.close
}
}
class BagDoc(file: File){
import scala.io.Source
import net.moraleboost.mecab.Lattice
import net.moraleboost.mecab.Tagger
import net.moraleboost.mecab.impl.StandardTagger
import net.moraleboost.mecab.Node
val nounRegex = """名詞""".r
val termRegex = """[ぁ-ん|ァ-ヶ|一-龠|a-z|A-Z]+""".r
val bags = scala.collection.mutable.Map.empty[String, Int]
println(file.getPath)
init
def this(file: String){
this(new File(file))
}
private def init = {
val lines = Source.fromFile(file).mkString
val tagger = new StandardTagger("")
val lattice = tagger.createLattice
lattice.setSentence(lines)
tagger.parse(lattice)
var node = lattice.bosNode
while(node != null) {
nounRegex findPrefixOf node.feature map { v =>
val surface = node.surface
termRegex.findFirstIn(surface) match{
case Some(_) => addBagWord(surface)
case None =>
}
}
node = node.next
}
lattice.destroy
tagger.destroy
}
private def addBagWord(word: String) {
val preFreq = bags.getOrElse(word, 0)
bags.update(word, preFreq + 1)
}
def getVocabulary = bags.keys
}
case class WordSet(docId: Int, wordId: Int, freq: Int){
override def toString = s"$docId $wordId $freq"
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment