Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Elasticsearch search result convert to MeCab-IPADIC format
import $ivy.`org.scalaz::scalaz-core:7.2.+`
import $ivy.`org.json4s::json4s-jackson:3.+`
import $ivy.`org.json4s::json4s-ext:3.+`
import $ivy.`com.github.nscala-time::nscala-time:2.18.+`
import scalaz._
import Scalaz._
import com.github.nscala_time.time.Imports._
import scala.io.Source
import org.json4s._
import org.json4s.jackson.JsonMethods._
import java.io.File
import java.io.PrintWriter
implicit val formats = DefaultFormats
val filename = "artist_test.json";
val line = Source.fromFile(filename).getLines.toList.mkString("");
val artistsJson = parse(line);
case class Artist(names: List[String], alternative_names: List[String])
val artists = (artistsJson \ "hits" \ "hits") match {
case JArray(hits) => hits.map{ hit =>
val names = (hit \ "_source" \ "names").asInstanceOf[JArray].arr.map(_.extract[String])
val aNames = (hit \ "_source" \ "alternative_names").asInstanceOf[JArray].arr.map(_.extract[String])
Artist(names, aNames)
}
}
println(artists.size);
val names = artists.flatMap{ artist =>
(artist.names ++ artist.alternative_names).map{ n =>
n.replace(", "," ")
.replace(","," ")
.replaceAll("""[ !@#$%^&*()-=_+{}:<>?/!@#$%^&*()_+-={};:'",.<>/?]""","")
.toLowerCase
}
}.toSet;
println(names.size);
val d_writer = new PrintWriter(new File("artist_dictionary.txt"));
val s_writer = new PrintWriter(new File("artist_synonym.txt"));
d_writer.write(names.map( n => s"${n},${n},${n},カスタム人名\n" ).mkString);
d_writer.close();
s_writer.write(artists.map{ artist =>
(artist.names ++ artist.alternative_names).map( n => n.replace(", "," ").replace(","," ").toLowerCase).toSet.mkString(",")
}.filter(_.contains(",")).mkString("\n"))
s_writer.close();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment