Skip to content

Instantly share code, notes, and snippets.

@hochgi
Created October 29, 2021 06:29
Show Gist options
  • Save hochgi/a47042323decca6ac2d744c2fdc3c3c4 to your computer and use it in GitHub Desktop.
Save hochgi/a47042323decca6ac2d744c2fdc3c3c4 to your computer and use it in GitHub Desktop.
convert a tree structure (e.g. derived from XML) to a csv where same labeled nodes make full outer join of inner subtrees (empty cells as `null`)
scala> import com.hochgi.util._
import com.hochgi.util._
scala> val t = Tree("root", List(Content("root text"), Attribute("att","val")), List(Tree("c", List(Content("interesting content")), Nil), Tree("c", List(Attribute("innerAtt","innerVal"), Content("boring content")), Nil)))
val t: com.hochgi.util.Tree = Tree(root,List(Content(root text), Attribute(att,val)),List(Tree(c,List(Content(interesting content)),List()), Tree(c,List(Attribute(innerAtt,innerVal), Content(boring content)),List())))
scala> csv(t)
val res0: List[Map[String,String]] = List(Map(root.@txt -> root text, root.#att -> val, root.c.@txt -> interesting content), Map(root.@txt -> root text, root.#att -> val, root.c.#innerAtt -> innerVal, root.c.@txt -> boring content))
scala> render(res0)
val res1: String =
root.@txt,root.#att,root.c.@txt,root.c.#innerAtt
root text,val,interesting content,null
root text,val,boring content,innerVal
package com.hochgi
package object util {
sealed trait Value
case class Attribute(k: String, v: String) extends Value
case class Content(s: String) extends Value
case class Tree(label: String, values: List[Value], children: List[Tree])
def render(matrix: List[Map[String, String]]): String = {
val sb = new StringBuilder()
val headers = matrix.foldLeft(Set.empty[String])(_ union _.keySet).toList
sb ++= headers.mkString(",")
matrix.foreach { row =>
var sep = '\n'
headers.foreach { col =>
sb += sep
sep = ','
sb ++= row.getOrElse(col, "null")
}
}
sb.result()
}
def mkRow(values: List[Value]): Map[String, String] = values.map {
case Attribute(k, v) => s"#$k" -> v
case Content(string) => "@txt" -> string
}.toMap
def nextRowUnderLabel(label: String)(row: Map[String, String]): Map[String, String] =
row.map { case (header, value) => s"$label.$header" -> value }
def csv(root: Tree): List[Map[String, String]] = root match {
case Tree(label: String, values: List[Value], Nil) =>
List(nextRowUnderLabel(label)(mkRow(values)))
case Tree(label: String, values: List[Value], kids) =>
val label2Trees: Map[String, List[Tree]] = kids.groupBy(_.label)
val perLabelAllNodesWithRecMatrix: List[List[List[Map[String, String]]]] = label2Trees.map {
case (_, children) => children.map(csv)
}.toList
val rowToJoin = mkRow(values)
join(perLabelAllNodesWithRecMatrix, List(rowToJoin)).map(nextRowUnderLabel(label))
}
def join(labelsChildrenRowsCols: List[List[List[Map[String, String]]]],
thisMatrix: List[Map[String, String]]): List[Map[String, String]] = {
headAndTailOption(labelsChildrenRowsCols).fold(thisMatrix) {
case (firstLabelTrees, restOfLabelsTrees) => firstLabelTrees.flatMap { thatMatrix =>
val newMatrix: List[Map[String, String]] = for {
thisRow <- thisMatrix
thatRow <- thatMatrix
} yield thisRow ++ thatRow
join(restOfLabelsTrees, newMatrix)
}
}
}
def headAndTailOption[T](list: List[T]): Option[(T, List[T])] = list match {
case Nil => None
case head :: tail => Some(head -> tail)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment