Skip to content

Instantly share code, notes, and snippets.

@syedatifakhtar
Created May 28, 2020 19:37
Show Gist options
  • Save syedatifakhtar/e67fbd53d97b964d430e19f077c94fed to your computer and use it in GitHub Desktop.
Save syedatifakhtar/e67fbd53d97b964d430e19f077c94fed to your computer and use it in GitHub Desktop.
Generate random records based on Schema in Scala without Spark
import java.io.{BufferedWriter, DataOutputStream, File, FileWriter}
import scala.io.Source.fromFile
import scala.util.Random
import scala.util.parsing.json.JSON
object DBGenSansSpark {
val master = "local[2]"
val appName = "testing"
val r = scala.util.Random
def generators = Map[String, Unit => Any](
"INT" -> (_ => r.nextInt(100)),
"STRING" -> (_ => Random.alphanumeric.take(8).mkString("")),
"DOUBLE" -> (_ => r.nextDouble),
"LONG" -> (_ => r.nextLong)
)
type JSONMap = Map[String, String]
def argsParser(args: Array[String]) = {
args.map {
arg =>
(arg.split("--")(1).split("=")(0), arg.split("--")(1).split("=")(1))
}.toMap
}
def main(args: Array[String]): Unit = {
val argsMap = argsParser(args)
val schemaBasePath = argsMap("schemaBasePath") // /Users/$user/workspace/test_data/schemas
val outputBasePath = argsMap("outputBasePath") // /Users/$user/workspace/test_data/output
val numRecords = argsMap("numRecords").toInt
val tableFiles = argsMap("tables").split(",").map { tableName => (tableName, s"${schemaBasePath}/${tableName}.json") }.toMap //table1,table2
val jsonSchemasForTables: Map[String, JSONMap] = tableFiles.map { case (k, v) =>
(k, JSON.parseFull(fromFile(v).getLines.mkString).get.asInstanceOf[JSONMap])
}
jsonSchemasForTables.map {
case (tableName, json) =>
val records = (1 to numRecords).toStream.map { _ => generateRowForSchemaType(json) }
(tableName, records)
}.foreach {
case (tableName, records) =>
val tableOutputPath = s"$outputBasePath/$tableName/part-0000.csv"
println(s"Writing files for table: $tableName at location $tableOutputPath")
val file = new File(tableOutputPath)
file.getParentFile().mkdirs()
file.createNewFile()
val bw = new BufferedWriter(new FileWriter(file))
records.foreach {
record =>
bw.write(record.mkString("\u0001"))
bw.newLine()
}
bw.flush()
bw.close()
println(s"Write completed for files for table: $tableName at location $tableOutputPath")
}
}
private def generateRowForSchemaType(json: JSONMap) = {
json.mapValues(x => generators(x).apply()).values.toSeq
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment