Skip to content

Instantly share code, notes, and snippets.

@piyo7
Created December 3, 2018 01:06
Show Gist options
  • Save piyo7/618fea6513cd1d0e5261494171dec123 to your computer and use it in GitHub Desktop.
Save piyo7/618fea6513cd1d0e5261494171dec123 to your computer and use it in GitHub Desktop.
SparkのDataFrame/Datasetって型安全なの? ref: https://qiita.com/piyo7/items/15a37e300e78795b09db
sbt.version=1.2.6
javacOptions ++= Seq("-source", "1.8", "-target", "1.8")
scalaVersion := "2.12.7"
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "2.4.0",
"org.apache.spark" %% "spark-sql" % "2.4.0")
case class Person(name: String, height: Int)
val people: Dataset[Person]
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
val spark = SparkSession.builder().master("local[*]").getOrCreate()
import spark.implicits._
case class Person(name: String, height: Int)
val people: Dataset[Person] = Seq(Person("Sayori", 157), Person("Natsuki", 149), Person("Yuri", 165)).toDS()
+-------+------+
| name|height|
+-------+------+
| Sayori| 157|
|Natsuki| 149|
| Yuri| 165|
+-------+------+
heights.select("name").show()
+-------+
| name|
+-------+
| Sayori|
|Natsuki|
| Yuri|
+-------+
case class Monika(m: Just[Monika], o: Just[Monika], n: Just[Monika], i: Just[Monika], k: Just[Monika], a: Just[Monika])
people.select("m", "o", "n", "i", "k", "a").as[Monika]
implicit class DataSetHelper[T](val dataset: Dataset[T]) extends AnyVal {
def selectAs[U](implicit encoder: Encoder[U]): Dataset[U] =
dataset.select(encoder.schema.fields.map(f => new ColumnName(f.name).cast(f.dataType)): _*).as[U]
}
people.selectAs[Monika]
val justMonika: Dataset[Person] = people.map(_.copy(name = "Monika"))
val justMonika: Dataset[Person] = people.filter(_.name == "Monika")
+-------+------+
| name|height|
+-------+------+
| Sayori| 157|
|Natsuki| 149|
| Yuri| 165|
+-------+------+
val names: DataFrame = people.select("name")
+-------+
| name|
+-------+
| Sayori|
|Natsuki|
| Yuri|
+-------+
[error] (run-main-2) org.apache.spark.sql.AnalysisException: cannot resolve '`age`' given input columns: [name, height];;
[error] 'Project ['age]
[error] +- LocalRelation [name#2, height#3]
[error] org.apache.spark.sql.AnalysisException: cannot resolve '`age`' given input columns: [name, height];;
[error] 'Project ['age]
[error] +- LocalRelation [name#2, height#3]
case class Height(height: Int)
val heights: Dataset[Height] = people.as[Height]
case class Age(age: Int)
val ages: Dataset[Age] = people.as[Age]
[error] (run-main-0) org.apache.spark.sql.AnalysisException: cannot resolve '`age`' given input columns: [name, height];
[error] org.apache.spark.sql.AnalysisException: cannot resolve '`age`' given input columns: [name, height];
case class Height(height: Int)
val heights: Dataset[Height] = people.as[Height]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment