Skip to content

Instantly share code, notes, and snippets.

@anthony-cros
Created February 23, 2021 23:08
Show Gist options
  • Save anthony-cros/4a3a9ae9a31881d9fd85c4c67b8a5559 to your computer and use it in GitHub Desktop.
Save anthony-cros/4a3a9ae9a31881d9fd85c4c67b8a5559 to your computer and use it in GitHub Desktop.
import gallia._ // see https://github.com/galliaproject/gallia-core/blob/init/README.md#dependencies
// ===========================================================================
object StartUsingSparkWithScala { // see https://towardsdatascience.com/stop-using-pandas-and-start-using-spark-with-scala-f7364077c2e0 by Chloe Connor
// test meta
case class FootballTeam(
name : String,
league : String,
matches_played : Int,
goals_this_season: Int,
top_goal_scorer : String,
wins : Int)
// ---------------------------------------------------------------------------
case class Player(
player_name : String,
goals_scored: Int,
injury : Option[String])
// ===========================================================================
// test data
val teams: HeadS =
bobjs(
bobj(
"name" -> "Brighton and Hove Albion",
"league" -> "Premier League",
"matches_played" -> 29,
"goals_this_season" -> 32,
"top_goal_scorer" -> "Neil Maupay",
"wins" -> 6),
bobj(
"name" -> "Manchester City",
"league" -> "Premier League",
"matches_played" -> 28,
"goals_this_season" -> 33,
"top_goal_scorer" -> "Sergio Aguero",
"wins" -> 6))
// ---------------------------------------------------------------------------
val anotherTeams: HeadS =
bobjs(
bobj(
"name" -> "Nankatsu City",
"league" -> "Japanese highschool league",
"matches_played" -> 241,
"goals_this_season" -> 5103,
"top_goal_scorer" -> "Tsubasa Ozora",
"wins" -> 241) )
// ---------------------------------------------------------------------------
val players: HeadS =
aobjs(cls[Player])( // have to provide schema because of Option
obj(
"player_name" -> "Neil Maupey",
"goals_scored" -> 5
/* no injury */),
obj(
"player_name" -> "Sergio Aguero",
"goals_scored"-> 4,
"injury" -> "knee") )
// ===========================================================================
def main(args: Array[String]): Unit = {
val newTeams = teams.add("sport" -> "football").printJsonl()
//{"name":"Brighton and Hove Albion","league":"Premier League","matches_played":29,"goals_this_season":32,"top_goal_scorer":"Neil Maupay","wins":6,"sport":"football"}
//{"name":"Manchester City","league":"Premier League","matches_played":28,"goals_this_season":33,"top_goal_scorer":"Sergio Aguero","wins":6,"sport":"football"}
// ---------------------------------------------------------------------------
println
val teamsWithLeague = teams.generate("premier_league").from("league").using(_ == "Premier League").printJsonl()
//{"name":"Brighton and Hove Albion","league":"Premier League","matches_played":29,"goals_this_season":32,"top_goal_scorer":"Neil Maupay","wins":6,"premier_league":true}
//{"name":"Manchester City","league":"Premier League","matches_played":28,"goals_this_season":33,"top_goal_scorer":"Sergio Aguero","wins":6,"premier_league":true}
// ---------------------------------------------------------------------------
println
val filteredTeams = teams.filterBy("goals_this_season").matches(_ > 50).printJsonl() // empty
// ---------------------------------------------------------------------------
println
teams.countBy("league").printJsonl // {"league":"Premier League","_count":2}
// ---------------------------------------------------------------------------
println
teams
.reduce(
'matches_played .mean, // could add more than mean here, eg: "matches_played".aggregates(_.mean, _.stdev),
'goals_this_season.count)
.printJson()
//{"matches_played":28.5,"goals_this_season":2}
// ---------------------------------------------------------------------------
println
teams.union(anotherTeams).printJsonl()
//{"name":"Brighton and Hove Albion","league":"Premier League","matches_played":29,"goals_this_season":32,"top_goal_scorer":"Neil Maupay","wins":6}
//{"name":"Manchester City","league":"Premier League","matches_played":28,"goals_this_season":33,"top_goal_scorer":"Sergio Aguero","wins":6}
//{"name":"Nankatsu City","league":"Japanese highschool league","matches_played":241,"goals_this_season":5103,"top_goal_scorer":"Tsubasa Ozora","wins":241}
// ---------------------------------------------------------------------------
println
val augmentedTeams = // will need it again below
teams.join(players) {
_ .left
.on(leftKey = "top_goal_scorer",
rightKey = "player_name" /* right-hand key is discarded */) }
augmentedTeams.printJsonl()
//{"name":"Brighton and Hove Albion","league":"Premier League","matches_played":29,"goals_this_season":32,"top_goal_scorer":"Neil Maupay","wins":6}
//{"name":"Manchester City","league":"Premier League","matches_played":28,"goals_this_season":33,"top_goal_scorer":"Sergio Aguero","wins":6,"goals_scored":4,"injury":"knee"}
// ---------------------------------------------------------------------------
println
val teamNames: Seq[String] = teams.forceStrings("name")
println(teamNames) // List(Brighton and Hove Albion, Manchester City)
// or equivalently:
teams.forceTypeds [String]("name")
teams .grab(_.string("name")).forceValue
teams.collectValues(_.grab(_.string("name")))
// ---------------------------------------------------------------------------
println
val filteredPlayers =
augmentedTeams
.convert("name").toRequired // FIXME: t210223175332 - this seems to be a bug in Gallia, left join should not make left fields optional
.filterBy(_.string(/* team */ "name")) // MUST use explicit type for the contains below to work
.matches(teamNames.contains)
.printJsonl()
//{"name":"Brighton and Hove Albion","league":"Premier League","matches_played":29,"goals_this_season":32,"top_goal_scorer":"Neil Maupay","wins":6}
//{"name":"Manchester City","league":"Premier League","matches_played":28,"goals_this_season":33,"top_goal_scorer":"Sergio Aguero","wins":6,"goals_scored":4,"injury":"knee"}
// ---------------------------------------------------------------------------
println
val footballTeams = "/data/misc/footballs_teams.csv".stream[FootballTeam] /* header=true is the default */.printJsonl()
//{"name":"Brighton and Hove Albion","league":"Premier League","matches_played":29,"goals_this_season":32,"top_goal_scorer":"Neil Maupay","wins":6}
//{"name":"Manchester City","league":"Premier League","matches_played":28,"goals_this_season":33,"top_goal_scorer":"Sergio Aguero","wins":6}
}
// Note: t210223175659 - need to allow "goals_this_season".count above (failed unless Symbol)
}
// ===========================================================================
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment