Created
February 23, 2021 23:08
-
-
Save anthony-cros/4a3a9ae9a31881d9fd85c4c67b8a5559 to your computer and use it in GitHub Desktop.
Examples from https://towardsdatascience.com/stop-using-pandas-and-start-using-spark-with-scala-f7364077c2e0 by Chloe Connor ported to Gallia
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gallia._ // see https://github.com/galliaproject/gallia-core/blob/init/README.md#dependencies | |
// =========================================================================== | |
object StartUsingSparkWithScala { // see https://towardsdatascience.com/stop-using-pandas-and-start-using-spark-with-scala-f7364077c2e0 by Chloe Connor | |
// test meta | |
case class FootballTeam( | |
name : String, | |
league : String, | |
matches_played : Int, | |
goals_this_season: Int, | |
top_goal_scorer : String, | |
wins : Int) | |
// --------------------------------------------------------------------------- | |
case class Player( | |
player_name : String, | |
goals_scored: Int, | |
injury : Option[String]) | |
// =========================================================================== | |
// test data | |
val teams: HeadS = | |
bobjs( | |
bobj( | |
"name" -> "Brighton and Hove Albion", | |
"league" -> "Premier League", | |
"matches_played" -> 29, | |
"goals_this_season" -> 32, | |
"top_goal_scorer" -> "Neil Maupay", | |
"wins" -> 6), | |
bobj( | |
"name" -> "Manchester City", | |
"league" -> "Premier League", | |
"matches_played" -> 28, | |
"goals_this_season" -> 33, | |
"top_goal_scorer" -> "Sergio Aguero", | |
"wins" -> 6)) | |
// --------------------------------------------------------------------------- | |
val anotherTeams: HeadS = | |
bobjs( | |
bobj( | |
"name" -> "Nankatsu City", | |
"league" -> "Japanese highschool league", | |
"matches_played" -> 241, | |
"goals_this_season" -> 5103, | |
"top_goal_scorer" -> "Tsubasa Ozora", | |
"wins" -> 241) ) | |
// --------------------------------------------------------------------------- | |
val players: HeadS = | |
aobjs(cls[Player])( // have to provide schema because of Option | |
obj( | |
"player_name" -> "Neil Maupey", | |
"goals_scored" -> 5 | |
/* no injury */), | |
obj( | |
"player_name" -> "Sergio Aguero", | |
"goals_scored"-> 4, | |
"injury" -> "knee") ) | |
// =========================================================================== | |
def main(args: Array[String]): Unit = { | |
val newTeams = teams.add("sport" -> "football").printJsonl() | |
//{"name":"Brighton and Hove Albion","league":"Premier League","matches_played":29,"goals_this_season":32,"top_goal_scorer":"Neil Maupay","wins":6,"sport":"football"} | |
//{"name":"Manchester City","league":"Premier League","matches_played":28,"goals_this_season":33,"top_goal_scorer":"Sergio Aguero","wins":6,"sport":"football"} | |
// --------------------------------------------------------------------------- | |
println | |
val teamsWithLeague = teams.generate("premier_league").from("league").using(_ == "Premier League").printJsonl() | |
//{"name":"Brighton and Hove Albion","league":"Premier League","matches_played":29,"goals_this_season":32,"top_goal_scorer":"Neil Maupay","wins":6,"premier_league":true} | |
//{"name":"Manchester City","league":"Premier League","matches_played":28,"goals_this_season":33,"top_goal_scorer":"Sergio Aguero","wins":6,"premier_league":true} | |
// --------------------------------------------------------------------------- | |
println | |
val filteredTeams = teams.filterBy("goals_this_season").matches(_ > 50).printJsonl() // empty | |
// --------------------------------------------------------------------------- | |
println | |
teams.countBy("league").printJsonl // {"league":"Premier League","_count":2} | |
// --------------------------------------------------------------------------- | |
println | |
teams | |
.reduce( | |
'matches_played .mean, // could add more than mean here, eg: "matches_played".aggregates(_.mean, _.stdev), | |
'goals_this_season.count) | |
.printJson() | |
//{"matches_played":28.5,"goals_this_season":2} | |
// --------------------------------------------------------------------------- | |
println | |
teams.union(anotherTeams).printJsonl() | |
//{"name":"Brighton and Hove Albion","league":"Premier League","matches_played":29,"goals_this_season":32,"top_goal_scorer":"Neil Maupay","wins":6} | |
//{"name":"Manchester City","league":"Premier League","matches_played":28,"goals_this_season":33,"top_goal_scorer":"Sergio Aguero","wins":6} | |
//{"name":"Nankatsu City","league":"Japanese highschool league","matches_played":241,"goals_this_season":5103,"top_goal_scorer":"Tsubasa Ozora","wins":241} | |
// --------------------------------------------------------------------------- | |
println | |
val augmentedTeams = // will need it again below | |
teams.join(players) { | |
_ .left | |
.on(leftKey = "top_goal_scorer", | |
rightKey = "player_name" /* right-hand key is discarded */) } | |
augmentedTeams.printJsonl() | |
//{"name":"Brighton and Hove Albion","league":"Premier League","matches_played":29,"goals_this_season":32,"top_goal_scorer":"Neil Maupay","wins":6} | |
//{"name":"Manchester City","league":"Premier League","matches_played":28,"goals_this_season":33,"top_goal_scorer":"Sergio Aguero","wins":6,"goals_scored":4,"injury":"knee"} | |
// --------------------------------------------------------------------------- | |
println | |
val teamNames: Seq[String] = teams.forceStrings("name") | |
println(teamNames) // List(Brighton and Hove Albion, Manchester City) | |
// or equivalently: | |
teams.forceTypeds [String]("name") | |
teams .grab(_.string("name")).forceValue | |
teams.collectValues(_.grab(_.string("name"))) | |
// --------------------------------------------------------------------------- | |
println | |
val filteredPlayers = | |
augmentedTeams | |
.convert("name").toRequired // FIXME: t210223175332 - this seems to be a bug in Gallia, left join should not make left fields optional | |
.filterBy(_.string(/* team */ "name")) // MUST use explicit type for the contains below to work | |
.matches(teamNames.contains) | |
.printJsonl() | |
//{"name":"Brighton and Hove Albion","league":"Premier League","matches_played":29,"goals_this_season":32,"top_goal_scorer":"Neil Maupay","wins":6} | |
//{"name":"Manchester City","league":"Premier League","matches_played":28,"goals_this_season":33,"top_goal_scorer":"Sergio Aguero","wins":6,"goals_scored":4,"injury":"knee"} | |
// --------------------------------------------------------------------------- | |
println | |
val footballTeams = "/data/misc/footballs_teams.csv".stream[FootballTeam] /* header=true is the default */.printJsonl() | |
//{"name":"Brighton and Hove Albion","league":"Premier League","matches_played":29,"goals_this_season":32,"top_goal_scorer":"Neil Maupay","wins":6} | |
//{"name":"Manchester City","league":"Premier League","matches_played":28,"goals_this_season":33,"top_goal_scorer":"Sergio Aguero","wins":6} | |
} | |
// Note: t210223175659 - need to allow "goals_this_season".count above (failed unless Symbol) | |
} | |
// =========================================================================== |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment