Skip to content

Instantly share code, notes, and snippets.

View AnantMishra30's full-sized avatar
🏠
Working from home

ANANT MISHRA AnantMishra30

🏠
Working from home
View GitHub Profile
from pyspark.sql.functions import broadcast
joinExpr = person["grad_section_id"] == graduateProgram["id"]
person.join(broadcast(graduateProgram), joinExpr).explain()
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- BroadcastHashJoin [grad_section_id#596L], [id#610L], Inner, BuildRight, false
:- Project [_1#586L AS id#594L, _2#587 AS name#595, _3#588L AS grad_section_id#596L, _4#589 AS subject_enrolled_id#597]
joinExpr = person["grad_section_id"] == graduateProgram["id"]
person.join(graduateProgram, joinExpr).explain()
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- SortMergeJoin [grad_section_id#596L], [id#610L], Inner
:- Sort [grad_section_id#596L ASC NULLS FIRST], false, 0
: +- Exchange hashpartitioning(grad_section_id#596L, 200), ENSURE_REQUIREMENTS, [id=#5805]
: +- Project [_1#586L AS id#594L, _2#587 AS name#595, _3#588L AS grad_section_id#596L, _4#589 AS subject_enrolled_id#597]
#1
joinType = "cross"
graduateProgram.join(person, joinExpression, joinType)
#2
person.join(graduateProgram)
#3
person.crossJoin(graduateProgram)
joinType = "left_anti"
graduateProgram.join(person, joinExpression, joinType)
joinType = "left_semi"
graduateProgram.join(person, joinExpression, joinType)
joinType = "right_outer"
person.join(graduateProgram, joinExpression, joinType)
joinType = "left_outer"
person.join(graduateProgram, joinExpression, joinType)
joinType = "outer"
person.join(graduateProgram, joinExpression, joinType)
DATAFRAME_1.join(DATAFRAME_2, JOIN_CONDITION, JOIN_TYPE)
# .join is function call on any pyspark dataframe
# DATFRAME_2 is a mandatory parameter
# JOIN_CONDITION is used to specify which key is used to match both dataframes
# JOIN_TYPE is used to specify which type of join to implement
joinExpression = person["grad_section_id"] == graduateProgram["id"]
person.join(graduateProgram, joinExpression)