1ambda/df-join.py

## df-join.py
# dfListingSales.explain("FORMATTED")

== Physical Plan ==
AdaptiveSparkPlan (14)
+- Project (13)
   +- SortMergeJoin LeftOuter (12)
      :- Sort (4)
      :  +- Exchange (3)
      :     +- Project (2)
      :        +- Scan csv  (1)
      +- Sort (11)
         +- HashAggregate (10)
            +- Exchange (9)
               +- HashAggregate (8)
                  +- Project (7)
                     +- Filter (6)
                        +- Scan csv  (5)


(1) Scan csv
Output [2]: [id#2732, name#2736]
Batched: false
Location: InMemoryFileIndex [dbfs:/FileStore/tables/airbnb_listings.csv]
ReadSchema: struct<id:string,name:string>

(2) Project
Output [2]: [id#2732 AS listing_id#3270, name#2736 AS listing_name#3271]
Input [2]: [id#2732, name#2736]

(3) Exchange
Input [2]: [listing_id#3270, listing_name#3271]
Arguments: hashpartitioning(cast(listing_id#3270 as int), 200), ENSURE_REQUIREMENTS, [id=#1373]

(4) Sort
Input [2]: [listing_id#3270, listing_name#3271]
Arguments: [cast(listing_id#3270 as int) ASC NULLS FIRST], false, 0

(5) Scan csv
Output [3]: [listing_id#2718, available#2720, price#2721]
Batched: false
Location: InMemoryFileIndex [dbfs:/FileStore/tables/airbnb_calendar.csv]
PushedFilters: [IsNotNull(available), EqualTo(available,f), IsNotNull(listing_id)]
ReadSchema: struct<listing_id:int,available:string,price:string>

(6) Filter
Input [3]: [listing_id#2718, available#2720, price#2721]
Condition : ((isnotnull(available#2720) AND (available#2720 = f)) AND isnotnull(listing_id#2718))

(7) Project
Output [2]: [listing_id#2718, cast(regexp_extract(price#2721, [0-9]+.[0-9]+, 0) as double) AS price#2986]
Input [3]: [listing_id#2718, available#2720, price#2721]

(8) HashAggregate
Input [2]: [listing_id#2718, price#2986]
Keys [1]: [listing_id#2718]
Functions [2]: [partial_sum(price#2986) AS sum#3076, partial_count(1) AS count#3078L]
Aggregate Attributes [2]: [sum#3075, count#3077L]
Results [3]: [listing_id#2718, sum#3076, count#3078L]

(9) Exchange
Input [3]: [listing_id#2718, sum#3076, count#3078L]
Arguments: hashpartitioning(listing_id#2718, 200), ENSURE_REQUIREMENTS, [id=#1369]

(10) HashAggregate
Input [3]: [listing_id#2718, sum#3076, count#3078L]
Keys [1]: [listing_id#2718]
Functions [2]: [finalmerge_sum(merge sum#3076) AS sum(price#2986)#3062, finalmerge_count(merge count#3078L) AS count(1)#3064L]
Aggregate Attributes [2]: [sum(price#2986)#3062, count(1)#3064L]
Results [3]: [listing_id#2718, sum(price#2986)#3062 AS price_order#3063, count(1)#3064L AS count_order#3065L]

(11) Sort
Input [3]: [listing_id#2718, price_order#3063, count_order#3065L]
Arguments: [listing_id#2718 ASC NULLS FIRST], false, 0

(12) SortMergeJoin
Left keys [1]: [cast(listing_id#3270 as int)]
Right keys [1]: [listing_id#2718]
Join condition: None

(13) Project
Output [4]: [listing_id#3270, listing_name#3271, coalesce(price_order#3063, 0.0) AS price_order#3284, coalesce(count_order#3065L, 0) AS count_order#3285L]
Input [5]: [listing_id#3270, listing_name#3271, listing_id#2718, price_order#3063, count_order#3065L]

(14) AdaptiveSparkPlan
Output [4]: [listing_id#3270, listing_name#3271, price_order#3284, count_order#3285L]
Arguments: isFinalPlan=false
	# dfListingSales.explain("FORMATTED")

	== Physical Plan ==
	AdaptiveSparkPlan (14)
	+- Project (13)
	+- SortMergeJoin LeftOuter (12)
	:- Sort (4)
	: +- Exchange (3)
	: +- Project (2)
	: +- Scan csv (1)
	+- Sort (11)
	+- HashAggregate (10)
	+- Exchange (9)
	+- HashAggregate (8)
	+- Project (7)
	+- Filter (6)
	+- Scan csv (5)


	(1) Scan csv
	Output [2]: [id#2732, name#2736]
	Batched: false
	Location: InMemoryFileIndex [dbfs:/FileStore/tables/airbnb_listings.csv]
	ReadSchema: struct<id:string,name:string>

	(2) Project
	Output [2]: [id#2732 AS listing_id#3270, name#2736 AS listing_name#3271]
	Input [2]: [id#2732, name#2736]

	(3) Exchange
	Input [2]: [listing_id#3270, listing_name#3271]
	Arguments: hashpartitioning(cast(listing_id#3270 as int), 200), ENSURE_REQUIREMENTS, [id=#1373]

	(4) Sort
	Input [2]: [listing_id#3270, listing_name#3271]
	Arguments: [cast(listing_id#3270 as int) ASC NULLS FIRST], false, 0

	(5) Scan csv
	Output [3]: [listing_id#2718, available#2720, price#2721]
	Batched: false
	Location: InMemoryFileIndex [dbfs:/FileStore/tables/airbnb_calendar.csv]
	PushedFilters: [IsNotNull(available), EqualTo(available,f), IsNotNull(listing_id)]
	ReadSchema: struct<listing_id:int,available:string,price:string>

	(6) Filter
	Input [3]: [listing_id#2718, available#2720, price#2721]
	Condition : ((isnotnull(available#2720) AND (available#2720 = f)) AND isnotnull(listing_id#2718))

	(7) Project
	Output [2]: [listing_id#2718, cast(regexp_extract(price#2721, [0-9]+.[0-9]+, 0) as double) AS price#2986]
	Input [3]: [listing_id#2718, available#2720, price#2721]

	(8) HashAggregate
	Input [2]: [listing_id#2718, price#2986]
	Keys [1]: [listing_id#2718]
	Functions [2]: [partial_sum(price#2986) AS sum#3076, partial_count(1) AS count#3078L]
	Aggregate Attributes [2]: [sum#3075, count#3077L]
	Results [3]: [listing_id#2718, sum#3076, count#3078L]

	(9) Exchange
	Input [3]: [listing_id#2718, sum#3076, count#3078L]
	Arguments: hashpartitioning(listing_id#2718, 200), ENSURE_REQUIREMENTS, [id=#1369]

	(10) HashAggregate
	Input [3]: [listing_id#2718, sum#3076, count#3078L]
	Keys [1]: [listing_id#2718]
	Functions [2]: [finalmerge_sum(merge sum#3076) AS sum(price#2986)#3062, finalmerge_count(merge count#3078L) AS count(1)#3064L]
	Aggregate Attributes [2]: [sum(price#2986)#3062, count(1)#3064L]
	Results [3]: [listing_id#2718, sum(price#2986)#3062 AS price_order#3063, count(1)#3064L AS count_order#3065L]

	(11) Sort
	Input [3]: [listing_id#2718, price_order#3063, count_order#3065L]
	Arguments: [listing_id#2718 ASC NULLS FIRST], false, 0

	(12) SortMergeJoin
	Left keys [1]: [cast(listing_id#3270 as int)]
	Right keys [1]: [listing_id#2718]
	Join condition: None

	(13) Project
	Output [4]: [listing_id#3270, listing_name#3271, coalesce(price_order#3063, 0.0) AS price_order#3284, coalesce(count_order#3065L, 0) AS count_order#3285L]
	Input [5]: [listing_id#3270, listing_name#3271, listing_id#2718, price_order#3063, count_order#3065L]

	(14) AdaptiveSparkPlan
	Output [4]: [listing_id#3270, listing_name#3271, price_order#3284, count_order#3285L]
	Arguments: isFinalPlan=false