This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# The Google Maps library | |
import googlemaps | |
# Date time for easy computations between dates | |
from datetime import datetime | |
# JSON handling | |
import json | |
# Pandas | |
import pandas as pd | |
# Regular expressions | |
import re |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
id | _address | _d_code | _link | _price | type | _bedrooms | _bathrooms | _ber_code | _views | _latest_update | days_listed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Address Obfuscated | D14 | http://obfuscated-address | 855720.365 | HOUSE | 4 | 1 | ber_D1 | 1077 | 2020-09-22 | 0 | |
1 | Address Obfuscated | D14 | http://obfuscated-address | 177902.773 | APARTMENTS | 1 | 1 | ber_C3 | 24636 | 2020-09-23 | 0 | |
2 | Address Obfuscated | D14 | http://obfuscated-address | 543631.118 | APARTMENTS | 3 | 1 | ber_C3 | 2397 | 2020-09-23 | 0 | |
3 | Address Obfuscated | D14 | http://obfuscated-address | 330561.439 | APARTMENTS | 2 | 2 | ber_A3 | 40212 | 2020-09-22 | 0 | |
4 | Address Obfuscated | D14 | http://obfuscated-address | 626830.651 | HOUSE | 3 | 2 | ber_G | 11656 | 2020-09-23 | 0 | |
5 | Address Obfuscated | D14 | http://obfuscated-address | 606444.164 | HOUSE | 3 | 1 | ber_D2 | 7714 | 2020-09-23 | 0 | |
6 | Address Obfuscated | D14 | http://obfuscated-address | 426872.545 | HOUSE | 2 | 1 | ber_B3 | 3599 | 2020-09-22 | 0 | |
7 | Address Obfuscated | D14 | http://obfuscated-address | 203028.846 | HOUSE | 1 | 2 | ber_D1 | 2601 | 2020-09-23 | 0 | |
8 | Address Obfuscated | D14 | http://obfuscated-address | 380964.495 | HOUSE | 2 | 1 | ber_C2 | 757 | 2020-09-21 | 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Read the source tables in Parquet format | |
sales_table = spark.read.parquet("./data/sales_parquet") | |
''' | |
SELECT order_id AS the_order_id, | |
seller_id AS the_seller_id, | |
num_pieces_sold AS the_number_of_pieces_sold | |
FROM sales_table | |
''' | |
# Execution Plan and show action in one line |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.sql.functions import udf | |
from pyspark.sql.types import ArrayType, StringType | |
# Create the function that will be used in the UDF | |
def array_repeat_custom(element, count): | |
list = ["{}".format(element) for x in range(0, count)] | |
return list | |
# Convert the function into a UDF. It's good practice to indicate the return type of the UDF | |
# In this case the return type is an array of strings |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Read the source tables in Parquet format | |
sales_table = spark.read.parquet("./data/sales_parquet") | |
''' | |
CREATE TABLE sales_table_aggregated AS | |
SELECT COLLECT_SET(num_pieces_sold) AS num_pieces_sold_set, | |
seller_id | |
FROM sales_table | |
GROUP BY seller_id; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Read the source tables in Parquet format | |
sales_table = spark.read.parquet("./data/sales_parquet") | |
''' | |
SELECT COLLECT_SET(num_pieces_sold) AS num_pieces_sold_set, | |
COLLECT_LIST(num_pieces_list) AS num_pieces_sold_list, | |
seller_id | |
FROM sales_table | |
GROUP BY seller_id | |
''' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Read the source tables in Parquet format | |
sales_table = spark.read.parquet("./data/sales_parquet") | |
''' | |
SELECT DISTINCT REGEXP_EXTRACT(bill_raw_text, '(ab[cd]{2,4})|(aa[abcde]{1,2})') AS extracted_pattern | |
WHERE REGEXP_EXTRACT(bill_raw_text, '(ab[cd]{2,4})|(aa[abcde]{1,2})') <> " | |
FROM sales_table | |
''' | |
sales_table_execution_plan = sales_table.select( | |
# The last integer indicates which group to extract |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Read the source tables in Parquet format | |
sales_table = spark.read.parquet("./data/sales_parquet") | |
''' | |
SELECT * | |
FROM sales_table | |
WHERE bill_raw_text RLIKE '(ab[cd]{2,4})|(aa[abcde]{1,2})' | |
''' | |
sales_table_execution_plan = sales_table.where( | |
col('bill_raw_text').rlike("(ab[cd]{2,4})|(aa[abcde]{1,2})") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Read the source tables in Parquet format | |
sales_table = spark.read.parquet("./data/sales_parquet") | |
''' | |
SELECT * | |
WHERE bill_raw_text LIKE 'ab%cd%' | |
''' | |
sales_table_execution_plan = sales_table.where( | |
col('bill_raw_text').like("ab%cd%") | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import Window | |
from pyspark.sql.window import Window | |
# Read the source tables in Parquet format | |
sales_table = spark.read.parquet("./data/sales_parquet") | |
''' | |
SELECT seller_id, | |
product_id, | |
total_pieces, |