Skip to content

Instantly share code, notes, and snippets.

@aialenti
aialenti / google.py
Last active February 17, 2021 14:18
# The Google Maps library
import googlemaps
# Date time for easy computations between dates
from datetime import datetime
# JSON handling
import json
# Pandas
import pandas as pd
# Regular expressions
import re
id _address _d_code _link _price type _bedrooms _bathrooms _ber_code _views _latest_update days_listed
0 Address Obfuscated D14 http://obfuscated-address 855720.365 HOUSE 4 1 ber_D1 1077 2020-09-22 0
1 Address Obfuscated D14 http://obfuscated-address 177902.773 APARTMENTS 1 1 ber_C3 24636 2020-09-23 0
2 Address Obfuscated D14 http://obfuscated-address 543631.118 APARTMENTS 3 1 ber_C3 2397 2020-09-23 0
3 Address Obfuscated D14 http://obfuscated-address 330561.439 APARTMENTS 2 2 ber_A3 40212 2020-09-22 0
4 Address Obfuscated D14 http://obfuscated-address 626830.651 HOUSE 3 2 ber_G 11656 2020-09-23 0
5 Address Obfuscated D14 http://obfuscated-address 606444.164 HOUSE 3 1 ber_D2 7714 2020-09-23 0
6 Address Obfuscated D14 http://obfuscated-address 426872.545 HOUSE 2 1 ber_B3 3599 2020-09-22 0
7 Address Obfuscated D14 http://obfuscated-address 203028.846 HOUSE 1 2 ber_D1 2601 2020-09-23 0
8 Address Obfuscated D14 http://obfuscated-address 380964.495 HOUSE 2 1 ber_C2 757 2020-09-21 0
# Read the source tables in Parquet format
sales_table = spark.read.parquet("./data/sales_parquet")
'''
SELECT order_id AS the_order_id,
seller_id AS the_seller_id,
num_pieces_sold AS the_number_of_pieces_sold
FROM sales_table
'''
# Execution Plan and show action in one line
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType
# Create the function that will be used in the UDF
def array_repeat_custom(element, count):
list = ["{}".format(element) for x in range(0, count)]
return list
# Convert the function into a UDF. It's good practice to indicate the return type of the UDF
# In this case the return type is an array of strings
@aialenti
aialenti / explode.py
Last active February 17, 2021 14:18
# Read the source tables in Parquet format
sales_table = spark.read.parquet("./data/sales_parquet")
'''
CREATE TABLE sales_table_aggregated AS
SELECT COLLECT_SET(num_pieces_sold) AS num_pieces_sold_set,
seller_id
FROM sales_table
GROUP BY seller_id;
# Read the source tables in Parquet format
sales_table = spark.read.parquet("./data/sales_parquet")
'''
SELECT COLLECT_SET(num_pieces_sold) AS num_pieces_sold_set,
COLLECT_LIST(num_pieces_list) AS num_pieces_sold_list,
seller_id
FROM sales_table
GROUP BY seller_id
'''
# Read the source tables in Parquet format
sales_table = spark.read.parquet("./data/sales_parquet")
'''
SELECT DISTINCT REGEXP_EXTRACT(bill_raw_text, '(ab[cd]{2,4})|(aa[abcde]{1,2})') AS extracted_pattern
WHERE REGEXP_EXTRACT(bill_raw_text, '(ab[cd]{2,4})|(aa[abcde]{1,2})') <> "
FROM sales_table
'''
sales_table_execution_plan = sales_table.select(
# The last integer indicates which group to extract
@aialenti
aialenti / rlike.py
Last active September 20, 2020 14:06
# Read the source tables in Parquet format
sales_table = spark.read.parquet("./data/sales_parquet")
'''
SELECT *
FROM sales_table
WHERE bill_raw_text RLIKE '(ab[cd]{2,4})|(aa[abcde]{1,2})'
'''
sales_table_execution_plan = sales_table.where(
col('bill_raw_text').rlike("(ab[cd]{2,4})|(aa[abcde]{1,2})")
# Read the source tables in Parquet format
sales_table = spark.read.parquet("./data/sales_parquet")
'''
SELECT *
WHERE bill_raw_text LIKE 'ab%cd%'
'''
sales_table_execution_plan = sales_table.where(
col('bill_raw_text').like("ab%cd%")
)
# Import Window
from pyspark.sql.window import Window
# Read the source tables in Parquet format
sales_table = spark.read.parquet("./data/sales_parquet")
'''
SELECT seller_id,
product_id,
total_pieces,