Skip to content

Instantly share code, notes, and snippets.

#import numpy as np
#import pandas as pd
#from tqdm import tqdm
#import cv2
#from src.commons import utils
#from constants import
def resize_images(data, size=None):
'''
#import pandas as pd
#from constants import *
#from src.commons import utils
#import matplotlib.pyplot as plt
#import numpy as np
# Read the list of all paintings (scraped data)
data = pd.read_csv('{}/data.csv'.format(DATA_FOLDER))
for year in data['year'].unique():
full_data = pd.DataFrame()
val t0 = System.nanoTime()
// Create the Execution Plan
fact_table = fact_table.join(dimension_table,
fact_table.col("dimension_id") === dimension_table.col("id"))
// Perform an action to run the execution
fact_table.count
val t1 = System.nanoTime()
val t0 = System.nanoTime()
// Create the Execution Plan
fact_table = fact_table.join(broadcast(dimension_table), // Here's the magic!
fact_table.col("dimension_id") === dimension_table.col("id"))
// Perform an action to run the execution
fact_table.count
val t1 = System.nanoTime()
val t0 = System.nanoTime()
// Create the Execution Plan
fact_table = fact_table.join(broadcast(dimension_table2),
fact_table.col("dimension_2_id") === dimension_table.col("id"))
// Perform an action to run the execution
fact_table.count
val t1 = System.nanoTime()
al t0 = System.nanoTime()
// Do repartitioning
fact_table = fact_table.repartition(200, col("uniformly_distributed_column"))
// Create the Execution Plan
fact_table = fact_table.join(broadcast(dimension_table2),
fact_table.col("dimension_2_id") === dimension_table.col("id"))
// Perform an action to run the execution
// Create a dummy column which is a number between 0 and 1000, then append this suffix to the dimension_2_id column
fact_table = fact_table.withColumn("dummy", monotonically_increasing_id % 1000).
withColumn("dimension_2_id_suffix",concat(col("dimension_2_id"),lit("-"), col("dummy")))
// Scala random numbers generator
val r = scala.util.Random
// Create a "population" dataset with the numbers between 0 and 1000
val population:List[List[Int]] = for {
i <- (0 to 1000).toList
d <- 1 to 1
} yield List(d,i)
val df = population.map(x =>(x(0), x(1))).toDF(Seq("dummy_key","suffix"):_*)
val t0 = System.nanoTime()
// Create the Execution Plan
fact_table = fact_table.join(dimension_table2,
fact_table.col("dimension_2_id_suffix") === dimension_table.col("id_suffix"))
// Perform an action to run the execution
fact_table.count
val t1 = System.nanoTime()
// The following row avoids the broadcasting, the dimension_table2
// is very small and my configuration would broadcast it
spark.conf.set("spark.sql.autoBroadcastJoinThreshold",-1)
// I'm using caching to simplify the DAG
dimension_table2.cache
dimension_table2.count
fact_table = fact_table.repartition(400)