Skip to content

Instantly share code, notes, and snippets.

#import numpy as np
#import pandas as pd
#from tqdm import tqdm
#import cv2
#from src.commons import utils
#from constants import
def resize_images(data, size=None):
'''
#import pandas as pd
#from constants import *
#from src.commons import utils
#import matplotlib.pyplot as plt
#import numpy as np
# Read the list of all paintings (scraped data)
data = pd.read_csv('{}/data.csv'.format(DATA_FOLDER))
for year in data['year'].unique():
full_data = pd.DataFrame()
val t0 = System.nanoTime()
// Create the Execution Plan
fact_table = fact_table.join(dimension_table,
fact_table.col("dimension_id") === dimension_table.col("id"))
// Perform an action to run the execution
fact_table.count
val t1 = System.nanoTime()
val t0 = System.nanoTime()
// Create the Execution Plan
fact_table = fact_table.join(broadcast(dimension_table), // Here's the magic!
fact_table.col("dimension_id") === dimension_table.col("id"))
// Perform an action to run the execution
fact_table.count
val t1 = System.nanoTime()
val t0 = System.nanoTime()
// Create the Execution Plan
fact_table = fact_table.join(broadcast(dimension_table2),
fact_table.col("dimension_2_id") === dimension_table.col("id"))
// Perform an action to run the execution
fact_table.count
val t1 = System.nanoTime()
al t0 = System.nanoTime()
// Do repartitioning
fact_table = fact_table.repartition(200, col("uniformly_distributed_column"))
// Create the Execution Plan
fact_table = fact_table.join(broadcast(dimension_table2),
fact_table.col("dimension_2_id") === dimension_table.col("id"))
// Perform an action to run the execution
// Create a dummy column which is a number between 0 and 1000, then append this suffix to the dimension_2_id column
fact_table = fact_table.withColumn("dummy", monotonically_increasing_id % 1000).
withColumn("dimension_2_id_suffix",concat(col("dimension_2_id"),lit("-"), col("dummy")))
// The following row avoids the broadcasting, the dimension_table2
// is very small and my configuration would broadcast it
spark.conf.set("spark.sql.autoBroadcastJoinThreshold",-1)
// I'm using caching to simplify the DAG
dimension_table2.cache
dimension_table2.count
fact_table = fact_table.repartition(400)
// The following row avoids the broadcasting, the dimension_table2 is very small
spark.conf.set("spark.sql.autoBroadcastJoinThreshold",-1)
// I'm using caching to simplify the DAG
dimension_table2.cache
dimension_table2.count
// One way to use the same partitioner is to partition on a column with the same name,
// let's rename the columns that we want to join
fact_table = fact_table.withColumnRenamed("dimension_2_key", "repartition_id")
@aialenti
aialenti / file.py
Last active December 10, 2019 20:59
# Import Pandas
import pandas as pd
# Import PuLP modeler functions
from pulp import *
# Math functions for distance calculation
import math
# Networkx to get connected components and subtours
import networkx as nx
# Matplotlib for debugging
import matplotlib.pyplot as plt