aialenti

## block1.py
#import numpy as np
#import pandas as pd
#from tqdm import tqdm
#import cv2
#from src.commons import utils
#from constants import


def resize_images(data, size=None):
    '''

## block.py
#import pandas as pd
#from constants import *
#from src.commons import utils
#import matplotlib.pyplot as plt
#import numpy as np

#       Read the list of all paintings (scraped data)
data = pd.read_csv('{}/data.csv'.format(DATA_FOLDER))
for year in data['year'].unique():
    full_data = pd.DataFrame()

## count_rows.scala
val t0 = System.nanoTime()

// Create the Execution Plan
fact_table = fact_table.join(dimension_table,
                         fact_table.col("dimension_id") === dimension_table.col("id"))

// Perform an action to run the execution
fact_table.count

val t1 = System.nanoTime()

## broadcast.scala
val t0 = System.nanoTime()

// Create the Execution Plan
fact_table = fact_table.join(broadcast(dimension_table), // Here's the magic!
                         fact_table.col("dimension_id") === dimension_table.col("id"))

// Perform an action to run the execution
fact_table.count

val t1 = System.nanoTime()

## file.scala
val t0 = System.nanoTime()

// Create the Execution Plan
fact_table = fact_table.join(broadcast(dimension_table2),
                         fact_table.col("dimension_2_id") === dimension_table.col("id"))

// Perform an action to run the execution
fact_table.count

val t1 = System.nanoTime()

## snippet.scala
al t0 = System.nanoTime()

//  Do repartitioning
fact_table = fact_table.repartition(200, col("uniformly_distributed_column"))

// Create the Execution Plan
fact_table = fact_table.join(broadcast(dimension_table2),
                         fact_table.col("dimension_2_id") === dimension_table.col("id"))

// Perform an action to run the execution

## gist:20650adea8347e4459478f5bf651b436
// Create a dummy column which is a number between 0 and 1000, then append this suffix to the dimension_2_id column
fact_table = fact_table.withColumn("dummy", monotonically_increasing_id % 1000).
withColumn("dimension_2_id_suffix",concat(col("dimension_2_id"),lit("-"), col("dummy")))

## file.scala
// The following row avoids the broadcasting, the dimension_table2
// is very small and my configuration would broadcast it
spark.conf.set("spark.sql.autoBroadcastJoinThreshold",-1)

// I'm using caching to simplify the DAG
dimension_table2.cache
dimension_table2.count

fact_table = fact_table.repartition(400)

## file.scala
// The following row avoids the broadcasting, the dimension_table2 is very small
spark.conf.set("spark.sql.autoBroadcastJoinThreshold",-1)

// I'm using caching to simplify the DAG
dimension_table2.cache
dimension_table2.count

// One way to use the same partitioner is to partition on a column with the same name,
// let's rename the columns that we want to join
fact_table = fact_table.withColumnRenamed("dimension_2_key", "repartition_id")

## file.py
#   Import Pandas
import pandas as pd
#   Import PuLP modeler functions
from pulp import *
#   Math functions for distance calculation
import math
#   Networkx to get connected components and subtours
import networkx as nx
#   Matplotlib for debugging
import matplotlib.pyplot as plt
	#import numpy as np
	#import pandas as pd
	#from tqdm import tqdm
	#import cv2
	#from src.commons import utils
	#from constants import


	def resize_images(data, size=None):
	'''
	#import pandas as pd
	#from constants import *
	#from src.commons import utils
	#import matplotlib.pyplot as plt
	#import numpy as np

	# Read the list of all paintings (scraped data)
	data = pd.read_csv('{}/data.csv'.format(DATA_FOLDER))
	for year in data['year'].unique():
	full_data = pd.DataFrame()
	val t0 = System.nanoTime()

	// Create the Execution Plan
	fact_table = fact_table.join(dimension_table,
	fact_table.col("dimension_id") === dimension_table.col("id"))

	// Perform an action to run the execution
	fact_table.count

	val t1 = System.nanoTime()
	val t0 = System.nanoTime()

	// Create the Execution Plan
	fact_table = fact_table.join(broadcast(dimension_table), // Here's the magic!
	fact_table.col("dimension_id") === dimension_table.col("id"))

	// Perform an action to run the execution
	fact_table.count

	val t1 = System.nanoTime()
	al t0 = System.nanoTime()

	// Do repartitioning
	fact_table = fact_table.repartition(200, col("uniformly_distributed_column"))

	// Create the Execution Plan
	fact_table = fact_table.join(broadcast(dimension_table2),
	fact_table.col("dimension_2_id") === dimension_table.col("id"))

	// Perform an action to run the execution
	// Create a dummy column which is a number between 0 and 1000, then append this suffix to the dimension_2_id column
	fact_table = fact_table.withColumn("dummy", monotonically_increasing_id % 1000).
	withColumn("dimension_2_id_suffix",concat(col("dimension_2_id"),lit("-"), col("dummy")))
	// The following row avoids the broadcasting, the dimension_table2
	// is very small and my configuration would broadcast it
	spark.conf.set("spark.sql.autoBroadcastJoinThreshold",-1)

	// I'm using caching to simplify the DAG
	dimension_table2.cache
	dimension_table2.count

	fact_table = fact_table.repartition(400)
	# Import Pandas
	import pandas as pd
	# Import PuLP modeler functions
	from pulp import *
	# Math functions for distance calculation
	import math
	# Networkx to get connected components and subtours
	import networkx as nx
	# Matplotlib for debugging
	import matplotlib.pyplot as plt