Skip to content

Instantly share code, notes, and snippets.

// The following row avoids the broadcasting, the dimension_table2 is very small
spark.conf.set("spark.sql.autoBroadcastJoinThreshold",-1)
// I'm using caching to simplify the DAG
dimension_table2.cache
dimension_table2.count
// One way to use the same partitioner is to partition on a column with the same name,
// let's rename the columns that we want to join
fact_table = fact_table.withColumnRenamed("dimension_2_key", "repartition_id")
@aialenti
aialenti / file.py
Last active December 10, 2019 20:59
# Import Pandas
import pandas as pd
# Import PuLP modeler functions
from pulp import *
# Math functions for distance calculation
import math
# Networkx to get connected components and subtours
import networkx as nx
# Matplotlib for debugging
import matplotlib.pyplot as plt
class TSP():
cities = None
santa = None
variables_dict = None
x = None
path = None
def __init__(self, population_lower_bound=10e5 * 3):
cities = pd.read_csv("worldcities.csv")
self.cities = cities.loc[cities["population"] >= population_lower_bound][["city","lat","lng","population"]].reset_index()
@aialenti
aialenti / file.py
Last active December 13, 2019 17:08
# Import Pandas
import pandas as pd
# Import PuLP modeler functions
from pulp import *
# Math functions for distance calculation
import math
# Networkx to get connected components and subtours
import networkx as nx
# Matplotlib for debugging
import matplotlib.pyplot as plt
def calculate_distance(self, lat_a, lng_a, lat_b, lng_b):
# Convert lat lng in radians
lng_a, lat_a, lng_b, lat_b = map(math.radians, [lng_a, lat_a, lng_b, lat_b])
d_lat = lat_b - lat_a
d_lng = lng_a - lng_b
temp = (
math.sin(d_lat / 2) ** 2
+ math.cos(lat_a)
def build_model(self):
# Initialize the problem
santa = LpProblem("santa", LpMinimize)
# Generate distances
w = h = self.cities.shape[0]
distances = [[0 for x in range(w)] for y in range(h)]
for index_a, row_a in tqdm(self.cities.iterrows(), total=self.cities.shape[0]):
lat_a = row_a["lat"]
lon_a = row_a["lng"]
@aialenti
aialenti / file.py
Last active December 13, 2019 13:26
def build_graph_from_current_solution(self):
varsdict = {}
for v in self.santa.variables():
if v.varValue == 1:
varsdict[v.name] = v.varValue
G = nx.Graph()
# Add nodes to the Graph
for index_a, row_a in self.cities.iterrows():
lat_a = row_a["lat"]
def solve(self):
# Now, the model is missing of the subtour elimination constraints.
# The number of these constraints is 2^N-2 where N is the number of cities selected. Too many.
# What we can do is:
# 1. Solve the problem without the Subtour elimination constraints
# 2. Get the solution and see if there are subtours of length lower than N
# 3. Add the constraints that exclude those subtours
# 4. Goto 1 and repeat until there are no more subtours
while True:
self.santa.solve(solvers.PULP_CBC_CMD(msg=True))
tsp = TSP()
tsp.build_model()
tsp.solve()
tsp.extract_solution()
@aialenti
aialenti / tot.py
Last active December 16, 2019 13:59
# Import Pandas - List of cities can be found here https://simplemaps.com/data/world-cities
import pandas as pd
# Import PuLP modeler functions
from pulp import *
# Math functions for distance calculation
import math
# Networkx to get connected components and subtours
import networkx as nx
# Matplotlib for debugging
import matplotlib.pyplot as plt