Skip to content

Instantly share code, notes, and snippets.

@fabian-thomas
Last active July 28, 2021 17:36
Show Gist options
  • Save fabian-thomas/8fff16f65f0339ae70237b0eee8d56f6 to your computer and use it in GitHub Desktop.
Save fabian-thomas/8fff16f65f0339ae70237b0eee8d56f6 to your computer and use it in GitHub Desktop.
BigDataEngineering - Plan Enumeration scripts
#!/bin/python
import sys
inputFile = open(sys.argv[1], 'r') # file to read data from
def sortAndRemoveNonAlpha(s):
return ''.join(sorted(list(''.join(filter(str.isalnum, s)))))
resultSizeMap = {}
selectivityMap = {}
readingState = 0
currentProblems=[]
involvedRelations=[]
while True:
line = inputFile.readline()
if not line:
break
elif line.startswith('#'):
continue
elif line=='\n':
readingState+=1
continue
if readingState==0:
problem=sortAndRemoveNonAlpha(line)
currentProblems.append(problem)
involvedRelations.append(problem)
resultSizeMap[problem]=int(''.join(filter(str.isdigit, inputFile.readline())))
elif readingState==1:
selectivityMap[sortAndRemoveNonAlpha(line)]=float(inputFile.readline())
while len(currentProblems)>1:
newProblems = set()
for problem in currentProblems:
for single in involvedRelations:
alreadyJoined=list(sortAndRemoveNonAlpha(problem))
if not single in alreadyJoined:
selectivity=1
for p in alreadyJoined:
pair=sortAndRemoveNonAlpha(p+single)
if pair in selectivityMap:
selectivity*=selectivityMap[pair]
newProblem=sortAndRemoveNonAlpha(problem+single)
result=int(resultSizeMap[single]*resultSizeMap[problem]*selectivity)
if newProblem in resultSizeMap:
resultSizeMap[newProblem]=min(resultSizeMap[newProblem], result)
else:
resultSizeMap[newProblem]=result
newProblems.add(newProblem)
currentProblems=newProblems
for e in resultSizeMap:
print(e)
print()
for e in resultSizeMap:
print(resultSizeMap[e])
inputFile.close()
# relations with their table sizes
A
20000
B
30000
C
50000
D
30000
# relations that are joined by some predicate and their selectivity (exclude those that are joined by cross product)
AB
0.1
AC
0.05
BD
0.02
CD
0.04
#!/bin/python
import sys
inputFile = open(sys.argv[1], 'r')
def sortAndRemoveNonAlpha(s):
return ''.join(sorted(list(''.join(filter(str.isalnum, s)))))
resultSizeMap = {}
joined = set()
subProblems = []
readingState = 0
subProblemIndex = 0
for line in inputFile.readlines():
if line.startswith('#'):
continue
elif line=='\n':
readingState+=1
continue
if readingState==0:
subProblems.append(sortAndRemoveNonAlpha(line))
elif readingState==1:
p = subProblems[subProblemIndex]
resultSizeMap[p] = int(''.join(filter(str.isdigit, line)))
subProblemIndex+=1
elif readingState==2:
joined.add(sortAndRemoveNonAlpha(line))
costs = {}
lowestCosts = {}
involvedRelations = []
currentProblems = []
for entry in resultSizeMap:
if len(entry)==1:
costs[entry]=0
lowestCosts[entry]=0
currentProblems.append(entry)
involvedRelations.append(entry)
involvedRelations=sorted(involvedRelations)
currentProblems=sorted(currentProblems)
joinCostModel=lambda x, y : x + y
joinChar='⨝'
crossChar='×'
while len(currentProblems)>1:
prettyNewProblems = []
for prettySubProblem in currentProblems:
subProblem=sortAndRemoveNonAlpha(prettySubProblem)
if costs[prettySubProblem]==lowestCosts[subProblem]:
for single in involvedRelations:
alreadyJoined=list(sortAndRemoveNonAlpha(subProblem))
if not single in alreadyJoined:
cross=True
for p in alreadyJoined:
if sortAndRemoveNonAlpha(single+p) in joined:
cross=False
if cross:
prettyNewProblem='('+prettySubProblem+crossChar+single+')'
prettyNewProblemFlipped='('+single+crossChar+prettySubProblem+')'
else:
prettyNewProblem='('+prettySubProblem+joinChar+single+')'
prettyNewProblemFlipped='('+single+joinChar+prettySubProblem+')'
if not prettyNewProblemFlipped in prettyNewProblems:
cost=costs[prettySubProblem]+joinCostModel(resultSizeMap[subProblem], resultSizeMap[single])
costs[prettyNewProblem]=cost
prettyNewProblems.append(prettyNewProblem)
newProblem=sortAndRemoveNonAlpha(prettyNewProblem)
if newProblem in lowestCosts:
lowestCosts[newProblem]=min(lowestCosts[newProblem], cost)
else:
lowestCosts[newProblem]=cost
currentProblems=prettyNewProblems
print("{:^20} {:^20} {:^20}".format("Subplan", "Costs", "Resultsize"))
for e in costs:
print("{:>20} {:>20,} {:>20,}".format(e, costs[e], resultSizeMap[sortAndRemoveNonAlpha(e)]).replace(',', '.'))
inputFile.close()
# possible subproblems
# may be extracted directly from sizes script
A
B
C
D
AB
AC
AD
BC
BD
CD
ABC
ABD
ACD
BCD
ABCD
# corresponding result sizes mapped by index to the subproblems above
# may be extracted directly from sizes script
20000
30000
50000
30000
60000000
50000000
600000000
1500000000
18000000
60000000
150000000000
36000000000
60000000000
36000000000
3600000000000
# relations that are joined by some predicate (exclude those that are joined by cross product)
# should be the same as for the sizes script just without the selectivities
AB
AC
BD
CD
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment