Last active
September 2, 2020 23:28
-
-
Save matthewoliver/bb52d5ea6f125064d26535223141b68a to your computer and use it in GitHub Desktop.
Random untested very hacky version of a range scanner, best path picker and gap filler code.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from swift.common.utils import Timestamp, ShardRange | |
from swift.container.sharder import DEFAULT_SHARD_CONTAINER_THRESHOLD, \ | |
DEFAULT_SHARD_SHRINK_POINT | |
from itertools import chain | |
# This doesn't have to be in a class, was just trying to brainstorm and write something. | |
class RangesScanner(object): | |
def __init__(self, shard_account=None, root_conatiner=None, | |
ranges=None, shard_threshold=None, shrink_size=None): | |
# These are needed for building shardrange paths | |
self.shard_account = shard_account | |
self.root_container = root_conatiner | |
# This assumes just a list of ranges. I'd assume we'd be smart, and if we can't find a range with active shardranges | |
# we'd scan again including ranges that are FOUND or CLEAVED, to make sure we do have a valid complete range before | |
# we attempt to fix. | |
self.ranges = ranges | |
# These are used for the weight algorithm, because we don't like ranges that are too small or too big as an overlap | |
# might just be due to an cleave or shrink. | |
self.shard_threshold = shard_threshold or \ | |
DEFAULT_SHARD_CONTAINER_THRESHOLD | |
self.shrink_size = shrink_size or \ | |
DEFAULT_SHARD_CONTAINER_THRESHOLD * DEFAULT_SHARD_SHRINK_POINT | |
self.reset() | |
if ranges: | |
self.scan() | |
def reset(self): | |
self.complete = list() | |
self.frags = list() | |
self.lowers = defaultdict(list) | |
self.uppers = defaultdict(list) | |
self._scanned = False | |
def has_complete(self): | |
return self._scanned and len(self.complete) > 0 | |
def has_frags(self): | |
return self._scanned and len(self.frags) > 0 | |
def walk_range(self, lower, l=None, ranges=None, complete=True): | |
if not l: | |
l = [] | |
if not ranges: | |
ranges = list(self.ranges) | |
if lower in self.lowers: | |
multiple = len(self.lowers[lower]) > 1 | |
for sr in self.lowers[lower]: | |
if sr in ranges: | |
ranges.remove(sr) | |
if sr.upper_str == "": | |
l.append(sr) | |
if complete: | |
self.complete.append(l) | |
else: | |
self.fragments.append(l) | |
return | |
if multiple: | |
new_l = list(l) | |
new_l.append(sr) | |
self.walk_range(sr.upper_str, new_l, ranges, complete) | |
else: | |
l.append(sr) | |
self.walk_range(sr.upper_str, l, ranges, complete) | |
elif l: | |
self.fragments.append(l) | |
return | |
def scan(self, ranges=None): | |
self.reset() | |
if ranges: | |
self.ranges = ranges | |
overlaps = False | |
gaps = False | |
ranges_left = list(self.ranges) | |
for sr in self.ranges: | |
self.lowers[sr.lower_str].append(sr) | |
self.uppers[sr.upper_str].append(sr) | |
if len(self.lowers[sr.lower_str]) > 1 or len(self.uppers[sr.upper_str]) > 1: | |
overlaps = True | |
# Quick check for gaps | |
if set(self.lowers.keys()).difference(set(self.uppers.keys())): | |
gaps = True | |
# Short cut | |
if not overlaps and not gaps: | |
self.complete.append(self.ranges) | |
return | |
self.walk_range('', ranges=ranges_left) | |
while ranges_left: | |
ranges_left.sort() | |
self.walk_range(ranges_left[0].lower, ranges=ranges_left, complete=False) | |
self._scanned = True | |
def _ranges_to_remove(self, good_path): | |
remove_set = set() | |
good_set = set(good_path) | |
for path in chain(self.complete, self.frags): | |
path = set(path) | |
path.difference_update(good_set) | |
remove_set.update(path) | |
return list(remove_set) | |
def _calc_range_weight(self, shard_range): | |
# Ideally we want to choose a path with the least movement, so we | |
# and to choose a set of shard ranges that contain the most objects. | |
# But it isn't that simple. What if we've just cleaved, we don't want | |
# to pick and old un-cleaved range just to cleave it again. | |
# Also what if we've just shrunk? Then a range that is too small isn't | |
# one we actually want to pick either, because then we'd need to shrink | |
# again. | |
# Attempt 1: | |
# weight == object_count / shard_threshold | |
# Which should give us close to a percentage "full" anything over | |
# 1 really needs to shard. | |
# But, what if something has already sharded, or has just shrunk, then | |
# we don't want to pick that complete path, otherwise we need to shard | |
# or shrink again. So if it's > shard_threshold or smaller then shrink_size, | |
# we'll negate the sign. | |
# | |
# We do this because the we want to pick the path with the biggest weight | |
# as we want to cleave of the paths with less object (because data movement) | |
# so by giving them a negative they should become less attractive. | |
result = shard_range.object_count / float(self.shard_threshold) | |
if shard_range.object_count < self.shrink_size \ | |
or shard_range.object_count > self.shard_threshold: | |
return result * -1 | |
return result | |
def _find_best_complete_path(self): | |
if len(self.complete) == 1: | |
return self.complete[0] | |
# We have more then 1 complete range. This could be because: | |
# 1: We're in split brain | |
# 2: We've recently cleaved | |
# 3: We've recently started shrinking. | |
# In either case, let's walk and calulate the best path, noting | |
# that it's isn't just object count. | |
weighted_paths = [(path, sum(map(self._calc_range_weight, path))) | |
for path in self.complete] | |
return max(weighted_paths, key=lambda x: x[-1])[0] | |
def _find_and_build_best_fragment_path(self): | |
# we have 3 different types of fragments, mins, maxes and floating. | |
# That is, by definition, a fragment is a path that doesn't go from | |
# MIN to MAX. So: | |
# - MINS: fragment paths that start at MIN | |
# - MAXES: fragment paths that end at MAX | |
# - FLOATING: fragments that dont start or end at MIN or MAX | |
mins, maxes, floating = list(), list(), list() | |
for frag in self.frags: | |
if frag[0].lower == ShardRange.MIN: | |
mins.append(frag) | |
elif frag[-1].upper == ShardRange.MAX: | |
maxes.append(frag) | |
else: | |
floating.append(frag) | |
if not mins and not maxes and not floating: | |
return None | |
# When finding the best fragment path, let's make it simple, we want to | |
# build a complete path by inserting a new shard range. So the best path | |
# is a shard range that will fill the smallest gap. | |
# So let's make an assumption that we want to find the smallest gap | |
# in which case, let's start by finding the frag from mins that reaches | |
# the furthest.. or the max(mins). | |
# Then do the same for maxes, so the min(maxes). | |
def max_shard_range(frag): | |
return frag[-1].upper_str | |
def min_shard_range(frag): | |
return frag[0].lower_str | |
min_frag = max(mins, key=max_shard_range) if mins else None | |
max_frag = min(maxes, key=min_shard_range) if maxes else None | |
# Let's attempt to connect the frags up. | |
min_path = min_frag if min_frag else [] | |
while floating: | |
if not min_path: | |
min_floating = min(floating, key=min_shard_range) | |
min_path += self._make_new_range('', min_floating[0].lower_str) | |
min_path += min_floating | |
floating.remove(min_floating) | |
else: | |
# filter out everything < min_path | |
floating = [f for f in floating if f[-1] > min_path[-1]] | |
if not floating: | |
continue | |
min_floating = min(floating, key=min_shard_range) | |
# _connect_frags connects the 2, deals with overlaps and | |
# removes/add not required frags to self.frags so we can remove | |
# them later. | |
min_path = self._connect_frags(min_path, min_floating) | |
floating.remove(min_floating) | |
# Now we just need to connect the min_path to the max_frag | |
if not min_path: | |
min_path += self._make_new_range('', max_frag[0].lower_str) | |
min_path += max_frag | |
else: | |
min_path = self._connect_frags(min_path, max_frag) | |
return min_path | |
def _connect_frags(self, left_frag, right_frag): | |
# We've already filtered out fragments so the last one on the left, | |
# is < the last one on the right. Meaning the last range on left must | |
# either overlap or is well clear of the whole right range. (I think | |
# that logic makes sense) | |
overlap_map = map(left_frag[-1].overlaps, right_frag) | |
if any(overlap_map): | |
# we have overlaps so we want to find the last overlap. | |
overlap_map.reverse() | |
reverse_index = overlap_map.index(True) | |
last_match_index = len(right_frag) - 1 - reverse_index | |
# OK before me modify a SR, let's remove the existing frag from frags. | |
self.frags.remove(right_frag) | |
# and add the new smaller one | |
pre_overlap_frag = right_frag[:last_match_index] | |
if pre_overlap_frag: | |
self.frags.append(pre_overlap_frag) | |
right_frag[last_match_index].lower = left_frag[-1].upper | |
right_frag[last_match_index].state_timestamp = Timestamp.now() | |
left_frag += right_frag[last_match_index:] | |
else: | |
# just connect the 2 with a new shardrange. | |
left_frag += self._make_new_range(left_frag[-1].upper_str, | |
right_frag[0].lower_str) | |
left_frag += right_frag | |
return left_frag | |
def _make_new_range(self, lower, upper): | |
ts = Timestamp.now() | |
path = ShardRange.make_path(self.shard_account, self.root_container, | |
self.root_container, ts, 0) | |
return ShardRange(path, ts, lower, upper, state=ShardRange.FOUND) | |
def find_best_path(self): | |
ranges_to_remove = list() | |
# Rule 1: If there are complete paths these _always_ win over fragment | |
# Paths | |
if self.complete: | |
# Rule 1.1: find all the fragment ranges that aren't a part of | |
# the completed ranges to remove. | |
path = self._find_best_complete_path() | |
else: | |
# Rule 2: There are no completed paths, so we need to build | |
# the best path we can with the fragments we have. This | |
# will involve adding a shardrange(s) :S | |
path = self._find_and_build_best_fragment_path() | |
ranges_to_remove = self._ranges_to_remove(path) | |
return path, ranges_to_remove |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment