Skip to content

Instantly share code, notes, and snippets.

@schmohlio
Created June 3, 2015 02:17
Show Gist options
  • Save schmohlio/f3d6866b9b3174f1fb1a to your computer and use it in GitHub Desktop.
Save schmohlio/f3d6866b9b3174f1fb1a to your computer and use it in GitHub Desktop.
copy all missing servers based on instructions
#!/usr/bin/env python
'''
DataSync
Makes instructions to copy datasets to servers missing backups
based on input data.
- Ensure that each data center has a copy of every data set.
- Every dataset is included in at least 1 data center.
makes use of set operations.
'''
import fileinput
class DataSync():
MAX_LINE_N = 10000
''' may want to expand to other types of input,
i.e. a list that can be sorted in place '''
def __init__(self, num_centers):
# could use this to assert that all inputs were read.
self.num_centers = num_centers
# set of datacenters, parameterized on 1..N
self.datacenters = set(range(1, self.num_centers+1))
# hashmap of dataset with set of backup centers
self.dataset_locs = {}
# instructions to print. list of 3-tuples
self.instructions = []
''' string of space separated ints to list of strings '''
@staticmethod
def clean_line(line):
new_line = line.replace('\n','').split(' ')
return map(lambda x: int(x), new_line)
''' void
input: lines indexed by datacenter with dataset ids
creates a hashmap by dataset, with a set of datacenters backed up.
also adds datacenter indices to set
'''
def _persist_dataset_locations(self, lines):
lines = [self.clean_line(l) for l in lines if l] # watch blanks
for index, ds_list in enumerate(lines):
index += 1
for ds_id in ds_list:
if ds_id in self.dataset_locs:
self.dataset_locs[ds_id].add(index)
else:
self.dataset_locs[ds_id] = {index}
return True # status
def create_instructions_from_log(self, lines):
self._persist_dataset_locations(lines)
for ds_id, dc_set in self.dataset_locs.items():
missing = self.datacenters - dc_set
if len(missing) == 0: continue
# arbitrary server to copy from. may want to improve
from_id = next(iter(dc_set))
instruction = [(ds_id, from_id, to_dc) for to_dc in missing]
self.instructions.extend(instruction)
return self
def show(self):
template = "%d %d %d"
instructions = [template % (i,j,k) for i,j,k in self.instructions]
for line in instructions:
print line
print 'done'
def main():
INPUT = [line for line in fileinput.input()]
N = int(INPUT[0])
LOG = INPUT[1:]
worker = DataSync(N)
worker \
.create_instructions_from_log(LOG) \
.show()
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment