Skip to content

Instantly share code, notes, and snippets.

@alistairncoles
Last active August 29, 2015 14:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alistairncoles/f54eff6d179af735c414 to your computer and use it in GitHub Desktop.
Save alistairncoles/f54eff6d179af735c414 to your computer and use it in GitHub Desktop.
remove the fart from hash_cleanup_listdir
diff --git a/swift/obj/diskfile.py b/swift/obj/diskfile.py
index f1df4d9..e6f25ee 100644
--- a/swift/obj/diskfile.py
+++ b/swift/obj/diskfile.py
@@ -2063,10 +2063,16 @@ class ECDiskFileManager(DiskFileManager):
# been found so far as the valid fileset
discard()
set_valid_fileset()
- elif ext == '.data' and have_valid_fileset():
- # valid fileset without a .durable means we must have a newer .ts,
- # so discard the older .data file
- discard()
+ elif ext == '.data':
+ # not yet found a .durable
+ if have_valid_fileset():
+ # valid fileset means we must have a newer
+ # .ts, so discard the older .data file
+ discard()
+ else:
+ # .data newer than a .durable or .ts, don't discard yet
+ context.setdefault('fragments_without_durable', []).append(
+ filename)
elif ext == '.ts':
if have_valid_fileset() or not accept_first():
# newer .data, .durable or .ts already found so discard this
@@ -2080,7 +2086,7 @@ class ECDiskFileManager(DiskFileManager):
# newer .data, .durable or .ts already found so discard this
discard()
else:
- # ignore unexpected files e.g. .data newer than .durable or junk
+ # ignore unexpected files
pass
return have_valid_fileset()
@@ -2114,7 +2120,7 @@ class ECDiskFileManager(DiskFileManager):
and ts_file is None))
def gather_ondisk_files(self, files, include_obsolete=False,
- frag_index=None, **kwargs):
+ frag_index=None, verify=False, **kwargs):
"""
Given a simple list of files names, iterate over them to determine the
files that constitute a valid object, and optionally determine the
@@ -2153,10 +2159,11 @@ class ECDiskFileManager(DiskFileManager):
if not include_obsolete:
break
- assert self._verify_on_disk_files(
- results, frag_index=frag_index, **kwargs), \
- "On-disk file search algorithm contract is broken: %s" \
- % results.values()
+ if verify:
+ assert self._verify_on_disk_files(
+ results, frag_index=frag_index, **kwargs), \
+ "On-disk file search algorithm contract is broken: %s" \
+ % results.values()
return results
def get_ondisk_files(self, files, datadir, **kwargs):
@@ -2168,7 +2175,7 @@ class ECDiskFileManager(DiskFileManager):
:returns: a tuple of data, meta, and tombstone
"""
# maintain compatibility with 'legacy' get_ondisk_files return value
- accepted_files = self.gather_ondisk_files(files, **kwargs)
+ accepted_files = self.gather_ondisk_files(files, verify=True, **kwargs)
result = [(join(datadir, accepted_files.get(ext))
if accepted_files.get(ext) else None)
for ext in ('.data', '.meta', '.ts')]
@@ -2189,36 +2196,23 @@ class ECDiskFileManager(DiskFileManager):
key 'obsolete'; a list of files remaining in the directory,
reverse sorted, stored under the key 'files'.
"""
+ def is_reclaimable(filename):
+ timestamp = self.parse_on_disk_filename(filename)['timestamp']
+ return (time.time() - float(timestamp)) > reclaim_age
+
files = listdir(hsh_path)
- results = {}
- if len(files) == 1:
- # to remain compatible with legacy hash_cleanup_listdir, we have to
- # allow for a single junk file in dir that may violate ondisk
- # contract, so cannot call gather_ondisk_files and cannot assume
- # that the filename will parse.
- relcaim_rules = {
- '.ts': True,
- # TODO: cleanup old data file with no .durable
- # '.data': True,
- '.durable': False,
- }
- for ext, should_wait in relcaim_rules.items():
- if not files[0].endswith(ext):
- continue
- if should_wait:
- timestamp = self.parse_on_disk_filename(
- files[0])['timestamp']
- if (time.time() - float(timestamp)) > reclaim_age:
- # place reclaim-able tombstone on obsolete list
- results['obsolete'] = [files[0]]
- else:
- results[ext] = files[0]
- else:
- results['obsolete'] = [files[0]]
- elif files:
- files.sort(reverse=True)
- results = self.gather_ondisk_files(files, include_obsolete=True,
- frag_index=frag_index)
+ files.sort(reverse=True)
+ results = self.gather_ondisk_files(files, include_obsolete=True,
+ frag_index=frag_index)
+ if '.durable' in results and not results.get('fragments'):
+ # a .durable with no .data is deleted as soon as it is found
+ results.setdefault('obsolete', []).append(results.pop('.durable'))
+ if '.ts' in results and is_reclaimable(results['.ts']):
+ results.setdefault('obsolete', []).append(results.pop('.ts'))
+ for filename in results.get('fragments_without_durable', []):
+ # stray fragments are not deleted until reclaim-age
+ if is_reclaimable(filename):
+ results.setdefault('obsolete', []).append(filename)
for filename in results.get('obsolete', []):
remove_file(join(hsh_path, filename))
files.remove(filename)
diff --git a/test/unit/obj/test_diskfile.py b/test/unit/obj/test_diskfile.py
index 2fbf173..1c51a59 100644
--- a/test/unit/obj/test_diskfile.py
+++ b/test/unit/obj/test_diskfile.py
@@ -526,7 +526,8 @@ class DiskFileManagerMixin(BaseDiskFileTestMixin):
self.fail('%s with files %s' % (str(e), files))
shuffle(files)
- def _test_hash_cleanup_listdir_files(self, scenarios, policy):
+ def _test_hash_cleanup_listdir_files(self, scenarios, policy,
+ reclaim_age=None):
# check that expected files are left in hashdir after cleanup
for test in scenarios:
class_under_test = self.df_router[policy]
@@ -536,11 +537,15 @@ class DiskFileManagerMixin(BaseDiskFileTestMixin):
for fname in files:
open(os.path.join(hashdir, fname), 'w')
expected_after_cleanup = set([f[0] for f in test
- if f[1] or len(f) > 2 and f[2]])
- with mock.patch('swift.obj.diskfile.time') as mock_time:
- # don't reclaim anything
- mock_time.time.return_value = 0.0
- class_under_test.hash_cleanup_listdir(hashdir)
+ if (f[2] if len(f) > 2 else f[1])])
+ if reclaim_age:
+ class_under_test.hash_cleanup_listdir(
+ hashdir, reclaim_age=reclaim_age)
+ else:
+ with mock.patch('swift.obj.diskfile.time') as mock_time:
+ # don't reclaim anything
+ mock_time.time.return_value = 0.0
+ class_under_test.hash_cleanup_listdir(hashdir)
after_cleanup = set(os.listdir(hashdir))
errmsg = "expected %r, got %r for test %r" % (
sorted(expected_after_cleanup), sorted(after_cleanup), test
@@ -1117,9 +1122,11 @@ class TestECDiskFileManager(DiskFileManagerMixin, unittest.TestCase):
mgr_cls = diskfile.ECDiskFileManager
def test_get_ondisk_files_with_ec_policy(self):
- # Each scenario specifies a list of (filename, extension) tuples. If
- # extension is set then that filename should be returned by the method
- # under test for that extension type.
+ # Each scenario specifies a list of (filename, extension, [survives])
+ # tuples. If extension is set then that filename should be returned by
+ # the method under test for that extension type. If the optional
+ # 'survives' is True, the filename should still be in the dir after
+ # cleanup.
scenarios = [[('0000000007.00000.ts', '.ts')],
[('0000000007.00000.ts', '.ts'),
@@ -1141,7 +1148,7 @@ class TestECDiskFileManager(DiskFileManagerMixin, unittest.TestCase):
# data newer than durable ignored, even if its only data
[('0000000008.00000#1.data', False, True),
- ('0000000007.00000.durable', '.durable')],
+ ('0000000007.00000.durable', False, False)],
# data older than durable is ignored
[('0000000007.00000.durable', '.durable'),
@@ -1150,7 +1157,7 @@ class TestECDiskFileManager(DiskFileManagerMixin, unittest.TestCase):
('0000000004.00000.ts', False)],
# data older than durable ignored, even if its only data
- [('0000000007.00000.durable', '.durable'),
+ [('0000000007.00000.durable', False, False),
('0000000006.00000#1.data', False),
('0000000004.00000.ts', False)],
@@ -1251,6 +1258,65 @@ class TestECDiskFileManager(DiskFileManagerMixin, unittest.TestCase):
# note: not calling self._test_hash_cleanup_listdir_files(scenarios, 0)
# here due to the anomalous scenario as commented above
+ def test_hash_cleanup_listdir_reclaim(self):
+ # Each scenario specifies a list of (filename, extension, [survives])
+ # tuples. If extension is set or 'survives' is True, the filename
+ # should still be in the dir after cleanup.
+ much_older = Timestamp(time() - 2000).internal
+ older = Timestamp(time() - 1001).internal
+ newer = Timestamp(time() - 900).internal
+ scenarios = [[('%s.ts' % older, False, False)],
+
+ # fresh tombstone is preserved
+ [('%s.ts' % newer, '.ts', True)],
+
+ # isolated .durable is cleaned up immediately
+ [('%s.durable' % newer, False, False)],
+
+ # ...even when other older files are in dir
+ [('%s.durable' % older, False, False),
+ ('%s.ts' % much_older, False, False)],
+
+ # isolated .data files are cleaned up when stale
+ [('%s#2.data' % older, False, False),
+ ('%s#4.data' % older, False, False)],
+
+ # ...even when there is an older durable fileset
+ [('%s#2.data' % older, False, False),
+ ('%s#4.data' % older, False, False),
+ ('%s#2.data' % much_older, '.data', True),
+ ('%s#4.data' % much_older, False, True),
+ ('%s.durable' % much_older, '.durable', True)],
+
+ # ... but preserved if still fresh
+ [('%s#2.data' % newer, False, True),
+ ('%s#4.data' % newer, False, True)],
+
+ # ... and we could have a mixture of fresh and stale .data
+ [('%s#2.data' % newer, False, True),
+ ('%s#4.data' % older, False, False)],
+
+ # TODO these remaining scenarios exhibit different
+ # behavior than the legacy replication DiskFileManager
+ # behavior...
+
+ # tombstone reclaimed despite newer non-durable data
+ [('%s#2.data' % newer, False, True),
+ ('%s#4.data' % older, False, False),
+ ('%s.ts' % much_older, '.ts', False)],
+
+ # tombstone reclaimed despite newer non-durable data
+ [('%s.ts' % older, '.ts', False),
+ ('%s.durable' % much_older, False, False)],
+
+ # tombstone reclaimed despite junk file
+ [('junk', False, True),
+ ('%s.ts' % much_older, '.ts', False)],
+ ]
+
+ self._test_hash_cleanup_listdir_files(scenarios, POLICIES.default,
+ reclaim_age=1000)
+
def test_get_ondisk_files_with_stray_meta(self):
# get_ondisk_files does not tolerate a stray .meta file
scenarios = [['0000000007.00000.meta'],
@@ -1478,7 +1544,11 @@ class TestECDiskFileManager(DiskFileManagerMixin, unittest.TestCase):
# but meta timestamp is not returned if specified frag index
# is not found
expected = {
+ # TODO: differs from repl DiskFileManager which *will*
+ # return meta timestamp when only meta and ts on disk
'9373a92d072897b136b3fc06595b4abc': ts1,
+ '9373a92d072897b136b3fc06595b0456': ts3,
+ '9373a92d072897b136b3fc06595b7456': ts2,
}
self._check_yield_hashes(POLICIES.default, suffix_map, expected,
frag_index=3)
@@ -1599,6 +1669,8 @@ class TestECDiskFileManager(DiskFileManagerMixin, unittest.TestCase):
}
expected = {
'9373a92d072897b136b3fc06595b0456': ts1,
+ '9373a92d072897b136b3fc06595ba456': ts1,
+ '9373a92d072897b136b3fc06595bb456': ts2,
}
self._check_yield_hashes(POLICIES.default, suffix_map, expected,
frag_index=2)
@@ -1831,15 +1903,17 @@ class DiskFileMixin(BaseDiskFileTestMixin):
self.fail("Unexpected swift exception raised: %r" % err)
def test_get_metadata(self):
- df = self._create_test_file('1234567890', timestamp=42)
+ timestamp = self.ts().internal
+ df = self._create_test_file('1234567890', timestamp=timestamp)
md = df.get_metadata()
- self.assertEqual(md['X-Timestamp'], Timestamp(42).internal)
+ self.assertEqual(md['X-Timestamp'], timestamp)
def test_read_metadata(self):
- self._create_test_file('1234567890', timestamp=42)
+ timestamp = self.ts().internal
+ self._create_test_file('1234567890', timestamp=timestamp)
df = self._simple_get_diskfile()
md = df.read_metadata()
- self.assertEqual(md['X-Timestamp'], Timestamp(42).internal)
+ self.assertEqual(md['X-Timestamp'], timestamp)
def test_read_metadata_no_xattr(self):
def mock_getxattr(*args, **kargs):
@@ -1867,15 +1941,16 @@ class DiskFileMixin(BaseDiskFileTestMixin):
self.fail("Expected DiskFileNotOpen exception")
def test_disk_file_default_disallowed_metadata(self):
- # build an object with some meta (ts 41)
+ # build an object with some meta (at t0+1s)
orig_metadata = {'X-Object-Meta-Key1': 'Value1',
'Content-Type': 'text/garbage'}
- df = self._get_open_disk_file(ts='41', extra_metadata=orig_metadata)
+ df = self._get_open_disk_file(ts=self.ts().internal,
+ extra_metadata=orig_metadata)
with df.open():
self.assertEquals('1024', df._metadata['Content-Length'])
- # write some new metadata (fast POST, don't send orig meta, ts 42)
+ # write some new metadata (fast POST, don't send orig meta, at t0+1)
df = self._simple_get_diskfile()
- df.write_metadata({'X-Timestamp': Timestamp(42).internal,
+ df.write_metadata({'X-Timestamp': self.ts().internal,
'X-Object-Meta-Key2': 'Value2'})
df = self._simple_get_diskfile()
with df.open():
@@ -1887,15 +1962,16 @@ class DiskFileMixin(BaseDiskFileTestMixin):
self.assertEquals('Value2', df._metadata['X-Object-Meta-Key2'])
def test_disk_file_preserves_sysmeta(self):
- # build an object with some meta (ts 41)
+ # build an object with some meta (at t0)
orig_metadata = {'X-Object-Sysmeta-Key1': 'Value1',
'Content-Type': 'text/garbage'}
- df = self._get_open_disk_file(ts='41', extra_metadata=orig_metadata)
+ df = self._get_open_disk_file(ts=self.ts().internal,
+ extra_metadata=orig_metadata)
with df.open():
self.assertEquals('1024', df._metadata['Content-Length'])
- # write some new metadata (fast POST, don't send orig meta, ts 42)
+ # write some new metadata (fast POST, don't send orig meta, at t0+1s)
df = self._simple_get_diskfile()
- df.write_metadata({'X-Timestamp': Timestamp(42).internal,
+ df.write_metadata({'X-Timestamp': self.ts().internal,
'X-Object-Sysmeta-Key1': 'Value2',
'X-Object-Meta-Key3': 'Value3'})
df = self._simple_get_diskfile()
@@ -2898,10 +2974,11 @@ class DiskFileMixin(BaseDiskFileTestMixin):
self.assertEqual(str(exc), '')
def test_diskfile_timestamp(self):
- self._get_open_disk_file(ts='1383181759.12345')
+ ts = Timestamp(time())
+ self._get_open_disk_file(ts=ts.internal)
df = self._simple_get_diskfile()
with df.open():
- self.assertEqual(df.timestamp, '1383181759.12345')
+ self.assertEqual(df.timestamp, ts.internal)
def test_error_in_hash_cleanup_listdir(self):
@@ -3727,11 +3804,15 @@ class TestSuffixHashes(unittest.TestCase):
def test_hash_cleanup_listdir_only_meta(self):
for policy in self.iter_policies():
- # the get_ondisk_files contract validation doesn't allow a
- # directory with only .meta files
file1, file2 = [self.ts().internal + '.meta' for i in range(2)]
file_list = [file1, file2]
- expected = AssertionError()
+ if policy.policy_type == EC_POLICY:
+ # EC policy does tolerate only .meta's in dir when cleaning up
+ expected = [file2]
+ else:
+ # the get_ondisk_files contract validation doesn't allow a
+ # directory with only .meta files
+ expected = AssertionError()
self.check_hash_cleanup_listdir(policy, file_list, expected)
def test_hash_cleanup_listdir_ignore_orphaned_ts(self):
@@ -3762,20 +3843,31 @@ class TestSuffixHashes(unittest.TestCase):
def test_hash_cleanup_listdir_meta_keeps_old_ts(self):
for policy in self.iter_policies():
- # An orphaned .meta will not clean up a very old .ts
old_float = time() - (diskfile.ONE_WEEK + 1)
file1 = Timestamp(old_float).internal + '.ts'
file2 = Timestamp(time() + 2).internal + '.meta'
file_list = [file1, file2]
- self.check_hash_cleanup_listdir(policy, file_list, [file2, file1])
+ if policy.policy_type == EC_POLICY:
+ # EC will clean up old .ts despite a .meta
+ expected = [file2]
+ else:
+ # An orphaned .meta will not clean up a very old .ts
+ expected = [file2, file1]
+ self.check_hash_cleanup_listdir(policy, file_list, expected)
def test_hash_cleanup_listdir_keep_single_old_data(self):
for policy in self.iter_policies():
- # A single old .data file will not be removed
old_float = time() - (diskfile.ONE_WEEK + 1)
file1 = self._datafilename(Timestamp(old_float), policy)
file_list = [file1]
- self.check_hash_cleanup_listdir(policy, file_list, [file1])
+ if policy.policy_type == EC_POLICY:
+ # for EC an isolated old .data file is removed, its useless
+ # without a .durable
+ expected = []
+ else:
+ # A single old .data file will not be removed
+ expected = file_list
+ self.check_hash_cleanup_listdir(policy, file_list, expected)
def test_hash_cleanup_listdir_drops_isolated_durable(self):
for policy in self.iter_policies():
@@ -3826,7 +3918,13 @@ class TestSuffixHashes(unittest.TestCase):
file1 = self._datafilename(Timestamp(1), policy)
file2 = '0000000002.00000.ts'
file_list = [file1, file2]
- self.check_hash_cleanup_listdir(policy, file_list, [file2])
+ if policy.policy_type == EC_POLICY:
+ # the .ts gets reclaimed up despite failed .data delete
+ expected = []
+ else:
+ # the .ts isn't reclaimed because there were two files in dir
+ expected = [file2]
+ self.check_hash_cleanup_listdir(policy, file_list, expected)
# invalidate_hash tests - behavior
diff --git a/test/unit/obj/test_reconstructor.py b/test/unit/obj/test_reconstructor.py
index ac875c9..00609b4 100755
--- a/test/unit/obj/test_reconstructor.py
+++ b/test/unit/obj/test_reconstructor.py
@@ -256,10 +256,17 @@ class TestGlobalSetupObjectReconstructor(unittest.TestCase):
# create a bunch of FA's to test
t = 1421181937.70054 # time.time()
- _create_frag_archives(self.ec_policy, self.ec_obj_path,
- self.ec_local_dev['id'], 0)
- _create_frag_archives(self.ec_policy, self.ec_obj_path,
- self.ec_local_dev['id'], 1)
+ with mock.patch('swift.obj.diskfile.time') as mock_time:
+ # since (a) we are using a fixed time here to create
+ # frags which corresponds to all the hardcoded hashes and
+ # (b) the EC diskfile will delete its .data file right
+ # after creating if it has expired, use this horrible hack
+ # to prevent the reclaim happening
+ mock_time.time.return_value = 0.0
+ _create_frag_archives(self.ec_policy, self.ec_obj_path,
+ self.ec_local_dev['id'], 0)
+ _create_frag_archives(self.ec_policy, self.ec_obj_path,
+ self.ec_local_dev['id'], 1)
break
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment