-
-
Save alistairncoles/f54eff6d179af735c414 to your computer and use it in GitHub Desktop.
remove the fart from hash_cleanup_listdir
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/swift/obj/diskfile.py b/swift/obj/diskfile.py | |
index f1df4d9..e6f25ee 100644 | |
--- a/swift/obj/diskfile.py | |
+++ b/swift/obj/diskfile.py | |
@@ -2063,10 +2063,16 @@ class ECDiskFileManager(DiskFileManager): | |
# been found so far as the valid fileset | |
discard() | |
set_valid_fileset() | |
- elif ext == '.data' and have_valid_fileset(): | |
- # valid fileset without a .durable means we must have a newer .ts, | |
- # so discard the older .data file | |
- discard() | |
+ elif ext == '.data': | |
+ # not yet found a .durable | |
+ if have_valid_fileset(): | |
+ # valid fileset means we must have a newer | |
+ # .ts, so discard the older .data file | |
+ discard() | |
+ else: | |
+ # .data newer than a .durable or .ts, don't discard yet | |
+ context.setdefault('fragments_without_durable', []).append( | |
+ filename) | |
elif ext == '.ts': | |
if have_valid_fileset() or not accept_first(): | |
# newer .data, .durable or .ts already found so discard this | |
@@ -2080,7 +2086,7 @@ class ECDiskFileManager(DiskFileManager): | |
# newer .data, .durable or .ts already found so discard this | |
discard() | |
else: | |
- # ignore unexpected files e.g. .data newer than .durable or junk | |
+ # ignore unexpected files | |
pass | |
return have_valid_fileset() | |
@@ -2114,7 +2120,7 @@ class ECDiskFileManager(DiskFileManager): | |
and ts_file is None)) | |
def gather_ondisk_files(self, files, include_obsolete=False, | |
- frag_index=None, **kwargs): | |
+ frag_index=None, verify=False, **kwargs): | |
""" | |
Given a simple list of files names, iterate over them to determine the | |
files that constitute a valid object, and optionally determine the | |
@@ -2153,10 +2159,11 @@ class ECDiskFileManager(DiskFileManager): | |
if not include_obsolete: | |
break | |
- assert self._verify_on_disk_files( | |
- results, frag_index=frag_index, **kwargs), \ | |
- "On-disk file search algorithm contract is broken: %s" \ | |
- % results.values() | |
+ if verify: | |
+ assert self._verify_on_disk_files( | |
+ results, frag_index=frag_index, **kwargs), \ | |
+ "On-disk file search algorithm contract is broken: %s" \ | |
+ % results.values() | |
return results | |
def get_ondisk_files(self, files, datadir, **kwargs): | |
@@ -2168,7 +2175,7 @@ class ECDiskFileManager(DiskFileManager): | |
:returns: a tuple of data, meta, and tombstone | |
""" | |
# maintain compatibility with 'legacy' get_ondisk_files return value | |
- accepted_files = self.gather_ondisk_files(files, **kwargs) | |
+ accepted_files = self.gather_ondisk_files(files, verify=True, **kwargs) | |
result = [(join(datadir, accepted_files.get(ext)) | |
if accepted_files.get(ext) else None) | |
for ext in ('.data', '.meta', '.ts')] | |
@@ -2189,36 +2196,23 @@ class ECDiskFileManager(DiskFileManager): | |
key 'obsolete'; a list of files remaining in the directory, | |
reverse sorted, stored under the key 'files'. | |
""" | |
+ def is_reclaimable(filename): | |
+ timestamp = self.parse_on_disk_filename(filename)['timestamp'] | |
+ return (time.time() - float(timestamp)) > reclaim_age | |
+ | |
files = listdir(hsh_path) | |
- results = {} | |
- if len(files) == 1: | |
- # to remain compatible with legacy hash_cleanup_listdir, we have to | |
- # allow for a single junk file in dir that may violate ondisk | |
- # contract, so cannot call gather_ondisk_files and cannot assume | |
- # that the filename will parse. | |
- relcaim_rules = { | |
- '.ts': True, | |
- # TODO: cleanup old data file with no .durable | |
- # '.data': True, | |
- '.durable': False, | |
- } | |
- for ext, should_wait in relcaim_rules.items(): | |
- if not files[0].endswith(ext): | |
- continue | |
- if should_wait: | |
- timestamp = self.parse_on_disk_filename( | |
- files[0])['timestamp'] | |
- if (time.time() - float(timestamp)) > reclaim_age: | |
- # place reclaim-able tombstone on obsolete list | |
- results['obsolete'] = [files[0]] | |
- else: | |
- results[ext] = files[0] | |
- else: | |
- results['obsolete'] = [files[0]] | |
- elif files: | |
- files.sort(reverse=True) | |
- results = self.gather_ondisk_files(files, include_obsolete=True, | |
- frag_index=frag_index) | |
+ files.sort(reverse=True) | |
+ results = self.gather_ondisk_files(files, include_obsolete=True, | |
+ frag_index=frag_index) | |
+ if '.durable' in results and not results.get('fragments'): | |
+ # a .durable with no .data is deleted as soon as it is found | |
+ results.setdefault('obsolete', []).append(results.pop('.durable')) | |
+ if '.ts' in results and is_reclaimable(results['.ts']): | |
+ results.setdefault('obsolete', []).append(results.pop('.ts')) | |
+ for filename in results.get('fragments_without_durable', []): | |
+ # stray fragments are not deleted until reclaim-age | |
+ if is_reclaimable(filename): | |
+ results.setdefault('obsolete', []).append(filename) | |
for filename in results.get('obsolete', []): | |
remove_file(join(hsh_path, filename)) | |
files.remove(filename) | |
diff --git a/test/unit/obj/test_diskfile.py b/test/unit/obj/test_diskfile.py | |
index 2fbf173..1c51a59 100644 | |
--- a/test/unit/obj/test_diskfile.py | |
+++ b/test/unit/obj/test_diskfile.py | |
@@ -526,7 +526,8 @@ class DiskFileManagerMixin(BaseDiskFileTestMixin): | |
self.fail('%s with files %s' % (str(e), files)) | |
shuffle(files) | |
- def _test_hash_cleanup_listdir_files(self, scenarios, policy): | |
+ def _test_hash_cleanup_listdir_files(self, scenarios, policy, | |
+ reclaim_age=None): | |
# check that expected files are left in hashdir after cleanup | |
for test in scenarios: | |
class_under_test = self.df_router[policy] | |
@@ -536,11 +537,15 @@ class DiskFileManagerMixin(BaseDiskFileTestMixin): | |
for fname in files: | |
open(os.path.join(hashdir, fname), 'w') | |
expected_after_cleanup = set([f[0] for f in test | |
- if f[1] or len(f) > 2 and f[2]]) | |
- with mock.patch('swift.obj.diskfile.time') as mock_time: | |
- # don't reclaim anything | |
- mock_time.time.return_value = 0.0 | |
- class_under_test.hash_cleanup_listdir(hashdir) | |
+ if (f[2] if len(f) > 2 else f[1])]) | |
+ if reclaim_age: | |
+ class_under_test.hash_cleanup_listdir( | |
+ hashdir, reclaim_age=reclaim_age) | |
+ else: | |
+ with mock.patch('swift.obj.diskfile.time') as mock_time: | |
+ # don't reclaim anything | |
+ mock_time.time.return_value = 0.0 | |
+ class_under_test.hash_cleanup_listdir(hashdir) | |
after_cleanup = set(os.listdir(hashdir)) | |
errmsg = "expected %r, got %r for test %r" % ( | |
sorted(expected_after_cleanup), sorted(after_cleanup), test | |
@@ -1117,9 +1122,11 @@ class TestECDiskFileManager(DiskFileManagerMixin, unittest.TestCase): | |
mgr_cls = diskfile.ECDiskFileManager | |
def test_get_ondisk_files_with_ec_policy(self): | |
- # Each scenario specifies a list of (filename, extension) tuples. If | |
- # extension is set then that filename should be returned by the method | |
- # under test for that extension type. | |
+ # Each scenario specifies a list of (filename, extension, [survives]) | |
+ # tuples. If extension is set then that filename should be returned by | |
+ # the method under test for that extension type. If the optional | |
+ # 'survives' is True, the filename should still be in the dir after | |
+ # cleanup. | |
scenarios = [[('0000000007.00000.ts', '.ts')], | |
[('0000000007.00000.ts', '.ts'), | |
@@ -1141,7 +1148,7 @@ class TestECDiskFileManager(DiskFileManagerMixin, unittest.TestCase): | |
# data newer than durable ignored, even if its only data | |
[('0000000008.00000#1.data', False, True), | |
- ('0000000007.00000.durable', '.durable')], | |
+ ('0000000007.00000.durable', False, False)], | |
# data older than durable is ignored | |
[('0000000007.00000.durable', '.durable'), | |
@@ -1150,7 +1157,7 @@ class TestECDiskFileManager(DiskFileManagerMixin, unittest.TestCase): | |
('0000000004.00000.ts', False)], | |
# data older than durable ignored, even if its only data | |
- [('0000000007.00000.durable', '.durable'), | |
+ [('0000000007.00000.durable', False, False), | |
('0000000006.00000#1.data', False), | |
('0000000004.00000.ts', False)], | |
@@ -1251,6 +1258,65 @@ class TestECDiskFileManager(DiskFileManagerMixin, unittest.TestCase): | |
# note: not calling self._test_hash_cleanup_listdir_files(scenarios, 0) | |
# here due to the anomalous scenario as commented above | |
+ def test_hash_cleanup_listdir_reclaim(self): | |
+ # Each scenario specifies a list of (filename, extension, [survives]) | |
+ # tuples. If extension is set or 'survives' is True, the filename | |
+ # should still be in the dir after cleanup. | |
+ much_older = Timestamp(time() - 2000).internal | |
+ older = Timestamp(time() - 1001).internal | |
+ newer = Timestamp(time() - 900).internal | |
+ scenarios = [[('%s.ts' % older, False, False)], | |
+ | |
+ # fresh tombstone is preserved | |
+ [('%s.ts' % newer, '.ts', True)], | |
+ | |
+ # isolated .durable is cleaned up immediately | |
+ [('%s.durable' % newer, False, False)], | |
+ | |
+ # ...even when other older files are in dir | |
+ [('%s.durable' % older, False, False), | |
+ ('%s.ts' % much_older, False, False)], | |
+ | |
+ # isolated .data files are cleaned up when stale | |
+ [('%s#2.data' % older, False, False), | |
+ ('%s#4.data' % older, False, False)], | |
+ | |
+ # ...even when there is an older durable fileset | |
+ [('%s#2.data' % older, False, False), | |
+ ('%s#4.data' % older, False, False), | |
+ ('%s#2.data' % much_older, '.data', True), | |
+ ('%s#4.data' % much_older, False, True), | |
+ ('%s.durable' % much_older, '.durable', True)], | |
+ | |
+ # ... but preserved if still fresh | |
+ [('%s#2.data' % newer, False, True), | |
+ ('%s#4.data' % newer, False, True)], | |
+ | |
+ # ... and we could have a mixture of fresh and stale .data | |
+ [('%s#2.data' % newer, False, True), | |
+ ('%s#4.data' % older, False, False)], | |
+ | |
+ # TODO these remaining scenarios exhibit different | |
+ # behavior than the legacy replication DiskFileManager | |
+ # behavior... | |
+ | |
+ # tombstone reclaimed despite newer non-durable data | |
+ [('%s#2.data' % newer, False, True), | |
+ ('%s#4.data' % older, False, False), | |
+ ('%s.ts' % much_older, '.ts', False)], | |
+ | |
+ # tombstone reclaimed despite newer non-durable data | |
+ [('%s.ts' % older, '.ts', False), | |
+ ('%s.durable' % much_older, False, False)], | |
+ | |
+ # tombstone reclaimed despite junk file | |
+ [('junk', False, True), | |
+ ('%s.ts' % much_older, '.ts', False)], | |
+ ] | |
+ | |
+ self._test_hash_cleanup_listdir_files(scenarios, POLICIES.default, | |
+ reclaim_age=1000) | |
+ | |
def test_get_ondisk_files_with_stray_meta(self): | |
# get_ondisk_files does not tolerate a stray .meta file | |
scenarios = [['0000000007.00000.meta'], | |
@@ -1478,7 +1544,11 @@ class TestECDiskFileManager(DiskFileManagerMixin, unittest.TestCase): | |
# but meta timestamp is not returned if specified frag index | |
# is not found | |
expected = { | |
+ # TODO: differs from repl DiskFileManager which *will* | |
+ # return meta timestamp when only meta and ts on disk | |
'9373a92d072897b136b3fc06595b4abc': ts1, | |
+ '9373a92d072897b136b3fc06595b0456': ts3, | |
+ '9373a92d072897b136b3fc06595b7456': ts2, | |
} | |
self._check_yield_hashes(POLICIES.default, suffix_map, expected, | |
frag_index=3) | |
@@ -1599,6 +1669,8 @@ class TestECDiskFileManager(DiskFileManagerMixin, unittest.TestCase): | |
} | |
expected = { | |
'9373a92d072897b136b3fc06595b0456': ts1, | |
+ '9373a92d072897b136b3fc06595ba456': ts1, | |
+ '9373a92d072897b136b3fc06595bb456': ts2, | |
} | |
self._check_yield_hashes(POLICIES.default, suffix_map, expected, | |
frag_index=2) | |
@@ -1831,15 +1903,17 @@ class DiskFileMixin(BaseDiskFileTestMixin): | |
self.fail("Unexpected swift exception raised: %r" % err) | |
def test_get_metadata(self): | |
- df = self._create_test_file('1234567890', timestamp=42) | |
+ timestamp = self.ts().internal | |
+ df = self._create_test_file('1234567890', timestamp=timestamp) | |
md = df.get_metadata() | |
- self.assertEqual(md['X-Timestamp'], Timestamp(42).internal) | |
+ self.assertEqual(md['X-Timestamp'], timestamp) | |
def test_read_metadata(self): | |
- self._create_test_file('1234567890', timestamp=42) | |
+ timestamp = self.ts().internal | |
+ self._create_test_file('1234567890', timestamp=timestamp) | |
df = self._simple_get_diskfile() | |
md = df.read_metadata() | |
- self.assertEqual(md['X-Timestamp'], Timestamp(42).internal) | |
+ self.assertEqual(md['X-Timestamp'], timestamp) | |
def test_read_metadata_no_xattr(self): | |
def mock_getxattr(*args, **kargs): | |
@@ -1867,15 +1941,16 @@ class DiskFileMixin(BaseDiskFileTestMixin): | |
self.fail("Expected DiskFileNotOpen exception") | |
def test_disk_file_default_disallowed_metadata(self): | |
- # build an object with some meta (ts 41) | |
+ # build an object with some meta (at t0+1s) | |
orig_metadata = {'X-Object-Meta-Key1': 'Value1', | |
'Content-Type': 'text/garbage'} | |
- df = self._get_open_disk_file(ts='41', extra_metadata=orig_metadata) | |
+ df = self._get_open_disk_file(ts=self.ts().internal, | |
+ extra_metadata=orig_metadata) | |
with df.open(): | |
self.assertEquals('1024', df._metadata['Content-Length']) | |
- # write some new metadata (fast POST, don't send orig meta, ts 42) | |
+ # write some new metadata (fast POST, don't send orig meta, at t0+1) | |
df = self._simple_get_diskfile() | |
- df.write_metadata({'X-Timestamp': Timestamp(42).internal, | |
+ df.write_metadata({'X-Timestamp': self.ts().internal, | |
'X-Object-Meta-Key2': 'Value2'}) | |
df = self._simple_get_diskfile() | |
with df.open(): | |
@@ -1887,15 +1962,16 @@ class DiskFileMixin(BaseDiskFileTestMixin): | |
self.assertEquals('Value2', df._metadata['X-Object-Meta-Key2']) | |
def test_disk_file_preserves_sysmeta(self): | |
- # build an object with some meta (ts 41) | |
+ # build an object with some meta (at t0) | |
orig_metadata = {'X-Object-Sysmeta-Key1': 'Value1', | |
'Content-Type': 'text/garbage'} | |
- df = self._get_open_disk_file(ts='41', extra_metadata=orig_metadata) | |
+ df = self._get_open_disk_file(ts=self.ts().internal, | |
+ extra_metadata=orig_metadata) | |
with df.open(): | |
self.assertEquals('1024', df._metadata['Content-Length']) | |
- # write some new metadata (fast POST, don't send orig meta, ts 42) | |
+ # write some new metadata (fast POST, don't send orig meta, at t0+1s) | |
df = self._simple_get_diskfile() | |
- df.write_metadata({'X-Timestamp': Timestamp(42).internal, | |
+ df.write_metadata({'X-Timestamp': self.ts().internal, | |
'X-Object-Sysmeta-Key1': 'Value2', | |
'X-Object-Meta-Key3': 'Value3'}) | |
df = self._simple_get_diskfile() | |
@@ -2898,10 +2974,11 @@ class DiskFileMixin(BaseDiskFileTestMixin): | |
self.assertEqual(str(exc), '') | |
def test_diskfile_timestamp(self): | |
- self._get_open_disk_file(ts='1383181759.12345') | |
+ ts = Timestamp(time()) | |
+ self._get_open_disk_file(ts=ts.internal) | |
df = self._simple_get_diskfile() | |
with df.open(): | |
- self.assertEqual(df.timestamp, '1383181759.12345') | |
+ self.assertEqual(df.timestamp, ts.internal) | |
def test_error_in_hash_cleanup_listdir(self): | |
@@ -3727,11 +3804,15 @@ class TestSuffixHashes(unittest.TestCase): | |
def test_hash_cleanup_listdir_only_meta(self): | |
for policy in self.iter_policies(): | |
- # the get_ondisk_files contract validation doesn't allow a | |
- # directory with only .meta files | |
file1, file2 = [self.ts().internal + '.meta' for i in range(2)] | |
file_list = [file1, file2] | |
- expected = AssertionError() | |
+ if policy.policy_type == EC_POLICY: | |
+ # EC policy does tolerate only .meta's in dir when cleaning up | |
+ expected = [file2] | |
+ else: | |
+ # the get_ondisk_files contract validation doesn't allow a | |
+ # directory with only .meta files | |
+ expected = AssertionError() | |
self.check_hash_cleanup_listdir(policy, file_list, expected) | |
def test_hash_cleanup_listdir_ignore_orphaned_ts(self): | |
@@ -3762,20 +3843,31 @@ class TestSuffixHashes(unittest.TestCase): | |
def test_hash_cleanup_listdir_meta_keeps_old_ts(self): | |
for policy in self.iter_policies(): | |
- # An orphaned .meta will not clean up a very old .ts | |
old_float = time() - (diskfile.ONE_WEEK + 1) | |
file1 = Timestamp(old_float).internal + '.ts' | |
file2 = Timestamp(time() + 2).internal + '.meta' | |
file_list = [file1, file2] | |
- self.check_hash_cleanup_listdir(policy, file_list, [file2, file1]) | |
+ if policy.policy_type == EC_POLICY: | |
+ # EC will clean up old .ts despite a .meta | |
+ expected = [file2] | |
+ else: | |
+ # An orphaned .meta will not clean up a very old .ts | |
+ expected = [file2, file1] | |
+ self.check_hash_cleanup_listdir(policy, file_list, expected) | |
def test_hash_cleanup_listdir_keep_single_old_data(self): | |
for policy in self.iter_policies(): | |
- # A single old .data file will not be removed | |
old_float = time() - (diskfile.ONE_WEEK + 1) | |
file1 = self._datafilename(Timestamp(old_float), policy) | |
file_list = [file1] | |
- self.check_hash_cleanup_listdir(policy, file_list, [file1]) | |
+ if policy.policy_type == EC_POLICY: | |
+ # for EC an isolated old .data file is removed, its useless | |
+ # without a .durable | |
+ expected = [] | |
+ else: | |
+ # A single old .data file will not be removed | |
+ expected = file_list | |
+ self.check_hash_cleanup_listdir(policy, file_list, expected) | |
def test_hash_cleanup_listdir_drops_isolated_durable(self): | |
for policy in self.iter_policies(): | |
@@ -3826,7 +3918,13 @@ class TestSuffixHashes(unittest.TestCase): | |
file1 = self._datafilename(Timestamp(1), policy) | |
file2 = '0000000002.00000.ts' | |
file_list = [file1, file2] | |
- self.check_hash_cleanup_listdir(policy, file_list, [file2]) | |
+ if policy.policy_type == EC_POLICY: | |
+ # the .ts gets reclaimed up despite failed .data delete | |
+ expected = [] | |
+ else: | |
+ # the .ts isn't reclaimed because there were two files in dir | |
+ expected = [file2] | |
+ self.check_hash_cleanup_listdir(policy, file_list, expected) | |
# invalidate_hash tests - behavior | |
diff --git a/test/unit/obj/test_reconstructor.py b/test/unit/obj/test_reconstructor.py | |
index ac875c9..00609b4 100755 | |
--- a/test/unit/obj/test_reconstructor.py | |
+++ b/test/unit/obj/test_reconstructor.py | |
@@ -256,10 +256,17 @@ class TestGlobalSetupObjectReconstructor(unittest.TestCase): | |
# create a bunch of FA's to test | |
t = 1421181937.70054 # time.time() | |
- _create_frag_archives(self.ec_policy, self.ec_obj_path, | |
- self.ec_local_dev['id'], 0) | |
- _create_frag_archives(self.ec_policy, self.ec_obj_path, | |
- self.ec_local_dev['id'], 1) | |
+ with mock.patch('swift.obj.diskfile.time') as mock_time: | |
+ # since (a) we are using a fixed time here to create | |
+ # frags which corresponds to all the hardcoded hashes and | |
+ # (b) the EC diskfile will delete its .data file right | |
+ # after creating if it has expired, use this horrible hack | |
+ # to prevent the reclaim happening | |
+ mock_time.time.return_value = 0.0 | |
+ _create_frag_archives(self.ec_policy, self.ec_obj_path, | |
+ self.ec_local_dev['id'], 0) | |
+ _create_frag_archives(self.ec_policy, self.ec_obj_path, | |
+ self.ec_local_dev['id'], 1) | |
break | |
break | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment