Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@kanzure
Last active August 23, 2016 03:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kanzure/2fa531afaf03fddd6568eb0212ac8c4c to your computer and use it in GitHub Desktop.
Save kanzure/2fa531afaf03fddd6568eb0212ac8c4c to your computer and use it in GitHub Desktop.
Find the most recent common block between two different, partially synchronized blockchain data stores. Attempt to do so without calling the _batch RPC call for 500,000 getblockhash requests. See https://botbot.me/freenode/bitcoin-core-dev/2016-04-28/?msg=65077020&page=3
"""
Toy demo for finding the most recent common block between a database and a
bitcoind instance. The goal is to correctly handle reorgs and long periods of
downtime if any. The previous implementation used getblockhashes and the _batch
RPC call to get 300,000 blockhashes at a time. Unfortunately bitcoind is not
really made to handle requests of that size. Instead, here is a more optimized
implementation that should take less time and a minimal number of RPC requests.
The output of this is fed into a planner which generates a plan for processing
the blocks between a current blockhash and the last most recent common block.
"""
import hashlib
import unittest
import math
from copy import deepcopy
# ---- utility functions ----
def factors(n):
step = 2 if n%2 else 1
return set(
reduce(
list.__add__,
([i, n//i] for i in range(1, int(math.sqrt(n))+1, step) if n % i == 0),
)
)
def sha256(value):
return hashlib.sha256(value).hexdigest()
def certainlynotsha256(value):
return value
def generate_many_hashes(hashcount, exclude_first_n=0):
"""
Make a list of hashes with hashcount-many items. Use exclude_first_n to
skip those hashes while still counting them towards the total despite their
lack of appearance.
"""
return [certainlynotsha256(str(counter)) for counter in xrange(exclude_first_n, hashcount)]
def generate_hash_dictionary(hashcount, exclude_first_n=0):
"""
Make a dictionary of shape {counter: hash}.
"""
some_dict = {}
hashes = generate_many_hashes(hashcount, exclude_first_n=exclude_first_n)
for (counter, some_hash) in enumerate(hashes):
some_dict[counter + exclude_first_n] = some_hash
# NOTE: This demo assumes that some_dict.keys() is sorted by the
# construction given above.
return some_dict
# ---- classes ----
class BlockchainStorage(object):
def __init__(self, hashdict={}):
self.hashdict = hashdict
self.hashes = hashdict.values() or []
@property
def height(self):
return len(self.hashes)
class Database(BlockchainStorage):
pass
class Bitcoind(BlockchainStorage):
pass
# ---- exceptions ----
class SearchException(Exception):
"""
General exception for the search algorithm.
"""
pass
class NoDataException(SearchException):
"""
No data provided.
"""
pass
# ---- search ----
def _initial_checks(lagstore, truthstore):
"""
Check the lagstore and truthstore for some basic details. These checks
should not be performed on each iteration of the search function.
"""
# TODO: Modify this function to work with a given "current_blockhash". That
# is, find the most recent common block given "current_blockhash". Both
# data stores might have more hashes than "current_blockhash".
# The truthstore must be populated.
if len(truthstore.hashes) == 0:
raise NoDataException("truthstore has no data")
# When there are no hashes in the lagstore, then we know that there are no
# matching recent blocks.
if len(lagstore.hashes) == 0:
return None
# When the first hash in the lagstore doesn't match anything in the
# truthstore, then we know that the truthstore is not done synchronizing or
# the lagstore truly doesn't have anything that matches.
if lagstore.hashdict[lagstore.hashdict.keys()[0]] not in truthstore.hashdict.values():
return None
# When the latest blocks between the two storages are matching, return that
# matching blockhash.
if truthstore.hashdict[truthstore.hashdict.keys()[-1]] == lagstore.hashdict[lagstore.hashdict.keys()[-1]]:
return (truthstore.hashdict.keys()[-1], truthstore.hashdict[truthstore.hashdict.keys()[-1]])
# Both stores have a "block height" associated with each block. If the two
# stores have a different "height" (or rather, if each of their most recent
# blocks have a different "height"), then pick the lower height value to be
# the number from which to start the search from.
reversing_offset = 1
most_recent_lagstore_block_height = lagstore.hashdict.keys()[0 - reversing_offset]
most_recent_truthstore_block_height = truthstore.hashdict.keys()[0 - reversing_offset]
block_height_search_start = min(most_recent_lagstore_block_height, most_recent_truthstore_block_height)
# Do both stores have the same hash at that height? By definition at least
# one of the two stores does not have any data beyond this point, therefore
# this would be the most recent common block.
if lagstore.hashdict[block_height_search_start] == truthstore.hashdict[block_height_search_start]:
return (block_height_search_start, truthstore.hashdict[block_height_search_start])
# Next, find the oldest common block between the two data stores. This by
# definition should be the oldest block in the lagstore. If the lagstore's
# initial block is not in the truthstore, then the function would have
# exited above and returned None.
oldest_common_block_height = lagstore.hashdict.keys()[0]
return ("OK",
{
"block_height_search_start": block_height_search_start,
"oldest_common_block_height": oldest_common_block_height,
},
)
def find_most_recent_common_block(lagstore, truthstore, skip_initial_checks=False, block_height_search_start=None, oldest_common_block_height=None, depth=0, verbose_debug=True):
"""
Find the most recent common block shared by both storage devices. The
second storage is assumed to be the source of truth.
When the lagstore is empty, this function returns None to indicate that
there are no matching blocks between the two data storage devices.
"""
# TODO: implement something about "block processor minimum height locking"
# here?
# some trivial parameter validation checking
if skip_initial_checks == True:
if block_height_search_start == None:
raise Exception("invalid value for block_height_search_start")
elif oldest_common_block_height == None:
raise Exception("invalid value for oldest_common_block_height")
# perform initial checks if allowed by flag
if not skip_initial_checks:
result = _initial_checks(lagstore, truthstore)
if type(result) != tuple or (type(result) == tuple and result[0] != "OK"):
return result
block_height_search_start = result[1]["block_height_search_start"]
oldest_common_block_height = result[1]["oldest_common_block_height"]
# next iteration, skip those checks
skip_initial_checks = True
potential_blocks = block_height_search_start - oldest_common_block_height
# When there's only 100 blocks (or less), give up on the search algorithm
# and switch to using getblockhashes RPC calls.
if potential_blocks <= 100:
if verbose_debug:
print(
"-- Manually check {} blockhashes between {oldest_common_block_height} and {block_height_search_start}".format(
potential_blocks,
oldest_common_block_height=oldest_common_block_height,
block_height_search_start=block_height_search_start,
)
)
# Manually fetch all the hashes of interest.
target_range = range(oldest_common_block_height, block_height_search_start + 1)
lagstorehashes = {index: lagstore.hashdict[index] for index in target_range}
truthstorehashes = {index: truthstore.hashdict[index] for index in target_range}
# find the most recent common hash
for index in reversed(truthstorehashes.keys()):
lagstorehash = lagstore.hashdict[index]
truthstorehash = truthstore.hashdict[index]
if truthstorehash == lagstorehash:
break
else:
raise Exception("didn't find a matching hash; this should be impossible in this spot...")
return (index, truthstorehash)
# More than 100 blocks to go. Continue using the search algorithm.
else:
# how many blocks to sample in the range at a time?
num_points = 10
# Split the remaining potential blocks into different groups to check.
# This is not quite a binary search. We can request multiple blocks at
# a time from the truthstore.
points = range(
oldest_common_block_height,
oldest_common_block_height + potential_blocks,
potential_blocks // num_points,
)
points.append(oldest_common_block_height + potential_blocks)
# Use sort because the initial element in the "points" array is the
# highest value, and it should be placed at the end of the array.
# Use reversed because it's better to learn about commonality near the
# end of the blockchains rather than at the beginning.
points = list(reversed(sorted(points)))
# TODO: what should the initial value of last_point be?
last_point = None
# Request all of these points from the truthstore at the same time
# using a single RPC request.
truthstore_hash_values = {}
for point in points:
truthstore_hash_values[point] = truthstore.hashdict[point]
if truthstore_hash_values[point] in lagstore.hashdict.values():
break
last_point = point
else:
raise Exception("no matching values found.. this should not happen? (double check)")
if verbose_debug:
print("points are: {}".format(points))
print("point is: {}".format(point))
print("last_point is: {}".format(last_point))
print("block_height_search_start: {}".format(block_height_search_start))
print("oldest_common_block_height: {}".format(oldest_common_block_height))
print("potential_blocks: {}".format(potential_blocks))
print("depth: {}".format(depth))
# this is here to prevent a max recursion depth error, although
# this should not be required if the implementation is working
# correctly.
if depth > 4:
raise Exception("recursion depth exceeded, depth is {}".format(depth))
return find_most_recent_common_block(
lagstore,
truthstore,
skip_initial_checks=skip_initial_checks,
block_height_search_start=last_point,
oldest_common_block_height=point,
depth=depth+1,
debug=debug,
)
# ---- test setup functions ----
def make_shared_stores(hashcount):
"""
Make two separate data stores with hashcount-many hashes.
"""
hashdict = generate_hash_dictionary(hashcount=hashcount)
allhashes = hashdict.values()
database = Database(hashdict=deepcopy(hashdict))
bitcoind = Bitcoind(hashdict=deepcopy(hashdict))
return (allhashes, hashdict, database, bitcoind)
# ---- tests ----
class SomeTestCase(unittest.TestCase):
def test_find_most_recent_common_block__no_truthstore_data(self):
hashcount = 10
hashdict = generate_hash_dictionary(hashcount=hashcount)
database = Database(hashdict=hashdict)
bitcoind = Bitcoind(hashdict={})
with self.assertRaises(NoDataException):
find_most_recent_common_block(lagstore=database, truthstore=bitcoind)
def test_find_most_recent_common_block__empty_lagstore(self):
hashcount = 10
hashdict = generate_hash_dictionary(hashcount=hashcount)
database = Database(hashdict={})
bitcoind = Bitcoind(hashdict=hashdict)
mrcb = find_most_recent_common_block(lagstore=database, truthstore=bitcoind)
# There should be no "most recent common block" between the two stores.
self.assertEqual(mrcb, None)
def test_find_most_recent_common_block__shares_last_block(self):
hashcount = 10
(allhashes, hashdict, database, bitcoind) = make_shared_stores(hashcount)
# execute
mrcb = find_most_recent_common_block(lagstore=database, truthstore=bitcoind)
self.assertNotEqual(mrcb, None)
self.assertEqual(mrcb, (hashcount - 1, allhashes[-1]))
def test_find_most_recent_common_block__mismatched_heights_but_same_hash_at_mismatch(self):
hashcount = 10
(allhashes, hashdict, database, bitcoind) = make_shared_stores(hashcount)
# make a size mismatch
del database.hashdict[hashcount - 1]
self.assertNotEqual(database.hashdict.values(), bitcoind.hashdict.values())
mrcb = find_most_recent_common_block(lagstore=database, truthstore=bitcoind)
# hashcount - 2 because index 8 is the first matching element
self.assertEqual(mrcb, (hashcount - 2, allhashes[-2]))
def test_find_most_recent_common_block__nothing_in_common_by_lagstore_first_blockhash(self):
hashcount = 10
(allhashes, hashdict, database, bitcoind) = make_shared_stores(hashcount)
database.hashdict[0] = "xyz"
# The remainder of the hashes in the database don't matter. In fact, it
# would be wrong for the database to keep its original later hashes
# after that 0's hash value, because those hashes would be wrong and
# impossible (a blockchain is a hashchain, after all).
mrcb = find_most_recent_common_block(lagstore=database, truthstore=bitcoind)
self.assertEqual(mrcb, None)
def test_find_most_recent_common_block__two_missing(self):
hashcount = 20
(allhashes, hashdict, database, bitcoind) = make_shared_stores(hashcount)
del database.hashdict[database.hashdict.keys()[-1]]
del database.hashdict[database.hashdict.keys()[-1]]
mrcb = find_most_recent_common_block(lagstore=database, truthstore=bitcoind)
self.assertEqual(mrcb, (database.hashdict.keys()[hashcount - 3], database.hashdict[database.hashdict.keys()[hashcount - 3]]))
def test_find_most_recent_common_block__two_different(self):
hashcount = 20
(allhashes, hashdict, database, bitcoind) = make_shared_stores(hashcount)
del database.hashdict[database.hashdict.keys()[-1]]
del database.hashdict[database.hashdict.keys()[-1]]
database.hashdict[18] = "xyz"
database.hashdict[19] = "abc"
mrcb = find_most_recent_common_block(lagstore=database, truthstore=bitcoind)
self.assertEqual(mrcb, (database.hashdict.keys()[hashcount - 3], database.hashdict[database.hashdict.keys()[hashcount - 3]]))
# This tests at least one level of recursion of the search function.
def test_find_most_recent_common_block__lotta_blocks_to_check(self):
hashcount = 400
hashdict = generate_hash_dictionary(hashcount=hashcount)
bitcoind = Bitcoind(hashdict=deepcopy(hashdict))
other_total_hashcount = 400
other_different_hashcount = 200
other_same_hashcount = 200
other_hashdict = generate_hash_dictionary(hashcount=other_same_hashcount)
for x in range(other_same_hashcount, other_total_hashcount + 1):
other_hashdict[x] = certainlynotsha256("other {}".format(x))
database = Database(hashdict=other_hashdict)
mrcb = find_most_recent_common_block(lagstore=database, truthstore=bitcoind)
expected_height = (hashcount / 2) - 1
expected_hash = hashdict[expected_height]
self.assertEqual(mrcb, (expected_height, expected_hash))
def test_find_most_recent_common_block__many_many_differet(self):
hashcount = 25000
hashdict = generate_hash_dictionary(hashcount=hashcount)
bitcoind = Bitcoind(hashdict=deepcopy(hashdict))
other_total_hashcount = hashcount + 1800
other_different_hashcount = other_total_hashcount - hashcount
other_same_hashcount = hashcount / 2
other_hashdict = generate_hash_dictionary(hashcount=other_same_hashcount)
for x in range(other_same_hashcount, other_total_hashcount + 1):
other_hashdict[x] = certainlynotsha256("other {}".format(x))
database = Database(hashdict=other_hashdict)
mrcb = find_most_recent_common_block(lagstore=database, truthstore=bitcoind)
expected_height = (hashcount / 2) - 1
expected_hash = hashdict[expected_height]
self.assertEqual(mrcb, (expected_height, expected_hash))
if __name__ == "__main__":
unittest.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment