heyalexchoi/copy_redis.py

## copy_redis.py
# there are a few different ways to copy data from one redis instance to another
# from what i've read, you can set up your new instance as a slave to the old
# and let redis replication do its thing
# or, it seems you can copy the underlying dump data from one instance's file system to the other's
# however, sometimes you don't manage your own redis instances, or maybe you just don't want to bother
# with any of that. you could just copy the data.
# this script does that.
# there were some other scripts online already
# - https://github.com/jeremyfa/node-redis-dump
# - https://github.com/yaauie/redis-copy
# the first one ran out of memory. it's a reported issue. the owner says 🤷🏽‍♀️ sorry, find another tool.
# the second one was really slow.
# so, i wrote this one.
# it uses redis pipelines, dump, restore, and larger-than-default batch sizes to reduce network latency effects.
# it also sets a default ttl on each key it copies over. for my use case these are all good things but you can
# adjust the ttl and batch sizes pretty easily below.
# I run python 3.6
# 🙏🏽

import os
import redis

# copy data FROM source
source_redis_url = 'redis://user:pass@host:port'
# copy data TO destination
destination_redis_url = 'redis://user2:pass2@host2:port2'

# redis scan count default is 10.
# this variable is used for a couple related things
# 1. how many keys are requested from redis on each scan
# 2. how many commands are dispatched to redis in each pipeline execution
# generally speaking, i've found i can reduce significant network latency effects
# by batching things in this way. eg, pay the cost for a round trip across the network every
# N commands instead of every 1 command.
scan_count = 1000
# 72 hour expiration (ttl param is in ms). set this to 0 for no ttl expiration on your keys.
default_ttl = 60 * 60 * 72 * 1000

source_conn = redis.StrictRedis.from_url(source_redis_url, decode_responses=False)
destination_conn = redis.StrictRedis.from_url(destination_redis_url, decode_responses=False)

# pipelines dump calls to keys
# returns zipped list of keys and serialized dump results
# [[key1, serialized_result1], [key2, serialized_result2],...]
def dump_keys(conn, keys):
    pipe = conn.pipeline()
    for key in keys:
        pipe.dump(key)
    results = pipe.execute()
    zipped_results = zip(keys, results)
    return zipped_results

# pipelines restore calls
# takes in zipped list of keys and serialized dump values
# restores serialized value at key, with replacement, and a default ttl of 72 hours
def restore_dump_values(conn, zipped_results):
    pipe = conn.pipeline()
    for pair in zipped_results:
        key = pair[0]
        value = pair[1]
        pipe.restore(name=key, ttl=default_ttl, value=value, replace=True)
    result = pipe.execute()
    return result

def dump_and_restore(source_conn, destination_conn, keys):
    # dump all the keys
    zipped_results = dump_keys(conn=source_conn, keys=keys)
    # restore the serialized results
    restore_result = restore_dump_values(conn=destination_conn, zipped_results=zipped_results)
    return restore_result

total_run = 0
total_success = 0
keys = []
for key in source_conn.scan_iter(count=scan_count):
    keys.append(key)
    bucket_count = len(keys)
    if bucket_count >= scan_count:
        res = dump_and_restore(source_conn=source_conn, destination_conn=destination_conn, keys=keys)
        total_run = total_run + len(res)
        successes = [result for result in res if result == b'OK']
        total_success += len(successes)
        keys = []
        print(f"total_run {total_run}")
        print(f"total_success {total_success}")

res = dump_and_restore(source_conn=source_conn, destination_conn=destination_conn, keys=keys)
total_run = total_run + len(res)
successes = [result for result in res if result == b'OK']
total_success += len(successes)
print(f"total_run {total_run}")
print(f"total_success {total_success}")
	# there are a few different ways to copy data from one redis instance to another
	# from what i've read, you can set up your new instance as a slave to the old
	# and let redis replication do its thing
	# or, it seems you can copy the underlying dump data from one instance's file system to the other's
	# however, sometimes you don't manage your own redis instances, or maybe you just don't want to bother
	# with any of that. you could just copy the data.
	# this script does that.
	# there were some other scripts online already
	# - https://github.com/jeremyfa/node-redis-dump
	# - https://github.com/yaauie/redis-copy
	# the first one ran out of memory. it's a reported issue. the owner says 🤷🏽‍♀️ sorry, find another tool.
	# the second one was really slow.
	# so, i wrote this one.
	# it uses redis pipelines, dump, restore, and larger-than-default batch sizes to reduce network latency effects.
	# it also sets a default ttl on each key it copies over. for my use case these are all good things but you can
	# adjust the ttl and batch sizes pretty easily below.
	# I run python 3.6
	# 🙏🏽

	import os
	import redis

	# copy data FROM source
	source_redis_url = 'redis://user:pass@host:port'
	# copy data TO destination
	destination_redis_url = 'redis://user2:pass2@host2:port2'

	# redis scan count default is 10.
	# this variable is used for a couple related things
	# 1. how many keys are requested from redis on each scan
	# 2. how many commands are dispatched to redis in each pipeline execution
	# generally speaking, i've found i can reduce significant network latency effects
	# by batching things in this way. eg, pay the cost for a round trip across the network every
	# N commands instead of every 1 command.
	scan_count = 1000
	# 72 hour expiration (ttl param is in ms). set this to 0 for no ttl expiration on your keys.
	default_ttl = 60 * 60 * 72 * 1000

	source_conn = redis.StrictRedis.from_url(source_redis_url, decode_responses=False)
	destination_conn = redis.StrictRedis.from_url(destination_redis_url, decode_responses=False)

	# pipelines dump calls to keys
	# returns zipped list of keys and serialized dump results
	# [[key1, serialized_result1], [key2, serialized_result2],...]
	def dump_keys(conn, keys):
	pipe = conn.pipeline()
	for key in keys:
	pipe.dump(key)
	results = pipe.execute()
	zipped_results = zip(keys, results)
	return zipped_results

	# pipelines restore calls
	# takes in zipped list of keys and serialized dump values
	# restores serialized value at key, with replacement, and a default ttl of 72 hours
	def restore_dump_values(conn, zipped_results):
	pipe = conn.pipeline()
	for pair in zipped_results:
	key = pair[0]
	value = pair[1]
	pipe.restore(name=key, ttl=default_ttl, value=value, replace=True)
	result = pipe.execute()
	return result

	def dump_and_restore(source_conn, destination_conn, keys):
	# dump all the keys
	zipped_results = dump_keys(conn=source_conn, keys=keys)
	# restore the serialized results
	restore_result = restore_dump_values(conn=destination_conn, zipped_results=zipped_results)
	return restore_result

	total_run = 0
	total_success = 0
	keys = []
	for key in source_conn.scan_iter(count=scan_count):
	keys.append(key)
	bucket_count = len(keys)
	if bucket_count >= scan_count:
	res = dump_and_restore(source_conn=source_conn, destination_conn=destination_conn, keys=keys)
	total_run = total_run + len(res)
	successes = [result for result in res if result == b'OK']
	total_success += len(successes)
	keys = []
	print(f"total_run {total_run}")
	print(f"total_success {total_success}")

	res = dump_and_restore(source_conn=source_conn, destination_conn=destination_conn, keys=keys)
	total_run = total_run + len(res)
	successes = [result for result in res if result == b'OK']
	total_success += len(successes)
	print(f"total_run {total_run}")
	print(f"total_success {total_success}")