Skip to content

Instantly share code, notes, and snippets.

@seansu4you87
Created June 17, 2020 01:37
Show Gist options
  • Save seansu4you87/8941a16bb23ba117456374b233de3675 to your computer and use it in GitHub Desktop.
Save seansu4you87/8941a16bb23ba117456374b233de3675 to your computer and use it in GitHub Desktop.
Chaos example
describe "Replica Reads Resilience" do
it "can survive replica failure" do
shard_1_query = User.where(id: 1)
shard_2_query = User.where(id: 2)
expect(shard_1_query.count).to eq(1)
expect(shard_2_query.count).to eq(1)
task = pod_failure :create, "vttablet-shard1-replica2"
task.join
expect(shard_1_query.count).to eq(1)
expect(shard_2_query.count).to eq(1)
task = pod_failure :heal, "vttablet-shard1-replica2"
task.join
expect(shard_1_query.count).to eq(1)
expect(shard_2_query.count).to eq(1)
end
end
describe "MySQL SSR" do
context "single region" do
it "can survive SSR replica failure up to a certain point" do
# NOTE(yu): Topology
#
# - 1 single region
# - 3 SSR (one of which is master)
# - requires 1 SSR ack to ack a write
# Works since both SSRs are in the network
User.create!(id: 1)
task = pod_partition :create, "vttablet-shard1-replica1", "vttablet-shard1-replica2"
task.join
# Works since vttablet-shard1-replica3 is still in the network
User.create!(id: 3)
task = pod_partition :create, "vttablet-shard1-replica1", "vttablet-shard1-replica3"
task.join
# Fails since no SSRs are in the network!
expect { User.create!(id: 3) }.to raise_error(ActiveRecord::Timeout)
end
end
context "multi region" do
it "can survive SSR replica failure up to a certain point"
end
end
describe "Etcd" do
context "local" do
it "can tolerate local Etcd failure until topology is too stale" do
good_conn = Connection.new("vtgate1")
fail_conn = Connection.new("vtgate2")
query = User.where(id: 1)
# Queries work for both vtgates
expect(query.using(good_conn).count).to eq(1)
expect(query.using(fail_conn).count).to eq(1)
tasks = (1..3).map { |i| pod_partition :create, "vtgate2", "local_etcd#{i}" }
tasks.map(&:join)
# Queries still work for both vtgates
expect(query.using(good_conn).count).to eq(1)
expect(query.using(fail_conn).count).to eq(1)
add_topo_tasks = []
add_topo_tasks << Vitess.add_replica("shard1", "replica4")
add_topo_tasks << Vitess.add_replica("shard1", "replica5")
add_topo_tasks << Vitess.add_replica("shard1", "replica6")
add_topo_tasks.map(&:join)
kill_topo_tasks = []
kill_topo_tasks << Vitess.kill_replica("shard1", "replica3")
kill_topo_tasks << Vitess.kill_replica("shard1", "replica2")
kill_topo_tasks << Vitess.kill_replica("shard1", "replica1")
kill_topo_tasks.map(&:join)
election = Vitess.elect_master("shard1", "replica4")
election.join
# Queries for `fail_conn` stop working because Topology is too stale now
expect(query.using(good_conn).count).to eq(1)
expect { query.using(fail_conn).count }.to raise_error(ActiveRecord::Timeout)
end
end
context "global" do
it "can tolerate global Etcd not having a quorum" do
query = User.where(id: 1)
expect(query.count).to eq(1)
tasks = (1..3).reduce([]) do |acc, i|
(1..3).each do |j|
next if i == j
acc << (pod_partition :create, "global_etcd#{i}", "global_etcd#{j}")
end
acc
end
tasks.map(&:join)
# Query should still work since we are not dependent on global Etcd for live writes
expect(query.count).to eq(1)
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment