Skip to content

Instantly share code, notes, and snippets.

@lutter

lutter/README.md Secret

Last active June 24, 2021 17:58
Show Gist options
  • Save lutter/b74ee0ae20d1ca2614b9a060ff6df2d5 to your computer and use it in GitHub Desktop.
Save lutter/b74ee0ae20d1ca2614b9a060ff6df2d5 to your computer and use it in GitHub Desktop.
Remove unused deployments

Do not use this with graph-node 0.23 or later; see docs/maintenance.md for how to do this for 0.23 or later

The SQL file and the bash script together make it possible to remove subgraph deployments that are not needed anymore. Not needed here means that they are not the current or pending version of any subgraph.

The script expects to load the SQL file from ../sql/remove-deployment.sql

Please read the comment at the beginning of the bash script for more details on setup and usage

create or replace function
remove_deployment_metadata(sid varchar)
returns integer as
$$
declare
metadata_count integer;
begin
-- Generated by store/postgres/examples/layout.rs in graph-node
with dds as (
select id
from subgraphs.dynamic_ethereum_contract_data_source
where deployment = sid),
md0 as (delete from subgraphs.dynamic_ethereum_contract_data_source e
where e.deployment = sid
returning e.id),
md1 as (delete from subgraphs.ethereum_block_handler_entity e
where left(e.id, 46) = sid or left(e.id, 40) in (select id from dds)
returning e.id),
md2 as (delete from subgraphs.ethereum_block_handler_filter_entity e
where left(e.id, 46) = sid or left(e.id, 40) in (select id from dds)
returning e.id),
md3 as (delete from subgraphs.ethereum_call_handler_entity e
where left(e.id, 46) = sid or left(e.id, 40) in (select id from dds)
returning e.id),
md4 as (delete from subgraphs.ethereum_contract_abi e
where left(e.id, 46) = sid or left(e.id, 40) in (select id from dds)
returning e.id),
md5 as (delete from subgraphs.ethereum_contract_data_source e
where left(e.id, 46) = sid or left(e.id, 40) in (select id from dds)
returning e.id),
md6 as (delete from subgraphs.ethereum_contract_data_source_template e
where left(e.id, 46) = sid or left(e.id, 40) in (select id from dds)
returning e.id),
md7 as (delete from subgraphs.ethereum_contract_data_source_template_source e
where left(e.id, 46) = sid or left(e.id, 40) in (select id from dds)
returning e.id),
md8 as (delete from subgraphs.ethereum_contract_event_handler e
where left(e.id, 46) = sid or left(e.id, 40) in (select id from dds)
returning e.id),
md9 as (delete from subgraphs.ethereum_contract_mapping e
where left(e.id, 46) = sid or left(e.id, 40) in (select id from dds)
returning e.id),
md10 as (delete from subgraphs.ethereum_contract_source e
where left(e.id, 46) = sid or left(e.id, 40) in (select id from dds)
returning e.id),
md11 as (delete from subgraphs.subgraph_deployment e
where left(e.id, 46) = sid
returning e.id),
md12 as (delete from subgraphs.subgraph_deployment_assignment e
where left(e.id, 46) = sid
returning e.id),
md13 as (delete from subgraphs.subgraph_manifest e
where left(e.id, 46) = sid
returning e.id)
select sum(count) into metadata_count from (
select count(*) from md0
union all
select count(*) from md1
union all
select count(*) from md2
union all
select count(*) from md3
union all
select count(*) from md4
union all
select count(*) from md5
union all
select count(*) from md6
union all
select count(*) from md7
union all
select count(*) from md8
union all
select count(*) from md9
union all
select count(*) from md10
union all
select count(*) from md11
union all
select count(*) from md12
union all
select count(*) from md13
) a;
return metadata_count;
end;
$$ language plpgsql;
create or replace function
log_msg(msg varchar, start timestamptz)
returns void as
$$
declare
now timestamptz;
begin
now := clock_timestamp();
raise notice '% %', to_char(now - start, 'SSSS.MS'), msg;
end;
$$ language plpgsql;
-- Remove a deployment. This removes all the data and metadata for a
-- deployment. In particular, it will
-- * delete entries in event_meta_data belonging to the deployment
-- * delete the deployment metadata in subgraphs.entities
-- * drop the schema for the deployment, deleting all its data
-- * remove the row in deployment_schemas for the deployment
create or replace function
remove_deployment(sid varchar)
returns bigint as
$$
declare
sgd varchar;
sgd_version varchar;
num_entities bigint;
metadata_count int;
start timestamptz;
begin
start := clock_timestamp();
if exists (select 1
from subgraphs.subgraph_deployment_assignment
where id = sid) then
raise 'Can not remove subgraph % because it is currently assigned',
sid;
end if;
if exists (select 1
from subgraphs.subgraph s,
subgraphs.subgraph_version v
where (v.id = s.pending_version
or v.id = s.current_version)
and v.deployment = sid) then
raise 'Can not remove subgraph % because it is a current or pending version of a subgraph',
sid;
end if;
if sid not like 'Qm%' then
raise 'Can not remove subgraph %, it is not a normal subgraph',
sid;
end if;
select name, version
into sgd, sgd_version
from deployment_schemas
where subgraph = sid;
select entity_count into num_entities
from subgraphs.subgraph_deployment
where id = sid;
raise notice 'Removing subgraph % (%)', sgd, sgd_version;
raise notice ' subgraph_id: %', sid;
raise notice ' entities: %', num_entities;
if sgd_version = 'split' then
-- We do not remove those unneeded events from event_meta_data since
-- that takes an insanely long time; we just remember them so we can
-- clean them up later, if needed
perform log_msg(format('Saving unneeded event_ids'), start);
execute format('insert into unneeded_event_ids
(select distinct event_id from %I.entity_history)', sgd);
end if;
perform log_msg(format('Removing metadata for %s', sgd), start);
metadata_count := remove_deployment_metadata(sid);
perform log_msg(format('Removing schema %s', sgd), start);
execute format('drop schema %I cascade', sgd);
delete from deployment_schemas
where subgraph = sid;
delete from subgraphs.subgraph_version
where deployment = sid;
perform log_msg(format('Done removing %s(%s metadata entries)',
sgd, metadata_count), start);
return num_entities;
end;
$$ language plpgsql;
#! /bin/bash
# This script does something dangerous: deleting data irretrievably. Before
# running it, you should take a backup since there is absolutely no other
# way to get deleted data back. We have used this script internally for our
# purposes, but that is no guarantee that it will work in your
# infrastructure (though it just might).
# This script removes unused subgraph deployments (versions) A deployment
# is considered unused if it is not currently assigned for indexing; that
# usually happens when a deployment is no longer the current or pending
# version of any subgraph.
#
# To use this script, you have to set hings up so that you can log into the
# desired graph databases without supplying details on the psql command
# line, i.e., running just a plan `psql` should connect you to the database
# from which you want to remove deployments/subgraph versions
#
# The recommended way to configure your environment to do the above if to
# use a pg_service.conf file
# (https://www.postgresql.org/docs/9.6/libpq-pgservice.html) In a nuthsell,
# put the following into ~/.pg_service.conf, filling in the relevant values
# for your environment:
#
# [mygraph]
# host=<database host>
# hostaddr=<database ip address> # can be omitted if DNS to 'host' works
# port=<database port>
# dbname=<database name>
# user=<database user>
# password=<database password>
#
# Check that you set things up properly by running
#
# PGSERVICE=mygraph psql -c 'select count(*) from subgraphs.subgraph'
#
# That should return the number of named subgraphs in your
# installation. Once that works, you can now run
#
# PGSERVICE=mygraph ./remove-unused-deployments
#
# All output will be sent to /var/tmp/remove-unused-deployments.txt - you
# will not see any output from this script in your terminal.
#
# The script expects a file `../sql/remove-deployment.sql` to exist which it
# will load into the database
loop() {
local count="$1"
local cmd="$2"
for i in $(seq 1 "$count")
do
printf "============================== $i ==============================\n"
psql -Xt -c '\timing on' -c "$cmd"
sleep 1
done
}
# Various SQL commands we need
read -r -d '' count_unassigned <<"EOF"
select
(select count(*)
from subgraphs.subgraph_deployment)
- (select count(*)
from subgraphs.subgraph_deployment_assignment)
EOF
read -r -d '' remove_deployment <<EOF
select remove_deployment(x.subgraph)
from (select d.id as subgraph,
d.entity_count
from subgraphs.subgraph_deployment d
where not exists (select 1
from subgraphs.subgraph_deployment_assignment a
where a.id = d.id)
order by d.entity_count asc
limit 1) x;
EOF
read -r -d '' count_unneeded_events <<EOF
select ceil(count(*)/50000::float)
from unneeded_event_ids
EOF
read -r -d '' unneeded_events <<EOF
with t as
(select event_id from unneeded_event_ids order by event_id asc limit 50000),
x as
(delete from unneeded_event_ids
where event_id in (select event_id from t))
delete from event_meta_data
where id in (select event_id from t);
EOF
set -e
TOPDIR=$(readlink -f $(dirname "$0")/..)
echo "Loading stored procedures"
psql -Xt -f $TOPDIR/sql/remove-deployment.sql
# Send all output to a file to make sure the changes leave a record. When
# this script is done, this file should be put somewhere for safekeeping
exec >/var/tmp/remove-unused-deployments.txt 2>&1
count=$(psql -Xt -c "$count_unassigned")
echo "$count unassigned subgraphs"
loop "$count" "$remove_deployment"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment