philzook58/unionfind.v

## unionfind.v
(*|
A data structure that I've been more and more interested in recently is the disjoint set datastructure or union-find.
https://en.wikipedia.org/wiki/Disjoint-set_data_structure It's used in egraphs, unification, prolog, and graph connectivity.

I realized a cute representation that is easy to use and prover stuff about, although very inefficient compared to the usual version of disjoint set. It uses a simple functional representation based off the observation that the preimages of a function form disjoint sets. This representation is somewhat analogous to using functions to represent Maps in Coq like in Software Foundations. https://softwarefoundations.cis.upenn.edu/lf-current/Maps.html

The nice thing about this is that it really avoids any termination stickiness. Termination of the `find_root` operation of a union find in an explicit array or tree is not at all obvious and requires side proof or a refined type.
The termination ultimately comes from the fact that you used a finite number of unions to construct the disjoint sets.
|*)

Require Import Arith.

(*| For convenience we define disjoint sets of nats. See comments at end of post for some other options |*)
Definition ds := nat -> nat.

(*| The completely disjoint set is the identity function |*)
Definition init_ds : ds := fun x => x.

(*| `find_root` is just application |*)
Definition find_root (g : ds) x := g x.

(*| `in_same_set` is just `nat` equality checking |*)
Definition in_same_set (g : ds) x y :=
    Nat.eq_dec (g x) (g y).

(*| And finally the only interesting operation is `union`. Some comments: It is useful to lift the `find_root` operations out of the body of the returned function to compute them eagerly and share the result. This version unfortunately means that the cost of a `find_root` operation becomes proportional to the number of `union` operations used to construct the `ds`

|*)
Definition union (g : ds) x y : ds :=
    let px := find_root g x in
    let py := find_root g y in
    fun z =>
    let pz := find_root g z in
    if Nat.eq_dec px pz
    then py
    else pz.

(*|
-------------
Some proofs
-------------

I couldn't find this useful lemma in the standard library. Maybe I just missed it?
|*)


Lemma nat_eq_refl : forall x, exists p, Nat.eq_dec x x = left p.
intros x.
destruct (Nat.eq_dec x x).
eexists. reflexivity.
unfold "<>" in n. exfalso. apply n. reflexivity.
Qed.

(*|
A useful definition is `In_Same_Set`, which states that two nats `x` and `y` are in the same set if the `in_same_set` function returns a `left` |*)

Definition In_Same_Set g x y := exists p,
   in_same_set g x y = left p.

(*|
An element is always in the same set with itself
|*)

Theorem disjoint_refl : forall g x,
  In_Same_Set g x x.
  intros g x. unfold In_Same_Set, in_same_set.
  destruct (Nat.eq_dec (g x) (g x)).
  - eexists. reflexivity.
  - exfalso. apply n. reflexivity.
Qed.

(*|
The `In_Same_Set g` relation is symmettric.
|*)

Theorem disjoint_sym : forall g x y,
  In_Same_Set g x y ->
  In_Same_Set g y x.
  unfold In_Same_Set, in_same_set, find_root.
  intros g x y H.
  destruct H.
  destruct (Nat.eq_dec (g y) (g x)) eqn:E.
  - eexists. reflexivity.
  - exfalso. apply n. rewrite x0. reflexivity.
Qed.

(*|
The `In_Same_Set g` relation is transtive.
|*)

Theorem disjoint_trans : forall g x y z,
  In_Same_Set g x y ->
  In_Same_Set g y z ->
  In_Same_Set g x z.
  unfold In_Same_Set, in_same_set, find_root.
  intros g x y z H H0.
  destruct H.
  destruct H0.
  destruct (Nat.eq_dec (g x) (g z)) eqn:E.
  - eexists. reflexivity.
  - exfalso. apply n. rewrite x0. rewrite x1. reflexivity.
Qed.

(*|
The initial disjoint set only has equal elements in the same set
|*)

Theorem disjoint_init : forall x y, x <> y ->
   exists p, in_same_set init_ds x y = right p.
   intros.
   destruct in_same_set.
   exfalso.
   apply H.
   unfold init_ds in e.
   apply e.
   eexists. reflexivity. Qed.


(*|
After unioning x and y, they are now in the same set. This proof could be tightened up and automated more.
|*)

Theorem union_works : forall g x y,
  In_Same_Set (union g x y) x y.
  intros.
  unfold In_Same_Set, in_same_set, union, find_root.
  destruct (Nat.eq_dec (g x) (g y)) eqn:E1;
  destruct (Nat.eq_dec (g x) (g x)) eqn:E2;
  destruct (Nat.eq_dec (g y) (g y)) eqn:E3.
  - eexists. reflexivity.
  - exfalso. auto.
  - firstorder.
  - firstorder.
  - eexists. reflexivity.
  - exfalso. auto.
  - exfalso. auto.
  - exfalso. auto.
  Qed.

(*|Things remain in the same set after unioning any other elements|*)

Theorem union_mono : forall g x y z w,
   In_Same_Set g x y ->
   In_Same_Set (union g z w) x y.
   intros g x y z w H.
   destruct H.
   unfold union, In_Same_Set.
   unfold in_same_set in H.
   unfold in_same_set.
   unfold find_root.
   rewrite x0.
   apply nat_eq_refl. Qed.


(*|
Here's a small enhancement. We can define a new version of union that checks if the two elements being unioned are already in the same set.
|*)

Definition union2 (g : ds) x y : ds :=
        let px := find_root g x in
        let py := find_root g y in
        if Nat.eq_dec px py then
            g
        else
        fun z =>
        let pz := find_root g z in
        if Nat.eq_dec px pz
        then py
        else pz.
Check in_same_set.


Theorem union2_correct : forall g x y z,
          (union2 g x y) z = (union g x y) z.
intros. unfold union, union2, find_root.
    destruct (Nat.eq_dec (g x) (g y)) eqn:E2;
    destruct (Nat.eq_dec (g x) (g z)) eqn:E1;
    congruence.
    Qed.


(*|
---------------------
Possible Refinements
---------------------

You could abstract over the things that you're forming disjoint sets over
This means you need an enumeration to define init_ds

|*)

Definition ds' (a : Type) := a -> nat.

(*|

Or you can use a self map. This requires decidable equality in for checking canonical elements in the codomain

*)

Definition ds'' (a : Type) := a -> a.


(*|
Or you could do a fun proof relevant version for equivalence relations `R`. Now to perform a union you'll need to supply a proof `R x y`

|*)

Definition ds_pf (a : Type) (R : a -> a -> Prop) :=
    forall x : a,  {y : a | R x y}.

(*|

Another possibility is to use the fairly new feature of Coq, persistent arrays

* https://coq.inria.fr/refman/language/core/primitive.html#primitive-arrays
* https://coq.github.io/doc/master/stdlib/Coq.Array.PArray.html

This may indeed why these were added. They do require dealing with termination though. Here's one suggestion of an encoding. I dunno if this will bite you in the butt.

|*)

Require Import PArray.

Open Scope array_scope.
Fixpoint find_root' gas (a : array (sum nat Int63.int))
         (i : Int63.int) :=
    match gas with
    | O => None
    | S gas' => match a .[i] with
        | inl x => Some i
        | inr j => find_root' gas' a j
    end
    end.

Record disjoint_set := {
    gas : nat; (* unions, or height *)
    parents  : array (sum nat Int63.int) ;
    term_pf : forall i, exists n,
              (find_root' gas parents i = Some n)
}.
	(*\|
	A data structure that I've been more and more interested in recently is the disjoint set datastructure or union-find.
	https://en.wikipedia.org/wiki/Disjoint-set_data_structure It's used in egraphs, unification, prolog, and graph connectivity.

	I realized a cute representation that is easy to use and prover stuff about, although very inefficient compared to the usual version of disjoint set. It uses a simple functional representation based off the observation that the preimages of a function form disjoint sets. This representation is somewhat analogous to using functions to represent Maps in Coq like in Software Foundations. https://softwarefoundations.cis.upenn.edu/lf-current/Maps.html

	The nice thing about this is that it really avoids any termination stickiness. Termination of the `find_root` operation of a union find in an explicit array or tree is not at all obvious and requires side proof or a refined type.
	The termination ultimately comes from the fact that you used a finite number of unions to construct the disjoint sets.
	\|*)

	Require Import Arith.

	(\| For convenience we define disjoint sets of nats. See comments at end of post for some other options \|)
	Definition ds := nat -> nat.

	(\| The completely disjoint set is the identity function \|)
	Definition init_ds : ds := fun x => x.

	(\| `find_root` is just application \|)
	Definition find_root (g : ds) x := g x.

	(\| `in_same_set` is just `nat` equality checking \|)
	Definition in_same_set (g : ds) x y :=
	Nat.eq_dec (g x) (g y).

	(*\| And finally the only interesting operation is `union`. Some comments: It is useful to lift the `find_root` operations out of the body of the returned function to compute them eagerly and share the result. This version unfortunately means that the cost of a `find_root` operation becomes proportional to the number of `union` operations used to construct the `ds`

	\|*)
	Definition union (g : ds) x y : ds :=
	let px := find_root g x in
	let py := find_root g y in
	fun z =>
	let pz := find_root g z in
	if Nat.eq_dec px pz
	then py
	else pz.

	(*\|
	-------------
	Some proofs
	-------------

	I couldn't find this useful lemma in the standard library. Maybe I just missed it?
	\|*)


	Lemma nat_eq_refl : forall x, exists p, Nat.eq_dec x x = left p.
	intros x.
	destruct (Nat.eq_dec x x).
	eexists. reflexivity.
	unfold "<>" in n. exfalso. apply n. reflexivity.
	Qed.

	(*\|
	A useful definition is `In_Same_Set`, which states that two nats `x` and `y` are in the same set if the `in_same_set` function returns a `left` \|*)

	Definition In_Same_Set g x y := exists p,
	in_same_set g x y = left p.

	(*\|
	An element is always in the same set with itself
	\|*)

	Theorem disjoint_refl : forall g x,
	In_Same_Set g x x.
	intros g x. unfold In_Same_Set, in_same_set.
	destruct (Nat.eq_dec (g x) (g x)).
	- eexists. reflexivity.
	- exfalso. apply n. reflexivity.
	Qed.

	(*\|
	The `In_Same_Set g` relation is symmettric.
	\|*)

	Theorem disjoint_sym : forall g x y,
	In_Same_Set g x y ->
	In_Same_Set g y x.
	unfold In_Same_Set, in_same_set, find_root.
	intros g x y H.
	destruct H.
	destruct (Nat.eq_dec (g y) (g x)) eqn:E.
	- eexists. reflexivity.
	- exfalso. apply n. rewrite x0. reflexivity.
	Qed.

	(*\|
	The `In_Same_Set g` relation is transtive.
	\|*)

	Theorem disjoint_trans : forall g x y z,
	In_Same_Set g x y ->
	In_Same_Set g y z ->
	In_Same_Set g x z.
	unfold In_Same_Set, in_same_set, find_root.
	intros g x y z H H0.
	destruct H.
	destruct H0.
	destruct (Nat.eq_dec (g x) (g z)) eqn:E.
	- eexists. reflexivity.
	- exfalso. apply n. rewrite x0. rewrite x1. reflexivity.
	Qed.

	(*\|
	The initial disjoint set only has equal elements in the same set
	\|*)

	Theorem disjoint_init : forall x y, x <> y ->
	exists p, in_same_set init_ds x y = right p.
	intros.
	destruct in_same_set.
	exfalso.
	apply H.
	unfold init_ds in e.
	apply e.
	eexists. reflexivity. Qed.


	(*\|
	After unioning x and y, they are now in the same set. This proof could be tightened up and automated more.
	\|*)

	Theorem union_works : forall g x y,
	In_Same_Set (union g x y) x y.
	intros.
	unfold In_Same_Set, in_same_set, union, find_root.
	destruct (Nat.eq_dec (g x) (g y)) eqn:E1;
	destruct (Nat.eq_dec (g x) (g x)) eqn:E2;
	destruct (Nat.eq_dec (g y) (g y)) eqn:E3.
	- eexists. reflexivity.
	- exfalso. auto.
	- firstorder.
	- firstorder.
	- eexists. reflexivity.
	- exfalso. auto.
	- exfalso. auto.
	- exfalso. auto.
	Qed.

	(\|Things remain in the same set after unioning any other elements\|)

	Theorem union_mono : forall g x y z w,
	In_Same_Set g x y ->
	In_Same_Set (union g z w) x y.
	intros g x y z w H.
	destruct H.
	unfold union, In_Same_Set.
	unfold in_same_set in H.
	unfold in_same_set.
	unfold find_root.
	rewrite x0.
	apply nat_eq_refl. Qed.


	(*\|
	Here's a small enhancement. We can define a new version of union that checks if the two elements being unioned are already in the same set.
	\|*)

	Definition union2 (g : ds) x y : ds :=
	let px := find_root g x in
	let py := find_root g y in
	if Nat.eq_dec px py then
	g
	else
	fun z =>
	let pz := find_root g z in
	if Nat.eq_dec px pz
	then py
	else pz.
	Check in_same_set.


	Theorem union2_correct : forall g x y z,
	(union2 g x y) z = (union g x y) z.
	intros. unfold union, union2, find_root.
	destruct (Nat.eq_dec (g x) (g y)) eqn:E2;
	destruct (Nat.eq_dec (g x) (g z)) eqn:E1;
	congruence.
	Qed.


	(*\|
	---------------------
	Possible Refinements
	---------------------

	You could abstract over the things that you're forming disjoint sets over
	This means you need an enumeration to define init_ds

	\|*)

	Definition ds' (a : Type) := a -> nat.

	(*\|

	Or you can use a self map. This requires decidable equality in for checking canonical elements in the codomain

	*)

	Definition ds'' (a : Type) := a -> a.


	(*\|
	Or you could do a fun proof relevant version for equivalence relations `R`. Now to perform a union you'll need to supply a proof `R x y`

	\|*)

	Definition ds_pf (a : Type) (R : a -> a -> Prop) :=
	forall x : a, {y : a \| R x y}.

	(*\|

	Another possibility is to use the fairly new feature of Coq, persistent arrays

	* https://coq.inria.fr/refman/language/core/primitive.html#primitive-arrays
	* https://coq.github.io/doc/master/stdlib/Coq.Array.PArray.html

	This may indeed why these were added. They do require dealing with termination though. Here's one suggestion of an encoding. I dunno if this will bite you in the butt.

	\|*)

	Require Import PArray.

	Open Scope array_scope.
	Fixpoint find_root' gas (a : array (sum nat Int63.int))
	(i : Int63.int) :=
	match gas with
	\| O => None
	\| S gas' => match a .[i] with
	\| inl x => Some i
	\| inr j => find_root' gas' a j
	end
	end.

	Record disjoint_set := {
	gas : nat; (* unions, or height *)
	parents : array (sum nat Int63.int) ;
	term_pf : forall i, exists n,
	(find_root' gas parents i = Some n)
	}.