Skip to content

Instantly share code, notes, and snippets.

@az0
Created May 9, 2014 18:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save az0/e1ca3f18f06985398861 to your computer and use it in GitHub Desktop.
Save az0/e1ca3f18f06985398861 to your computer and use it in GitHub Desktop.
/*
https://stackoverflow.com/questions/23525187/identifying-connected-graphs-given-edges
*/
/*
The general methods for finding all connected components of a graph are Breadth-First-Search
or Depth-First-Search. SAS is not the best tool for implementing such algorithms since
they require using such data structures as queues.
Still it can be done with hash objects. Here's the code for BF-search.
*/
data people;
input person_id address_id $ household_id;
datalines;
1 A 1
2 B 2
3 B 2
4 C 3
5 C 3
5 D 3
6 D 3
;
run;
/*
Create adjacency list - all pairs of people with a common address. And empty variable cluster which will be populated later with groups' IDs:
*/
proc sql;
create table connections as
select distinct a.person_id as person_id_a, b.person_id as person_id_b, . as cluster
from people a
inner join people b
on a.address_id=b.address_id
;
quit;
/*Here goes the BF-search itself:*/
data _null_;
/*Declare hash object and its iterator for all unique people (vertices of the graph):*/
if 0 then set Connections;
dcl hash V(dataset:'Connections', ordered:'y');
V.defineKey('person_id_a');
V.defineData('person_id_a','cluster');
dcl hiter Vi('V');
V.defineDone();
/*Declare hash object for all connections (edges of the graph):*/
dcl hash E(dataset:'Connections', multidata:'y');
E.defineKey('person_id_a');
E.defineData('person_id_a','person_id_b');
E.defineDone();
/*Declare hash object and its iterator for the queue:*/
dcl hash Q(ordered:'y');
Q.defineKey('qnum','person_id_a');
Q.defineData('qnum','person_id_a');
dcl hiter Qi('Q');
Q.defineDone();
/*The outermost loop - for taking a new person without assigned cluster to be a root of the next cluster, when the queue is empty:*/
n = 0; /* ahz */
rc1=Vi.first();
do while(rc1=0);
if missing(cluster) then do;
qnum=1; Q.add(); *qnum-number of the person in the queue, to ensure that new people are added to the end of the queue.
n+1; cluster=n;
V.replace();*assign cluster number to a person;
/*In the following two nested loops we de-queue the first person in the queue and
look for all people connected to this person in adjacency list. Every found 'connection'
we add to the end of the queue. When done with the first person, we delete him/her and de-queue the next one (who became the first now). All of them will be in the same cluster. And so on, until the queue is empty. Then we take a new root person for a new cluster.*/
rc2=Qi.first();
do while(rc2=0);
qnum=qnum+Q.num_items-1;
rc3=E.find();
do while(rc3=0);
person_id_a=person_id_b;
rc4=V.find();
if rc4=0 and missing(cluster) then do;
qnum+1; Q.add();
cluster=n;
V.replace();
end;
rc3=E.find_next();
end;
Qi.first();
Qi.delete();
Q.remove();
Qi=_new_ hiter ('Q');
rc2=Qi.first();
end;
end;
rc1=Vi.next();
end;
/*Output list of people with assigned clusters.*/
V.output(dataset:'clusters');
run;
proc sort data=clusters; by cluster; run;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment