Skip to content

Instantly share code, notes, and snippets.

@dgleich
Created May 26, 2011 19:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dgleich/993867 to your computer and use it in GitHub Desktop.
Save dgleich/993867 to your computer and use it in GitHub Desktop.
A benchmark problem for enumeration of graph subsets
/**
* @file exact_modularity.cc
* Find the cluster of largest modularity in a small graph by exhaustive
* enumeration.
* @author David F. Gleich
*/
/**
* History
* -------
* :2011-05-17: Initial coding
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <iostream>
#include <string>
#include <vector>
#include <map>
#include <pthread.h>
const char* usage() {
return ""
"exact_modularity graphfile.smat [-n nthreads] [-output outputfile] \n"
" [-status statusfile] [-nostatus] [-bench]\n"
"\n"
" graphfile.smat - the input graph file. Must be <= 64 vertices."
" nthreads - the number of computational threads to use, default=2."
" output - the resulting cluster as an indicator vector over\n"
" vertices, i.e. one line for each vertex with 1 if its in the cluster\n"
" and 0 otherwise. The default name is graphfile.smat.modularity"
" status - An interim status file to make it possible to restart.\n"
" If you don't want a statusfile, then use -nostatus\n"
" The statusfile is removed on completion.\n"
" bench - Benchmark the code with 2 threads on a fixed graph.\n"
" Just specify anything for the graphfile name.\n"
"\n";
}
struct options {
std::string graphfile;
std::string outputfile;
std::string statusfile;
int nthreads;
bool bench;
};
bool parse_command_line(int argc, char **argv, options& opts) {
if (argc < 2) {
std::cerr << usage() << std::endl;
return false;
}
opts.bench = false;
opts.graphfile = argv[1];
// set defaults
std::string base = opts.graphfile.substr(0, opts.graphfile.rfind('.'));
printf("basename = %s\n", base.c_str());
opts.outputfile = base + ".modularity";
opts.statusfile = base + ".modstatus";
opts.nthreads = 2;
for (int argi = 2; argi < argc; ++argi) {
if (argv[argi][0] != '-') {
std::cerr << "Unknown argument: " << argv[argi] << std::endl;
return false;
}
if (strcmp(argv[argi], "-nthreads")==0
|| strcmp(argv[argi], "--nthreads")==0) {
if ((argi+1) >= argc) { std::cerr << "Missing argument" << std::endl; }
opts.nthreads = atoi(argv[++argi]);
if (opts.nthreads < 1) {
std::cerr << "Invalid nthreads argument: " << argv[argi] << std::endl;
}
} else if (strcmp(argv[argi], "-output")==0
|| strcmp(argv[argi], "--output")==0) {
if ((argi+1) >= argc) { std::cerr << "Missing argument" << std::endl; }
opts.outputfile = argv[++argi];
} else if (strcmp(argv[argi], "-status")==0
|| strcmp(argv[argi], "--status")==0) {
if ((argi+1) >= argc) { std::cerr << "Missing argument" << std::endl; }
opts.statusfile = argv[++argi];
} else if (strcmp(argv[argi], "-nostatus")==0
|| strcmp(argv[argi], "--nostatus")==0) {
opts.statusfile = "";
} else if (strcmp(argv[argi], "-bench")==0
|| strcmp(argv[argi], "--bench")==0) {
opts.bench = true;
} else {
std::cerr << "Unknown option: " << argv[argi] << std::endl;
return false;
}
}
return true;
}
class smat_graph {
public:
std::vector<int> dstptr;
std::vector<int> dsts;
int nverts;
int nedges;
void reset() {
nverts = 0;
nedges = 0;
dstptr.empty();
dsts.empty();
}
/**
* EdgeIterator is any restartable iterator, where each item
* is a std::pair<int,int>
*
* This code uses the minimum memory approach. It uses an extra two
* passes over the data for this task.
*/
template <typename EdgeIterator>
bool load_from_edges(EdgeIterator begin, EdgeIterator end) {
// pass 1, determine n
int n=0;
for (EdgeIterator iter = begin; iter != end; ++iter) {
n = std::max(iter->first,n);
n = std::max(iter->second,n);
nedges ++;
}
nverts = n + 1;
// pass 2, compute degrees
dstptr.resize(nverts+1);
dsts.resize(nedges);
int edgeno=0;
for (EdgeIterator iter = begin; iter != end; ++iter) {
int s = iter->first;
int d = iter->second;
if (s < 0 || s >= nverts) {
std::cerr << "edge:" << edgeno
<< " source value " << s << " is out-of-range "
<< "[0," << nverts << ")" << std::endl;
return false;
}
if (d < 0 || d >= nverts) {
std::cerr << "edge:" << edgeno
<< " destination value " << d << " is out-of-range "
<< "[0," << nverts << ")" << std::endl;
return false;
}
dstptr[s+1] += 1;
edgeno ++;
}
// build pointer locations
for (int nzi=0, j=0; j<nverts+1; j++) {
nzi += dstptr[j];
dstptr[j] = nzi;
}
// pass 3, load edges
edgeno = 0;
for (EdgeIterator iter = begin; iter != end; ++iter) {
int s = iter->first;
int d = iter->second;
if (s < 0 || s >= nverts) {
std::cerr << "edge:" << edgeno
<< " source value " << s << " is out-of-range "
<< "[0," << nverts << ")" << std::endl;
return false;
}
if (d < 0 || d >= nverts) {
std::cerr << "edge:" << edgeno
<< " destination value " << d << " is out-of-range "
<< "[0," << nverts << ")" << std::endl;
return false;
}
dsts[dstptr[s]] = d;
dstptr[s] += 1;
}
// reset pointer locations
for (int j=nverts; j>0; j--) {
dstptr[j] = dstptr[j-1];
}
dstptr[0] = 0;
return true;
}
bool load(const char* filename) {
FILE *f = fopen(filename,"rt");
if (!f) {
return false;
}
int m=0, n=0, nz=0;
if (fscanf(f, "%i %i %i", &m, &n, &nz) != 3) {
std::cerr << "error reading header" << std::endl;
return false;
}
if (m != n) {
std::cerr << "error not a square matrix (" << m << " != " << n << ")"
<< std::endl;
return false;
}
nverts = n;
nedges = nz;
dstptr.resize(nverts+1);
dsts.resize(nedges);
// read for degree counts
for (int i=0; i<nz; ++i) {
int s, d;
double val;
if (fscanf(f, "%i %i %lf", &s, &d, &val) != 3) {
std::cerr << filename << ":" << i+2
<< " error reading non-zero " << i+1 << std::endl;
return false;
}
if (s < 0 || s >= nverts) {
std::cerr << filename << ":" << i+2
<< " source value " << s << " is out-of-range "
<< "[0," << nverts << ")" << std::endl;
return false;
}
if (d < 0 || d >= nverts) {
std::cerr << filename << ":" << i+2
<< " destination value " << d << " is out-of-range "
<< "[0," << nverts << ")" << std::endl;
return false;
}
dstptr[s+1] += 1;
}
// check for extra rows
{
char c;
if ((fscanf(f, "%1s", &c) == 1) || !feof(f)) {
std::cerr << filename << ":"
<< " extra lines present after " << nz << " non-zeros"
<< std::endl;
}
clearerr(f); // fix the feof flag
}
// build pointer locations
for (int nzi=0, j=0; j<nverts+1; j++) {
nzi += dstptr[j];
dstptr[j] = nzi;
}
// reset file pointer
if (fseek(f, 0, SEEK_SET) != 0) {
std::cerr << filename
<< ": error resetting file stream" << std::endl;
return false;
}
// read the header again
if (fscanf(f, "%i %i %i", &m, &n, &nz) != 3) {
std::cerr << "error re-reading header" << std::endl;
return false;
}
if (m != n) {
std::cerr << "error re-read is not a square matrix ("
<< m << " != " << n << ")" << std::endl;
return false;
}
if (n != nverts || nz != nedges) {
std::cerr << "error re-read changed vertices or edges "
<< "(nverts=" << nverts << " != " << n << ")"
<< "(nedges=" << nedges << " != " << nz << ")"
<< std::endl;
return false;
}
// read for degree counts
for (int i=0; i<nedges; ++i) {
int s, d;
double val;
if (fscanf(f, "%i %i %lf", &s, &d, &val) != 3) {
std::cerr << filename << ":" << i+2
<< " error re-reading non-zero " << i+1 << std::endl;
return false;
}
if (s < 0 || s >= nverts) {
std::cerr << filename << ":" << i+2
<< " source value " << s << " is out-of-range "
<< "[0," << nverts << ")" << std::endl;
return false;
}
if (d < 0 || d >= nverts) {
std::cerr << filename << ":" << i+2
<< " destination value " << d << " is out-of-range "
<< "[0," << nverts << ")" << std::endl;
return false;
}
dsts[dstptr[s]] = d;
dstptr[s] += 1;
}
// reset pointer locations
for (int j=nverts; j>0; j--) {
dstptr[j] = dstptr[j-1];
}
dstptr[0] = 0;
return true;
}
void print(FILE* f) {
for (int ri=0; ri<nverts; ++ri) {
for (int nzi=dstptr[ri]; nzi<dstptr[ri+1]; ++nzi) {
int cj=dsts[nzi];
fprintf(f, "%i %i\n", ri, cj);
}
}
}
bool is_symmetric() {
typedef std::map<int,int> vmap;
typedef std::vector< vmap > graph_type;
graph_type graph(nverts);
for (int ri=0; ri<nverts; ++ri) {
for (int nzi=dstptr[ri]; nzi<dstptr[ri+1]; ++nzi) {
int cj=dsts[nzi];
graph[ri].insert(std::make_pair(cj,0));
graph[cj].insert(std::make_pair(ri,0));
graph[ri][cj] += 1;
graph[cj][ri] -= 1;
}
}
bool rval = true;
for (int ri=0; ri<nverts; ++ri) {
for (vmap::const_iterator vit=graph[ri].begin();
vit != graph[ri].end(); ++vit) {
if (vit->second != 0) {
std::cerr << "Graph is not-symmetric, check element "
<< "(" << ri << "," << vit->first << ")" << std::endl;
rval = false;
return (rval);
}
}
}
return (rval);
}
/** Check to make sure that none of the edges are repeated. */
bool has_repeated_edges() const {
typedef std::map<int,int> vmap;
typedef std::vector< vmap > graph_type;
graph_type graph(nverts);
for (int ri=0; ri<nverts; ++ri) {
for (int nzi=dstptr[ri]; nzi<dstptr[ri+1]; ++nzi) {
int cj=dsts[nzi];
graph[ri].insert(std::make_pair(cj,0));
graph[ri][cj] += 1;
}
}
bool rval = false;
for (int ri=0; ri<nverts; ++ri) {
for (vmap::const_iterator vit=graph[ri].begin();
vit != graph[ri].end(); ++vit) {
if (vit->second != 1) {
rval = true;
std::cerr << "Graph has a repeated edge, check element "
<< "(" << ri << "," << vit->first << ")" << std::endl;
return (rval);
}
}
}
return (rval);
}
double modularity(int* set) {
// count edges
double edges = 0.;
double vol = 0.;
for (int ri=0; ri<nverts; ++ri) {
for (int nzi=dstptr[ri]; nzi<dstptr[ri+1]; ++nzi) {
int cj=dsts[nzi];
if (set[ri] == 1) {
vol += 1.;
if (set[cj] == 1) {
edges += 1.;
}
}
}
}
double igvol = 1./(double)nedges;
return edges*igvol - (vol*igvol)*(vol*igvol);
}
double modularity(std::vector<int> &set) {
modularity(&set[0]);
}
};
typedef unsigned long long int smallset;
struct thread_data {
smat_graph *g;
smallset start;
smallset end;
double max;
smallset set;
};
void* modularity_thread(void* tdata) {
thread_data *data = (thread_data*)tdata;
smat_graph& g = *(data->g);
data->max = -1.;
std::vector<int> set(g.nverts);
for (smallset cur=data->start; cur<data->end; ++cur) {
// parse the smallset
for (int i=0; i<g.nverts; ++i) {
set[i] = (cur&(1<<i))>0;
}
double m = g.modularity(&set[0]);
if (m > data->max) {
data->set = cur;
data->max = m;
}
}
return NULL;
}
bool write_modularity(const char* filename, smallset s, int nverts) {
FILE *f = fopen(filename, "w");
if (!f) {
std::cerr << "Could not open " << filename << " for writing" << std::endl;
return false;
}
for (int i=0; i<nverts; ++i) {
fprintf(f, "%i\n",(s&(1<<i))>0);
}
fclose(f);
return true;
}
std::pair<smallset,double> modularity_enumerate(smat_graph& g, options opts)
{
// create the thread loop
std::vector<pthread_t> threads(opts.nthreads);
std::vector<thread_data> tdata(opts.nthreads);
smallset increment = ((unsigned long long)1)<<22;
smallset cur = 0;
smallset last = ((smallset)1)<<(g.nverts+1);
smallset best = 0;
double max = -1.;
// check if we have an old status
if (!opts.statusfile.empty()) {
FILE *s = fopen(opts.statusfile.c_str(), "r");
if (s) {
int rval = fscanf(s,"%llu %llu %lf", &cur, &best, &max);
if (rval == 3) {
std::cerr << "read old data from " << opts.statusfile << std::endl;
std::cerr << "max-modularity: " << max << std::endl;
std::cerr << "best-set: " << best << std::endl;
std::cerr << "current-set: " << cur << std::endl;
} else {
std::cerr << "couldn't read old data from " << opts.statusfile << std::endl;
// reset data
cur = 0; best = 0; max = -1.;
}
fclose(s);
}
}
while (cur < last) {
for (int t=0; t<opts.nthreads; ++t) {
// create thread data
smallset end = cur + increment;
if (end > last) { end = last; }
tdata[t].g = &g;
tdata[t].start = cur;
tdata[t].end = end;
tdata[t].max = -1.;
pthread_create(&threads[t], NULL, modularity_thread, (void*)&tdata[t]);
cur = tdata[t].end;
}
for (int t=0; t<opts.nthreads; ++t) {
pthread_join(threads[t], NULL);
if (tdata[t].max > max) {
max = tdata[t].max;
best = tdata[t].set;
}
}
if (!opts.statusfile.empty()) {
FILE *s = fopen(opts.statusfile.c_str(), "w");
fprintf(s,"%llu\n", cur);
fprintf(s,"%llu\n", best);
fprintf(s,"%.18e\n", max);
fclose(s);
}
std::cerr << "Results from last thread-pool" << std::endl;
std::cerr << "max-modularity: " << max << std::endl;
std::cerr << "best-set: " << best << std::endl;
std::cerr << "current-set: " << cur << std::endl;
}
if (!opts.statusfile.empty()) {
remove(opts.statusfile.c_str());
}
return std::make_pair(best,max);
}
int bench() {
smat_graph g;
std::vector<std::pair<int,int> > edges;
edges.push_back(std::make_pair<int,int>(0,12));
edges.push_back(std::make_pair<int,int>(12,0));
edges.push_back(std::make_pair<int,int>(0,2));
edges.push_back(std::make_pair<int,int>(2,0));
edges.push_back(std::make_pair<int,int>(2,3));
edges.push_back(std::make_pair<int,int>(3,2));
edges.push_back(std::make_pair<int,int>(3,0));
edges.push_back(std::make_pair<int,int>(0,3));
edges.push_back(std::make_pair<int,int>(0,1));
edges.push_back(std::make_pair<int,int>(1,0));
edges.push_back(std::make_pair<int,int>(1,2));
edges.push_back(std::make_pair<int,int>(2,1));
edges.push_back(std::make_pair<int,int>(2,4));
edges.push_back(std::make_pair<int,int>(4,2));
edges.push_back(std::make_pair<int,int>(4,9));
edges.push_back(std::make_pair<int,int>(9,4));
edges.push_back(std::make_pair<int,int>(9,3));
edges.push_back(std::make_pair<int,int>(3,9));
edges.push_back(std::make_pair<int,int>(3,10));
edges.push_back(std::make_pair<int,int>(10,3));
edges.push_back(std::make_pair<int,int>(10,1));
edges.push_back(std::make_pair<int,int>(1,10));
edges.push_back(std::make_pair<int,int>(1,7));
edges.push_back(std::make_pair<int,int>(7,1));
edges.push_back(std::make_pair<int,int>(7,5));
edges.push_back(std::make_pair<int,int>(5,7));
edges.push_back(std::make_pair<int,int>(5,6));
edges.push_back(std::make_pair<int,int>(6,5));
edges.push_back(std::make_pair<int,int>(6,0));
edges.push_back(std::make_pair<int,int>(0,6));
edges.push_back(std::make_pair<int,int>(0,13));
edges.push_back(std::make_pair<int,int>(13,0));
edges.push_back(std::make_pair<int,int>(13,18));
edges.push_back(std::make_pair<int,int>(18,13));
edges.push_back(std::make_pair<int,int>(18,17));
edges.push_back(std::make_pair<int,int>(17,18));
edges.push_back(std::make_pair<int,int>(17,13));
edges.push_back(std::make_pair<int,int>(13,17));
edges.push_back(std::make_pair<int,int>(13,14));
edges.push_back(std::make_pair<int,int>(14,13));
edges.push_back(std::make_pair<int,int>(14,16));
edges.push_back(std::make_pair<int,int>(16,14));
edges.push_back(std::make_pair<int,int>(14,15));
edges.push_back(std::make_pair<int,int>(15,14));
edges.push_back(std::make_pair<int,int>(14,5));
edges.push_back(std::make_pair<int,int>(5,14));
edges.push_back(std::make_pair<int,int>(6,14));
edges.push_back(std::make_pair<int,int>(14,6));
edges.push_back(std::make_pair<int,int>(14,24));
edges.push_back(std::make_pair<int,int>(24,14));
edges.push_back(std::make_pair<int,int>(24,4));
edges.push_back(std::make_pair<int,int>(4,24));
edges.push_back(std::make_pair<int,int>(4,8));
edges.push_back(std::make_pair<int,int>(8,4));
edges.push_back(std::make_pair<int,int>(8,9));
edges.push_back(std::make_pair<int,int>(9,8));
edges.push_back(std::make_pair<int,int>(9,10));
edges.push_back(std::make_pair<int,int>(10,9));
edges.push_back(std::make_pair<int,int>(10,11));
edges.push_back(std::make_pair<int,int>(11,10));
edges.push_back(std::make_pair<int,int>(10,19));
edges.push_back(std::make_pair<int,int>(19,10));
edges.push_back(std::make_pair<int,int>(3,22));
edges.push_back(std::make_pair<int,int>(22,3));
edges.push_back(std::make_pair<int,int>(22,23));
edges.push_back(std::make_pair<int,int>(23,22));
edges.push_back(std::make_pair<int,int>(22,21));
edges.push_back(std::make_pair<int,int>(21,22));
edges.push_back(std::make_pair<int,int>(22,20));
edges.push_back(std::make_pair<int,int>(20,22));
edges.push_back(std::make_pair<int,int>(1,9));
edges.push_back(std::make_pair<int,int>(9,1));
g.load_from_edges(edges.begin(), edges.end());
assert(g.is_symmetric() == true);
assert(g.has_repeated_edges() == false);
options opts;
opts.nthreads = 3;
opts.statusfile = "";
opts.outputfile = "";
std::pair<smallset,double> result = modularity_enumerate(g, opts);
std::cout << result.second << std::endl;
std::cout << result.first << std::endl;
}
int main(int argc, char **argv) {
options opts;
if (!parse_command_line(argc, argv, opts)) {
return (1);
}
if (opts.bench) {
bench();
return (0);
}
std::cerr << "graphfile: " << opts.graphfile << std::endl;
std::cerr << "outputfile: " << opts.outputfile << std::endl;
std::cerr << "statusfile: " << opts.statusfile << std::endl;
std::cerr << "nthreads: " << opts.nthreads << std::endl;
std::cerr << "loading graph ..." << std::endl;
smat_graph g;
if (!g.load(opts.graphfile.c_str())) {
return (2);
}
std::cerr << "loaded graph" << std::endl;
std::cerr << "nverts: " << g.nverts << std::endl;
std::cerr << "nedges: " << g.nedges << std::endl;
std::cerr << "is_symmetric: " << g.is_symmetric() << std::endl;
std::cerr << "has_repeated_edges: " << g.has_repeated_edges() << std::endl;
std::pair<smallset,double> result = modularity_enumerate(g, opts);
std::cout<< result.second << std::endl;
std::cout << result.first << std::endl;
write_modularity(opts.outputfile.c_str(), result.first, g.nverts);
return (0);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment