Created
May 26, 2011 19:29
-
-
Save dgleich/993867 to your computer and use it in GitHub Desktop.
A benchmark problem for enumeration of graph subsets
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* @file exact_modularity.cc | |
* Find the cluster of largest modularity in a small graph by exhaustive | |
* enumeration. | |
* @author David F. Gleich | |
*/ | |
/** | |
* History | |
* ------- | |
* :2011-05-17: Initial coding | |
*/ | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <string.h> | |
#include <assert.h> | |
#include <iostream> | |
#include <string> | |
#include <vector> | |
#include <map> | |
#include <pthread.h> | |
const char* usage() { | |
return "" | |
"exact_modularity graphfile.smat [-n nthreads] [-output outputfile] \n" | |
" [-status statusfile] [-nostatus] [-bench]\n" | |
"\n" | |
" graphfile.smat - the input graph file. Must be <= 64 vertices." | |
" nthreads - the number of computational threads to use, default=2." | |
" output - the resulting cluster as an indicator vector over\n" | |
" vertices, i.e. one line for each vertex with 1 if its in the cluster\n" | |
" and 0 otherwise. The default name is graphfile.smat.modularity" | |
" status - An interim status file to make it possible to restart.\n" | |
" If you don't want a statusfile, then use -nostatus\n" | |
" The statusfile is removed on completion.\n" | |
" bench - Benchmark the code with 2 threads on a fixed graph.\n" | |
" Just specify anything for the graphfile name.\n" | |
"\n"; | |
} | |
struct options { | |
std::string graphfile; | |
std::string outputfile; | |
std::string statusfile; | |
int nthreads; | |
bool bench; | |
}; | |
bool parse_command_line(int argc, char **argv, options& opts) { | |
if (argc < 2) { | |
std::cerr << usage() << std::endl; | |
return false; | |
} | |
opts.bench = false; | |
opts.graphfile = argv[1]; | |
// set defaults | |
std::string base = opts.graphfile.substr(0, opts.graphfile.rfind('.')); | |
printf("basename = %s\n", base.c_str()); | |
opts.outputfile = base + ".modularity"; | |
opts.statusfile = base + ".modstatus"; | |
opts.nthreads = 2; | |
for (int argi = 2; argi < argc; ++argi) { | |
if (argv[argi][0] != '-') { | |
std::cerr << "Unknown argument: " << argv[argi] << std::endl; | |
return false; | |
} | |
if (strcmp(argv[argi], "-nthreads")==0 | |
|| strcmp(argv[argi], "--nthreads")==0) { | |
if ((argi+1) >= argc) { std::cerr << "Missing argument" << std::endl; } | |
opts.nthreads = atoi(argv[++argi]); | |
if (opts.nthreads < 1) { | |
std::cerr << "Invalid nthreads argument: " << argv[argi] << std::endl; | |
} | |
} else if (strcmp(argv[argi], "-output")==0 | |
|| strcmp(argv[argi], "--output")==0) { | |
if ((argi+1) >= argc) { std::cerr << "Missing argument" << std::endl; } | |
opts.outputfile = argv[++argi]; | |
} else if (strcmp(argv[argi], "-status")==0 | |
|| strcmp(argv[argi], "--status")==0) { | |
if ((argi+1) >= argc) { std::cerr << "Missing argument" << std::endl; } | |
opts.statusfile = argv[++argi]; | |
} else if (strcmp(argv[argi], "-nostatus")==0 | |
|| strcmp(argv[argi], "--nostatus")==0) { | |
opts.statusfile = ""; | |
} else if (strcmp(argv[argi], "-bench")==0 | |
|| strcmp(argv[argi], "--bench")==0) { | |
opts.bench = true; | |
} else { | |
std::cerr << "Unknown option: " << argv[argi] << std::endl; | |
return false; | |
} | |
} | |
return true; | |
} | |
class smat_graph { | |
public: | |
std::vector<int> dstptr; | |
std::vector<int> dsts; | |
int nverts; | |
int nedges; | |
void reset() { | |
nverts = 0; | |
nedges = 0; | |
dstptr.empty(); | |
dsts.empty(); | |
} | |
/** | |
* EdgeIterator is any restartable iterator, where each item | |
* is a std::pair<int,int> | |
* | |
* This code uses the minimum memory approach. It uses an extra two | |
* passes over the data for this task. | |
*/ | |
template <typename EdgeIterator> | |
bool load_from_edges(EdgeIterator begin, EdgeIterator end) { | |
// pass 1, determine n | |
int n=0; | |
for (EdgeIterator iter = begin; iter != end; ++iter) { | |
n = std::max(iter->first,n); | |
n = std::max(iter->second,n); | |
nedges ++; | |
} | |
nverts = n + 1; | |
// pass 2, compute degrees | |
dstptr.resize(nverts+1); | |
dsts.resize(nedges); | |
int edgeno=0; | |
for (EdgeIterator iter = begin; iter != end; ++iter) { | |
int s = iter->first; | |
int d = iter->second; | |
if (s < 0 || s >= nverts) { | |
std::cerr << "edge:" << edgeno | |
<< " source value " << s << " is out-of-range " | |
<< "[0," << nverts << ")" << std::endl; | |
return false; | |
} | |
if (d < 0 || d >= nverts) { | |
std::cerr << "edge:" << edgeno | |
<< " destination value " << d << " is out-of-range " | |
<< "[0," << nverts << ")" << std::endl; | |
return false; | |
} | |
dstptr[s+1] += 1; | |
edgeno ++; | |
} | |
// build pointer locations | |
for (int nzi=0, j=0; j<nverts+1; j++) { | |
nzi += dstptr[j]; | |
dstptr[j] = nzi; | |
} | |
// pass 3, load edges | |
edgeno = 0; | |
for (EdgeIterator iter = begin; iter != end; ++iter) { | |
int s = iter->first; | |
int d = iter->second; | |
if (s < 0 || s >= nverts) { | |
std::cerr << "edge:" << edgeno | |
<< " source value " << s << " is out-of-range " | |
<< "[0," << nverts << ")" << std::endl; | |
return false; | |
} | |
if (d < 0 || d >= nverts) { | |
std::cerr << "edge:" << edgeno | |
<< " destination value " << d << " is out-of-range " | |
<< "[0," << nverts << ")" << std::endl; | |
return false; | |
} | |
dsts[dstptr[s]] = d; | |
dstptr[s] += 1; | |
} | |
// reset pointer locations | |
for (int j=nverts; j>0; j--) { | |
dstptr[j] = dstptr[j-1]; | |
} | |
dstptr[0] = 0; | |
return true; | |
} | |
bool load(const char* filename) { | |
FILE *f = fopen(filename,"rt"); | |
if (!f) { | |
return false; | |
} | |
int m=0, n=0, nz=0; | |
if (fscanf(f, "%i %i %i", &m, &n, &nz) != 3) { | |
std::cerr << "error reading header" << std::endl; | |
return false; | |
} | |
if (m != n) { | |
std::cerr << "error not a square matrix (" << m << " != " << n << ")" | |
<< std::endl; | |
return false; | |
} | |
nverts = n; | |
nedges = nz; | |
dstptr.resize(nverts+1); | |
dsts.resize(nedges); | |
// read for degree counts | |
for (int i=0; i<nz; ++i) { | |
int s, d; | |
double val; | |
if (fscanf(f, "%i %i %lf", &s, &d, &val) != 3) { | |
std::cerr << filename << ":" << i+2 | |
<< " error reading non-zero " << i+1 << std::endl; | |
return false; | |
} | |
if (s < 0 || s >= nverts) { | |
std::cerr << filename << ":" << i+2 | |
<< " source value " << s << " is out-of-range " | |
<< "[0," << nverts << ")" << std::endl; | |
return false; | |
} | |
if (d < 0 || d >= nverts) { | |
std::cerr << filename << ":" << i+2 | |
<< " destination value " << d << " is out-of-range " | |
<< "[0," << nverts << ")" << std::endl; | |
return false; | |
} | |
dstptr[s+1] += 1; | |
} | |
// check for extra rows | |
{ | |
char c; | |
if ((fscanf(f, "%1s", &c) == 1) || !feof(f)) { | |
std::cerr << filename << ":" | |
<< " extra lines present after " << nz << " non-zeros" | |
<< std::endl; | |
} | |
clearerr(f); // fix the feof flag | |
} | |
// build pointer locations | |
for (int nzi=0, j=0; j<nverts+1; j++) { | |
nzi += dstptr[j]; | |
dstptr[j] = nzi; | |
} | |
// reset file pointer | |
if (fseek(f, 0, SEEK_SET) != 0) { | |
std::cerr << filename | |
<< ": error resetting file stream" << std::endl; | |
return false; | |
} | |
// read the header again | |
if (fscanf(f, "%i %i %i", &m, &n, &nz) != 3) { | |
std::cerr << "error re-reading header" << std::endl; | |
return false; | |
} | |
if (m != n) { | |
std::cerr << "error re-read is not a square matrix (" | |
<< m << " != " << n << ")" << std::endl; | |
return false; | |
} | |
if (n != nverts || nz != nedges) { | |
std::cerr << "error re-read changed vertices or edges " | |
<< "(nverts=" << nverts << " != " << n << ")" | |
<< "(nedges=" << nedges << " != " << nz << ")" | |
<< std::endl; | |
return false; | |
} | |
// read for degree counts | |
for (int i=0; i<nedges; ++i) { | |
int s, d; | |
double val; | |
if (fscanf(f, "%i %i %lf", &s, &d, &val) != 3) { | |
std::cerr << filename << ":" << i+2 | |
<< " error re-reading non-zero " << i+1 << std::endl; | |
return false; | |
} | |
if (s < 0 || s >= nverts) { | |
std::cerr << filename << ":" << i+2 | |
<< " source value " << s << " is out-of-range " | |
<< "[0," << nverts << ")" << std::endl; | |
return false; | |
} | |
if (d < 0 || d >= nverts) { | |
std::cerr << filename << ":" << i+2 | |
<< " destination value " << d << " is out-of-range " | |
<< "[0," << nverts << ")" << std::endl; | |
return false; | |
} | |
dsts[dstptr[s]] = d; | |
dstptr[s] += 1; | |
} | |
// reset pointer locations | |
for (int j=nverts; j>0; j--) { | |
dstptr[j] = dstptr[j-1]; | |
} | |
dstptr[0] = 0; | |
return true; | |
} | |
void print(FILE* f) { | |
for (int ri=0; ri<nverts; ++ri) { | |
for (int nzi=dstptr[ri]; nzi<dstptr[ri+1]; ++nzi) { | |
int cj=dsts[nzi]; | |
fprintf(f, "%i %i\n", ri, cj); | |
} | |
} | |
} | |
bool is_symmetric() { | |
typedef std::map<int,int> vmap; | |
typedef std::vector< vmap > graph_type; | |
graph_type graph(nverts); | |
for (int ri=0; ri<nverts; ++ri) { | |
for (int nzi=dstptr[ri]; nzi<dstptr[ri+1]; ++nzi) { | |
int cj=dsts[nzi]; | |
graph[ri].insert(std::make_pair(cj,0)); | |
graph[cj].insert(std::make_pair(ri,0)); | |
graph[ri][cj] += 1; | |
graph[cj][ri] -= 1; | |
} | |
} | |
bool rval = true; | |
for (int ri=0; ri<nverts; ++ri) { | |
for (vmap::const_iterator vit=graph[ri].begin(); | |
vit != graph[ri].end(); ++vit) { | |
if (vit->second != 0) { | |
std::cerr << "Graph is not-symmetric, check element " | |
<< "(" << ri << "," << vit->first << ")" << std::endl; | |
rval = false; | |
return (rval); | |
} | |
} | |
} | |
return (rval); | |
} | |
/** Check to make sure that none of the edges are repeated. */ | |
bool has_repeated_edges() const { | |
typedef std::map<int,int> vmap; | |
typedef std::vector< vmap > graph_type; | |
graph_type graph(nverts); | |
for (int ri=0; ri<nverts; ++ri) { | |
for (int nzi=dstptr[ri]; nzi<dstptr[ri+1]; ++nzi) { | |
int cj=dsts[nzi]; | |
graph[ri].insert(std::make_pair(cj,0)); | |
graph[ri][cj] += 1; | |
} | |
} | |
bool rval = false; | |
for (int ri=0; ri<nverts; ++ri) { | |
for (vmap::const_iterator vit=graph[ri].begin(); | |
vit != graph[ri].end(); ++vit) { | |
if (vit->second != 1) { | |
rval = true; | |
std::cerr << "Graph has a repeated edge, check element " | |
<< "(" << ri << "," << vit->first << ")" << std::endl; | |
return (rval); | |
} | |
} | |
} | |
return (rval); | |
} | |
double modularity(int* set) { | |
// count edges | |
double edges = 0.; | |
double vol = 0.; | |
for (int ri=0; ri<nverts; ++ri) { | |
for (int nzi=dstptr[ri]; nzi<dstptr[ri+1]; ++nzi) { | |
int cj=dsts[nzi]; | |
if (set[ri] == 1) { | |
vol += 1.; | |
if (set[cj] == 1) { | |
edges += 1.; | |
} | |
} | |
} | |
} | |
double igvol = 1./(double)nedges; | |
return edges*igvol - (vol*igvol)*(vol*igvol); | |
} | |
double modularity(std::vector<int> &set) { | |
modularity(&set[0]); | |
} | |
}; | |
typedef unsigned long long int smallset; | |
struct thread_data { | |
smat_graph *g; | |
smallset start; | |
smallset end; | |
double max; | |
smallset set; | |
}; | |
void* modularity_thread(void* tdata) { | |
thread_data *data = (thread_data*)tdata; | |
smat_graph& g = *(data->g); | |
data->max = -1.; | |
std::vector<int> set(g.nverts); | |
for (smallset cur=data->start; cur<data->end; ++cur) { | |
// parse the smallset | |
for (int i=0; i<g.nverts; ++i) { | |
set[i] = (cur&(1<<i))>0; | |
} | |
double m = g.modularity(&set[0]); | |
if (m > data->max) { | |
data->set = cur; | |
data->max = m; | |
} | |
} | |
return NULL; | |
} | |
bool write_modularity(const char* filename, smallset s, int nverts) { | |
FILE *f = fopen(filename, "w"); | |
if (!f) { | |
std::cerr << "Could not open " << filename << " for writing" << std::endl; | |
return false; | |
} | |
for (int i=0; i<nverts; ++i) { | |
fprintf(f, "%i\n",(s&(1<<i))>0); | |
} | |
fclose(f); | |
return true; | |
} | |
std::pair<smallset,double> modularity_enumerate(smat_graph& g, options opts) | |
{ | |
// create the thread loop | |
std::vector<pthread_t> threads(opts.nthreads); | |
std::vector<thread_data> tdata(opts.nthreads); | |
smallset increment = ((unsigned long long)1)<<22; | |
smallset cur = 0; | |
smallset last = ((smallset)1)<<(g.nverts+1); | |
smallset best = 0; | |
double max = -1.; | |
// check if we have an old status | |
if (!opts.statusfile.empty()) { | |
FILE *s = fopen(opts.statusfile.c_str(), "r"); | |
if (s) { | |
int rval = fscanf(s,"%llu %llu %lf", &cur, &best, &max); | |
if (rval == 3) { | |
std::cerr << "read old data from " << opts.statusfile << std::endl; | |
std::cerr << "max-modularity: " << max << std::endl; | |
std::cerr << "best-set: " << best << std::endl; | |
std::cerr << "current-set: " << cur << std::endl; | |
} else { | |
std::cerr << "couldn't read old data from " << opts.statusfile << std::endl; | |
// reset data | |
cur = 0; best = 0; max = -1.; | |
} | |
fclose(s); | |
} | |
} | |
while (cur < last) { | |
for (int t=0; t<opts.nthreads; ++t) { | |
// create thread data | |
smallset end = cur + increment; | |
if (end > last) { end = last; } | |
tdata[t].g = &g; | |
tdata[t].start = cur; | |
tdata[t].end = end; | |
tdata[t].max = -1.; | |
pthread_create(&threads[t], NULL, modularity_thread, (void*)&tdata[t]); | |
cur = tdata[t].end; | |
} | |
for (int t=0; t<opts.nthreads; ++t) { | |
pthread_join(threads[t], NULL); | |
if (tdata[t].max > max) { | |
max = tdata[t].max; | |
best = tdata[t].set; | |
} | |
} | |
if (!opts.statusfile.empty()) { | |
FILE *s = fopen(opts.statusfile.c_str(), "w"); | |
fprintf(s,"%llu\n", cur); | |
fprintf(s,"%llu\n", best); | |
fprintf(s,"%.18e\n", max); | |
fclose(s); | |
} | |
std::cerr << "Results from last thread-pool" << std::endl; | |
std::cerr << "max-modularity: " << max << std::endl; | |
std::cerr << "best-set: " << best << std::endl; | |
std::cerr << "current-set: " << cur << std::endl; | |
} | |
if (!opts.statusfile.empty()) { | |
remove(opts.statusfile.c_str()); | |
} | |
return std::make_pair(best,max); | |
} | |
int bench() { | |
smat_graph g; | |
std::vector<std::pair<int,int> > edges; | |
edges.push_back(std::make_pair<int,int>(0,12)); | |
edges.push_back(std::make_pair<int,int>(12,0)); | |
edges.push_back(std::make_pair<int,int>(0,2)); | |
edges.push_back(std::make_pair<int,int>(2,0)); | |
edges.push_back(std::make_pair<int,int>(2,3)); | |
edges.push_back(std::make_pair<int,int>(3,2)); | |
edges.push_back(std::make_pair<int,int>(3,0)); | |
edges.push_back(std::make_pair<int,int>(0,3)); | |
edges.push_back(std::make_pair<int,int>(0,1)); | |
edges.push_back(std::make_pair<int,int>(1,0)); | |
edges.push_back(std::make_pair<int,int>(1,2)); | |
edges.push_back(std::make_pair<int,int>(2,1)); | |
edges.push_back(std::make_pair<int,int>(2,4)); | |
edges.push_back(std::make_pair<int,int>(4,2)); | |
edges.push_back(std::make_pair<int,int>(4,9)); | |
edges.push_back(std::make_pair<int,int>(9,4)); | |
edges.push_back(std::make_pair<int,int>(9,3)); | |
edges.push_back(std::make_pair<int,int>(3,9)); | |
edges.push_back(std::make_pair<int,int>(3,10)); | |
edges.push_back(std::make_pair<int,int>(10,3)); | |
edges.push_back(std::make_pair<int,int>(10,1)); | |
edges.push_back(std::make_pair<int,int>(1,10)); | |
edges.push_back(std::make_pair<int,int>(1,7)); | |
edges.push_back(std::make_pair<int,int>(7,1)); | |
edges.push_back(std::make_pair<int,int>(7,5)); | |
edges.push_back(std::make_pair<int,int>(5,7)); | |
edges.push_back(std::make_pair<int,int>(5,6)); | |
edges.push_back(std::make_pair<int,int>(6,5)); | |
edges.push_back(std::make_pair<int,int>(6,0)); | |
edges.push_back(std::make_pair<int,int>(0,6)); | |
edges.push_back(std::make_pair<int,int>(0,13)); | |
edges.push_back(std::make_pair<int,int>(13,0)); | |
edges.push_back(std::make_pair<int,int>(13,18)); | |
edges.push_back(std::make_pair<int,int>(18,13)); | |
edges.push_back(std::make_pair<int,int>(18,17)); | |
edges.push_back(std::make_pair<int,int>(17,18)); | |
edges.push_back(std::make_pair<int,int>(17,13)); | |
edges.push_back(std::make_pair<int,int>(13,17)); | |
edges.push_back(std::make_pair<int,int>(13,14)); | |
edges.push_back(std::make_pair<int,int>(14,13)); | |
edges.push_back(std::make_pair<int,int>(14,16)); | |
edges.push_back(std::make_pair<int,int>(16,14)); | |
edges.push_back(std::make_pair<int,int>(14,15)); | |
edges.push_back(std::make_pair<int,int>(15,14)); | |
edges.push_back(std::make_pair<int,int>(14,5)); | |
edges.push_back(std::make_pair<int,int>(5,14)); | |
edges.push_back(std::make_pair<int,int>(6,14)); | |
edges.push_back(std::make_pair<int,int>(14,6)); | |
edges.push_back(std::make_pair<int,int>(14,24)); | |
edges.push_back(std::make_pair<int,int>(24,14)); | |
edges.push_back(std::make_pair<int,int>(24,4)); | |
edges.push_back(std::make_pair<int,int>(4,24)); | |
edges.push_back(std::make_pair<int,int>(4,8)); | |
edges.push_back(std::make_pair<int,int>(8,4)); | |
edges.push_back(std::make_pair<int,int>(8,9)); | |
edges.push_back(std::make_pair<int,int>(9,8)); | |
edges.push_back(std::make_pair<int,int>(9,10)); | |
edges.push_back(std::make_pair<int,int>(10,9)); | |
edges.push_back(std::make_pair<int,int>(10,11)); | |
edges.push_back(std::make_pair<int,int>(11,10)); | |
edges.push_back(std::make_pair<int,int>(10,19)); | |
edges.push_back(std::make_pair<int,int>(19,10)); | |
edges.push_back(std::make_pair<int,int>(3,22)); | |
edges.push_back(std::make_pair<int,int>(22,3)); | |
edges.push_back(std::make_pair<int,int>(22,23)); | |
edges.push_back(std::make_pair<int,int>(23,22)); | |
edges.push_back(std::make_pair<int,int>(22,21)); | |
edges.push_back(std::make_pair<int,int>(21,22)); | |
edges.push_back(std::make_pair<int,int>(22,20)); | |
edges.push_back(std::make_pair<int,int>(20,22)); | |
edges.push_back(std::make_pair<int,int>(1,9)); | |
edges.push_back(std::make_pair<int,int>(9,1)); | |
g.load_from_edges(edges.begin(), edges.end()); | |
assert(g.is_symmetric() == true); | |
assert(g.has_repeated_edges() == false); | |
options opts; | |
opts.nthreads = 3; | |
opts.statusfile = ""; | |
opts.outputfile = ""; | |
std::pair<smallset,double> result = modularity_enumerate(g, opts); | |
std::cout << result.second << std::endl; | |
std::cout << result.first << std::endl; | |
} | |
int main(int argc, char **argv) { | |
options opts; | |
if (!parse_command_line(argc, argv, opts)) { | |
return (1); | |
} | |
if (opts.bench) { | |
bench(); | |
return (0); | |
} | |
std::cerr << "graphfile: " << opts.graphfile << std::endl; | |
std::cerr << "outputfile: " << opts.outputfile << std::endl; | |
std::cerr << "statusfile: " << opts.statusfile << std::endl; | |
std::cerr << "nthreads: " << opts.nthreads << std::endl; | |
std::cerr << "loading graph ..." << std::endl; | |
smat_graph g; | |
if (!g.load(opts.graphfile.c_str())) { | |
return (2); | |
} | |
std::cerr << "loaded graph" << std::endl; | |
std::cerr << "nverts: " << g.nverts << std::endl; | |
std::cerr << "nedges: " << g.nedges << std::endl; | |
std::cerr << "is_symmetric: " << g.is_symmetric() << std::endl; | |
std::cerr << "has_repeated_edges: " << g.has_repeated_edges() << std::endl; | |
std::pair<smallset,double> result = modularity_enumerate(g, opts); | |
std::cout<< result.second << std::endl; | |
std::cout << result.first << std::endl; | |
write_modularity(opts.outputfile.c_str(), result.first, g.nverts); | |
return (0); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment