Created
June 5, 2014 11:48
-
-
Save lpand/2aa702c10ad9603f866a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <string.h> | |
#include <stdlib.h> | |
#include <math.h> | |
#include <unistd.h> | |
#include <time.h> | |
#define MAX 2000 | |
#define UNDEF "NA" | |
void usage(void) { | |
fprintf(stderr, "Usage hypg -g genome_file -a annotation_file -c cutoff"); | |
fprintf(stderr, " -s set_file\n"); | |
fprintf(stderr, " [-p prefix]\n"); | |
fprintf(stderr, " [-m maxsize]\n"); | |
fprintf(stderr, " [-S minimum prevalence for listing (default 0)]\n"); | |
fprintf(stderr, " [-b test for overrepresentation or underrepresentation depending on\n"); | |
fprintf(stderr, " result of comparison with expected\n"); | |
fprintf(stderr, " [-B use Bonferroni corrected P-values to compare with cutoff]\n"); | |
fprintf(stderr, " [-i to be used when sets and annotations are the same (does not test self-overlap and\n"); | |
fprintf(stderr, " compares each pair only once)]\n"); | |
return; | |
} | |
void addgene(const char *name); | |
void addanno(const char *gene, const char *anno); | |
void analyze(int set_S, int set_b, int set_B, int set_i); | |
void rmfirst(char *new, const char *old); | |
void usage(void); | |
double hypg(int N, int n, int M, int m); | |
double hypg_low(int N, int n, int M, int m); | |
int find(const char *name, char **list, int n); | |
double bonf(double pv, int ntest); | |
void nline(FILE *f); | |
char **gene_name = NULL, **anno_term = NULL; | |
int ngene = 0, nanno = 0, nset = 0, ntotset = 0, ntotanno = 0, ntest = 0; | |
char genfile[MAX], annfile[MAX], setfile[MAX], line[MAX], set[MAX]; | |
char prefix[MAX], pv_file[MAX], list_file[MAX], report_file[MAX]; | |
char check[MAX]; | |
int *used_ann = NULL; | |
int *n_anno; | |
int **anno_list; | |
int *gen_prev = NULL, *set_prev = NULL; | |
double cutoff; | |
double *cutoff_sz = NULL; | |
int maxsize, minprev; | |
int *def_cut = NULL; | |
int **gene_list; | |
FILE *fout_pv, *fout_list, *fout_report; | |
int main(int argc, char *argv[]) { | |
int i, j, labgen; | |
int *flag; | |
char gname[MAX], anno[MAX]; | |
char report[MAX][MAX]; | |
FILE *fgen, *fann, *fset; | |
int c; | |
int set_g = 0, set_a = 0, set_s = 0, set_c = 0, set_p = 0, set_h = 0; | |
int set_m = 0, set_S = 0, set_b = 0, set_B = 0, set_i = 0; | |
extern char *optarg; | |
extern int optind, opterr, optopt; | |
// input | |
strcpy(prefix, "hypg"); | |
minprev = 0; | |
while (1) { | |
c = getopt(argc, argv, "hS:g:a:s:c:p:m:bBi"); | |
if (c == -1) | |
break; | |
switch (c) { | |
case 'h': | |
set_h = 1; | |
break; | |
case 'g': | |
strcpy(genfile, optarg); | |
set_g = 1; | |
break; | |
case 'a': | |
strcpy(annfile, optarg); | |
set_a = 1; | |
break; | |
case 's': | |
strcpy(setfile, optarg); | |
set_s = 1; | |
break; | |
case 'c': | |
cutoff = atof(optarg); | |
set_c = 1; | |
break; | |
case 'm': | |
maxsize = atoi(optarg); | |
set_m = 1; | |
break; | |
case 'p': | |
strcpy(prefix, optarg); | |
set_p = 1; | |
break; | |
case 'S': | |
set_S = 1; | |
minprev = atoi(optarg); | |
break; | |
case 'b': | |
set_b = 1; | |
break; | |
case 'B': | |
set_B = 1; | |
break; | |
case 'i': | |
set_i = 1; | |
break; | |
case '?': | |
break; | |
default: | |
break; | |
} | |
} | |
// check input | |
if (set_h) { | |
usage(); | |
exit(EXIT_SUCCESS); | |
} | |
if (!(set_g * set_a * set_s * set_c)) { | |
usage(); | |
exit(EXIT_FAILURE); | |
} | |
// report input | |
// performance test | |
char *log_file = fopen("/tmp/hypg.log", "a"); | |
clock_t start = clock(), diff; | |
// read genome | |
fgen = fopen(genfile, "r"); | |
if (fgen == NULL) { | |
fprintf(stderr, "Error opening %s\n", genfile); | |
exit(EXIT_FAILURE); | |
} | |
while (fscanf(fgen, "%s", gname) != EOF) { | |
addgene(gname); | |
} | |
fclose(fgen); | |
diff = clock() - start; | |
int msec = diff * 1000 / CLOCKS_PER_SEC; | |
fprintf(log_file, "\nReading the genome took %d seconds %d milliseconds\n", msec/1000, msec%1000); | |
n_anno = (int *) malloc(ngene * sizeof(int)); | |
for (i = 0; i < ngene; ++i) { | |
n_anno[i] = 0; | |
} | |
anno_list = (int **) malloc(ngene * sizeof(int *)); | |
for (i = 0; i < ngene; ++i) { | |
anno_list[i] = NULL; | |
} | |
if (!set_m) { | |
maxsize = ngene; | |
} | |
start = clock(); | |
// read annotation | |
fann = fopen(annfile, "r"); | |
if (fann == NULL) { | |
fprintf(stderr, "Error opening %s\n", annfile); | |
exit(EXIT_FAILURE); | |
} | |
while (fscanf(fann, "%s %s", gname, anno) != EOF) { | |
addanno(gname, anno); | |
} | |
fclose(fann); | |
diff = clock() - start; | |
msec = diff * 1000 / CLOCKS_PER_SEC; | |
fprintf(log_file, "\nReading annotations took %d seconds %d milliseconds\n", msec/1000, msec%1000); | |
// compute the number of used annotations (for multiple testing) | |
ntotanno = 0; | |
for (i = 0; i < nanno; ++i) { | |
if (used_ann[i] && gen_prev[i] <= maxsize) { | |
++ntotanno; | |
} | |
} | |
// open output files | |
strcpy(pv_file, prefix); | |
strcat(pv_file, ".pv"); | |
strcpy(list_file, prefix); | |
strcat(list_file, ".list"); | |
strcpy(report_file, prefix); | |
strcat(report_file, ".rep"); | |
fout_pv = fopen(pv_file, "w"); | |
fout_list = fopen(list_file, "w"); | |
fout_report = fopen(report_file, "w"); | |
// read and analyze sets | |
set_prev = (int *) malloc(nanno * sizeof(int)); | |
gene_list = (int **) malloc(nanno * sizeof(int *)); | |
flag = (int *) malloc(nanno * sizeof(int)); | |
start = clock(); | |
fset = fopen(setfile, "r"); | |
if (fset == NULL) { | |
fprintf(stderr, "Error opening %s\n", setfile); | |
exit(EXIT_FAILURE); | |
} | |
// count number of sets (for multiple testing) | |
ntotset = 0; | |
while (fscanf(fset, "%s", line) != EOF) { | |
if (line[0] == '>') { | |
++ntotset; | |
} | |
} | |
ntest = ntotanno * ntotset; | |
// correct ntest when sets and annotations are identical | |
if (set_i) { | |
if (ntotset != ntotanno) { | |
fprintf(stderr, "Warning: option -i assumes sets and annotations to be identical\n"); | |
fprintf(stderr, "but here we have %d sets and %d used annotations \n", ntotset, ntotanno); | |
} | |
ntest = ntotanno * (ntotanno - 1) / 2; | |
} | |
rewind(fset); | |
diff = clock() - start; | |
msec = diff * 1000 / CLOCKS_PER_SEC; | |
fprintf(log_file, "\nFirst read of the set took %d seconds %d milliseconds\n", msec/1000, msec%1000); | |
start = clock(); | |
fprintf(fout_report, "annotations:\t%d\n", ntotanno); | |
fprintf(fout_report, "sets:\t%d\n", ntotset); | |
fprintf(fout_report, "no_of_tests:\t%d\n", ntest); | |
while (fscanf(fset, "%s", line) != EOF) { | |
if (line[0] == '>') { | |
rmfirst(set, line); | |
nset = 0; | |
for (i = 0; i < nanno; ++i) { | |
set_prev[i] = 0; | |
gene_list[i] = NULL; | |
flag[i] = 0; | |
} | |
} | |
else if (line[0] == '<') { | |
rmfirst(check, line); | |
if (strcmp(check, set)) { | |
fprintf(stderr, "Error: set %s terminated by <%s\n", set, check); | |
exit(EXIT_FAILURE); | |
} | |
analyze(set_S, set_b, set_B, set_i); | |
for (i = 0; i < nanno; ++i) { | |
if (flag[i]) { | |
free(gene_list[i]); | |
} | |
} | |
} | |
else { | |
labgen = find(line, gene_name, ngene); | |
if (labgen == -1) { | |
continue; | |
} | |
++nset; | |
for (i = 0; i < n_anno[labgen]; ++i) { | |
j = anno_list[labgen][i]; | |
++set_prev[j]; | |
gene_list[j] = (int *) realloc(gene_list[j], set_prev[j] | |
* sizeof(int)); | |
flag[j] = 1; | |
gene_list[j][set_prev[j] - 1] = labgen; | |
} | |
} | |
} | |
diff = clock() - start; | |
msec = diff * 1000 / CLOCKS_PER_SEC; | |
fprintf(log_file, "\nAnalysis took %d seconds %d milliseconds\n", msec/1000, msec%1000); | |
fclose(log_file); | |
fclose(fout_pv); | |
fclose(fout_list); | |
fclose(fout_report); | |
return 0; | |
} | |
void addgene(const char *name) { | |
int label; | |
label = find(name, gene_name, ngene); | |
if (label != -1) { | |
fprintf(stderr, "Warning: multiple instances of %s in genome file %s\n", name, genfile); | |
return; | |
} | |
else { | |
++ngene; | |
gene_name = (char **) realloc(gene_name, ngene * sizeof(char *)); | |
gene_name[ngene - 1] = (char *) malloc((strlen(name) + 1) * sizeof(char)); | |
strcpy(gene_name[ngene - 1], name); | |
} | |
} | |
void addanno(const char *gene, const char *anno) { | |
int labann, labgen, i, is_new; | |
// find the label of this annotation or assign a new label if new | |
labann = find(anno, anno_term, nanno); | |
if (labann == -1) { | |
is_new = 1; | |
++nanno; | |
used_ann = (int *) realloc(used_ann, nanno * sizeof(int)); | |
anno_term = (char **) realloc(anno_term, nanno * sizeof(char *)); | |
anno_term[nanno - 1] = (char *) malloc((strlen(anno) + 1) * sizeof(char)); | |
strcpy(anno_term[nanno - 1], anno); | |
gen_prev = (int *) realloc(gen_prev, nanno * sizeof(int)); | |
gen_prev[nanno - 1] = 0; | |
labann = nanno - 1; | |
used_ann[labann] = 0; | |
} | |
// find the label of the gene or discard if the gene is not in the genome | |
labgen = find(gene, gene_name, ngene); | |
if (labgen == -1) { | |
return; // not counted if gene is not in genome | |
} | |
// check whether this annotation had been previously assigned to this gene | |
for (i = 0; i < n_anno[labgen]; ++i) { | |
if (anno_list[labgen][i] == labann) { | |
fprintf(stderr, "Warning: multiple instances of association %s - %s", gene, anno); | |
fprintf(stderr, " in annotation file %s\n", annfile); | |
return; | |
} | |
} | |
// add the annotation to the gene | |
used_ann[labann] = 1; | |
++n_anno[labgen]; | |
anno_list[labgen] = (int *) realloc(anno_list[labgen], n_anno[labgen] * sizeof(int)); | |
anno_list[labgen][n_anno[labgen] - 1] = labann; | |
++gen_prev[labann]; | |
} | |
int find(const char *name, char **list, int n) { | |
int i; | |
for (i = 0; i < n; ++i) { | |
if (strcmp(name, list[i]) == 0) | |
return i; | |
} | |
return -1; | |
} | |
void rmfirst(char *new, const char *old) { | |
int i; | |
for (i = 1; i <= strlen(old); ++i) { | |
new[i - 1] = old[i]; | |
} | |
} | |
void analyze(int set_S, int set_b, int set_B, int set_i) { | |
int i, j; | |
double pv, expect, pvcut, adjpv; | |
char sign; | |
pvcut = cutoff; | |
for (i = 0; i < nanno; ++i) { | |
if ((set_S && set_prev[i] < minprev) || (gen_prev[i] > maxsize)) continue; | |
if (set_i && strcmp(anno_term[i], set) <= 0) continue; | |
expect = (double) (gen_prev[i] * nset) / (double) ngene; | |
if (set_prev[i] >= expect) { | |
sign = '+'; | |
} | |
else { | |
sign = '-'; | |
} | |
if (! set_b) { | |
pv = hypg(ngene, gen_prev[i], nset, set_prev[i]); | |
adjpv = bonf(pv, ntest); | |
if (adjpv <= cutoff || ! set_B && pv <= cutoff) { | |
// fprintf(fout_pv, "%s\t%s\t%d\t%d\t%d\t%d\t%g\t%c\t%g\t%g", set, anno_term[i], ngene, nset, gen_prev[i], set_prev[i], | |
// expect, sign, pv, adjpv); | |
// fprintf(fout_pv, "\n"); | |
// for (j = 0; j < set_prev[i]; ++j) { | |
// fprintf(fout_list, "%s\t%s\t%s\n", set, anno_term[i], gene_name[gene_list[i][j]]); | |
// } | |
// New format: | |
fprintf(fout_pv, "%s\t%g\t%g", anno_term[i], pv, adjpv); | |
if (set_prev[i]) fprintf(fout_pv, "\t"); | |
for (j = 0; j < set_prev[i]; ++j) { | |
fprintf(fout_pv, "%s", gene_name[gene_list[i][j]]); | |
if (j < set_prev[i] - 1) fprintf(fout_pv, "\t"); | |
} | |
fprintf(fout_pv, "\n"); | |
} | |
} | |
else { | |
if (sign == '+') { | |
pv = hypg(ngene, gen_prev[i], nset, set_prev[i]); | |
} | |
else { | |
pv = hypg_low(ngene, gen_prev[i], nset, set_prev[i]); | |
} | |
adjpv = bonf(pv, ntest); | |
if (adjpv <= cutoff || ! set_B && pv <= cutoff) { | |
// fprintf(fout_pv, "%s\t%s\t%d\t%d\t%d\t%d\t%g\t%c\t%g\t%g", set, anno_term[i], ngene, nset, gen_prev[i], set_prev[i], | |
// expect, sign, pv, adjpv); | |
// fprintf(fout_pv, "\n"); | |
// for (j = 0; j < set_prev[i]; ++j) { | |
// fprintf(fout_list, "%s\t%s\t%s\n", set, anno_term[i], gene_name[gene_list[i][j]]); | |
// } | |
// New format: | |
fprintf(fout_pv, "%s\t%g\t%g", anno_term[i], pv, adjpv); | |
if (set_prev[i]) fprintf(fout_pv, "\t"); | |
for (j = 0; j < set_prev[i]; ++j) { | |
fprintf(fout_pv, "%s", gene_name[gene_list[i][j]]); | |
if (j < set_prev[i] - 1) fprintf(fout_pv, "\t"); | |
} | |
fprintf(fout_pv, "\n"); | |
} | |
} | |
} | |
} | |
double gammaln(float xx) { | |
// Returns the value ln[Gamma(xx)] for xx > 0. | |
if (xx > 0) { | |
double x, y, tmp, ser; | |
static double cof[6] = { 76.18009172947146, -86.50532032941677, 24.01409824083091, -1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5 }; | |
int j; | |
y = x = xx; | |
tmp = x + 5.5; | |
tmp -= (x + 0.5) * log(tmp); | |
ser = 1.000000000190015; | |
for (j = 0; j <= 5; j++) | |
ser += cof[j] / ++y; | |
return (-tmp + log(2.5066282746310005 * ser / x)); | |
} else { | |
return 1.0f; | |
} | |
} | |
double bicoln(int n, int k) | |
{ | |
// Returns the log of binomial coefficient ( n k ) as a floating-point number. | |
return gammaln(n + 1) - gammaln(k + 1) - gammaln(n - k + 1); | |
} | |
double hypg(int N, int n, int M, int m) | |
{ | |
int k, max; | |
double tmp_pvalue = 0; | |
max = n; | |
if (M < n) { | |
max = M; | |
} | |
for (k = m; k <= max; k++) { | |
tmp_pvalue += exp(bicoln(N - n, M - k) + bicoln(n, k) - bicoln(N, M)); | |
} | |
return tmp_pvalue; | |
} | |
double hypg_low(int N, int n, int M, int m) | |
{ | |
double tmp_pvalue = 0; | |
int min, k; | |
min = 0; | |
if (M > N - n) { | |
min = M - (N - n); | |
} | |
for (k = min; k <= m; ++k) { | |
tmp_pvalue += exp(bicoln(N - n, M - k) + bicoln(n, k) - bicoln(N, M)); | |
} | |
return tmp_pvalue; | |
} | |
double bonf(double pv, int ntest) | |
{ | |
if (ntest * pv < 1) { | |
return (double) ntest * pv; | |
} else { | |
return (double) 1; | |
} | |
} | |
void nline(FILE *f) | |
{ | |
while (getc(f) != '\n'); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment