lzamparo/dplyr_fisher_test.R

## dplyr_fisher_test.R
# here's what the data looks like:
> all_chrs_dt
        lower   upper signif_label signif_unlabel not_signif_label not_signif_unlabel                    test_name
   1:       0  100000         1293           1121          1235366             224096 unnannotated to unnannotated
   2:  100000  200000          465           1335          1244847             227812 unnannotated to unnannotated
   3:  200000  300000         1296           2358          1244090             225916 unnannotated to unnannotated
   4:  300000  400000         1245           1840          1242731             226852 unnannotated to unnannotated
   5:  400000  500000          627            995          1241226             227550 unnannotated to unnannotated
  ---
5056: 1500000 1600000            0             76                0             895216               DNase to DNase
5057: 1600000 1700000            0            107                2             893980               DNase to DNase
5058: 1700000 1800000            0            162                1             891953               DNase to DNase
5059: 1800000 1900000            0             98                1             890585               DNase to DNase
5060: 1900000 2000000            0             78                1             889103               DNase to DNase
      chromosome
   1:       chr1
   2:       chr1
   3:       chr1
   4:       chr1
   5:       chr1
  ---
5056:       chrX
5057:       chrX
5058:       chrX
5059:       chrX
5060:       chrX

# I'm trying to do fisher.test on bands of lower & upper e.g lower = 0 & upper = 100000
# For a given value of upper, I can get the row containing what I need in a df like so:

> counts <- all_chrs_dt %>% group_by(test_name) %>% filter(upper == 100000) %>% summarise(all_significant_label = sum(signif_label), all_significant_unlabel = sum(signif_unlabel), all_not_significant_label = sum(not_signif_label), all_not_significant_unlabel = sum(not_signif_unlabel))
> counts
Source: local data table [11 x 5]

                      test_name all_significant_label all_significant_unlabel all_not_significant_label
1  unnannotated to unnannotated                 15506                   10712                  15203321
2          unnannotated to CTCF                  2879                   23339                   1245936
3   unnannotated to DNase-K27ac                  5448                   20770                    804284
4         unnannotated to DNase                   267                   25951                    113358
5           CTCF to DNase-K27ac                   711                   25507                     57430
6                  CTCF to CTCF                   359                   25859                     41148
7    DNase-K27ac to DNase-K27ac                   967                   25251                     29548
8                 CTCF to DNase                    32                   26186                      7094
9          DNase to DNase-K27ac                    40                   26178                      4456
10         DNase-K27ac to DNase                     2                   26216                       811
11               DNase to DNase                     7                   26211                       618
Variables not shown: all_not_significant_unlabel (int)

 # I'd like to apply fisher.test to each row of this df, but I don't seem to be able to form the contingency tables
 # properly

 > counts <- all_chrs_dt %>% group_by(test_name) %>% filter(upper == 100000) %>% summarise(all_significant_label = sum(signif_label), all_significant_unlabel = sum(signif_unlabel), all_not_significant_label = sum(not_signif_label), all_not_significant_unlabel = sum(not_signif_unlabel)) %>% group_by(test_name) %>% fisher.test(matrix(c(all_significant_label,all_significant_unlabel,all_not_significant_label, all_not_significant_unlabel),nrow=2,ncol=2))[[p.value]]
Error in `[[.data.frame`(., fisher.test(matrix(c(all_significant_label,  :
  object 'p.value' not found

 >counts <- all_chrs_dt %>% group_by(test_name) %>% filter(upper == 100000) %>% summarise(all_significant_label = sum(signif_label), all_significant_unlabel = sum(signif_unlabel), all_not_significant_label = sum(not_signif_label), all_not_significant_unlabel = sum(not_signif_unlabel)) %>% group_by(test_name) %>% fisher.test(matrix(c(all_significant_label,all_significant_unlabel,all_not_significant_label, all_not_significant_unlabel),nrow=2,ncol=2))
Error in fisher.test(., matrix(c(all_significant_label, all_significant_unlabel,  :
  all entries of 'x' must be nonnegative and finite
	# here's what the data looks like:
	> all_chrs_dt
	lower upper signif_label signif_unlabel not_signif_label not_signif_unlabel test_name
	1: 0 100000 1293 1121 1235366 224096 unnannotated to unnannotated
	2: 100000 200000 465 1335 1244847 227812 unnannotated to unnannotated
	3: 200000 300000 1296 2358 1244090 225916 unnannotated to unnannotated
	4: 300000 400000 1245 1840 1242731 226852 unnannotated to unnannotated
	5: 400000 500000 627 995 1241226 227550 unnannotated to unnannotated
	---
	5056: 1500000 1600000 0 76 0 895216 DNase to DNase
	5057: 1600000 1700000 0 107 2 893980 DNase to DNase
	5058: 1700000 1800000 0 162 1 891953 DNase to DNase
	5059: 1800000 1900000 0 98 1 890585 DNase to DNase
	5060: 1900000 2000000 0 78 1 889103 DNase to DNase
	chromosome
	1: chr1
	2: chr1
	3: chr1
	4: chr1
	5: chr1
	---
	5056: chrX
	5057: chrX
	5058: chrX
	5059: chrX
	5060: chrX

	# I'm trying to do fisher.test on bands of lower & upper e.g lower = 0 & upper = 100000
	# For a given value of upper, I can get the row containing what I need in a df like so:

	> counts <- all_chrs_dt %>% group_by(test_name) %>% filter(upper == 100000) %>% summarise(all_significant_label = sum(signif_label), all_significant_unlabel = sum(signif_unlabel), all_not_significant_label = sum(not_signif_label), all_not_significant_unlabel = sum(not_signif_unlabel))
	> counts
	Source: local data table [11 x 5]

	test_name all_significant_label all_significant_unlabel all_not_significant_label
	1 unnannotated to unnannotated 15506 10712 15203321
	2 unnannotated to CTCF 2879 23339 1245936
	3 unnannotated to DNase-K27ac 5448 20770 804284
	4 unnannotated to DNase 267 25951 113358
	5 CTCF to DNase-K27ac 711 25507 57430
	6 CTCF to CTCF 359 25859 41148
	7 DNase-K27ac to DNase-K27ac 967 25251 29548
	8 CTCF to DNase 32 26186 7094
	9 DNase to DNase-K27ac 40 26178 4456
	10 DNase-K27ac to DNase 2 26216 811
	11 DNase to DNase 7 26211 618
	Variables not shown: all_not_significant_unlabel (int)

	# I'd like to apply fisher.test to each row of this df, but I don't seem to be able to form the contingency tables
	# properly

	> counts <- all_chrs_dt %>% group_by(test_name) %>% filter(upper == 100000) %>% summarise(all_significant_label = sum(signif_label), all_significant_unlabel = sum(signif_unlabel), all_not_significant_label = sum(not_signif_label), all_not_significant_unlabel = sum(not_signif_unlabel)) %>% group_by(test_name) %>% fisher.test(matrix(c(all_significant_label,all_significant_unlabel,all_not_significant_label, all_not_significant_unlabel),nrow=2,ncol=2))[[p.value]]
	Error in `[[.data.frame`(., fisher.test(matrix(c(all_significant_label, :
	object 'p.value' not found

	>counts <- all_chrs_dt %>% group_by(test_name) %>% filter(upper == 100000) %>% summarise(all_significant_label = sum(signif_label), all_significant_unlabel = sum(signif_unlabel), all_not_significant_label = sum(not_signif_label), all_not_significant_unlabel = sum(not_signif_unlabel)) %>% group_by(test_name) %>% fisher.test(matrix(c(all_significant_label,all_significant_unlabel,all_not_significant_label, all_not_significant_unlabel),nrow=2,ncol=2))
	Error in fisher.test(., matrix(c(all_significant_label, all_significant_unlabel, :
	all entries of 'x' must be nonnegative and finite