Created
December 4, 2013 06:52
-
-
Save dalexander/7783403 to your computer and use it in GitHub Desktop.
Scatter-gather help for GenomicConsensus
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pbcore.io import FastaTable | |
from nose.tools import eq_ | |
def chunk(keysAndSizes, numChunks): | |
""" | |
Heuristically attempt to split the keys up into sublists such that | |
the total of sizes of each sublist is near the targetSize, and | |
the chunks are well balanced. Better to go over than under. | |
""" | |
targetChunkSize = sum(s for (k, s) in keysAndSizes) // numChunks | |
chunks = [] | |
thisChunk = [] | |
thisChunkSize = 0 | |
for key, size in keysAndSizes: | |
thisChunk.append((key, size)) | |
thisChunkSize += size | |
if thisChunkSize > targetChunkSize: | |
chunks.append(thisChunk) | |
thisChunk = [] | |
thisChunkSize = 0 | |
if thisChunk: | |
chunks.append(thisChunk) | |
return chunks | |
def genScatter(numChunks, referenceFastaPath, padWithNone=False): | |
""" | |
Return a list of at most `numChunks` lists of contig names, to be | |
used in scattering quiver jobs across nodes. Attempts to | |
load-balance across nodes by looking at the contig lengths. | |
""" | |
ft = FastaTable(referenceFastaPath) | |
keysAndSizes = [ (r.name, len(r.sequence)) for r in ft ] | |
chunks = chunk(keysAndSizes, numChunks) | |
if padWithNone and numChunks > len(chunks): | |
chunks += [None] * (numChunks - len(chunks)) | |
return chunks | |
def test_chunks1(): | |
eq_([[("foo", 100)]], chunk([("foo", 100)], 1)) | |
eq_([[("foo", 100)]], chunk([("foo", 100)], 10)) | |
eq_([[("foo", 100)]], chunk([("foo", 100)], 100)) | |
eq_([[("foo", 100)]], chunk([("foo", 100)], 200)) | |
hg = [('chr1', 249250621), | |
('chr2', 243199373), | |
('chr3', 198022430), | |
('chr4', 191154276), | |
('chr5', 180915260), | |
('chr6', 171115067), | |
('chr7', 159138663), | |
('chr8', 146364022), | |
('chr9', 141213431), | |
('chr10', 135534747), | |
('chr11', 135006516), | |
('chr12', 133851895), | |
('chr13', 115169878), | |
('chr14', 107349540), | |
('chr15', 102531392), | |
('chr16', 90354753), | |
('chr17', 81195210), | |
('chr18', 78077248), | |
('chr19', 59128983), | |
('chr20', 63025520), | |
('chr21', 48129895), | |
('chr22', 51304566), | |
('chrM', 16571), | |
('chrX', 155270560), | |
('chrY', 59373566)] | |
chunkedHg_10 = chunk(hg, 10, True) | |
eq_( [[('chr1', 249250621), ('chr2', 243199373)], | |
[('chr3', 198022430), ('chr4', 191154276)], | |
[('chr5', 180915260), ('chr6', 171115067)], | |
[('chr7', 159138663), ('chr8', 146364022), ('chr9', 141213431)], | |
[('chr10', 135534747), ('chr11', 135006516), ('chr12', 133851895)], | |
[('chr13', 115169878), ('chr14', 107349540), ('chr15', 102531392)], | |
[('chr16', 90354753), ('chr17', 81195210), ('chr18', 78077248), ('chr19', 59128983), ('chr20', 63025520)], | |
[('chr21', 48129895), ('chr22', 51304566), ('chrM', 16571), ('chrX', 155270560), ('chrY', 59373566)]], | |
chunkedHg_10) | |
if __name__ == '__main__': | |
print genScatter(4, "~/Data/fluidigm_amplicons/Old/fluidigm_amplicons.fasta", True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment