Skip to content

Instantly share code, notes, and snippets.

@dalexander
Created December 4, 2013 06:52
Show Gist options
  • Save dalexander/7783403 to your computer and use it in GitHub Desktop.
Save dalexander/7783403 to your computer and use it in GitHub Desktop.
Scatter-gather help for GenomicConsensus
from pbcore.io import FastaTable
from nose.tools import eq_
def chunk(keysAndSizes, numChunks):
"""
Heuristically attempt to split the keys up into sublists such that
the total of sizes of each sublist is near the targetSize, and
the chunks are well balanced. Better to go over than under.
"""
targetChunkSize = sum(s for (k, s) in keysAndSizes) // numChunks
chunks = []
thisChunk = []
thisChunkSize = 0
for key, size in keysAndSizes:
thisChunk.append((key, size))
thisChunkSize += size
if thisChunkSize > targetChunkSize:
chunks.append(thisChunk)
thisChunk = []
thisChunkSize = 0
if thisChunk:
chunks.append(thisChunk)
return chunks
def genScatter(numChunks, referenceFastaPath, padWithNone=False):
"""
Return a list of at most `numChunks` lists of contig names, to be
used in scattering quiver jobs across nodes. Attempts to
load-balance across nodes by looking at the contig lengths.
"""
ft = FastaTable(referenceFastaPath)
keysAndSizes = [ (r.name, len(r.sequence)) for r in ft ]
chunks = chunk(keysAndSizes, numChunks)
if padWithNone and numChunks > len(chunks):
chunks += [None] * (numChunks - len(chunks))
return chunks
def test_chunks1():
eq_([[("foo", 100)]], chunk([("foo", 100)], 1))
eq_([[("foo", 100)]], chunk([("foo", 100)], 10))
eq_([[("foo", 100)]], chunk([("foo", 100)], 100))
eq_([[("foo", 100)]], chunk([("foo", 100)], 200))
hg = [('chr1', 249250621),
('chr2', 243199373),
('chr3', 198022430),
('chr4', 191154276),
('chr5', 180915260),
('chr6', 171115067),
('chr7', 159138663),
('chr8', 146364022),
('chr9', 141213431),
('chr10', 135534747),
('chr11', 135006516),
('chr12', 133851895),
('chr13', 115169878),
('chr14', 107349540),
('chr15', 102531392),
('chr16', 90354753),
('chr17', 81195210),
('chr18', 78077248),
('chr19', 59128983),
('chr20', 63025520),
('chr21', 48129895),
('chr22', 51304566),
('chrM', 16571),
('chrX', 155270560),
('chrY', 59373566)]
chunkedHg_10 = chunk(hg, 10, True)
eq_( [[('chr1', 249250621), ('chr2', 243199373)],
[('chr3', 198022430), ('chr4', 191154276)],
[('chr5', 180915260), ('chr6', 171115067)],
[('chr7', 159138663), ('chr8', 146364022), ('chr9', 141213431)],
[('chr10', 135534747), ('chr11', 135006516), ('chr12', 133851895)],
[('chr13', 115169878), ('chr14', 107349540), ('chr15', 102531392)],
[('chr16', 90354753), ('chr17', 81195210), ('chr18', 78077248), ('chr19', 59128983), ('chr20', 63025520)],
[('chr21', 48129895), ('chr22', 51304566), ('chrM', 16571), ('chrX', 155270560), ('chrY', 59373566)]],
chunkedHg_10)
if __name__ == '__main__':
print genScatter(4, "~/Data/fluidigm_amplicons/Old/fluidigm_amplicons.fasta", True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment