Skip to content

Instantly share code, notes, and snippets.

@martijnvermaat
Last active August 29, 2015 14:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save martijnvermaat/3b6658722de5bcf0610c to your computer and use it in GitHub Desktop.
Save martijnvermaat/3b6658722de5bcf0610c to your computer and use it in GitHub Desktop.
Profiling k-mer pairwise comparison
.ipynb_checkpoints
*.pdf
*.fa
[abcde].k1
[abcde].k2
[abcde].k3
[abcde].k4
[abcde].k5
[abcde].k6
[abcde].k7
[abcde].k8
[abcde].k9
[abcde].k10
[abcde].k11
[abcde].k12
[abcde].k13
[abcde].k14
[abcde].k15

Profiling k-mer pairwise comparison

We profile k-mer pairwise comparision for k=[1,12] using the kMer analysis toolkit and programming library for k-mer profiles.

See the IPython notebook for more information.

0.542464971542 26120
0.663958072662 42492
0.972708940506 91608
2.24748587608 288264
7.51165699959 1074688
27.2333760262 4220276
108.990545034 16803304
0.541092157364 26120
0.547954082489 26120
0.578961849213 26120
0.578741788864 26144
0.586236000061 26196
0.567816019058 26360
0.570757865906 27144
0.593155145645 30204
5.48644304276 26120
5.7480931282 42528
6.13077187538 91608
7.46066999435 288280
13.2450871468 1074684
34.4764480591 4220288
116.609779835 16803324
5.44484806061 26128
5.53405499458 26124
5.82787203789 26128
5.84015107155 26152
5.82632112503 26292
5.79324197769 26752
5.74274420738 27332
5.71768307686 30264
54.6297440529 26116
57.8999540806 43588
58.0587861538 91868
60.5846529007 288448
72.2292790413 1074680
93.4170598984 4220460
174.824126005 16803412
54.5379779339 26120
54.4764060974 26120
57.6308250427 26124
58.1498789787 26144
57.4757890701 26264
57.5336329937 26756
57.6689498425 28424
58.0697989464 32832
0.00376987457275 27948
0.248427867889 121492
1.09074687958 397148
4.51135492325 1499740
17.9062130451 5669460
61.405629158 20096264
199.224109173 63755916
0.00374603271484 27944
0.00378704071045 27944
0.0042028427124 28036
0.00458002090454 28204
0.00510907173157 28544
0.00836801528931 29532
0.0185878276825 34008
0.0714981555939 52820
0.00386595726013 27948
0.253719091415 122176
1.1847550869 397036
4.87614607811 1499924
19.3254239559 5670424
68.5020990372 20095424
216.661634207 63754052
0.00380110740662 27936
0.00382900238037 27936
0.00426483154297 28044
0.00455498695374 28208
0.00516986846924 28472
0.00827503204346 29584
0.0180509090424 33936
0.0715341567993 52776
0.00382304191589 27940
1946.50338984 13000560
5011.18195319 42178040
0.00398802757263 27940
0.00459003448486 27940
0.00705599784851 27952
0.0167629718781 28212
0.0539407730103 28568
0.0038058757782 27944
11.3999941349 121544
44.8743479252 392128
209.745757103 1367460
775.451279879 4249036
2038.04450011 13000368
4487.34653401 39471116
0.00396203994751 27940
0.00455594062805 27944
0.00734496116638 28048
0.0167670249939 28208
0.0530319213867 28560
0.208234071732 29528
0.775527000427 34012
3.10026812553 52828
#!/usr/bin/env python
import resource
import sys
import timeit
from k_mer import klib
def index(filename, k):
with open(filename) as f:
profile = klib.Profile.from_fasta(f, k)
def profile_index(filename, k):
repeats = timeit.repeat('index({0}, {1})'.format(repr(filename), k),
setup='from __main__ import index',
repeat=3, number=1)
usage_time = min(repeats)
usage_memory = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
return usage_time, usage_memory
if __name__ == '__main__':
if len(sys.argv) != 3:
sys.stdout.write('Usage: %s SAMPLE K\n' % sys.argv[0])
sys.exit(1)
usage_time, usage_memory = profile_index(sys.argv[1], int(sys.argv[2]))
print usage_time, usage_memory
#!/bin/bash
for fasta in a.fa b.fa c.fa; do
# Copy it to local storage
local_fasta=$(mktemp)
cp $fasta $local_fasta
# Read it once, so we have no cache effects
./profile_count.py $local_fasta 4 > /dev/null
for k in {1..15}; do
./profile_count.py $local_fasta $k > ${fasta%.fa}.count.k${k}
done
rm $local_fasta
done
#!/usr/bin/env python
import resource
import sys
import timeit
from k_mer import ProfileFileType, kdistlib, klib
def distance(filename_a, filename_b, **diff_args):
with ProfileFileType()(filename_a) as f_a, ProfileFileType()(filename_b) as f_b:
profile_a = klib.Profile.from_file(f_a)
profile_b = klib.Profile.from_file(f_b)
if profile_a.length != profile_b.length:
raise ValueError('trying to compare profiles for different k')
dist = kdistlib.ProfileDistance(**diff_args)
dist.distance(profile_a, profile_b)
def profile_distance(filename_a, filename_b, **diff_args):
repeats = timeit.repeat('distance({0}, {1}, **{2})'.format(repr(filename_a),
repr(filename_b),
repr(diff_args)),
setup='from __main__ import distance',
repeat=3, number=1)
usage_time = min(repeats)
usage_memory = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
return usage_time, usage_memory
if __name__ == '__main__':
if len(sys.argv) != 4:
sys.stdout.write('Usage: %s PROFILE_LEFT PROFILE_RIGHT MODE\n' % sys.argv[0])
sys.exit(1)
if sys.argv[3] not in ('default', 'scale', 'smooth', 'scale.smooth'):
sys.stdout.write('Not a valid mode: %s\n' % sys.argv[3])
sys.exit(1)
if sys.argv[3] == 'default':
usage_time, usage_memory = profile_distance(sys.argv[1], sys.argv[2])
elif sys.argv[3] == 'scale':
usage_time, usage_memory = profile_distance(sys.argv[1], sys.argv[2], do_scale=True)
elif sys.argv[3] == 'smooth':
usage_time, usage_memory = profile_distance(sys.argv[1], sys.argv[2], do_smooth=True, threshold=10)
elif sys.argv[3] == 'scale.smooth':
usage_time, usage_memory = profile_distance(sys.argv[1], sys.argv[2], do_scale=True, do_smooth=True, threshold=10)
print usage_time, usage_memory
#!/bin/bash
# Copy fasta files to local storage to speed things up
local_fasta_d=$(mktemp)
local_fasta_e=$(mktemp)
cp d.fa $local_fasta_d
cp e.fa $local_fasta_e
for k in {1..15}; do
# Create profiles
kMer count -k $k d.k${k} $local_fasta_d
kMer count -k $k e.k${k} $local_fasta_e
# Copy profiles to local storage
local_d=$(mktemp)
local_e=$(mktemp)
cp d.k${k} $local_d
cp e.k${k} $local_e
# Read them once, so we have no cache effects
kMer info $local_d > /dev/null
kMer info $local_e > /dev/null
./profile_distance.py $local_d $local_e default > distance.default.k${k}
./profile_distance.py $local_d $local_e scale > distance.scale.k${k}
./profile_distance.py $local_d $local_e smooth > distance.smooth.k${k}
./profile_distance.py $local_d $local_e scale.smooth > distance.scale.smooth.k${k}
rm $local_d $local_e
done
rm $local_fasta_d $local_fasta_e
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment