We profile k-mer pairwise comparision for k=[1,12] using the kMer analysis toolkit and programming library for k-mer profiles.
See the IPython notebook for more information.
.ipynb_checkpoints | |
*.fa | |
[abcde].k1 | |
[abcde].k2 | |
[abcde].k3 | |
[abcde].k4 | |
[abcde].k5 | |
[abcde].k6 | |
[abcde].k7 | |
[abcde].k8 | |
[abcde].k9 | |
[abcde].k10 | |
[abcde].k11 | |
[abcde].k12 | |
[abcde].k13 | |
[abcde].k14 | |
[abcde].k15 |
0.542464971542 26120 |
0.663958072662 42492 |
0.972708940506 91608 |
2.24748587608 288264 |
7.51165699959 1074688 |
27.2333760262 4220276 |
108.990545034 16803304 |
0.541092157364 26120 |
0.547954082489 26120 |
0.578961849213 26120 |
0.578741788864 26144 |
0.586236000061 26196 |
0.567816019058 26360 |
0.570757865906 27144 |
0.593155145645 30204 |
5.48644304276 26120 |
5.7480931282 42528 |
6.13077187538 91608 |
7.46066999435 288280 |
13.2450871468 1074684 |
34.4764480591 4220288 |
116.609779835 16803324 |
5.44484806061 26128 |
5.53405499458 26124 |
5.82787203789 26128 |
5.84015107155 26152 |
5.82632112503 26292 |
5.79324197769 26752 |
5.74274420738 27332 |
5.71768307686 30264 |
54.6297440529 26116 |
57.8999540806 43588 |
58.0587861538 91868 |
60.5846529007 288448 |
72.2292790413 1074680 |
93.4170598984 4220460 |
174.824126005 16803412 |
54.5379779339 26120 |
54.4764060974 26120 |
57.6308250427 26124 |
58.1498789787 26144 |
57.4757890701 26264 |
57.5336329937 26756 |
57.6689498425 28424 |
58.0697989464 32832 |
0.00376987457275 27948 |
0.248427867889 121492 |
1.09074687958 397148 |
4.51135492325 1499740 |
17.9062130451 5669460 |
61.405629158 20096264 |
199.224109173 63755916 |
0.00374603271484 27944 |
0.00378704071045 27944 |
0.0042028427124 28036 |
0.00458002090454 28204 |
0.00510907173157 28544 |
0.00836801528931 29532 |
0.0185878276825 34008 |
0.0714981555939 52820 |
0.00386595726013 27948 |
0.253719091415 122176 |
1.1847550869 397036 |
4.87614607811 1499924 |
19.3254239559 5670424 |
68.5020990372 20095424 |
216.661634207 63754052 |
0.00380110740662 27936 |
0.00382900238037 27936 |
0.00426483154297 28044 |
0.00455498695374 28208 |
0.00516986846924 28472 |
0.00827503204346 29584 |
0.0180509090424 33936 |
0.0715341567993 52776 |
0.00382304191589 27940 |
11.709302187 122136 |
46.8763818741 392168 |
191.472046852 1367092 |
694.941875935 4249648 |
1946.50338984 13000560 |
5011.18195319 42178040 |
0.00398802757263 27940 |
0.00459003448486 27940 |
0.00705599784851 27952 |
0.0167629718781 28212 |
0.0539407730103 28568 |
0.206032991409 29584 |
0.806213140488 33940 |
3.21424412727 52784 |
0.0038058757782 27944 |
11.3999941349 121544 |
44.8743479252 392128 |
209.745757103 1367460 |
775.451279879 4249036 |
2038.04450011 13000368 |
4487.34653401 39471116 |
0.00396203994751 27940 |
0.00455594062805 27944 |
0.00734496116638 28048 |
0.0167670249939 28208 |
0.0530319213867 28560 |
0.208234071732 29528 |
0.775527000427 34012 |
3.10026812553 52828 |
#!/usr/bin/env python | |
import resource | |
import sys | |
import timeit | |
from k_mer import klib | |
def index(filename, k): | |
with open(filename) as f: | |
profile = klib.Profile.from_fasta(f, k) | |
def profile_index(filename, k): | |
repeats = timeit.repeat('index({0}, {1})'.format(repr(filename), k), | |
setup='from __main__ import index', | |
repeat=3, number=1) | |
usage_time = min(repeats) | |
usage_memory = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss | |
return usage_time, usage_memory | |
if __name__ == '__main__': | |
if len(sys.argv) != 3: | |
sys.stdout.write('Usage: %s SAMPLE K\n' % sys.argv[0]) | |
sys.exit(1) | |
usage_time, usage_memory = profile_index(sys.argv[1], int(sys.argv[2])) | |
print usage_time, usage_memory |
#!/bin/bash | |
for fasta in a.fa b.fa c.fa; do | |
# Copy it to local storage | |
local_fasta=$(mktemp) | |
cp $fasta $local_fasta | |
# Read it once, so we have no cache effects | |
./profile_count.py $local_fasta 4 > /dev/null | |
for k in {1..15}; do | |
./profile_count.py $local_fasta $k > ${fasta%.fa}.count.k${k} | |
done | |
rm $local_fasta | |
done |
#!/usr/bin/env python | |
import resource | |
import sys | |
import timeit | |
from k_mer import ProfileFileType, kdistlib, klib | |
def distance(filename_a, filename_b, **diff_args): | |
with ProfileFileType()(filename_a) as f_a, ProfileFileType()(filename_b) as f_b: | |
profile_a = klib.Profile.from_file(f_a) | |
profile_b = klib.Profile.from_file(f_b) | |
if profile_a.length != profile_b.length: | |
raise ValueError('trying to compare profiles for different k') | |
dist = kdistlib.ProfileDistance(**diff_args) | |
dist.distance(profile_a, profile_b) | |
def profile_distance(filename_a, filename_b, **diff_args): | |
repeats = timeit.repeat('distance({0}, {1}, **{2})'.format(repr(filename_a), | |
repr(filename_b), | |
repr(diff_args)), | |
setup='from __main__ import distance', | |
repeat=3, number=1) | |
usage_time = min(repeats) | |
usage_memory = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss | |
return usage_time, usage_memory | |
if __name__ == '__main__': | |
if len(sys.argv) != 4: | |
sys.stdout.write('Usage: %s PROFILE_LEFT PROFILE_RIGHT MODE\n' % sys.argv[0]) | |
sys.exit(1) | |
if sys.argv[3] not in ('default', 'scale', 'smooth', 'scale.smooth'): | |
sys.stdout.write('Not a valid mode: %s\n' % sys.argv[3]) | |
sys.exit(1) | |
if sys.argv[3] == 'default': | |
usage_time, usage_memory = profile_distance(sys.argv[1], sys.argv[2]) | |
elif sys.argv[3] == 'scale': | |
usage_time, usage_memory = profile_distance(sys.argv[1], sys.argv[2], do_scale=True) | |
elif sys.argv[3] == 'smooth': | |
usage_time, usage_memory = profile_distance(sys.argv[1], sys.argv[2], do_smooth=True, threshold=10) | |
elif sys.argv[3] == 'scale.smooth': | |
usage_time, usage_memory = profile_distance(sys.argv[1], sys.argv[2], do_scale=True, do_smooth=True, threshold=10) | |
print usage_time, usage_memory |
#!/bin/bash | |
# Copy fasta files to local storage to speed things up | |
local_fasta_d=$(mktemp) | |
local_fasta_e=$(mktemp) | |
cp d.fa $local_fasta_d | |
cp e.fa $local_fasta_e | |
for k in {1..15}; do | |
# Create profiles | |
kMer count -k $k d.k${k} $local_fasta_d | |
kMer count -k $k e.k${k} $local_fasta_e | |
# Copy profiles to local storage | |
local_d=$(mktemp) | |
local_e=$(mktemp) | |
cp d.k${k} $local_d | |
cp e.k${k} $local_e | |
# Read them once, so we have no cache effects | |
kMer info $local_d > /dev/null | |
kMer info $local_e > /dev/null | |
./profile_distance.py $local_d $local_e default > distance.default.k${k} | |
./profile_distance.py $local_d $local_e scale > distance.scale.k${k} | |
./profile_distance.py $local_d $local_e smooth > distance.smooth.k${k} | |
./profile_distance.py $local_d $local_e scale.smooth > distance.scale.smooth.k${k} | |
rm $local_d $local_e | |
done | |
rm $local_fasta_d $local_fasta_e |