ctb/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Benchmarking k-mer load times against RAM

see hashsize_bench.ipynb

  
## hashsize_bench.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              hashsize_bench.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## load-bench.py
#! /usr/bin/env python
from __future__ import print_function, unicode_literals

import json
import os
import sys
import threading
import textwrap
import khmer
from khmer import khmer_args
from khmer.khmer_args import (build_counting_args, report_on_config, info,
                              add_threading_args, calculate_graphsize,
                              sanitize_help)
from khmer.kfile import check_file_writable
from khmer.kfile import check_input_files
from khmer.kfile import check_space_for_graph
from khmer.khmer_logger import (configure_logging, log_info, log_error,
                                log_warn)
import time


def get_parser():
    parser = build_counting_args("Build a k-mer countgraph from the given"
                                 " sequences.",
                                 citations=['counting', 'SeqAn'])
    add_threading_args(parser)
    parser.add_argument('input_sequence_filename', nargs='+',
                        help="The names of one or more FAST[AQ] input "
                        "sequence files.")
    parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True,
                        action='store_false', help="The default behaviour is "
                        "to count past 255 using bigcount. This flag turns "
                        "bigcount off, limiting counts to 255.")
    parser.add_argument('--summary-info', '-s', type=str, default=None,
                        metavar="FORMAT", choices=[str('json'), str('tsv')],
                        help="What format should the machine readable run "
                        "summary be in? (`json` or `tsv`, disabled by"
                        " default)")
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    parser.add_argument('-q', '--quiet', dest='quiet', default=False,
                        action='store_true')
    return parser


def main():

    args = sanitize_help(get_parser()).parse_args()

    configure_logging(args.quiet)
    report_on_config(args)

    filenames = args.input_sequence_filename

    log_info('making countgraph')
    start = time.time()

    countgraph = khmer_args.create_countgraph(args)
    countgraph.set_use_bigcount(False)

    filename = None

    total_num_reads = 0

    for index, filename in enumerate(filenames):
        log_info('consuming input {input}', input=filename)
        rparser = khmer.ReadParser(filename)
        countgraph.consume_seqfile_with_reads_parser(rparser)

        total_num_reads += rparser.num_reads

    log_info('DONE.')
    end = time.time()
    print(sum(countgraph.hashsizes()), end - start)

if __name__ == '__main__':
    main()

# vim: set filetype=python tabstop=4 softtabstop=4 shiftwidth=4 expandtab:
# vim: set textwidth=79:

## ram-times.out
4999802 0.6113121509552002
9999936 0.6644558906555176
49999814 0.7513549327850342
99999894 0.9065320491790771
499999864 1.3282811641693115
999999738 1.7348880767822266
1999999730 2.948660135269165
	#! /usr/bin/env python
	from __future__ import print_function, unicode_literals

	import json
	import os
	import sys
	import threading
	import textwrap
	import khmer
	from khmer import khmer_args
	from khmer.khmer_args import (build_counting_args, report_on_config, info,
	add_threading_args, calculate_graphsize,
	sanitize_help)
	from khmer.kfile import check_file_writable
	from khmer.kfile import check_input_files
	from khmer.kfile import check_space_for_graph
	from khmer.khmer_logger import (configure_logging, log_info, log_error,
	log_warn)
	import time


	def get_parser():
	parser = build_counting_args("Build a k-mer countgraph from the given"
	" sequences.",
	citations=['counting', 'SeqAn'])
	add_threading_args(parser)
	parser.add_argument('input_sequence_filename', nargs='+',
	help="The names of one or more FAST[AQ] input "
	"sequence files.")
	parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True,
	action='store_false', help="The default behaviour is "
	"to count past 255 using bigcount. This flag turns "
	"bigcount off, limiting counts to 255.")
	parser.add_argument('--summary-info', '-s', type=str, default=None,
	metavar="FORMAT", choices=[str('json'), str('tsv')],
	help="What format should the machine readable run "
	"summary be in? (`json` or `tsv`, disabled by"
	" default)")
	parser.add_argument('-f', '--force', default=False, action='store_true',
	help='Overwrite output file if it exists')
	parser.add_argument('-q', '--quiet', dest='quiet', default=False,
	action='store_true')
	return parser


	def main():

	args = sanitize_help(get_parser()).parse_args()

	configure_logging(args.quiet)
	report_on_config(args)

	filenames = args.input_sequence_filename

	log_info('making countgraph')
	start = time.time()

	countgraph = khmer_args.create_countgraph(args)
	countgraph.set_use_bigcount(False)

	filename = None

	total_num_reads = 0

	for index, filename in enumerate(filenames):
	log_info('consuming input {input}', input=filename)
	rparser = khmer.ReadParser(filename)
	countgraph.consume_seqfile_with_reads_parser(rparser)

	total_num_reads += rparser.num_reads

	log_info('DONE.')
	end = time.time()
	print(sum(countgraph.hashsizes()), end - start)

	if __name__ == '__main__':
	main()

	# vim: set filetype=python tabstop=4 softtabstop=4 shiftwidth=4 expandtab:
	# vim: set textwidth=79:
	4999802 0.6113121509552002
	9999936 0.6644558906555176
	49999814 0.7513549327850342
	99999894 0.9065320491790771
	499999864 1.3282811641693115
	999999738 1.7348880767822266
	1999999730 2.948660135269165