aboSamoor/bench.py

## bench.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""template.py: Description of what the module does."""

from optparse import OptionParser
import logging
import os
from random import randint
from time import time

__author__ = "Rami Al-Rfou"
__email__ = "rmyeid@"

LOG_FORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"

_proc_status = '/proc/%d/status' % os.getpid()

_scale = {'kB': 1024.0, 'mB': 1024.0*1024.0,
          'KB': 1024.0, 'MB': 1024.0*1024.0}

def _VmB(VmKey):
  '''Private.
  '''
  global _proc_status, _scale
  # get pseudo file  /proc/<pid>/status
  try:
    t = open(_proc_status)
    v = t.read()
    t.close()
  except:
    return 0.0  # non-Linux?
    # get VmKey line e.g. 'VmRSS:  9999  kB\n ...'
  i = v.index(VmKey)
  v = v[i:].split(None, 3)  # whitespace
  if len(v) < 3:
    return 0.0  # invalid format?
    # convert Vm value to bytes
  return float(v[1]) * _scale[v[2]]


def memory(since=0.0):
  '''Return memory usage in bytes.
  '''
  return _VmB('VmSize:') - since


def resident(since=0.0):
  '''Return resident memory usage in bytes.
  '''
  return _VmB('VmRSS:') - since


def stacksize(since=0.0):
  '''Return stack size in bytes.
  '''
  return _VmB('VmStk:') - since


class token0(object):
  def __init__(self, word, tag1, tag2, tag3, tag4):
    self.word = word
    self.tag1 = tag1
    self.tag2 = tag2
    self.tag3 = tag3
    self.tag4 = tag4

class Document0(object):
  def __init__(self):
    self.tokens = []

  def add(self, tags):
   self.tokens.append(token0(*tags))


class Document1(object):
  def __init__(self):
    self.tokens = []
    self.tag1 = []
    self.tag2 = []
    self.tag3 = []
    self.tag4 = []

  def add(self, tags):
    word, tag1, tag2, tag3, tag4 = tags
    self.tokens.append(word)
    self.tag1.append(tag1)
    self.tag2.append(tag2)
    self.tag3.append(tag3)
    self.tag4.append(tag4)

class Document2(object):
  def __init__(self):
    self.tokens = []

  def add(self, tags):
    self.tokens.append(tuple(tags))

class Document3(object):
  def __init__(self):
    self.tokens = []

  def add(self, tags):
    word, tag1, tag2, tag3, tag4 = tags
    d = {'t1': tag1, 't2': tag2, 't3': tag3, 't4': tag4, 'w': word}
    self.tokens.append(d)

class Document4(object):
  def __init__(self):
    self.sentences = []

  def add(self, tokens):
    self.sentences.append(tokens)

  def traverse(self):
    for sentence in self.sentences:
      yield sentence

class Document5(object):
  def __init__(self):
    self.sentences = []

  def add(self, tokens):
    self.sentences.append(-1)
    self.sentences.extend(tokens)

  def traverse(self):
    sent = []
    for token in self.sentences:
      if token == -1:
        yield sent
        sent = []
      else:
        sent.append(token)


class Document6(object):
  def __init__(self):
    self.sentences = [0]
    self.tokens = []

  def add(self, tokens):
    self.sentences.append(len(tokens)+self.sentences[-1])
    self.tokens.extend(tokens)

  def traverse(self):
    num_sents = len(self.sentences) -1
    for i in range(num_sents):
      yield self.tokens[self.sentences[i]:self.sentences[i+1]]


class Sentence(object):
  def __init__(self, tokens):
    self.tokens = tokens


class Document7(object):
  def __init__(self):
    self.sentences = []

  def add(self, tokens):
    self.sentences.append(Sentence(tokens))

  def traverse(self):
    for s in self.sentences:
      yield s

def bench_representation():
  documents = [Document0]
  for document in documents:
    start = time()
    d = document()
    for i in range(10000000):
      word = randint(1, 10000)
      tag1 = randint(1, 10000)
      tag2 = randint(1, 10000)
      tag3 = randint(1, 10000)
      tag4 = randint(1, 10000)
      d.add((word, tag1, tag2, tag3, tag4))
    finish = time()
    logging.info("Finished the task in %f seconds", finish-start)
    logging.info("Resident: %d\tStack: %d\tMemory: %d", resident(), stacksize(),
                 memory())

def bench_representation_2():
  documents = [Document7]
  for document in documents:
    start = time()
    d = document()
    for i in range(1000000):
      words = [randint(1, 10000) for i in range(10)]
      d.add(words)
    finish = time()
    logging.info("Finished the task in %f seconds", finish-start)
    logging.info("Resident: %d\tStack: %d\tMemory: %d", resident(), stacksize(),
                 memory())

    start = time()
    lens = 0
    for sent in d.traverse():
      lens += len(sent.tokens)
    logging.info("Total length of sentences is %d", lens)
    finish = time()
    logging.info("Finished the traverse in %f seconds", finish-start)
    logging.info("Resident: %d\tStack: %d\tMemory: %d", resident(), stacksize(),
                 memory())


def main(options, args):
  bench_representation()

if __name__ == "__main__":
  parser = OptionParser()
  parser.add_option("-f", "--file", dest="filename", help="Input file")
  parser.add_option("-l", "--log", dest="log", help="log verbosity level",
                    default="INFO")
  (options, args) = parser.parse_args()

  numeric_level = getattr(logging, options.log.upper(), None)
  logging.basicConfig(level=numeric_level, format=LOG_FORMAT)
  main(options, args)
	#!/usr/bin/env python
	# -- coding: utf-8 --

	"""template.py: Description of what the module does."""

	from optparse import OptionParser
	import logging
	import os
	from random import randint
	from time import time

	__author__ = "Rami Al-Rfou"
	__email__ = "rmyeid@"

	LOG_FORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"

	_proc_status = '/proc/%d/status' % os.getpid()

	_scale = {'kB': 1024.0, 'mB': 1024.0*1024.0,
	'KB': 1024.0, 'MB': 1024.0*1024.0}

	def _VmB(VmKey):
	'''Private.
	'''
	global _proc_status, _scale
	# get pseudo file /proc/<pid>/status
	try:
	t = open(_proc_status)
	v = t.read()
	t.close()
	except:
	return 0.0 # non-Linux?
	# get VmKey line e.g. 'VmRSS: 9999 kB\n ...'
	i = v.index(VmKey)
	v = v[i:].split(None, 3) # whitespace
	if len(v) < 3:
	return 0.0 # invalid format?
	# convert Vm value to bytes
	return float(v[1]) * _scale[v[2]]


	def memory(since=0.0):
	'''Return memory usage in bytes.
	'''
	return _VmB('VmSize:') - since


	def resident(since=0.0):
	'''Return resident memory usage in bytes.
	'''
	return _VmB('VmRSS:') - since


	def stacksize(since=0.0):
	'''Return stack size in bytes.
	'''
	return _VmB('VmStk:') - since


	class token0(object):
	def __init__(self, word, tag1, tag2, tag3, tag4):
	self.word = word
	self.tag1 = tag1
	self.tag2 = tag2
	self.tag3 = tag3
	self.tag4 = tag4

	class Document0(object):
	def __init__(self):
	self.tokens = []

	def add(self, tags):
	self.tokens.append(token0(*tags))


	class Document1(object):
	def __init__(self):
	self.tokens = []
	self.tag1 = []
	self.tag2 = []
	self.tag3 = []
	self.tag4 = []

	def add(self, tags):
	word, tag1, tag2, tag3, tag4 = tags
	self.tokens.append(word)
	self.tag1.append(tag1)
	self.tag2.append(tag2)
	self.tag3.append(tag3)
	self.tag4.append(tag4)

	class Document2(object):
	def __init__(self):
	self.tokens = []

	def add(self, tags):
	self.tokens.append(tuple(tags))

	class Document3(object):
	def __init__(self):
	self.tokens = []

	def add(self, tags):
	word, tag1, tag2, tag3, tag4 = tags
	d = {'t1': tag1, 't2': tag2, 't3': tag3, 't4': tag4, 'w': word}
	self.tokens.append(d)

	class Document4(object):
	def __init__(self):
	self.sentences = []

	def add(self, tokens):
	self.sentences.append(tokens)

	def traverse(self):
	for sentence in self.sentences:
	yield sentence

	class Document5(object):
	def __init__(self):
	self.sentences = []

	def add(self, tokens):
	self.sentences.append(-1)
	self.sentences.extend(tokens)

	def traverse(self):
	sent = []
	for token in self.sentences:
	if token == -1:
	yield sent
	sent = []
	else:
	sent.append(token)


	class Document6(object):
	def __init__(self):
	self.sentences = [0]
	self.tokens = []

	def add(self, tokens):
	self.sentences.append(len(tokens)+self.sentences[-1])
	self.tokens.extend(tokens)

	def traverse(self):
	num_sents = len(self.sentences) -1
	for i in range(num_sents):
	yield self.tokens[self.sentences[i]:self.sentences[i+1]]


	class Sentence(object):
	def __init__(self, tokens):
	self.tokens = tokens


	class Document7(object):
	def __init__(self):
	self.sentences = []

	def add(self, tokens):
	self.sentences.append(Sentence(tokens))

	def traverse(self):
	for s in self.sentences:
	yield s

	def bench_representation():
	documents = [Document0]
	for document in documents:
	start = time()
	d = document()
	for i in range(10000000):
	word = randint(1, 10000)
	tag1 = randint(1, 10000)
	tag2 = randint(1, 10000)
	tag3 = randint(1, 10000)
	tag4 = randint(1, 10000)
	d.add((word, tag1, tag2, tag3, tag4))
	finish = time()
	logging.info("Finished the task in %f seconds", finish-start)
	logging.info("Resident: %d\tStack: %d\tMemory: %d", resident(), stacksize(),
	memory())

	def bench_representation_2():
	documents = [Document7]
	for document in documents:
	start = time()
	d = document()
	for i in range(1000000):
	words = [randint(1, 10000) for i in range(10)]
	d.add(words)
	finish = time()
	logging.info("Finished the task in %f seconds", finish-start)
	logging.info("Resident: %d\tStack: %d\tMemory: %d", resident(), stacksize(),
	memory())

	start = time()
	lens = 0
	for sent in d.traverse():
	lens += len(sent.tokens)
	logging.info("Total length of sentences is %d", lens)
	finish = time()
	logging.info("Finished the traverse in %f seconds", finish-start)
	logging.info("Resident: %d\tStack: %d\tMemory: %d", resident(), stacksize(),
	memory())


	def main(options, args):
	bench_representation()

	if __name__ == "__main__":
	parser = OptionParser()
	parser.add_option("-f", "--file", dest="filename", help="Input file")
	parser.add_option("-l", "--log", dest="log", help="log verbosity level",
	default="INFO")
	(options, args) = parser.parse_args()

	numeric_level = getattr(logging, options.log.upper(), None)
	logging.basicConfig(level=numeric_level, format=LOG_FORMAT)
	main(options, args)
No results found