Skip to content

Instantly share code, notes, and snippets.

@aboSamoor
Created February 5, 2012 23:28
Show Gist options
  • Select an option

  • Save aboSamoor/1748395 to your computer and use it in GitHub Desktop.

Select an option

Save aboSamoor/1748395 to your computer and use it in GitHub Desktop.
Experiments to model text processing using python
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""template.py: Description of what the module does."""
from optparse import OptionParser
import logging
import os
from random import randint
from time import time
__author__ = "Rami Al-Rfou"
__email__ = "rmyeid@"
LOG_FORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"
_proc_status = '/proc/%d/status' % os.getpid()
_scale = {'kB': 1024.0, 'mB': 1024.0*1024.0,
'KB': 1024.0, 'MB': 1024.0*1024.0}
def _VmB(VmKey):
'''Private.
'''
global _proc_status, _scale
# get pseudo file /proc/<pid>/status
try:
t = open(_proc_status)
v = t.read()
t.close()
except:
return 0.0 # non-Linux?
# get VmKey line e.g. 'VmRSS: 9999 kB\n ...'
i = v.index(VmKey)
v = v[i:].split(None, 3) # whitespace
if len(v) < 3:
return 0.0 # invalid format?
# convert Vm value to bytes
return float(v[1]) * _scale[v[2]]
def memory(since=0.0):
'''Return memory usage in bytes.
'''
return _VmB('VmSize:') - since
def resident(since=0.0):
'''Return resident memory usage in bytes.
'''
return _VmB('VmRSS:') - since
def stacksize(since=0.0):
'''Return stack size in bytes.
'''
return _VmB('VmStk:') - since
class token0(object):
def __init__(self, word, tag1, tag2, tag3, tag4):
self.word = word
self.tag1 = tag1
self.tag2 = tag2
self.tag3 = tag3
self.tag4 = tag4
class Document0(object):
def __init__(self):
self.tokens = []
def add(self, tags):
self.tokens.append(token0(*tags))
class Document1(object):
def __init__(self):
self.tokens = []
self.tag1 = []
self.tag2 = []
self.tag3 = []
self.tag4 = []
def add(self, tags):
word, tag1, tag2, tag3, tag4 = tags
self.tokens.append(word)
self.tag1.append(tag1)
self.tag2.append(tag2)
self.tag3.append(tag3)
self.tag4.append(tag4)
class Document2(object):
def __init__(self):
self.tokens = []
def add(self, tags):
self.tokens.append(tuple(tags))
class Document3(object):
def __init__(self):
self.tokens = []
def add(self, tags):
word, tag1, tag2, tag3, tag4 = tags
d = {'t1': tag1, 't2': tag2, 't3': tag3, 't4': tag4, 'w': word}
self.tokens.append(d)
class Document4(object):
def __init__(self):
self.sentences = []
def add(self, tokens):
self.sentences.append(tokens)
def traverse(self):
for sentence in self.sentences:
yield sentence
class Document5(object):
def __init__(self):
self.sentences = []
def add(self, tokens):
self.sentences.append(-1)
self.sentences.extend(tokens)
def traverse(self):
sent = []
for token in self.sentences:
if token == -1:
yield sent
sent = []
else:
sent.append(token)
class Document6(object):
def __init__(self):
self.sentences = [0]
self.tokens = []
def add(self, tokens):
self.sentences.append(len(tokens)+self.sentences[-1])
self.tokens.extend(tokens)
def traverse(self):
num_sents = len(self.sentences) -1
for i in range(num_sents):
yield self.tokens[self.sentences[i]:self.sentences[i+1]]
class Sentence(object):
def __init__(self, tokens):
self.tokens = tokens
class Document7(object):
def __init__(self):
self.sentences = []
def add(self, tokens):
self.sentences.append(Sentence(tokens))
def traverse(self):
for s in self.sentences:
yield s
def bench_representation():
documents = [Document0]
for document in documents:
start = time()
d = document()
for i in range(10000000):
word = randint(1, 10000)
tag1 = randint(1, 10000)
tag2 = randint(1, 10000)
tag3 = randint(1, 10000)
tag4 = randint(1, 10000)
d.add((word, tag1, tag2, tag3, tag4))
finish = time()
logging.info("Finished the task in %f seconds", finish-start)
logging.info("Resident: %d\tStack: %d\tMemory: %d", resident(), stacksize(),
memory())
def bench_representation_2():
documents = [Document7]
for document in documents:
start = time()
d = document()
for i in range(1000000):
words = [randint(1, 10000) for i in range(10)]
d.add(words)
finish = time()
logging.info("Finished the task in %f seconds", finish-start)
logging.info("Resident: %d\tStack: %d\tMemory: %d", resident(), stacksize(),
memory())
start = time()
lens = 0
for sent in d.traverse():
lens += len(sent.tokens)
logging.info("Total length of sentences is %d", lens)
finish = time()
logging.info("Finished the traverse in %f seconds", finish-start)
logging.info("Resident: %d\tStack: %d\tMemory: %d", resident(), stacksize(),
memory())
def main(options, args):
bench_representation()
if __name__ == "__main__":
parser = OptionParser()
parser.add_option("-f", "--file", dest="filename", help="Input file")
parser.add_option("-l", "--log", dest="log", help="log verbosity level",
default="INFO")
(options, args) = parser.parse_args()
numeric_level = getattr(logging, options.log.upper(), None)
logging.basicConfig(level=numeric_level, format=LOG_FORMAT)
main(options, args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment