Created
February 5, 2012 23:28
-
-
Save aboSamoor/1748395 to your computer and use it in GitHub Desktop.
Experiments to model text processing using python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| """template.py: Description of what the module does.""" | |
| from optparse import OptionParser | |
| import logging | |
| import os | |
| from random import randint | |
| from time import time | |
| __author__ = "Rami Al-Rfou" | |
| __email__ = "rmyeid@" | |
| LOG_FORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s" | |
| _proc_status = '/proc/%d/status' % os.getpid() | |
| _scale = {'kB': 1024.0, 'mB': 1024.0*1024.0, | |
| 'KB': 1024.0, 'MB': 1024.0*1024.0} | |
| def _VmB(VmKey): | |
| '''Private. | |
| ''' | |
| global _proc_status, _scale | |
| # get pseudo file /proc/<pid>/status | |
| try: | |
| t = open(_proc_status) | |
| v = t.read() | |
| t.close() | |
| except: | |
| return 0.0 # non-Linux? | |
| # get VmKey line e.g. 'VmRSS: 9999 kB\n ...' | |
| i = v.index(VmKey) | |
| v = v[i:].split(None, 3) # whitespace | |
| if len(v) < 3: | |
| return 0.0 # invalid format? | |
| # convert Vm value to bytes | |
| return float(v[1]) * _scale[v[2]] | |
| def memory(since=0.0): | |
| '''Return memory usage in bytes. | |
| ''' | |
| return _VmB('VmSize:') - since | |
| def resident(since=0.0): | |
| '''Return resident memory usage in bytes. | |
| ''' | |
| return _VmB('VmRSS:') - since | |
| def stacksize(since=0.0): | |
| '''Return stack size in bytes. | |
| ''' | |
| return _VmB('VmStk:') - since | |
| class token0(object): | |
| def __init__(self, word, tag1, tag2, tag3, tag4): | |
| self.word = word | |
| self.tag1 = tag1 | |
| self.tag2 = tag2 | |
| self.tag3 = tag3 | |
| self.tag4 = tag4 | |
| class Document0(object): | |
| def __init__(self): | |
| self.tokens = [] | |
| def add(self, tags): | |
| self.tokens.append(token0(*tags)) | |
| class Document1(object): | |
| def __init__(self): | |
| self.tokens = [] | |
| self.tag1 = [] | |
| self.tag2 = [] | |
| self.tag3 = [] | |
| self.tag4 = [] | |
| def add(self, tags): | |
| word, tag1, tag2, tag3, tag4 = tags | |
| self.tokens.append(word) | |
| self.tag1.append(tag1) | |
| self.tag2.append(tag2) | |
| self.tag3.append(tag3) | |
| self.tag4.append(tag4) | |
| class Document2(object): | |
| def __init__(self): | |
| self.tokens = [] | |
| def add(self, tags): | |
| self.tokens.append(tuple(tags)) | |
| class Document3(object): | |
| def __init__(self): | |
| self.tokens = [] | |
| def add(self, tags): | |
| word, tag1, tag2, tag3, tag4 = tags | |
| d = {'t1': tag1, 't2': tag2, 't3': tag3, 't4': tag4, 'w': word} | |
| self.tokens.append(d) | |
| class Document4(object): | |
| def __init__(self): | |
| self.sentences = [] | |
| def add(self, tokens): | |
| self.sentences.append(tokens) | |
| def traverse(self): | |
| for sentence in self.sentences: | |
| yield sentence | |
| class Document5(object): | |
| def __init__(self): | |
| self.sentences = [] | |
| def add(self, tokens): | |
| self.sentences.append(-1) | |
| self.sentences.extend(tokens) | |
| def traverse(self): | |
| sent = [] | |
| for token in self.sentences: | |
| if token == -1: | |
| yield sent | |
| sent = [] | |
| else: | |
| sent.append(token) | |
| class Document6(object): | |
| def __init__(self): | |
| self.sentences = [0] | |
| self.tokens = [] | |
| def add(self, tokens): | |
| self.sentences.append(len(tokens)+self.sentences[-1]) | |
| self.tokens.extend(tokens) | |
| def traverse(self): | |
| num_sents = len(self.sentences) -1 | |
| for i in range(num_sents): | |
| yield self.tokens[self.sentences[i]:self.sentences[i+1]] | |
| class Sentence(object): | |
| def __init__(self, tokens): | |
| self.tokens = tokens | |
| class Document7(object): | |
| def __init__(self): | |
| self.sentences = [] | |
| def add(self, tokens): | |
| self.sentences.append(Sentence(tokens)) | |
| def traverse(self): | |
| for s in self.sentences: | |
| yield s | |
| def bench_representation(): | |
| documents = [Document0] | |
| for document in documents: | |
| start = time() | |
| d = document() | |
| for i in range(10000000): | |
| word = randint(1, 10000) | |
| tag1 = randint(1, 10000) | |
| tag2 = randint(1, 10000) | |
| tag3 = randint(1, 10000) | |
| tag4 = randint(1, 10000) | |
| d.add((word, tag1, tag2, tag3, tag4)) | |
| finish = time() | |
| logging.info("Finished the task in %f seconds", finish-start) | |
| logging.info("Resident: %d\tStack: %d\tMemory: %d", resident(), stacksize(), | |
| memory()) | |
| def bench_representation_2(): | |
| documents = [Document7] | |
| for document in documents: | |
| start = time() | |
| d = document() | |
| for i in range(1000000): | |
| words = [randint(1, 10000) for i in range(10)] | |
| d.add(words) | |
| finish = time() | |
| logging.info("Finished the task in %f seconds", finish-start) | |
| logging.info("Resident: %d\tStack: %d\tMemory: %d", resident(), stacksize(), | |
| memory()) | |
| start = time() | |
| lens = 0 | |
| for sent in d.traverse(): | |
| lens += len(sent.tokens) | |
| logging.info("Total length of sentences is %d", lens) | |
| finish = time() | |
| logging.info("Finished the traverse in %f seconds", finish-start) | |
| logging.info("Resident: %d\tStack: %d\tMemory: %d", resident(), stacksize(), | |
| memory()) | |
| def main(options, args): | |
| bench_representation() | |
| if __name__ == "__main__": | |
| parser = OptionParser() | |
| parser.add_option("-f", "--file", dest="filename", help="Input file") | |
| parser.add_option("-l", "--log", dest="log", help="log verbosity level", | |
| default="INFO") | |
| (options, args) = parser.parse_args() | |
| numeric_level = getattr(logging, options.log.upper(), None) | |
| logging.basicConfig(level=numeric_level, format=LOG_FORMAT) | |
| main(options, args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment