Skip to content

Instantly share code, notes, and snippets.

@kmike
Created December 5, 2013 20:38
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kmike/7813450 to your computer and use it in GitHub Desktop.
Save kmike/7813450 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
from __future__ import division, print_function
import os
import sys
import time
import resource
import psutil
from sklearn import datasets
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from marisa_vectorizers import MarisaCountVectorizer, MarisaTfidfVectorizer
vectorizers = dict(
count = CountVectorizer(),
count2 = CountVectorizer(ngram_range=(1,2)),
tfidf = TfidfVectorizer(),
tfidf2 = TfidfVectorizer(ngram_range=(1,2)),
hashing18 = HashingVectorizer(n_features=2**18),
hashing20 = HashingVectorizer(n_features=2**20),
marisa_count = MarisaCountVectorizer(),
marisa_count2 = MarisaCountVectorizer(ngram_range=(1,2)),
marisa_tfidf = MarisaTfidfVectorizer(),
marisa_tfidf2 = MarisaTfidfVectorizer(ngram_range=(1,2)),
)
if __name__ == '__main__':
if len(sys.argv) < 2:
print("Available vectorizers:\n")
print("\n".join(sorted(vectorizers.keys())))
sys.exit()
vecname = sys.argv[1]
vec = vectorizers[vecname]
newsgroups_train = datasets.fetch_20newsgroups(subset='train')
p = psutil.Process(os.getpid())
before = p.get_memory_info().rss / 2**20
max_before = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 2**20
start = time.time()
vec.fit(newsgroups_train.data)
end = time.time()
after = p.get_memory_info().rss / 2**20
max_after = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 2**20
assert max_after >= max_before
print("fit time: %0.1fs" % (end-start))
print("fit memusage: %0.1fMB" % (max_after-before))
before2 = p.get_memory_info().rss / 2**20
joblib.dump(vec, vecname+".joblib")
max_after2 = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 2**20
assert max_after2 >= max_after
print("dump time: %0.1fs" % (time.time()-end))
print("dump memusage: %0.1fMB" % (max_after2-before2))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment