Skip to content

Instantly share code, notes, and snippets.

@cjauvin
Created November 23, 2012 20:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cjauvin/4137131 to your computer and use it in GitHub Desktop.
Save cjauvin/4137131 to your computer and use it in GitHub Desktop.
import time
from sklearn.feature_extraction.text import CountVectorizer
# brown20.txt is the Brown corpus concatenated 20x to itself (~1M lines)
# Results obtained on a 24-core Linux machine
start = time.time()
vect = CountVectorizer()
vect.fit_transform(open('brown20x.txt'), n_jobs=1)
print time.time() - start # 307 seconds
start = time.time()
vect = CountVectorizer()
vect.fit_transform(open('brown20x.txt'), n_jobs=-1)
print time.time() - start # 585 seconds (almost 2X worst)
start = time.time()
vect = CountVectorizer()
vect.fit_transform(open('brown20x.txt'), n_jobs=-1, batch_size=5)
print time.time() - start # 155 seconds
start = time.time()
vect = CountVectorizer()
vect.fit_transform(open('brown20x.txt'), n_jobs=-1, batch_size=10)
print time.time() - start # 119 seconds
start = time.time()
vect = CountVectorizer()
vect.fit_transform(open('brown20x.txt'), n_jobs=-1, batch_size=100)
print time.time() - start # 92 seconds
start = time.time()
vect = CountVectorizer()
vect.fit_transform(open('brown20x.txt'), n_jobs=-1, batch_size=500)
print time.time() - start # 88 seconds
start = time.time()
vect = CountVectorizer()
vect.fit_transform(open('brown20x.txt'), n_jobs=-1, batch_size=1000)
print time.time() - start # 86 seconds (3.5X better)
start = time.time()
vect = CountVectorizer()
vect.fit_transform(open('brown20x.txt'), n_jobs=2, batch_size=10000)
print time.time() - start # 92 seconds
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment