Skip to content

Instantly share code, notes, and snippets.

@halfak
Created September 2, 2016 14:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save halfak/565d1c2153da57c5c6600cb175f20236 to your computer and use it in GitHub Desktop.
Save halfak/565d1c2153da57c5c6600cb175f20236 to your computer and use it in GitHub Desktop.
Hash vector speed demos (revscoring)
import pickle
import sys
import time
import mwapi
import mysqltsv
from editquality.feature_lists import enwiki
from revscoring.datasources import revision_oriented as ro
from revscoring.datasources.meta import (frequencies, gramming, hashing,
selectors)
from revscoring.dependencies import solve
from revscoring.extractors import api
from revscoring.features import wikitext
from revscoring.features.meta import vectorizers
from sklearn.feature_extraction.text import HashingVectorizer
# unigrams, bigrams, skipgrams and double-skipgrams
my_grams = [(0,), (0,1), (0,2), (0,3)]
parent_hash_table = frequencies.table(
hashing.hash(gramming.gram(wikitext.revision.parent.datasources.words, grams=my_grams), n=2**20))
revision_hash_table = frequencies.table(
hashing.hash(gramming.gram(wikitext.revision.datasources.words, grams=my_grams), n=2**20))
hash_delta = frequencies.delta(parent_hash_table, revision_hash_table)
session = mwapi.Session("https://en.wikipedia.org")
rev_text = session.get(
action="query", prop="revisions", rvprop="content",
formatversion=2, revids=727001271)['query']['pages'][0]['revisions'][0]['content']
parent_text = session.get(
action="query", prop="revisions", rvprop="content",
formatversion=2, revids=724239632)['query']['pages'][0]['revisions'][0]['content']
rev_words = solve(wikitext.revision.datasources.words,
cache={ro.revision.text: rev_text})
start = time.time()
for i in range(25):
solve(wikitext.revision.datasources.words,
cache={ro.revision.text: rev_text})
print("Text -->", len(rev_words), "words:",
round((time.time() - start)/25, 4), "seconds")
rev_grams = solve(gramming.gram(wikitext.revision.datasources.words, grams=my_grams),
cache={wikitext.revision.datasources.words: rev_words})
start = time.time()
for i in range(25):
solve(gramming.gram(wikitext.revision.datasources.words, grams=my_grams),
cache={wikitext.revision.datasources.words: rev_words})
print("Words -->", len(rev_grams), "grams:", round((time.time() - start)/25, 4), "seconds")
start = time.time()
for i in range(25):
solve(hashing.hash(gramming.gram(wikitext.revision.datasources.words, grams=my_grams), n=2**20),
cache={gramming.gram(wikitext.revision.datasources.words, grams=my_grams): rev_grams})
print("Grams -->", len(rev_grams), "hashes:", round((time.time() - start)/25, 4), "seconds")
start = time.time()
for i in range(25):
solve(hashing.hash(gramming.gram(wikitext.revision.datasources.words, grams=my_grams), n=2**20),
cache={wikitext.revision.datasources.words: rev_words})
print("Words -->", len(rev_grams), "hashed grams:", round((time.time() - start)/25, 4), "seconds")
start = time.time()
for i in range(25):
solve(revision_hash_table, cache={wikitext.revision.datasources.words: rev_words})
print("Words --> Hash table", round((time.time() - start)/25, 4), "seconds")
'''
single_vectorizer = HashingVectorizer(tokenizer=lambda x: x, n_features=2**20)
def hash_vectorize_table(items):
csr = single_vectorizer.transform([str(item) for item in items])
coo = csr.tocoo()
return dict(zip(coo.col, coo.data))
start = time.time()
for i in range(25):
_ = hash_vectorize_table(rev_grams)
print("Words --> HashingVectorizer table", round((time.time() - start)/25, 4), "seconds")
'''
four_vectorizer = HashingVectorizer(tokenizer=lambda x: x, n_features=2**20, ngram_range=(1,4))
start = time.time()
for i in range(25):
_ = four_vectorizer.transform(rev_words)
print("Words --> HashingVectorizer transform", round((time.time() - start)/25, 4), "seconds")
$ python demo_hash_vector_pattern.py
Sending requests with default User-Agent. Set 'user_agent' on mwapi.Session to quiet this message.
Text --> 7880 words: 0.0847 seconds
Words --> 31514 grams: 0.0507 seconds
Grams --> 31514 hashes: 0.1669 seconds
Words --> 31514 hashed grams: 0.2471 seconds
Words --> Hash table 0.224 seconds
Words --> HashingVectorizer transform 0.212 seconds
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment