Skip to content

Instantly share code, notes, and snippets.

@codez266
Created December 11, 2017 15:13
Show Gist options
  • Save codez266/67935f5e51cdaceabb4a4b6b61489aa1 to your computer and use it in GitHub Desktop.
Save codez266/67935f5e51cdaceabb4a4b6b61489aa1 to your computer and use it in GitHub Desktop.
Hash based word features for drafttopic
from revscoring.languages import english
from revscoring.datasources.meta import (frequencies, gramming, hashing,
mappers)
import numpy as np
from revscoring import Feature
from revscoring.features import FeatureVector, wikitext
grams = [(0,), (0, 1), (0, 2)]
hashed_bow = frequencies.table(
hashing.hash(gramming.gram(
mappers.lower_case(
english.stopwords.revision.datasources.non_stopwords,
name="lower_non_stop_words"),
grams=grams
), n=2**10))
def get_doc_freq(hashes):
mat = np.zeros(2**10)
for k, v in hashes.items():
mat[k] = v
return mat
doc_freq = FeatureVector(
"doc_freq",
get_doc_freq,
depends_on=[hashed_bow],
returns=np.float64
)
hbow = [doc_freq]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment