Skip to content

Instantly share code, notes, and snippets.

@napsternxg
Last active April 15, 2023 06:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save napsternxg/fda7cd4001ba9cf0d63a8b3d5a7244fc to your computer and use it in GitHub Desktop.
Save napsternxg/fda7cd4001ba9cf0d63a8b3d5a7244fc to your computer and use it in GitHub Desktop.
Space sklearn Transformer - Use spacy embeddings in Sklearn model pipelines
"""Spacy Embedding Transformer for Sklearn pipeline
Install spacy and floret
```bash
pip install spacy floret scikit-learn
```
First download the vectors from:
```bash
wget -nc https://github.com/explosion/spacy-vectors-builder/releases/download/en-3.4.0/en_vectors_floret_lg.floret.gz
```
Next create a spacy pipeline for these
```bash
spacy init vectors en en_vectors_floret_lg.floret.gz en_vectors_floret_lg --mode floret
```
"""
import spacy
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
class SpacyTransformer(BaseEstimator, TransformerMixin):
def __init__(self, pipeline="en_vectors_floret_lg"):
self.pipeline = pipeline
self.nlp = spacy.load(pipeline)
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
# Perform arbitary transformation
X = np.stack([x.vector for x in self.nlp.pipe(X)], axis=0)
return X
if __name__ == "__main__":
model = Pipeline([
("spacy", SpacyTransformer(pipeline="en_vectors_floret_lg")),
("model", LogisticRegression(solver="liblinear")),
])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment