Last active
April 15, 2023 06:21
-
-
Save napsternxg/fda7cd4001ba9cf0d63a8b3d5a7244fc to your computer and use it in GitHub Desktop.
Space sklearn Transformer - Use spacy embeddings in Sklearn model pipelines
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Spacy Embedding Transformer for Sklearn pipeline | |
Install spacy and floret | |
```bash | |
pip install spacy floret scikit-learn | |
``` | |
First download the vectors from: | |
```bash | |
wget -nc https://github.com/explosion/spacy-vectors-builder/releases/download/en-3.4.0/en_vectors_floret_lg.floret.gz | |
``` | |
Next create a spacy pipeline for these | |
```bash | |
spacy init vectors en en_vectors_floret_lg.floret.gz en_vectors_floret_lg --mode floret | |
``` | |
""" | |
import spacy | |
import numpy as np | |
from sklearn.base import BaseEstimator, TransformerMixin | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.pipeline import Pipeline | |
class SpacyTransformer(BaseEstimator, TransformerMixin): | |
def __init__(self, pipeline="en_vectors_floret_lg"): | |
self.pipeline = pipeline | |
self.nlp = spacy.load(pipeline) | |
def fit(self, X, y=None): | |
return self | |
def transform(self, X, y=None): | |
# Perform arbitary transformation | |
X = np.stack([x.vector for x in self.nlp.pipe(X)], axis=0) | |
return X | |
if __name__ == "__main__": | |
model = Pipeline([ | |
("spacy", SpacyTransformer(pipeline="en_vectors_floret_lg")), | |
("model", LogisticRegression(solver="liblinear")), | |
]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment