Last active
June 14, 2023 15:57
-
-
Save jgoerner/7a363f9d3670418fc6c99f5f2ea53a7f to your computer and use it in GitHub Desktop.
Feature Engineering for NLP using sklearn FeatureUnion
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A little snippet to show how to use sklearn transformer interface | |
# to extract NLP features | |
# necessary imports | |
import pandas as pd | |
from sklearn.base import TransformerMixin | |
from sklearn.pipeline import FeatureUnion | |
# load your dataset | |
df_train = pd.read_csv("./data/train/cwi_training.txt", sep="\t", names=["sentence", "word", "index", "label"]) | |
# helper function to transform result of feature union to dataframe | |
def fu_to_df(feature_union, X): | |
# number of features | |
n_feat = len(feature_union.transformer_list) | |
# names of transformer | |
name_trans = [n for n, _ in fu.transformer_list] | |
return pd.DataFrame(fu.transform(X).reshape(n_feat, -1).T, columns=name_trans) | |
# example feature word (opt. with offset) | |
class Word(TransformerMixin): | |
def __init__(self, offset=0): | |
self.offset = offset | |
def transform(self, X): | |
result = [] | |
for index, rowdata in X.iterrows(): | |
index = rowdata["index"] | |
words = word_tokenize(rowdata["sentence"]) | |
word = words[index+self.offset] if (index+self.offset) in range(len(words)) else "NONE" | |
result.append(word) | |
return result | |
# example feature word length | |
class WordLen(TransformerMixin): | |
def transform(self, X): | |
result = [] | |
for index, rowdata in X.iterrows(): | |
result.append(len(rowdata["word"])) | |
return result | |
# label as a feature | |
class Label(TransformerMixin): | |
def transform(self, X): | |
result = [] | |
for index, rowdata in X.iterrows(): | |
result.append(rowdata["label"]) | |
return result | |
# create a feature union to combine features | |
fu = FeatureUnion([ | |
("word", Word()), | |
("word_length", WordLen()), | |
("label", Label()) | |
], n_jobs=-1) #mucho importante! | |
# convert the dataset | |
df_train_transformed = fu_to_df(fu, df_train) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment