Skip to content

Instantly share code, notes, and snippets.

@jgoerner
Last active June 14, 2023 15:57
Show Gist options
  • Save jgoerner/7a363f9d3670418fc6c99f5f2ea53a7f to your computer and use it in GitHub Desktop.
Save jgoerner/7a363f9d3670418fc6c99f5f2ea53a7f to your computer and use it in GitHub Desktop.
Feature Engineering for NLP using sklearn FeatureUnion
# A little snippet to show how to use sklearn transformer interface
# to extract NLP features
# necessary imports
import pandas as pd
from sklearn.base import TransformerMixin
from sklearn.pipeline import FeatureUnion
# load your dataset
df_train = pd.read_csv("./data/train/cwi_training.txt", sep="\t", names=["sentence", "word", "index", "label"])
# helper function to transform result of feature union to dataframe
def fu_to_df(feature_union, X):
# number of features
n_feat = len(feature_union.transformer_list)
# names of transformer
name_trans = [n for n, _ in fu.transformer_list]
return pd.DataFrame(fu.transform(X).reshape(n_feat, -1).T, columns=name_trans)
# example feature word (opt. with offset)
class Word(TransformerMixin):
def __init__(self, offset=0):
self.offset = offset
def transform(self, X):
result = []
for index, rowdata in X.iterrows():
index = rowdata["index"]
words = word_tokenize(rowdata["sentence"])
word = words[index+self.offset] if (index+self.offset) in range(len(words)) else "NONE"
result.append(word)
return result
# example feature word length
class WordLen(TransformerMixin):
def transform(self, X):
result = []
for index, rowdata in X.iterrows():
result.append(len(rowdata["word"]))
return result
# label as a feature
class Label(TransformerMixin):
def transform(self, X):
result = []
for index, rowdata in X.iterrows():
result.append(rowdata["label"])
return result
# create a feature union to combine features
fu = FeatureUnion([
("word", Word()),
("word_length", WordLen()),
("label", Label())
], n_jobs=-1) #mucho importante!
# convert the dataset
df_train_transformed = fu_to_df(fu, df_train)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment