Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
STOPWORDS = nltk.corpus.stopwords.words('english')
filters = [ '!', '"', '#', '$', '%', '&', '(', ')', '*', '+', '-', '.', '/', '\\', ':', ';', '<', '=', '>',
'?', '@', '[', ']', '^', '_', '`', '{', '|', '}', '\~', '\t','\\n',"'",",",'~' , '—']
def preprocess_text(input_strs , filters=None , stopwords=STOPWORDS):
"""
* filter punctuation
* to_lower
* remove stop words (from nltk corpus)
* remove multiple spaces with one
* remove leading spaces
"""
# filter punctuation and case conversion
input_strs = input_strs.str.replace_multi(filters, ' ', regex=False)
input_strs = input_strs.str.lower()
# remove stopwords
stopwords_gpu = nvstrings.to_device(stopwords)
input_strs = nvtext.replace_tokens(input_strs.data, stopwords_gpu, ' ')
input_strs = cudf.Series(input_strs)
# replace multiple spaces with single one and strip leading/trailing spaces
input_strs = input_strs.str.replace(r"\s+", ' ', regex=True)
input_strs = input_strs.str.strip(' ')
return input_strs
def preprocess_text_df(df, text_cols=['text'], **kwargs):
for col in text_cols:
df[col] = preprocess_text(df[col], **kwargs)
return df
%time df = preprocess_text_df(df, filters=filters)
df['text'].head(5).to_pandas()

CPU times: user 1.6 s, sys: 708 ms, total: 2.3 s Wall time: 2.31 s

0                          story champions round table
1                                  written illustrated
2                                          howard pyle
3    1902 distinguished american artist howard pyle...
4          illustrate legend king arthur knights round
Name: text, dtype: object
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment