Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
STOPWORDS = nltk.corpus.stopwords.words('english')
filters = [ '!', '"', '#', '$', '%', '&', '(', ')', '*', '+', '-', '.', '/', '\\', ':', ';', '<', '=', '>',
'?', '@', '[', ']', '^', '_', '`', '{', '|', '}', '\t','\n',"'",",",'~' , '—']
def preprocess_text(input_strs , filters=None , stopwords=STOPWORDS):
"""
* filter punctuation
* to_lower
* remove stop words (from nltk corpus)
* remove multiple spaces with one
* remove leading spaces
"""
# filter punctuation and case conversion
translation_table = {ord(char): ord(' ') for char in filters}
input_strs = input_strs.str.translate(translation_table)
input_strs = input_strs.str.lower()
# remove stopwords
stopwords_gpu = cudf.Series(stopwords)
input_strs = input_strs.str.replace_tokens(STOPWORDS, ' ')
# replace multiple spaces with single one and strip leading/trailing spaces
input_strs = input_strs.str.normalize_spaces( )
input_strs = input_strs.str.strip(' ')
return input_strs
def preprocess_text_df(df, text_cols=['text'], **kwargs):
for col in text_cols:
df[col] = preprocess_text(df[col], **kwargs)
return df
%time df = preprocess_text_df(df, filters=filters)
df.head(5)

CPU times: user 816 ms, sys: 240 ms, total: 1.06 s Wall time: 1.05 s

text	author	title
0	geological observations south america	Charles Darwin	Geological Observations On South America
1	charles darwin	Charles Darwin	Geological Observations On South America
2	editorial note	Charles Darwin	Geological Observations On South America
3	although respects technical subjects style	Charles Darwin	Geological Observations On South America
4	darwin journal books reprinted never lose value	Charles Darwin	Geological Observations On South America
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment