VibhuJawa/1_gv_100_gutenburg_pre_processing.py

## 1_gv_100_gutenburg_pre_processing.py
STOPWORDS = nltk.corpus.stopwords.words('english')

filters = [ '!', '"', '#', '$', '%', '&', '(', ')', '*', '+', '-', '.', '/',  '\\', ':', ';', '<', '=', '>',
           '?', '@', '[', ']', '^', '_', '`', '{', '|', '}', '\~', '\t','\\n',"'",",",'~' , '—']

def preprocess_text(input_strs , filters=None , stopwords=STOPWORDS):
    """
        * filter punctuation
        * to_lower
        * remove stop words (from nltk corpus)
        * remove multiple spaces with one
        * remove leading spaces
    """

    # filter punctuation and case conversion
    input_strs = input_strs.str.replace_multi(filters, ' ', regex=False)
    input_strs = input_strs.str.lower()

    # remove stopwords
    stopwords_gpu = nvstrings.to_device(stopwords)
    input_strs = nvtext.replace_tokens(input_strs.data, stopwords_gpu, ' ')
    input_strs = cudf.Series(input_strs)

    # replace multiple spaces with single one and strip leading/trailing spaces
    input_strs = input_strs.str.replace(r"\s+", ' ', regex=True)
    input_strs = input_strs.str.strip(' ')

    return input_strs

def preprocess_text_df(df, text_cols=['text'], **kwargs):
    for col in text_cols:
        df[col] = preprocess_text(df[col], **kwargs)
    return  df

%time df = preprocess_text_df(df, filters=filters)

df['text'].head(5).to_pandas()

## 2_gv_100_gutenburg_pre_processing_output.md

      
    Raw
  

              2_gv_100_gutenburg_pre_processing_output.md
            
          
    CPU times: user 1.6 s, sys: 708 ms, total: 2.3 s
Wall time: 2.31 s
0                          story champions round table
1                                  written illustrated
2                                          howard pyle
3    1902 distinguished american artist howard pyle...
4          illustrate legend king arthur knights round
Name: text, dtype: object
	STOPWORDS = nltk.corpus.stopwords.words('english')

	filters = [ '!', '"', '#', '$', '%', '&', '(', ')', '*', '+', '-', '.', '/', '\\', ':', ';', '<', '=', '>',
	'?', '@', '[', ']', '^', '_', '`', '{', '\|', '}', '\~', '\t','\\n',"'",",",'~' , '—']

	def preprocess_text(input_strs , filters=None , stopwords=STOPWORDS):
	"""
	* filter punctuation
	* to_lower
	* remove stop words (from nltk corpus)
	* remove multiple spaces with one
	* remove leading spaces
	"""

	# filter punctuation and case conversion
	input_strs = input_strs.str.replace_multi(filters, ' ', regex=False)
	input_strs = input_strs.str.lower()

	# remove stopwords
	stopwords_gpu = nvstrings.to_device(stopwords)
	input_strs = nvtext.replace_tokens(input_strs.data, stopwords_gpu, ' ')
	input_strs = cudf.Series(input_strs)

	# replace multiple spaces with single one and strip leading/trailing spaces
	input_strs = input_strs.str.replace(r"\s+", ' ', regex=True)
	input_strs = input_strs.str.strip(' ')

	return input_strs

	def preprocess_text_df(df, text_cols=['text'], **kwargs):
	for col in text_cols:
	df[col] = preprocess_text(df[col], **kwargs)
	return df

	%time df = preprocess_text_df(df, filters=filters)

	df['text'].head(5).to_pandas()