VibhuJawa/1_gv_100_gutenburg_pre_processing_updated.py Secret

## 1_gv_100_gutenburg_pre_processing_updated.py
STOPWORDS = nltk.corpus.stopwords.words('english')

filters = [ '!', '"', '#', '$', '%', '&', '(', ')', '*', '+', '-', '.', '/',  '\\', ':', ';', '<', '=', '>',
           '?', '@', '[', ']', '^', '_', '`', '{', '|', '}', '\t','\n',"'",",",'~' , '—']

def preprocess_text(input_strs , filters=None , stopwords=STOPWORDS):
    """
        * filter punctuation
        * to_lower
        * remove stop words (from nltk corpus)
        * remove multiple spaces with one
        * remove leading spaces
    """

    # filter punctuation and case conversion
    translation_table = {ord(char): ord(' ') for char in filters}
    input_strs = input_strs.str.translate(translation_table)
    input_strs = input_strs.str.lower()

    # remove stopwords
    stopwords_gpu = cudf.Series(stopwords)
    input_strs =  input_strs.str.replace_tokens(STOPWORDS, ' ')

    # replace multiple spaces with single one and strip leading/trailing spaces
    input_strs = input_strs.str.normalize_spaces( )
    input_strs = input_strs.str.strip(' ')

    return input_strs

def preprocess_text_df(df, text_cols=['text'], **kwargs):
    for col in text_cols:
        df[col] = preprocess_text(df[col], **kwargs)
    return  df

%time df = preprocess_text_df(df, filters=filters)

df.head(5)

## 2_gv_100_gutenburg_pre_processing_output.md

      
    Raw
  

              2_gv_100_gutenburg_pre_processing_output.md
            
          
    CPU times: user 816 ms, sys: 240 ms, total: 1.06 s
Wall time: 1.05 s
text	author	title
0	geological observations south america	Charles Darwin	Geological Observations On South America
1	charles darwin	Charles Darwin	Geological Observations On South America
2	editorial note	Charles Darwin	Geological Observations On South America
3	although respects technical subjects style	Charles Darwin	Geological Observations On South America
4	darwin journal books reprinted never lose value	Charles Darwin	Geological Observations On South America
	STOPWORDS = nltk.corpus.stopwords.words('english')

	filters = [ '!', '"', '#', '$', '%', '&', '(', ')', '*', '+', '-', '.', '/', '\\', ':', ';', '<', '=', '>',
	'?', '@', '[', ']', '^', '_', '`', '{', '\|', '}', '\t','\n',"'",",",'~' , '—']

	def preprocess_text(input_strs , filters=None , stopwords=STOPWORDS):
	"""
	* filter punctuation
	* to_lower
	* remove stop words (from nltk corpus)
	* remove multiple spaces with one
	* remove leading spaces
	"""

	# filter punctuation and case conversion
	translation_table = {ord(char): ord(' ') for char in filters}
	input_strs = input_strs.str.translate(translation_table)
	input_strs = input_strs.str.lower()

	# remove stopwords
	stopwords_gpu = cudf.Series(stopwords)
	input_strs = input_strs.str.replace_tokens(STOPWORDS, ' ')

	# replace multiple spaces with single one and strip leading/trailing spaces
	input_strs = input_strs.str.normalize_spaces( )
	input_strs = input_strs.str.strip(' ')

	return input_strs

	def preprocess_text_df(df, text_cols=['text'], **kwargs):
	for col in text_cols:
	df[col] = preprocess_text(df[col], **kwargs)
	return df

	%time df = preprocess_text_df(df, filters=filters)

	df.head(5)