egemenzeytinci/preprocessing.py

## preprocessing.py
from nltk.corpus import stopwords
from stemming.porter2 import stem
import nltk
import re
import string

nltk.download('punkt')
nltk.download('stopwords')
default_stopwords = stopwords.words('english')


def preproccessing(text):
    # remove html tags
    text = re.sub(r'<.*?>', '', text)

    # remove the characters [\], ['] and ["]
    text = re.sub(r"\\", "", text)
    text = re.sub(r"\'", "", text)
    text = re.sub(r"\"", "", text)
    text = re.sub(r"\d+", "", text)

    # convert text to lowercase
    text = text.strip().lower()

    # replace punctuation characters with spaces
    replace_punctuation = str.maketrans(string.punctuation,
                                        ' ' * len(string.punctuation))
    text = str(text).translate(replace_punctuation)

    # stemming (removing ed, es etc.)
    stems = [stem(word) for word in text.split(' ')]

    # removing stop words
    words = [w for w in stems if w not in default_stopwords if w != '']

    return ' '.join(map(str, words))


def main():
    test = "<html>Added and cutted!!!!!!!!\\\\\'</html>"
    processed = preproccessing(test)
    assert (processed == 'ad cut')


if __name__ == '__main__':
    main()
	from nltk.corpus import stopwords
	from stemming.porter2 import stem
	import nltk
	import re
	import string

	nltk.download('punkt')
	nltk.download('stopwords')
	default_stopwords = stopwords.words('english')


	def preproccessing(text):
	# remove html tags
	text = re.sub(r'<.*?>', '', text)

	# remove the characters [\], ['] and ["]
	text = re.sub(r"\\", "", text)
	text = re.sub(r"\'", "", text)
	text = re.sub(r"\"", "", text)
	text = re.sub(r"\d+", "", text)

	# convert text to lowercase
	text = text.strip().lower()

	# replace punctuation characters with spaces
	replace_punctuation = str.maketrans(string.punctuation,
	' ' * len(string.punctuation))
	text = str(text).translate(replace_punctuation)

	# stemming (removing ed, es etc.)
	stems = [stem(word) for word in text.split(' ')]

	# removing stop words
	words = [w for w in stems if w not in default_stopwords if w != '']

	return ' '.join(map(str, words))


	def main():
	test = "<html>Added and cutted!!!!!!!!\\\\\'</html>"
	processed = preproccessing(test)
	assert (processed == 'ad cut')


	if __name__ == '__main__':
	main()