Agustinus Theodorus agustinustheo

## news_title_tokenization
def news_title_tokenization(message):
    stopwords = nltk.corpus.stopwords.words('english')
    tokenized_news_title = []
    ps = PorterStemmer()
    for word in word_tokenize(message):
        if word not in stopwords:
            tokenized_news_title.append(ps.stem(word))

    return tokenized_news_title

## find_similar_articles
def find_similar_articles(news, similarity):
    news_title_tokenized = ""

    if(re.match(r'^https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)$', news)):
        news_article = Article(news)
        news_article.download()
        news_article.parse()
        news_title_tokenized = news_title_tokenization(preproccess_text(news_article.title))
    else:
        news_title_tokenized = news_title_tokenization(preproccess_text(news))

## preproccess_text
def preproccess_text(text_messages):
    # change words to lower case - Hello, HELLO, hello are all the same word
    processed = text_messages.lower()

    # Remove remove unnecessary noise
    processed = re.sub(r'\[[0-9]+\]|\[[a-z]+\]|\[[A-Z]+\]|\\\\|\\r|\\t|\\n|\\', ' ', processed)

    # Remove punctuation
    processed = re.sub(r'[.,\/#!%\^&\*;\[\]:|+{}=\-\'"_”“`~(’)?]', ' ', processed)

## remove_unnecessary_noise
def remove_unnecessary_noise(text_messages):
    text_messages = re.sub(r'\\([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])\\([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])\\([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])', ' ', text_messages)
    text_messages = re.sub(r'\\([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])\\([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])', ' ', text_messages)
    text_messages = re.sub(r'\[[0-9]+\]|\[[a-z]+\]|\[[A-Z]+\]|\\\\|\\r|\\t|\\n|\\', ' ', text_messages)

    return text_messages

## requirements.txt
nltk
beautifulsoup4
selenium>=2.44.0,<3.0.0
requests
unidecode
vcrpy
future
fake-useragent
newspaper3k
sklearn

## convertText.py
def convertText(text):
    words = word_tokenize(text)
    new_string = ''
    for msg in words:
        new_word = ''
        alpha_flag = False
        digit_flag = False
        for c in msg:
            if c.isalpha():
                alpha_flag = True

## preprocessText.py
def preproccess_text(text_messages):
    # change words to lower case - Hello, HELLO, hello are all the same word
    processed = text_messages.lower()

    # Replace email addresses with 'almtemail'
    processed = re.sub(r'^.+@[^\.].*\.[a-z]{2,}$', ' almtemail ', processed)

    # Replace phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'nmrtlpn'
    processed = re.sub(r'(\()?(\+62|62|0)(\d{2,3})?\)?[ .-]?\d{2,4}[ .-]?\d{2,4}[ .-]?\d{2,4}', ' nmrtlpn ', processed)

## trainTokenizer.py
#Train the sentence tokenizer
f=open("indonesian_sent_tokenizer_corpus/indonesian-promotion-text.txt", "r")
if f.mode == 'r':
    train_text = preproccess_text(f.read())
f.close()

path = 'indonesian_sent_tokenizer_corpus/tempo/txt'
for foldername in os.listdir(path):
    new_path = path + '/' + foldername
    for filename in os.listdir(new_path):

## preprocessDataframe.py
def preproccess_df(text_messages):# change words to lower case - Hello, HELLO, hello are all the same word
    processed = text_messages.str.lower()

    # Replace email addresses with 'almtemail'
    processed = processed.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', ' almtemail ')

    # Replace phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'nmrtlpn'
    processed = processed.str.replace(r'(\()?(\+62|62|0)(\d{2,3})?\)?[ .-]?\d{2,4}[ .-]?\d{2,4}[ .-]?\d{2,4}', ' nmrtlpn' )

    # Replace URLs with 'almtweb'

## loadingDataset.py
df = pd.read_csv('sms_classifier_corpus/data.txt', engine='python', sep="<%>", header=None)
	def news_title_tokenization(message):
	stopwords = nltk.corpus.stopwords.words('english')
	tokenized_news_title = []
	ps = PorterStemmer()
	for word in word_tokenize(message):
	if word not in stopwords:
	tokenized_news_title.append(ps.stem(word))

	return tokenized_news_title
	def find_similar_articles(news, similarity):
	news_title_tokenized = ""

	if(re.match(r'^https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)$', news)):
	news_article = Article(news)
	news_article.download()
	news_article.parse()
	news_title_tokenized = news_title_tokenization(preproccess_text(news_article.title))
	else:
	news_title_tokenized = news_title_tokenization(preproccess_text(news))
	def preproccess_text(text_messages):
	# change words to lower case - Hello, HELLO, hello are all the same word
	processed = text_messages.lower()

	# Remove remove unnecessary noise
	processed = re.sub(r'\[[0-9]+\]\|\[[a-z]+\]\|\[[A-Z]+\]\|\\\\\|\\r\|\\t\|\\n\|\\', ' ', processed)

	# Remove punctuation
	processed = re.sub(r'[.,\/#!%\^&\*;\[\]:\|+{}=\-\'"_”“`~(’)?]', ' ', processed)
	def remove_unnecessary_noise(text_messages):
	text_messages = re.sub(r'\\([a-z]\|[A-Z]\|[0-9])([a-z]\|[A-Z]\|[0-9])([a-z]\|[A-Z]\|[0-9])\\([a-z]\|[A-Z]\|[0-9])([a-z]\|[A-Z]\|[0-9])([a-z]\|[A-Z]\|[0-9])\\([a-z]\|[A-Z]\|[0-9])([a-z]\|[A-Z]\|[0-9])([a-z]\|[A-Z]\|[0-9])', ' ', text_messages)
	text_messages = re.sub(r'\\([a-z]\|[A-Z]\|[0-9])([a-z]\|[A-Z]\|[0-9])([a-z]\|[A-Z]\|[0-9])\\([a-z]\|[A-Z]\|[0-9])([a-z]\|[A-Z]\|[0-9])([a-z]\|[A-Z]\|[0-9])', ' ', text_messages)
	text_messages = re.sub(r'\[[0-9]+\]\|\[[a-z]+\]\|\[[A-Z]+\]\|\\\\\|\\r\|\\t\|\\n\|\\', ' ', text_messages)

	return text_messages
	nltk
	beautifulsoup4
	selenium>=2.44.0,<3.0.0
	requests
	unidecode
	vcrpy
	future
	fake-useragent
	newspaper3k
	sklearn
	def convertText(text):
	words = word_tokenize(text)
	new_string = ''
	for msg in words:
	new_word = ''
	alpha_flag = False
	digit_flag = False
	for c in msg:
	if c.isalpha():
	alpha_flag = True
	#Train the sentence tokenizer
	f=open("indonesian_sent_tokenizer_corpus/indonesian-promotion-text.txt", "r")
	if f.mode == 'r':
	train_text = preproccess_text(f.read())
	f.close()

	path = 'indonesian_sent_tokenizer_corpus/tempo/txt'
	for foldername in os.listdir(path):
	new_path = path + '/' + foldername
	for filename in os.listdir(new_path):
	def preproccess_df(text_messages):# change words to lower case - Hello, HELLO, hello are all the same word
	processed = text_messages.str.lower()

	# Replace email addresses with 'almtemail'
	processed = processed.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', ' almtemail ')

	# Replace phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'nmrtlpn'
	processed = processed.str.replace(r'(\()?(\+62\|62\|0)(\d{2,3})?\)?[ .-]?\d{2,4}[ .-]?\d{2,4}[ .-]?\d{2,4}', ' nmrtlpn' )

	# Replace URLs with 'almtweb'