This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def news_title_tokenization(message): | |
stopwords = nltk.corpus.stopwords.words('english') | |
tokenized_news_title = [] | |
ps = PorterStemmer() | |
for word in word_tokenize(message): | |
if word not in stopwords: | |
tokenized_news_title.append(ps.stem(word)) | |
return tokenized_news_title |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def find_similar_articles(news, similarity): | |
news_title_tokenized = "" | |
if(re.match(r'^https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)$', news)): | |
news_article = Article(news) | |
news_article.download() | |
news_article.parse() | |
news_title_tokenized = news_title_tokenization(preproccess_text(news_article.title)) | |
else: | |
news_title_tokenized = news_title_tokenization(preproccess_text(news)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def preproccess_text(text_messages): | |
# change words to lower case - Hello, HELLO, hello are all the same word | |
processed = text_messages.lower() | |
# Remove remove unnecessary noise | |
processed = re.sub(r'\[[0-9]+\]|\[[a-z]+\]|\[[A-Z]+\]|\\\\|\\r|\\t|\\n|\\', ' ', processed) | |
# Remove punctuation | |
processed = re.sub(r'[.,\/#!%\^&\*;\[\]:|+{}=\-\'"_”“`~(’)?]', ' ', processed) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def remove_unnecessary_noise(text_messages): | |
text_messages = re.sub(r'\\([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])\\([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])\\([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])', ' ', text_messages) | |
text_messages = re.sub(r'\\([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])\\([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])', ' ', text_messages) | |
text_messages = re.sub(r'\[[0-9]+\]|\[[a-z]+\]|\[[A-Z]+\]|\\\\|\\r|\\t|\\n|\\', ' ', text_messages) | |
return text_messages |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
nltk | |
beautifulsoup4 | |
selenium>=2.44.0,<3.0.0 | |
requests | |
unidecode | |
vcrpy | |
future | |
fake-useragent | |
newspaper3k | |
sklearn |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def convertText(text): | |
words = word_tokenize(text) | |
new_string = '' | |
for msg in words: | |
new_word = '' | |
alpha_flag = False | |
digit_flag = False | |
for c in msg: | |
if c.isalpha(): | |
alpha_flag = True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def preproccess_text(text_messages): | |
# change words to lower case - Hello, HELLO, hello are all the same word | |
processed = text_messages.lower() | |
# Replace email addresses with 'almtemail' | |
processed = re.sub(r'^.+@[^\.].*\.[a-z]{2,}$', ' almtemail ', processed) | |
# Replace phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'nmrtlpn' | |
processed = re.sub(r'(\()?(\+62|62|0)(\d{2,3})?\)?[ .-]?\d{2,4}[ .-]?\d{2,4}[ .-]?\d{2,4}', ' nmrtlpn ', processed) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Train the sentence tokenizer | |
f=open("indonesian_sent_tokenizer_corpus/indonesian-promotion-text.txt", "r") | |
if f.mode == 'r': | |
train_text = preproccess_text(f.read()) | |
f.close() | |
path = 'indonesian_sent_tokenizer_corpus/tempo/txt' | |
for foldername in os.listdir(path): | |
new_path = path + '/' + foldername | |
for filename in os.listdir(new_path): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def preproccess_df(text_messages):# change words to lower case - Hello, HELLO, hello are all the same word | |
processed = text_messages.str.lower() | |
# Replace email addresses with 'almtemail' | |
processed = processed.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', ' almtemail ') | |
# Replace phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'nmrtlpn' | |
processed = processed.str.replace(r'(\()?(\+62|62|0)(\d{2,3})?\)?[ .-]?\d{2,4}[ .-]?\d{2,4}[ .-]?\d{2,4}', ' nmrtlpn' ) | |
# Replace URLs with 'almtweb' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df = pd.read_csv('sms_classifier_corpus/data.txt', engine='python', sep="<%>", header=None) |
OlderNewer