Skip to content

Instantly share code, notes, and snippets.

@agustinustheo
agustinustheo / news_title_tokenization
Created February 22, 2019 12:43
News Title Tokenization function for Filtering Fake News Blog
def news_title_tokenization(message):
stopwords = nltk.corpus.stopwords.words('english')
tokenized_news_title = []
ps = PorterStemmer()
for word in word_tokenize(message):
if word not in stopwords:
tokenized_news_title.append(ps.stem(word))
return tokenized_news_title
@agustinustheo
agustinustheo / find_similar_articles
Last active February 22, 2019 12:45
Find Similar Articles Function function for Filtering Fake News Blog
def find_similar_articles(news, similarity):
news_title_tokenized = ""
if(re.match(r'^https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)$', news)):
news_article = Article(news)
news_article.download()
news_article.parse()
news_title_tokenized = news_title_tokenization(preproccess_text(news_article.title))
else:
news_title_tokenized = news_title_tokenization(preproccess_text(news))
@agustinustheo
agustinustheo / preproccess_text
Created February 22, 2019 12:45
Preprocess Text function for Filtering Fake News Blog
def preproccess_text(text_messages):
# change words to lower case - Hello, HELLO, hello are all the same word
processed = text_messages.lower()
# Remove remove unnecessary noise
processed = re.sub(r'\[[0-9]+\]|\[[a-z]+\]|\[[A-Z]+\]|\\\\|\\r|\\t|\\n|\\', ' ', processed)
# Remove punctuation
processed = re.sub(r'[.,\/#!%\^&\*;\[\]:|+{}=\-\'"_”“`~(’)?]', ' ', processed)
@agustinustheo
agustinustheo / remove_unnecessary_noise
Created February 22, 2019 12:46
Remove Unnecessary Noise function for Filtering Fake News Blog
def remove_unnecessary_noise(text_messages):
text_messages = re.sub(r'\\([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])\\([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])\\([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])', ' ', text_messages)
text_messages = re.sub(r'\\([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])\\([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])([a-z]|[A-Z]|[0-9])', ' ', text_messages)
text_messages = re.sub(r'\[[0-9]+\]|\[[a-z]+\]|\[[A-Z]+\]|\\\\|\\r|\\t|\\n|\\', ' ', text_messages)
return text_messages
@agustinustheo
agustinustheo / requirements.txt
Created February 22, 2019 12:47
Required libraries for Filtering Fake News Blog
nltk
beautifulsoup4
selenium>=2.44.0,<3.0.0
requests
unidecode
vcrpy
future
fake-useragent
newspaper3k
sklearn
def convertText(text):
words = word_tokenize(text)
new_string = ''
for msg in words:
new_word = ''
alpha_flag = False
digit_flag = False
for c in msg:
if c.isalpha():
alpha_flag = True
@agustinustheo
agustinustheo / preprocessText.py
Created July 30, 2019 05:27
Preprocess Text function for SMS Classifier Blog
def preproccess_text(text_messages):
# change words to lower case - Hello, HELLO, hello are all the same word
processed = text_messages.lower()
# Replace email addresses with 'almtemail'
processed = re.sub(r'^.+@[^\.].*\.[a-z]{2,}$', ' almtemail ', processed)
# Replace phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'nmrtlpn'
processed = re.sub(r'(\()?(\+62|62|0)(\d{2,3})?\)?[ .-]?\d{2,4}[ .-]?\d{2,4}[ .-]?\d{2,4}', ' nmrtlpn ', processed)
@agustinustheo
agustinustheo / trainTokenizer.py
Created July 30, 2019 05:42
Train Tokenizer code for SMS Classifier Blog
#Train the sentence tokenizer
f=open("indonesian_sent_tokenizer_corpus/indonesian-promotion-text.txt", "r")
if f.mode == 'r':
train_text = preproccess_text(f.read())
f.close()
path = 'indonesian_sent_tokenizer_corpus/tempo/txt'
for foldername in os.listdir(path):
new_path = path + '/' + foldername
for filename in os.listdir(new_path):
@agustinustheo
agustinustheo / preprocessDataframe.py
Created July 30, 2019 06:09
Preprocess Dataframe function for SMS Classifier Blog
def preproccess_df(text_messages):# change words to lower case - Hello, HELLO, hello are all the same word
processed = text_messages.str.lower()
# Replace email addresses with 'almtemail'
processed = processed.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', ' almtemail ')
# Replace phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'nmrtlpn'
processed = processed.str.replace(r'(\()?(\+62|62|0)(\d{2,3})?\)?[ .-]?\d{2,4}[ .-]?\d{2,4}[ .-]?\d{2,4}', ' nmrtlpn' )
# Replace URLs with 'almtweb'
@agustinustheo
agustinustheo / loadingDataset.py
Last active July 30, 2019 06:12
Loading dataset for SMS Classifier Blog
df = pd.read_csv('sms_classifier_corpus/data.txt', engine='python', sep="<%>", header=None)