Created
May 4, 2019 12:27
-
-
Save ferdhika31/159df6751eadc3940b5f68c88ed04390 to your computer and use it in GitHub Desktop.
Preprocessing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from nltk.tokenize import RegexpTokenizer | |
from nltk.corpus import stopwords | |
from nltk.stem import PorterStemmer | |
''' | |
Return array of word | |
''' | |
def preprocess(teks): | |
# lowercase | |
teks = teks.lower() | |
# Token + remove punctuation | |
tokenizer = RegexpTokenizer(r'\w+') # \w : angka atau huruf kecil & besar termasuk underscore | + : 1 atau lebih, sebanyak-banyaknya | |
teks = tokenizer.tokenize(teks) | |
# Stopwords | |
stop_words = set(stopwords.words('english')) | |
word_tokens = teks | |
teks = [w for w in word_tokens if not w in stop_words] | |
# sama aja kayak yang diatas | |
# teks = [] | |
# for w in word_tokens: | |
# if w not in stop_words: | |
# teks.append(w) | |
# Stemming pake porter stemmer | |
ps = PorterStemmer() # inisiasi porter stemmer | |
teks = [ps.stem(kata) for kata in teks] | |
return teks | |
# ambil data di csv pake pandas | |
df = pd.read_csv("data/dataset.csv") | |
# iterasi data dari csv | |
for index, row in df.iterrows(): | |
print(preprocess(row["news"])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment