Skip to content

Instantly share code, notes, and snippets.

@ferdhika31
Created May 4, 2019 12:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ferdhika31/159df6751eadc3940b5f68c88ed04390 to your computer and use it in GitHub Desktop.
Save ferdhika31/159df6751eadc3940b5f68c88ed04390 to your computer and use it in GitHub Desktop.
Preprocessing
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
'''
Return array of word
'''
def preprocess(teks):
# lowercase
teks = teks.lower()
# Token + remove punctuation
tokenizer = RegexpTokenizer(r'\w+') # \w : angka atau huruf kecil & besar termasuk underscore | + : 1 atau lebih, sebanyak-banyaknya
teks = tokenizer.tokenize(teks)
# Stopwords
stop_words = set(stopwords.words('english'))
word_tokens = teks
teks = [w for w in word_tokens if not w in stop_words]
# sama aja kayak yang diatas
# teks = []
# for w in word_tokens:
# if w not in stop_words:
# teks.append(w)
# Stemming pake porter stemmer
ps = PorterStemmer() # inisiasi porter stemmer
teks = [ps.stem(kata) for kata in teks]
return teks
# ambil data di csv pake pandas
df = pd.read_csv("data/dataset.csv")
# iterasi data dari csv
for index, row in df.iterrows():
print(preprocess(row["news"]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment