Skip to content

Instantly share code, notes, and snippets.

@ferdhika31
Created July 19, 2019 14:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ferdhika31/e2cfd5e38ec5b2e89405f7bfdc93774c to your computer and use it in GitHub Desktop.
Save ferdhika31/e2cfd5e38ec5b2e89405f7bfdc93774c to your computer and use it in GitHub Desktop.
Simple NER metode machine learning naive bayes
from mnb import MNB
nb = MNB()
nb.learn('Santika Supriadi Supriadi', 'person')
# nb.learn('Supriadi', 'person')
# nb.learn('Supriadi Cahyadi', 'person')
nb.learn('Santika Santika Merapi', 'organisasi')
nb.learn('Merapi Merapi Bali Supriadi', 'lokasi')
print(nb.categorize("Santika"))
import math
class MNB:
def __init__(self, debug=False):
'''
Log Debug
'''
self.debug = debug
'''
VOCABULARY - inisiasi vocabulary |V|
'''
self.vocabulary = []
'''
PRIOR PROBABILITY: P(C) = docCount(C) / Ndoc
'''
self.docFrequencyCount = {} # docCount(entitas)
self.totalNumberOfDocuments = 0 # docCount(all)
'''
ENTITAS - inisiasi daftar entitas
'''
self.daftarEntitas = []
self.tokenFrequencyTable = {}
def getOrCreateEntitasToken(self, teks, namaEntitas):
if namaEntitas=="" or type(namaEntitas) is not str:
print('Nama entitas tidak sesuai: `' + namaEntitas + '`. Harus String bro.')
exit
# simple singleton for each entitas
if namaEntitas not in [e["entitas"] for e in self.daftarEntitas]:
# init counter
self.docFrequencyCount[namaEntitas] = 0
# tambah entitas ke list
self.daftarEntitas.append({
"entitas" : namaEntitas,
"tokens" : teks.split()
})
else:
index = [i for i, de in enumerate(self.daftarEntitas) if de["entitas"]==namaEntitas ][0]
for tok in teks.split():
self.daftarEntitas[index]["tokens"].append(tok)
return namaEntitas if namaEntitas in [e["entitas"] for e in self.daftarEntitas] else None
# count(c)
def tambahKeVocab(self, kata):
if kata not in self.vocabulary:
self.vocabulary.append(kata)
return ""
def learn(self, teks, entitas):
entitas = self.getOrCreateEntitasToken(teks, entitas)
# tambah frekuensi nya 1 (Prior)
self.docFrequencyCount[entitas] += 1 #per dokumen kelas/entitas
self.totalNumberOfDocuments += 1 #buat semua dokumen
for kata in teks.split(" "):
# tambah kata ke vocab |V|
self.tambahKeVocab(kata)
def categorize(self, teks):
tokens = teks.split(" ")
prior = 0
maksProb = 0
peluangEntitas = []
for e in self.daftarEntitas:
# P(c) = docCount(class)/nDoc
if self.debug:
print("P("+e["entitas"]+") = ",str(self.docFrequencyCount[e["entitas"]])+"/"+str(self.totalNumberOfDocuments))
prior = self.docFrequencyCount[e["entitas"]]/self.totalNumberOfDocuments
hitungPeluangEntitas = prior
for token in tokens:
v = len(self.vocabulary) #|V|
nTokAll = len(e["tokens"]) #count(c)
nTok = e["tokens"].count(token) #count(w,c)
if self.debug:
print("P("+str(token)+"|"+str(e["entitas"])+") = (%d)+1/(%d+%d) = %d/%d" % ( nTok, nTokAll, v, nTok+1, nTokAll+v ))
# laplace Add-1 Smoothing
# => P(w|c) = ( count(w,c) + 1 ) / ( count(w,c) + |V| )
tokenProbability = (nTok+1)/(nTokAll+v)
# print("%f / %f = %f" % (math.log(tokenProbability), prior, math.log(tokenProbability)*prior) )
hitungPeluangEntitas *= tokenProbability
# masukin peluang
peluangEntitas.append({
"entitas" : e["entitas"],
"probabilitas" : hitungPeluangEntitas
})
indexEntitas = 0
for i, pe in enumerate(peluangEntitas):
if pe["probabilitas"] > maksProb:
maksProb = pe["probabilitas"]
indexEntitas = i
return {
"entitas" : peluangEntitas[indexEntitas]["entitas"],
"probabilitas" : peluangEntitas[indexEntitas]["probabilitas"],
"probabilitasEntitas" : peluangEntitas
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment