Skip to content

Instantly share code, notes, and snippets.

@irfanandratama
Created April 12, 2018 03:29
Show Gist options
  • Save irfanandratama/d8ffb396ac752f1dae0b46f5397bd59c to your computer and use it in GitHub Desktop.
Save irfanandratama/d8ffb396ac752f1dae0b46f5397bd59c to your computer and use it in GitHub Desktop.
Representasi TF-IDF dengan Python
#Teks harus sudah melalui proses word tokenizing terlebih dahulu.
def tf(sudahDiTokenize): #Term Frequency
wordlist = sudahDiTokenize
#flat_list = [item for sublist in wordlist for item in sublist] #bila memakai tf normalized
#jumkata = len(flat_list) # bila memakai tf normalized
wordfreq = {}
for w in wordlist:
for o in w:
wordfreq[o] = wordfreq.get(o,0) + 1
#wordfreq.update((x, y/jumkata) for x, y in wordfreq.items()) #Gunakan ini apabila memakai TF Normalized
print(wordfreq)
return wordfreq
def idf(sudahDiTokenize): #Inverse Term Frequency
idf_values = {}
jumdok = len(sudahDiTokenize)
all_tokens_set = set([item for sublist in sudahDiTokenize for item in sublist])
print(all_tokens_set)
for tkn in all_tokens_set:
contains_token = map(lambda doc: tkn in doc, sudahDiTokenize)
idf_values[tkn] = math.log10(jumdok/(sum(contains_token)))
print(idf_values)
return idf_values
def tfxidf(tf,idf):
hasil = {k: tf[k]*idf[k] for k in tf}
print(hasil)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment