Skip to content

Instantly share code, notes, and snippets.

@irfanandratama
Last active April 12, 2018 03:45
Show Gist options
  • Save irfanandratama/72c4ae0963e2dafacba7c9d33691830e to your computer and use it in GitHub Desktop.
Save irfanandratama/72c4ae0963e2dafacba7c9d33691830e to your computer and use it in GitHub Desktop.
Tokenizer dengan Python
#memisahkan berdasarkan kalimat
def senttoken(): #Bagi per kalimat
kalimat = input() #tambah .lower() untuk melakukan case folding sekaligus
kalimat = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\.|\?|\!)\s', kalimat)
print(kalimat)
return kalimat
def wordtoken(kaltoken):
listkalimat = kaltoken
listkata = []
for kata in listkalimat:
#kata = re.findall(r'\w+|\S\w*', kata) #tokenize tanpa menghilangkan tanda baca
kata = re.findall(r'(?i)\b[a-z]+\b', kata)
#kata = re.findall(r'(?i)\b[a-z]+\b|\d+\.\d+|\d+', kata) gunakan ini untuk tf normalized
listkata.append(kata)
#print(len(listkata))
print(listkata)
return listkata
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment