Skip to content

Instantly share code, notes, and snippets.

@RiansyahTohamba
Last active January 15, 2021 01:44
Show Gist options
  • Save RiansyahTohamba/06444ed18eb23ac8e8be0fcc238614b4 to your computer and use it in GitHub Desktop.
Save RiansyahTohamba/06444ed18eb23ac8e8be0fcc238614b4 to your computer and use it in GitHub Desktop.
import nltk
# ubah jadi tipe nltk
def convertToText(filename):
# raw/str -> token/list -> convert ke nltk.Text
raw = open(filename).read()
# type(raw) == string
tokens = nltk.word_tokenize(raw)
# type(tokens) == list
# token bisa berupa tanda-baca{?.,etc}, pos = {adverb,adj,}
return nltk.Text(tokens)
def get_context(keyword,filename):
nltktxt = convertToText(filename)
return nltktxt.concordance(keyword)
def convert_to_wordpos(rawstr):
sentences = nltk.sent_tokenize(rawstr)
sentences = [nltk.word_tokenize(sent) for sent in sentences]
sentences = [nltk.pos_tag(sent) for sent in sentences]
return sentences
# ubah jadi word dan tag pos nya
def text_preprocess(filename):
rawstr = open(filename).read()
return convert_to_wordpos(rawstr)
# chunk by np-chunk
def get_chunk_np(sentence):
grammar = "NP: {<DT>?<JJ>*<NN>}"
cp = nltk.RegexpParser(grammar)
result = cp.parse(sentence)
# hasil parse ini, target grammar akan diberi tag (NP )
return result
# chunk by tag-patterns
def get_chunk_tagpattern(sentence):
pass
# chunk by regex
def get_chunk_regex(sentence):
pass
def printAllNPChunk(sentences):
# jika sudah di print, selanjutnya adalah ekstrak relation
# caranya gimana ?
for sen in sentences:
print(get_chunk_np(sen))
def find_by_postag(sentences, tagkeyword):
for sent in sentences:
for wt in sent:
word = wt[0]
tag = wt[1]
if (tag == tagkeyword):
print(word)
# find NP in sentences
def findNP(senteChunk):
for ch in senteChunk:
if (type(ch)!= tuple and ch.label() == 'NP'):
print(ch)
@RiansyahTohamba
Copy link
Author

example use of function find_by_postag(sentences,tagkeyword).

sentences = text_preprocess('kim-method-4-5.txt')
tagkeyword = 'CD'
find_by_postag(sentences, tagkeyword)

Result in the list of Cardinality words in the text.
Since it contains a number, probably contain information focus on us

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment