RiansyahTohamba/simple-processing-text

## simple-processing-text
import nltk
# ubah jadi tipe nltk
def convertToText(filename):
    # raw/str -> token/list -> convert ke nltk.Text
    raw = open(filename).read()
    # type(raw) == string
    tokens = nltk.word_tokenize(raw)
    # type(tokens) == list
    # token bisa berupa tanda-baca{?.,etc}, pos = {adverb,adj,}
    return nltk.Text(tokens)

def get_context(keyword,filename):
    nltktxt = convertToText(filename)
    return nltktxt.concordance(keyword)

def convert_to_wordpos(rawstr):
    sentences = nltk.sent_tokenize(rawstr)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    return sentences

# ubah jadi word dan tag pos nya
def text_preprocess(filename):
    rawstr = open(filename).read()
    return convert_to_wordpos(rawstr)

# chunk by np-chunk
def get_chunk_np(sentence):
    grammar = "NP: {<DT>?<JJ>*<NN>}"
    cp = nltk.RegexpParser(grammar)
    result = cp.parse(sentence)
    #     hasil parse ini, target grammar akan diberi tag (NP )
    return result

# chunk by tag-patterns
def get_chunk_tagpattern(sentence):
    pass

# chunk by regex
def get_chunk_regex(sentence):
    pass

def printAllNPChunk(sentences):
#    jika sudah di print, selanjutnya adalah ekstrak relation
#    caranya gimana ?
    for sen in sentences:
        print(get_chunk_np(sen))

def find_by_postag(sentences, tagkeyword):
    for sent in sentences:
        for wt in sent:
            word = wt[0]
            tag = wt[1]
            if (tag == tagkeyword):
                print(word)

# find NP in sentences
def findNP(senteChunk):
    for ch in senteChunk:
        if (type(ch)!= tuple and ch.label() == 'NP'):
            print(ch)
	import nltk
	# ubah jadi tipe nltk
	def convertToText(filename):
	# raw/str -> token/list -> convert ke nltk.Text
	raw = open(filename).read()
	# type(raw) == string
	tokens = nltk.word_tokenize(raw)
	# type(tokens) == list
	# token bisa berupa tanda-baca{?.,etc}, pos = {adverb,adj,}
	return nltk.Text(tokens)

	def get_context(keyword,filename):
	nltktxt = convertToText(filename)
	return nltktxt.concordance(keyword)

	def convert_to_wordpos(rawstr):
	sentences = nltk.sent_tokenize(rawstr)
	sentences = [nltk.word_tokenize(sent) for sent in sentences]
	sentences = [nltk.pos_tag(sent) for sent in sentences]
	return sentences

	# ubah jadi word dan tag pos nya
	def text_preprocess(filename):
	rawstr = open(filename).read()
	return convert_to_wordpos(rawstr)

	# chunk by np-chunk
	def get_chunk_np(sentence):
	grammar = "NP: {<DT>?<JJ>*<NN>}"
	cp = nltk.RegexpParser(grammar)
	result = cp.parse(sentence)
	# hasil parse ini, target grammar akan diberi tag (NP )
	return result

	# chunk by tag-patterns
	def get_chunk_tagpattern(sentence):
	pass

	# chunk by regex
	def get_chunk_regex(sentence):
	pass

	def printAllNPChunk(sentences):
	# jika sudah di print, selanjutnya adalah ekstrak relation
	# caranya gimana ?
	for sen in sentences:
	print(get_chunk_np(sen))

	def find_by_postag(sentences, tagkeyword):
	for sent in sentences:
	for wt in sent:
	word = wt[0]
	tag = wt[1]
	if (tag == tagkeyword):
	print(word)

	# find NP in sentences
	def findNP(senteChunk):
	for ch in senteChunk:
	if (type(ch)!= tuple and ch.label() == 'NP'):
	print(ch)