Skip to content

Instantly share code, notes, and snippets.

@ilius
Last active November 29, 2021 08:11
Show Gist options
  • Save ilius/3e0fee9520ac03094bcb to your computer and use it in GitHub Desktop.
Save ilius/3e0fee9520ac03094bcb to your computer and use it in GitHub Desktop.
Hazm Persian NLP: shallow-parser-test.py
# coding: utf8
from __future__ import unicode_literals
import sys, os
from os.path import join
import json
homeDir = os.getenv('HOME')
hazmRootDir = '/data2/Persian-NLP/Open-Hazm'
posJsonFile = join(homeDir, 'Desktop/pos-data.json')
sys.path.append(hazmRootDir)
sys.path.append('/data2/Persian-NLP/python-wapiti')## https://github.com/adsva/python-wapiti
## sudo ln -s /data2/Persian-NLP/Open-Hazm/Resources/malt.jar /usr/local/bin/
## sudo ln -s /data2/Persian-NLP/Open-Hazm/Resources/stanford-postagger.jar /usr/local/bin/
from hazm.Normalizer import Normalizer
from hazm import sent_tokenize, word_tokenize
from hazm import POSTagger
from WapitiPOSTagger import WapitiPOSTagger
from WapitiChunker import WapitiChunker
toStr = lambda s: s.encode('utf8') if isinstance(s, unicode) else str(s)
toUnicode = lambda s: s if isinstance(s, unicode) else str(s).decode('utf8')
normalizer = Normalizer()
def make_tagger():
return WapitiPOSTagger(
pattern=None,
model=join(hazmRootDir, 'Resources/wapiti/WapitiPOSTagger-150Full.model'),
)
def pformatTaggedTokens(tagged_tokens):
RLM = '\u200f'
ZWNJ = '\u200c'
LRE = '\u202a'
PDF = '\u202c'
line1_words = []
line2_words = []
line3_words = []
for word, tag in tagged_tokens:
word_len = len(word)
word_len_vis = len(word.replace(ZWNJ, ''))
word_dif = word_len - word_len_vis
width = max(
word_len_vis,
len(tag),
)
line1_words.append(word.center(width + word_dif))
line2_words.append('_'*width)
####
tag = LRE + tag.center(width, toStr('_')) + PDF
line3_words.append(tag)
return toStr('\n'.join([
RLM + ' '.join(line) for line in (
line1_words,
line2_words,
reversed(line3_words),
)
]))
def textToPosData(text):
tagger = make_tagger()
text = normalizer.normalize(text)
sents = sent_tokenize(text)
data = []
for sent in sents:
#print 'tagging sentence: ', sent
sent = sent.strip()
sent_words = word_tokenize(sent)
try:
tagged_tokens = tagger.tag(sent_words)
except Exception as e:
print 'Exception when tagging sentence', toStr(sent)
print str(e)
else:
data.append(tagged_tokens)
return data
def printPosData(data):
for tagged_tokens in data:
print pformatTaggedTokens(tagged_tokens)
print
sys.stdout.flush()
def savePosData(data):
open(posJsonFile, 'w').write(toStr(json.dumps(pos_data)))
def loadPosData():
return json.loads(toUnicode(open(posJsonFile).read()))
def posDataToChunkData(pos_data):
print 'creating chunker'
chunker = WapitiChunker()
print 'chunker created'
###
chunk_data = []
for tagged_tokens in pos_data:
chunk_data.append(
chunker.parse(tagged_tokens)
)
return chunk_data
def printChunkData(chunk_data):
for chunks in chunk_data:
print toStr(' | '.join(chunks))
if __name__ == '__main__':
#text = 'زنگ‌ها برای که به صدا درمی‌آید؟'
#text = toUnicode(open(sys.argv[1]).read())
#####
#pos_data = textToPosData(text)
pos_data = loadPosData()
#printPosData(pos_data)
#####
chunk_data = posDataToChunkData(pos_data)
printChunkData(chunk_data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment