Skip to content

Instantly share code, notes, and snippets.

@fannix
Created November 17, 2012 00:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fannix/4092226 to your computer and use it in GitHub Desktop.
Save fannix/4092226 to your computer and use it in GitHub Desktop.
Stanford Tagger Jython Wrapper
#encoding:utf-8
from java.io import FileInputStream
from java.io import ObjectInputStream
import sys
jarfiles = ["/opt/lingpipe-segmenter/lingpipe-4.0.1.jar", "/opt/lingpipe-segmenter/zhToksDemo.jar"]
for jar in jarfiles:
if jar not in sys.path:
sys.path.append(jar)
modelfile="/opt/lingpipe-segmenter/mxf_segmenter"
fin = FileInputStream(modelfile);
objin = ObjectInputStream(fin);
mSpellChecker = objin.readObject()
mSpellChecker.setAllowInsert(True);
mSpellChecker.setAllowMatch(True);
mSpellChecker.setAllowDelete(False);
mSpellChecker.setAllowSubstitute(False);
mSpellChecker.setAllowTranspose(False);
mSpellChecker.setNumConsecutiveInsertionsAllowed(1);
mMaxNBest = 1024;
mSpellChecker.setNBest(mMaxNBest);
# segment a string
#sentence=u"你好世界"
#print mSpellChecker.didYouMean(sentence)
# segment a file
f = open("/home/lxb/corpus/taobao_mobile_utf8.txt")
for line in f:
line = line.decode("utf-8")
try:
print mSpellChecker.didYouMean(line).encode("utf-8")
except UnicodeEncodeError:
continue
f.close()
#encoding:utf-8
from java.util import Properties
from java.io import File, PrintStream, FileOutputStream
from java.lang import System
import sys
segmentjar ="/Users/mxf/Downloads/stanford-chinese-segmenter-2011-09-14/seg.jar"
if segmentjar not in sys.path:
sys.path.append(segmentjar )
from edu.stanford.nlp.ie.crf import CRFClassifier
props = Properties()
props.setProperty("sighanCorporaDict", "/Users/mxf/Downloads/stanford-chinese-segmenter-2011-09-14/data/")
props.setProperty("serDictionary", "/Users/mxf/Downloads/stanford-chinese-segmenter-2011-09-14/data/dict-chris6.ser.gz")
classifier = CRFClassifier(props)
#alternatively we could use ctb.gz
classifier.loadClassifierNoExceptions("/Users/mxf/Downloads/stanford-chinese-segmenter-2011-09-14/data/pku.gz", props)
classifier.flags.setProperties(props)
#segement a string
sentence = u'你好世界'
li = classifier.segmentString(sentence)
for e in li:
print e
#segment a file and write to a file
#outfile = File("/home/mxf/stdout.txt")
#System.setOut(PrintStream(FileOutputStream(outfile)))
## the arguments can be a list
#classifier.testAndWriteAnswers(["/home/mxf/mobile.txt"])
#encoding: utf-8
import sys
segmentjar ="/opt/stanford-postagger/stanford-postagger.jar"
if segmentjar not in sys.path:
sys.path.append(segmentjar )
from edu.stanford.nlp.tagger.maxent import MaxentTagger
# note that tagger and segmenter conflict
# they can not be both in the classpath
model = "/home/mxf/stanford-postagger-full-2010-05-19/models/chinese.tagger"
tagger = MaxentTagger(model)
sentence = u"你好 世界"
# tag a sentence
tagger.tagTokenizedString(sentence)
# tag a file
f = open("/home/mxf/test.seg", 'r')
for line in f:
print tagger.tagTokenizedString(line.decode("utf-8"))
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment