Skip to content

Instantly share code, notes, and snippets.

@j2labs
Created October 25, 2009 18:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save j2labs/218174 to your computer and use it in GitHub Desktop.
Save j2labs/218174 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
"""
This script is a python implementation of Danish Munir's perl urdu-segmenter.
http://www.cs.cmu.edu/afs/cs/user/lsl/Nice/Urdu-MT/code/Tools/Sentence_Segmenter/urdu-segmenter.pl
"""
import sys
import re
from optparse import OptionParser
DASH = u'\u06D4' # arabic full stop
QUESTION = u'\u061F'
ELLIPSIS = u'\u2026'
BULLET = u'\u2022'
CR = u'\u000D'
SPACE = u'\u0020'
FULL_STOP = u'\u002e'
def split_text(filename, docid, xml):
text = read_text(filename)
sentences = find_sentences(text)
write_output(sentences, filename, xml)
def find_sentences(text):
text = text.replace('\r','')
text = text.replace('\n','\n\n')
reg_bullet = u'\s*%s\s*' % BULLET
text = re.sub(reg_bullet, '\n\n\n\n\n', text)
text = text.replace('\t* +\t*$', ' ')
reg_cr = u'[\n%s][ ]+[\n%s]' % (CR, CR)
text = re.sub(reg_cr, '\n\n', text)
reg_space = u'^[\t%s]+$' % SPACE
text = re.sub(reg_space, '\n\n', text)
text = text.replace('|','')
#/(\n{2,}|!|\x{061f}|\x{06D4}|\x{2022}|\x{000d}|\s{2,}|\x{2026}|\x{002e})/
# '\n{2,}|!|QUESTION|DASH |BULLET |CR |\s{2,}|ELLIPSIS|FULL_STOP'
regex = u'(\n{2,}|!|%s|%s|%s|%s|\s{2,}|%s|\%s)' % (QUESTION, DASH, BULLET, CR, ELLIPSIS, FULL_STOP)
p = re.compile(regex)
sentences = p.split(text)
return sentences
def write_output(sentences, docid, xml):
if(xml):
print '<DOC docid = "%s" lang = "URD">' % (docid)
segment_id = 1
follow_up_punctuation = re.compile('[\n%s%s]' % (CR, BULLET))
i = 0
while i < len(sentences):
sent = sentences[i]
sent = sent.strip() # remove whitespace
if len(sent) < 1: # skip empty lines
i = i+2
continue
if(xml):
sys.stdout.write('<SEG id="%s">' % (segment_id))
sys.stdout.write(sent)
# check punctuation in following sentence
# if not newline, CR or BULLET, print it
next_sent = sentences[i+1]
if not follow_up_punctuation.match(next_sent):
sys.stdout.write(next_sent)
if(xml):
sys.stdout.write('</SEG>')
print ''
segment_id = segment_id + 1
i = i + 2
def read_text(filename):
f = None
if filename == "-":
f = sys.stdin
else:
f = open(filename, "r")
text = f.read()
f.close()
return text
if __name__ == "__main__":
parser = OptionParser(usage="%prog -f file <-d=DOCID> <-x>")
parser.add_option('-d', '--docid', dest='docid', default=None,
help='DocID XXX')
parser.add_option('-f', '--filename', dest='filename',
help='Filename to parse')
parser.add_option('-x', '--xml',
action="store_true", dest='xml', default=False,
help='Print text in XML format')
if len(sys.argv) == 1:
parser.print_help()
sys.exit()
(options, args) = parser.parse_args()
split_text(options.filename, options.docid, options.xml)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment