j2labs/urdu-segmenter.pl

## urdu-segmenter.pl
#!/usr/bin/perl -w -CD
binmode STDOUT, ":utf8";
binmode STDIN, ":utf8";
no warnings;


#-----Description------------------------------------------------------
#
# Program:urdu-segmenter.pl
# Written by: Danish Munir
# Purpose:breaks urdu text into sentences
#
# Syntax: urdu-segmenter.pl [filename]
#or program_that_outputs_urdu_text | urdu-segmenter.pl [-x] -s [docid(optional)]
# This script takes a utf8 encoded file with Urdu text as input
# and outputs to STDOUT, the text after segmenting it into sentences.
#
# The xml format of the output is as follows
# <DOC docid = "Filename" lang = "URD">
# <SEG id = "1">Urdu Sentence 1</SEG>
# <SEG id = "2">Urdu Sentence 2</SEG>
# <SEG id = "3">Urdu Sentence 3</SEG>
# </DOC>
#
# This script breaks urdu sentences based on the following punctuations:
# [dash]Unicode 06D4
# [question]Unicode 061F
# multiple newline characters
#-----------------------------------------------------------------------

if ($ARGV[0] =~ m/^-h$/ || $ARGV[0] =~ m/^-+help$/){
    print "\n
breaksenteces.pl
----------------
Syntax: urdu-segmenter.pl [filename]
 of urdu-segmenter.pl -x [filename]
 or program_that_outputs_urdu_text | urdu-segmenter.pl -s [docid(optional)]
 of program_that_outputs_urdu_text | urdu-segmenter.pl -s -x [docid(optional)]
 eg: more sourcefile1.txt | urdu-segmenter.pl -s Title

The -x option is used to output xml tags, if and only if the -x option is used

This script takes a utf8 encoded file with Urdu text as input and outputs to STDOUT, the text after segmenting it into sentences.

The xml format of the output is as follows
<DOC docid = \"Filename\" lang = \"URD\">
<SEG id = \"1\">Urdu Sentence 1</SEG>
<SEG id = \"2\">Urdu Sentence 2</SEG>
</DOC>

This script breaks urdu sentences based on the following punctuations:

 multiple newline characters
 [dash]Unicode 06D4
 [question]Unicode 061F
 [ellipsis]Unicode 2026
 [bullet]Unicode 2022

\n\n";
    exit;
}


#Code Starts here
if ($ARGV[0] =~ m/^-s$/){
    if ($ARGV[1] =~ m/^-x$/) {
        $printxml = 1;
        $filename = $ARGV[2];
        $/=undef;
        $_=<STDIN>;
    } else {
        $printxml = 0;
        $filename = $ARGV[1];
        $/=undef;
        $_=<STDIN>;
    }
}
else {
    if ($ARGV[0] =~ m/^-x$/) {
        $printxml = 1;
        open(I,"<:utf8", $ARGV[1]) #Open the file passed, or exit upon error
            or die "Cannot open file $ARGV[1]: $!";

        $filename = $ARGV[1]; #This and the next 2 lines cleanup the

        $filename =~ s/.*\///; #filename by removing the path and the
        $filename =~ s/\.[^\.]*$//; #extension.

        $/=undef; #Set the input delimeter to undef to
#read the entire file at once.

        $_=<I>;
    } else {
        $printxml = 0;
        open(I,"<:utf8", $ARGV[0]) #Open the file passed, or exit upon error
            or die "Cannot open file $ARGV[0]: $!";

        $filename = $ARGV[0]; #This and the next 2 lines cleanup the

        $filename =~ s/.*\///; #filename by removing the path and the
        $filename =~ s/\.[^\.]*$//; #extension.

        $/=undef; #Set the input delimeter to undef to
#read the entire file at once.

        $_=<I>;
    }
}
s/\r//sgi;
s/\n/\n\n/sg;
if ($printxml) {
    print "<DOC docid = \"$filename\" lang = \"URD\">\n";
}
s/\s*\x{2022}\s*/\n\n\n\n\n/g; #Replace bullets with sentence breaks.

s/\t* +\t*$/ /g;
s/[\n\x{000D}][ ]+[\n\x{000d}]/\n\n/sg; #This and the following 4 lines
s/^[\t\x{0020}]+$/\n\n/g; #attempt to remove lines with
#s/ +$/\n\n/g; #spaces only.

#s/([\x{06d4}\x{061f}\n\x{000d}]) *[\n\x{000d}]*/$1/g;

s/|//g; #Remove pipe character from files.


my @sentences=split(/(\n{2,}|!|\x{061f}|\x{06D4}|\x{2022}|\x{000d}|\s{2,}|\x{2026}|\x{002e})/); #This line actually splits the text into
###my @sentences=split(/(\n{2,}|!|\x{002e})/); #This line actually splits the text into
#sentences based on the various delimiters
#described above


my $i = 0; #Initialize loop counter i, and the
my $j=1; #segment counter j

 sent: while ( $i < @sentences ) { #The @sentences array has a list of items such that
     #an item at index i, is followed by the punctuation
     #that marked the end of this sentence at index i+1

     $string=$sentences[$i]; #Take a sentence and trim any white d
     $string =~ s/^\s*(.*?)\s*$/$1/g; #spaces at the start or en

     if (length($string) <=3 || $string =~ m/^\s+$/){#Check to see if a sentence contains only white
         $i += 0; #space. If it does, than discard it.
     }
     else{
         if ($printxml) {
             print "<SEG id=\"$j\">$string"; #If it passes the test, than sentence is valid, so print it
         } else {
             print "$string";
         }

         if($sentences[$i+1] =~ m/[\n\x{000d}\x{2022}]/){#Check the punctuation follownig the sentence.
             print ""; #If newline, carraige-return, or bullet than dont print it.
         } else{ #If passes both tests than print it.
             print "$sentences[$i+1]";
         }

         if ($printxml) {
             print "</SEG>\n"; #Close segment tag.
         } else {
             print "\n";
         }
         $j++; #Increment segment counter.
     }
     $i+=2; #Increment sentence counter by 2, to move to next
} #set of sentence and its ending punctuation.

if ($printxml) {
    print "</DOC>\n"; #Close DOC tag.
}

close;
	#!/usr/bin/perl -w -CD
	binmode STDOUT, ":utf8";
	binmode STDIN, ":utf8";
	no warnings;


	#-----Description------------------------------------------------------
	#
	# Program:urdu-segmenter.pl
	# Written by: Danish Munir
	# Purpose:breaks urdu text into sentences
	#
	# Syntax: urdu-segmenter.pl [filename]
	#or program_that_outputs_urdu_text \| urdu-segmenter.pl [-x] -s [docid(optional)]
	# This script takes a utf8 encoded file with Urdu text as input
	# and outputs to STDOUT, the text after segmenting it into sentences.
	#
	# The xml format of the output is as follows
	# <DOC docid = "Filename" lang = "URD">
	# <SEG id = "1">Urdu Sentence 1</SEG>
	# <SEG id = "2">Urdu Sentence 2</SEG>
	# <SEG id = "3">Urdu Sentence 3</SEG>
	# </DOC>
	#
	# This script breaks urdu sentences based on the following punctuations:
	# [dash]Unicode 06D4
	# [question]Unicode 061F
	# multiple newline characters
	#-----------------------------------------------------------------------

	if ($ARGV[0] =~ m/^-h$/ \|\| $ARGV[0] =~ m/^-+help$/){
	print "\n
	breaksenteces.pl
	----------------
	Syntax: urdu-segmenter.pl [filename]
	of urdu-segmenter.pl -x [filename]
	or program_that_outputs_urdu_text \| urdu-segmenter.pl -s [docid(optional)]
	of program_that_outputs_urdu_text \| urdu-segmenter.pl -s -x [docid(optional)]
	eg: more sourcefile1.txt \| urdu-segmenter.pl -s Title

	The -x option is used to output xml tags, if and only if the -x option is used

	This script takes a utf8 encoded file with Urdu text as input and outputs to STDOUT, the text after segmenting it into sentences.

	The xml format of the output is as follows
	<DOC docid = \"Filename\" lang = \"URD\">
	<SEG id = \"1\">Urdu Sentence 1</SEG>
	<SEG id = \"2\">Urdu Sentence 2</SEG>
	</DOC>

	This script breaks urdu sentences based on the following punctuations:

	multiple newline characters
	[dash]Unicode 06D4
	[question]Unicode 061F
	[ellipsis]Unicode 2026
	[bullet]Unicode 2022

	\n\n";
	exit;
	}


	#Code Starts here
	if ($ARGV[0] =~ m/^-s$/){
	if ($ARGV[1] =~ m/^-x$/) {
	$printxml = 1;
	$filename = $ARGV[2];
	$/=undef;
	$_=<STDIN>;
	} else {
	$printxml = 0;
	$filename = $ARGV[1];
	$/=undef;
	$_=<STDIN>;
	}
	}
	else {
	if ($ARGV[0] =~ m/^-x$/) {
	$printxml = 1;
	open(I,"<:utf8", $ARGV[1]) #Open the file passed, or exit upon error
	or die "Cannot open file $ARGV[1]: $!";

	$filename = $ARGV[1]; #This and the next 2 lines cleanup the

	$filename =~ s/.*\///; #filename by removing the path and the
	$filename =~ s/\.[^\.]*$//; #extension.

	$/=undef; #Set the input delimeter to undef to
	#read the entire file at once.

	$_=<I>;
	} else {
	$printxml = 0;
	open(I,"<:utf8", $ARGV[0]) #Open the file passed, or exit upon error
	or die "Cannot open file $ARGV[0]: $!";

	$filename = $ARGV[0]; #This and the next 2 lines cleanup the

	$filename =~ s/.*\///; #filename by removing the path and the
	$filename =~ s/\.[^\.]*$//; #extension.

	$/=undef; #Set the input delimeter to undef to
	#read the entire file at once.

	$_=<I>;
	}
	}
	s/\r//sgi;
	s/\n/\n\n/sg;
	if ($printxml) {
	print "<DOC docid = \"$filename\" lang = \"URD\">\n";
	}
	s/\s\x{2022}\s/\n\n\n\n\n/g; #Replace bullets with sentence breaks.

	s/\t* +\t*$/ /g;
	s/[\n\x{000D}][ ]+[\n\x{000d}]/\n\n/sg; #This and the following 4 lines
	s/^[\t\x{0020}]+$/\n\n/g; #attempt to remove lines with
	#s/ +$/\n\n/g; #spaces only.

	#s/([\x{06d4}\x{061f}\n\x{000d}]) [\n\x{000d}]/$1/g;

	s/\|//g; #Remove pipe character from files.


	my @sentences=split(/(\n{2,}\|!\|\x{061f}\|\x{06D4}\|\x{2022}\|\x{000d}\|\s{2,}\|\x{2026}\|\x{002e})/); #This line actually splits the text into
	###my @sentences=split(/(\n{2,}\|!\|\x{002e})/); #This line actually splits the text into
	#sentences based on the various delimiters
	#described above


	my $i = 0; #Initialize loop counter i, and the
	my $j=1; #segment counter j

	sent: while ( $i < @sentences ) { #The @sentences array has a list of items such that
	#an item at index i, is followed by the punctuation
	#that marked the end of this sentence at index i+1

	$string=$sentences[$i]; #Take a sentence and trim any white d
	$string =~ s/^\s(.?)\s*$/$1/g; #spaces at the start or en

	if (length($string) <=3 \|\| $string =~ m/^\s+$/){#Check to see if a sentence contains only white
	$i += 0; #space. If it does, than discard it.
	}
	else{
	if ($printxml) {
	print "<SEG id=\"$j\">$string"; #If it passes the test, than sentence is valid, so print it
	} else {
	print "$string";
	}

	if($sentences[$i+1] =~ m/[\n\x{000d}\x{2022}]/){#Check the punctuation follownig the sentence.
	print ""; #If newline, carraige-return, or bullet than dont print it.
	} else{ #If passes both tests than print it.
	print "$sentences[$i+1]";
	}

	if ($printxml) {
	print "</SEG>\n"; #Close segment tag.
	} else {
	print "\n";
	}
	$j++; #Increment segment counter.
	}
	$i+=2; #Increment sentence counter by 2, to move to next
	} #set of sentence and its ending punctuation.

	if ($printxml) {
	print "</DOC>\n"; #Close DOC tag.
	}

	close;