takdavid/unicount.pl

## unicount.pl
#!/usr/bin/perl

use Unicode::UCD 'charinfo';
use Unicode::Normalize 'isNonStDecomp', 'NFD', 'NFC', 'isComp2nd', 'isNonStDecomp';
use charnames ':full';
use Encode;
use HTML::Entities;

#use Getopt::Std;
#our ($opt_f, $opt_r);
#getopts('fr');

binmode(STDOUT, ':utf8');

sub usage
{
	print <<eof
Usage:
	./unicount.pl <infile >outfile

Ordered by counts:
	./unicount.pl <infile | sort -r -n >outfile
eof
}

my %count;
my $composition;
my $line;
my @charcodes;

while ($line = <>)
{
	$line = Encode::decode("UTF-8", $line);
	$line = decode_entities($line);
	$line = NFD($line);
	@charcodes = unpack("U0U*", $line);
	for ($i=0; $i<=$#charcodes; $i++)
	{
		$charcode = $charcodes[$i];
		$chr = substr $line, $i, 1;
		# combining characters together with the starter
		if (isComp2nd($charcode) || isNonStDecomp($charcode))
		{
			$composition .= $chr;
		}
		else
		{
			$count{$composition}++;
			$composition = $chr;
		}
	}
	$count{$composition}++;
}

delete $count{''};

while (($composition, $cnt) = each(%count))
{
	@charcodes = ();
	@charnames = ();
	foreach $chr (split(//, $composition))
	{
		$charcode = unpack("U0U*", $chr);
		push @charcodes, sprintf('%04X', $charcode);
		push @charnames, charnames::viacode($charcode);
	}
	$composition = '' if ($composition =~ /[\r\t\n]/);
	printf "%d\t%s\t%s\t%s\n", $cnt, $composition, join(' + ', @charcodes), join(' + ', @charnames);
}
	#!/usr/bin/perl

	use Unicode::UCD 'charinfo';
	use Unicode::Normalize 'isNonStDecomp', 'NFD', 'NFC', 'isComp2nd', 'isNonStDecomp';
	use charnames ':full';
	use Encode;
	use HTML::Entities;

	#use Getopt::Std;
	#our ($opt_f, $opt_r);
	#getopts('fr');

	binmode(STDOUT, ':utf8');

	sub usage
	{
	print <<eof
	Usage:
	./unicount.pl <infile >outfile

	Ordered by counts:
	./unicount.pl <infile \| sort -r -n >outfile
	eof
	}

	my %count;
	my $composition;
	my $line;
	my @charcodes;

	while ($line = <>)
	{
	$line = Encode::decode("UTF-8", $line);
	$line = decode_entities($line);
	$line = NFD($line);
	@charcodes = unpack("U0U*", $line);
	for ($i=0; $i<=$#charcodes; $i++)
	{
	$charcode = $charcodes[$i];
	$chr = substr $line, $i, 1;
	# combining characters together with the starter
	if (isComp2nd($charcode) \|\| isNonStDecomp($charcode))
	{
	$composition .= $chr;
	}
	else
	{
	$count{$composition}++;
	$composition = $chr;
	}
	}
	$count{$composition}++;
	}

	delete $count{''};

	while (($composition, $cnt) = each(%count))
	{
	@charcodes = ();
	@charnames = ();
	foreach $chr (split(//, $composition))
	{
	$charcode = unpack("U0U*", $chr);
	push @charcodes, sprintf('%04X', $charcode);
	push @charnames, charnames::viacode($charcode);
	}
	$composition = '' if ($composition =~ /[\r\t\n]/);
	printf "%d\t%s\t%s\t%s\n", $cnt, $composition, join(' + ', @charcodes), join(' + ', @charnames);
	}