Skip to content

Instantly share code, notes, and snippets.

@nikopol
Created November 2, 2011 17:53
Show Gist options
  • Save nikopol/1334349 to your computer and use it in GitHub Desktop.
Save nikopol/1334349 to your computer and use it in GitHub Desktop.
is_utf8 perl vs inline c
#!/usr/bin/perl -w
use Inline C;
#use Benchmark qw(:all);
local $/=undef;
my $file=$ARGV[0] or die("you must specify a file");
open my $fh, "<", $file;
my $text=<$fh>;
close $fh;
##$stat=timethese(100, {
## 'Perl' => sub { _is_utf8($text); },
## 'InlineC' => sub { __is_utf8($text); },
##});
##cmpthese($stat) ;
if(__is_utf8($text)) { print "\nseems to be utf8\n"; }
else { print "\ndoesn't seems to be utf8\n"; }
sub _is_utf8 {
my $text=shift;
my @asc=unpack("C*",$text);
my $len=scalar(@asc);
my $i=0;
my $utflen=0;
my $c=0;
#print "PParsing ".$len." bytes\n";
while($i<$len) {
$c=$asc[$i];
if($c & 0x80) { #high bit? => utf char ?
#print "highbit! [".$c."]";
if (($c&0xe0)==0xc0) { $utflen=1; }
elsif(($c&0xf0)==0xe0) { $utflen=2; }
elsif(($c&0xf8)==0xf0) { $utflen=3; }
else { return 0; }
#print " len=".$utflen;
while($utflen--) {
$c=$asc[++$i];
return 0 if(($c&0xc0)!=0x80);
}
#print " ok\n";
}
++$i;
}
1;
}
__END__
__C__
int __is_utf8(char *txt) {
unsigned char c;
int len;
while((c=*txt++)!=0)
if((c&0x80)==0x80) {
if ((c&0xe0)==0xc0) len=1;
else if((c&0xf0)==0xe0) len=2;
else if((c&0xf8)==0xf0) len=3;
else return 0;
while(len--) {
c=*txt++;
if((c&0xc0)!=0x80) return 0;
}
}
return 1;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment