Created
July 7, 2011 01:38
-
-
Save thinkhy/1068749 to your computer and use it in GitHub Desktop.
Extract MathML from GIF image generated by MathType
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use 5.010; | |
use Switch; | |
use File::Copy; | |
use Win32::OLE qw(in with); | |
use strict; | |
use constant MSXMLDOM => "MSXML2.DOMDOCUMENT.4.0"; | |
print "Begin to extract formular:\n"; | |
my @files = glob "e:/formular/backup/*.gif"; | |
my $n = @files; | |
print $n; | |
my $nothingtodo=<>; | |
my $cnt01 = 0; | |
my $cnt02 = 0; | |
my $cnt03 = 0; | |
my $cntWellForm = 0; | |
my $total = @files; | |
foreach(@files) | |
{ | |
my $file = $_; | |
print "\nProcess image file:".$file; | |
my $ret = TestFormularImage($file); | |
given ($ret) { | |
when (1) { $cnt02++; } | |
when (2) { $cnt02++; } | |
when (3) { $cnt03++;} | |
default { print 'Invalid mathtype formular!\n';} | |
} | |
} | |
my $notWellForm = $total - $cntWellForm; | |
print "\n\nSum:\tMathType01: $cnt01\n\tMathType02: $cnt02\n\tMathType03: $cnt03\n\tWellForm:$cntWellForm\n\tNOT WellForm:$notWellForm\nTotal:$total\n"; | |
sub TestFormularImage | |
{ | |
my $inputFile = shift; | |
my($dirpath,$basename,$extname) = ($inputFile =~ /^((?:.*[:\\\/])?)(.*)(\.[^.]*$)/s); | |
my $math = ReadFromFile($inputFile); | |
my $mathTypeVersion = 0; | |
if ($math =~ m/MathType003/gi) | |
{ | |
$mathTypeVersion = 3; | |
} | |
elsif ($math =~ m/MathType002/gi) | |
{ | |
$mathTypeVersion = 2; | |
} | |
elsif ($math =~ m/MathType001/gi) | |
{ | |
$mathTypeVersion = 1; | |
} | |
# $math =~ s/(?<=(>[^><]*))\&.*?;(?=([^><]*<))//gs; ; | |
# Remove scrambled code. | |
$math =~ s/\x{FF}//gs; ; | |
$math =~ s/\x{BB}\x{6D}/m/gs; ; | |
# | |
# $math =~ s/\&#x([0-9A-F]{1,4});//igs; | |
$math =~ s/^\s*//gs; ; | |
$math =~ m/(<\?xml.*)<!--\s*MathType\@End/gs; | |
$math = $1; | |
my $bLoad = LoadDocFromText($math); | |
if ($bLoad) | |
{ | |
$cntWellForm++; | |
print "$inputFile OK! Well formed MathML\n"; | |
} | |
else | |
{ | |
print "$inputFile FAILED! NOT Well formed MathML\n"; | |
copy $inputFile,"e:/formular/bad/$basename.$extname"; | |
} | |
$mathTypeVersion; | |
} | |
sub ReadFromFile { | |
my ($file) = @_; | |
my $content = ''; | |
open my $fh, $file or die "Failed to open $file. $!"; | |
{ | |
local $/; | |
$content = <$fh>; | |
} | |
close $fh; | |
$content; | |
} | |
sub LoadDocFromText | |
{ | |
my($text); | |
($text)= @_; | |
my $doc= Win32::OLE->new('MSXML2.DOMDocument.4.0') or die "Couldn't create DOM document"; | |
$doc->{async} = "False"; | |
$doc->{validateOnParse} = "True"; | |
my $bLoad = $doc->loadXML($text); | |
if (!$bLoad) | |
{ | |
print "Failed to load xml from text $!\n"; | |
} | |
$bLoad; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment