Skip to content

Instantly share code, notes, and snippets.

@thinkhy
Created July 7, 2011 01:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thinkhy/1068749 to your computer and use it in GitHub Desktop.
Save thinkhy/1068749 to your computer and use it in GitHub Desktop.
Extract MathML from GIF image generated by MathType
use 5.010;
use Switch;
use File::Copy;
use Win32::OLE qw(in with);
use strict;
use constant MSXMLDOM => "MSXML2.DOMDOCUMENT.4.0";
print "Begin to extract formular:\n";
my @files = glob "e:/formular/backup/*.gif";
my $n = @files;
print $n;
my $nothingtodo=<>;
my $cnt01 = 0;
my $cnt02 = 0;
my $cnt03 = 0;
my $cntWellForm = 0;
my $total = @files;
foreach(@files)
{
my $file = $_;
print "\nProcess image file:".$file;
my $ret = TestFormularImage($file);
given ($ret) {
when (1) { $cnt02++; }
when (2) { $cnt02++; }
when (3) { $cnt03++;}
default { print 'Invalid mathtype formular!\n';}
}
}
my $notWellForm = $total - $cntWellForm;
print "\n\nSum:\tMathType01: $cnt01\n\tMathType02: $cnt02\n\tMathType03: $cnt03\n\tWellForm:$cntWellForm\n\tNOT WellForm:$notWellForm\nTotal:$total\n";
sub TestFormularImage
{
my $inputFile = shift;
my($dirpath,$basename,$extname) = ($inputFile =~ /^((?:.*[:\\\/])?)(.*)(\.[^.]*$)/s);
my $math = ReadFromFile($inputFile);
my $mathTypeVersion = 0;
if ($math =~ m/MathType003/gi)
{
$mathTypeVersion = 3;
}
elsif ($math =~ m/MathType002/gi)
{
$mathTypeVersion = 2;
}
elsif ($math =~ m/MathType001/gi)
{
$mathTypeVersion = 1;
}
# $math =~ s/(?<=(>[^><]*))\&.*?;(?=([^><]*<))//gs; ;
# Remove scrambled code.
$math =~ s/\x{FF}//gs; ;
$math =~ s/\x{BB}\x{6D}/m/gs; ;
#
# $math =~ s/\&#x([0-9A-F]{1,4});//igs;
$math =~ s/^\s*//gs; ;
$math =~ m/(<\?xml.*)<!--\s*MathType\@End/gs;
$math = $1;
my $bLoad = LoadDocFromText($math);
if ($bLoad)
{
$cntWellForm++;
print "$inputFile OK! Well formed MathML\n";
}
else
{
print "$inputFile FAILED! NOT Well formed MathML\n";
copy $inputFile,"e:/formular/bad/$basename.$extname";
}
$mathTypeVersion;
}
sub ReadFromFile {
my ($file) = @_;
my $content = '';
open my $fh, $file or die "Failed to open $file. $!";
{
local $/;
$content = <$fh>;
}
close $fh;
$content;
}
sub LoadDocFromText
{
my($text);
($text)= @_;
my $doc= Win32::OLE->new('MSXML2.DOMDocument.4.0') or die "Couldn't create DOM document";
$doc->{async} = "False";
$doc->{validateOnParse} = "True";
my $bLoad = $doc->loadXML($text);
if (!$bLoad)
{
print "Failed to load xml from text $!\n";
}
$bLoad;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment