public
Created

Extract MathML from GIF image generated by MathType

  • Download Gist
MathType2MathML.pl
Perl
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
use 5.010;
use Switch;
use File::Copy;
use Win32::OLE qw(in with);
use strict;
 
use constant MSXMLDOM => "MSXML2.DOMDOCUMENT.4.0";
 
print "Begin to extract formular:\n";
 
my @files = glob "e:/formular/backup/*.gif";
my $n = @files;
print $n;
my $nothingtodo=<>;
 
my $cnt01 = 0;
my $cnt02 = 0;
my $cnt03 = 0;
 
my $cntWellForm = 0;
my $total = @files;
 
foreach(@files)
{
my $file = $_;
print "\nProcess image file:".$file;
my $ret = TestFormularImage($file);
 
given ($ret) {
when (1) { $cnt02++; }
when (2) { $cnt02++; }
when (3) { $cnt03++;}
default { print 'Invalid mathtype formular!\n';}
}
 
 
}
 
my $notWellForm = $total - $cntWellForm;
print "\n\nSum:\tMathType01: $cnt01\n\tMathType02: $cnt02\n\tMathType03: $cnt03\n\tWellForm:$cntWellForm\n\tNOT WellForm:$notWellForm\nTotal:$total\n";
 
 
 
sub TestFormularImage
{
my $inputFile = shift;
 
my($dirpath,$basename,$extname) = ($inputFile =~ /^((?:.*[:\\\/])?)(.*)(\.[^.]*$)/s);
 
my $math = ReadFromFile($inputFile);
 
my $mathTypeVersion = 0;
if ($math =~ m/MathType003/gi)
{
$mathTypeVersion = 3;
}
elsif ($math =~ m/MathType002/gi)
{
$mathTypeVersion = 2;
}
elsif ($math =~ m/MathType001/gi)
{
$mathTypeVersion = 1;
}
 
# $math =~ s/(?<=(>[^><]*))\&.*?;(?=([^><]*<))//gs; ;
# Remove scrambled code.
$math =~ s/\x{FF}//gs; ;
$math =~ s/\x{BB}\x{6D}/m/gs; ;
#
# $math =~ s/\&#x([0-9A-F]{1,4});//igs;
 
$math =~ s/^\s*//gs; ;
$math =~ m/(<\?xml.*)<!--\s*MathType\@End/gs;
$math = $1;
my $bLoad = LoadDocFromText($math);
if ($bLoad)
{
$cntWellForm++;
print "$inputFile OK! Well formed MathML\n";
}
else
{
print "$inputFile FAILED! NOT Well formed MathML\n";
copy $inputFile,"e:/formular/bad/$basename.$extname";
}
 
$mathTypeVersion;
}
 
sub ReadFromFile {
my ($file) = @_;
my $content = '';
 
open my $fh, $file or die "Failed to open $file. $!";
{
local $/;
$content = <$fh>;
}
 
close $fh;
 
$content;
}
 
 
sub LoadDocFromText
{
my($text);
($text)= @_;
 
my $doc= Win32::OLE->new('MSXML2.DOMDocument.4.0') or die "Couldn't create DOM document";
 
$doc->{async} = "False";
$doc->{validateOnParse} = "True";
my $bLoad = $doc->loadXML($text);
if (!$bLoad)
{
print "Failed to load xml from text $!\n";
}
 
$bLoad;
}

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.