-
-
Save neko-fire/7038322 to your computer and use it in GitHub Desktop.
<?php | |
/* | |
This program is free software; you can redistribute it and/or modify | |
it under the terms of the GNU General Public License as published by | |
the Free Software Foundation; either version 2 of the License, or | |
(at your option) any later version. | |
This program is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
GNU General Public License for more details. | |
You should have received a copy of the GNU General Public License | |
along with this program; if not, write to the Free Software | |
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
This code is an improved version of what can be found at: | |
http://www.webcheatsheet.com/php/reading_clean_text_from_pdf.php | |
AUTHOR: | |
- Webcheatsheet.com (Original code) | |
- Joeri Stegeman (joeri210 [at] yahoo [dot] com) (Class conversion and fixes/adjustments) | |
DESCRIPTION: | |
This is a class to convert PDF files into ASCII text or so called PDF text extraction. | |
It will ignore anything that is not addressed as text within the PDF and any layout. | |
Currently supported filters are: ASCIIHexDecode, ASCII85Decode, FlateDecode | |
PURPOSE(S): | |
Most likely for people that want their PDF to be searchable. | |
SYNTAX: | |
include('class.pdf2text.php'); | |
$a = new PDF2Text(); | |
$a->setFilename('test.pdf'); | |
$a->decodePDF(); | |
echo $a->output(); | |
ALTERNATIVES: | |
Other excellent options to search within a PDF: | |
- Apache PDFbox (http://pdfbox.apache.org/). An open source Java solution | |
- pdflib TET (http://www.pdflib.com/products/tet/) | |
- Online converter: http://snowtide.com/PDFTextStream | |
*/ | |
class PDF2Text { | |
// Some settings | |
var $multibyte = 2; // Use setUnicode(TRUE|FALSE) | |
var $convertquotes = ENT_QUOTES; // ENT_COMPAT (double-quotes), ENT_QUOTES (Both), ENT_NOQUOTES (None) | |
// Variables | |
var $filename = ''; | |
var $decodedtext = ''; | |
function setFilename($filename) { | |
// Reset | |
$this->decodedtext = ''; | |
$this->filename = $filename; | |
} | |
function output($echo = false) { | |
if($echo) echo $this->decodedtext; | |
else return $this->decodedtext; | |
} | |
function setUnicode($input) { | |
// 4 for unicode. But 2 should work in most cases just fine | |
if($input == true) $this->multibyte = 4; | |
else $this->multibyte = 2; | |
} | |
function decodePDF() { | |
// Read the data from pdf file | |
$infile = @file_get_contents($this->filename, FILE_BINARY); | |
if (empty($infile)) | |
return ""; | |
// Get all text data. | |
$transformations = array(); | |
$texts = array(); | |
// Get the list of all objects. | |
preg_match_all("#obj[\n|\r](.*)endobj[\n|\r]#ismU", $infile, $objects); | |
$objects = @$objects[1]; | |
// Select objects with streams. | |
for ($i = 0; $i < count($objects); $i++) { | |
$currentObject = $objects[$i]; | |
// Check if an object includes data stream. | |
if (preg_match("#stream[\n|\r](.*)endstream[\n|\r]#ismU", $currentObject, $stream)) { | |
$stream = ltrim($stream[1]); | |
// Check object parameters and look for text data. | |
$options = $this->getObjectOptions($currentObject); | |
if (!(empty($options["Length1"]) && empty($options["Type"]) && empty($options["Subtype"]))) | |
continue; | |
// Hack, length doesnt always seem to be correct | |
unset($options["Length"]); | |
// So, we have text data. Decode it. | |
$data = $this->getDecodedStream($stream, $options); | |
if (strlen($data)) { | |
if (preg_match_all("#BT[\n|\r](.*)ET[\n|\r]#ismU", $data, $textContainers)) { | |
$textContainers = @$textContainers[1]; | |
$this->getDirtyTexts($texts, $textContainers); | |
} else | |
$this->getCharTransformations($transformations, $data); | |
} | |
} | |
} | |
// Analyze text blocks taking into account character transformations and return results. | |
$this->decodedtext = $this->getTextUsingTransformations($texts, $transformations); | |
} | |
function decodeAsciiHex($input) { | |
$output = ""; | |
$isOdd = true; | |
$isComment = false; | |
for($i = 0, $codeHigh = -1; $i < strlen($input) && $input[$i] != '>'; $i++) { | |
$c = $input[$i]; | |
if($isComment) { | |
if ($c == '\r' || $c == '\n') | |
$isComment = false; | |
continue; | |
} | |
switch($c) { | |
case '\0': case '\t': case '\r': case '\f': case '\n': case ' ': break; | |
case '%': | |
$isComment = true; | |
break; | |
default: | |
$code = hexdec($c); | |
if($code === 0 && $c != '0') | |
return ""; | |
if($isOdd) | |
$codeHigh = $code; | |
else | |
$output .= chr($codeHigh * 16 + $code); | |
$isOdd = !$isOdd; | |
break; | |
} | |
} | |
if($input[$i] != '>') | |
return ""; | |
if($isOdd) | |
$output .= chr($codeHigh * 16); | |
return $output; | |
} | |
function decodeAscii85($input) { | |
$output = ""; | |
$isComment = false; | |
$ords = array(); | |
for($i = 0, $state = 0; $i < strlen($input) && $input[$i] != '~'; $i++) { | |
$c = $input[$i]; | |
if($isComment) { | |
if ($c == '\r' || $c == '\n') | |
$isComment = false; | |
continue; | |
} | |
if ($c == '\0' || $c == '\t' || $c == '\r' || $c == '\f' || $c == '\n' || $c == ' ') | |
continue; | |
if ($c == '%') { | |
$isComment = true; | |
continue; | |
} | |
if ($c == 'z' && $state === 0) { | |
$output .= str_repeat(chr(0), 4); | |
continue; | |
} | |
if ($c < '!' || $c > 'u') | |
return ""; | |
$code = ord($input[$i]) & 0xff; | |
$ords[$state++] = $code - ord('!'); | |
if ($state == 5) { | |
$state = 0; | |
for ($sum = 0, $j = 0; $j < 5; $j++) | |
$sum = $sum * 85 + $ords[$j]; | |
for ($j = 3; $j >= 0; $j--) | |
$output .= chr($sum >> ($j * 8)); | |
} | |
} | |
if ($state === 1) | |
return ""; | |
elseif ($state > 1) { | |
for ($i = 0, $sum = 0; $i < $state; $i++) | |
$sum += ($ords[$i] + ($i == $state - 1)) * pow(85, 4 - $i); | |
for ($i = 0; $i < $state - 1; $i++) | |
$ouput .= chr($sum >> ((3 - $i) * 8)); | |
} | |
return $output; | |
} | |
function decodeFlate($input) { | |
return gzuncompress($input); | |
} | |
function getObjectOptions($object) { | |
$options = array(); | |
if (preg_match("#<<(.*)>>#ismU", $object, $options)) { | |
$options = explode("/", $options[1]); | |
@array_shift($options); | |
$o = array(); | |
for ($j = 0; $j < @count($options); $j++) { | |
$options[$j] = preg_replace("#\s+#", " ", trim($options[$j])); | |
if (strpos($options[$j], " ") !== false) { | |
$parts = explode(" ", $options[$j]); | |
$o[$parts[0]] = $parts[1]; | |
} else | |
$o[$options[$j]] = true; | |
} | |
$options = $o; | |
unset($o); | |
} | |
return $options; | |
} | |
function getDecodedStream($stream, $options) { | |
$data = ""; | |
if (empty($options["Filter"])) | |
$data = $stream; | |
else { | |
$length = !empty($options["Length"]) ? $options["Length"] : strlen($stream); | |
$_stream = substr($stream, 0, $length); | |
foreach ($options as $key => $value) { | |
if ($key == "ASCIIHexDecode") | |
$_stream = $this->decodeAsciiHex($_stream); | |
if ($key == "ASCII85Decode") | |
$_stream = $this->decodeAscii85($_stream); | |
if ($key == "FlateDecode") | |
$_stream = $this->decodeFlate($_stream); | |
if ($key == "Crypt") { // TO DO | |
} | |
} | |
$data = $_stream; | |
} | |
return $data; | |
} | |
function getDirtyTexts(&$texts, $textContainers) { | |
for ($j = 0; $j < count($textContainers); $j++) { | |
if (preg_match_all("#\[(.*)\]\s*TJ[\n|\r]#ismU", $textContainers[$j], $parts)) | |
$texts = array_merge($texts, @$parts[1]); | |
elseif(preg_match_all("#T[d|w|m|f]\s*(\(.*\))\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts)) | |
$texts = array_merge($texts, @$parts[1]); | |
elseif(preg_match_all("#T[d|w|m|f]\s*(\[.*\])\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts)) | |
$texts = array_merge($texts, @$parts[1]); | |
} | |
} | |
function getCharTransformations(&$transformations, $stream) { | |
preg_match_all("#([0-9]+)\s+beginbfchar(.*)endbfchar#ismU", $stream, $chars, PREG_SET_ORDER); | |
preg_match_all("#([0-9]+)\s+beginbfrange(.*)endbfrange#ismU", $stream, $ranges, PREG_SET_ORDER); | |
for ($j = 0; $j < count($chars); $j++) { | |
$count = $chars[$j][1]; | |
$current = explode("\n", trim($chars[$j][2])); | |
for ($k = 0; $k < $count && $k < count($current); $k++) { | |
if (preg_match("#<([0-9a-f]{2,4})>\s+<([0-9a-f]{4,512})>#is", trim($current[$k]), $map)) | |
$transformations[str_pad($map[1], 4, "0")] = $map[2]; | |
} | |
} | |
for ($j = 0; $j < count($ranges); $j++) { | |
$count = $ranges[$j][1]; | |
$current = explode("\n", trim($ranges[$j][2])); | |
for ($k = 0; $k < $count && $k < count($current); $k++) { | |
if (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+<([0-9a-f]{4})>#is", trim($current[$k]), $map)) { | |
$from = hexdec($map[1]); | |
$to = hexdec($map[2]); | |
$_from = hexdec($map[3]); | |
for ($m = $from, $n = 0; $m <= $to; $m++, $n++) | |
$transformations[sprintf("%04X", $m)] = sprintf("%04X", $_from + $n); | |
} elseif (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+\[(.*)\]#ismU", trim($current[$k]), $map)) { | |
$from = hexdec($map[1]); | |
$to = hexdec($map[2]); | |
$parts = preg_split("#\s+#", trim($map[3])); | |
for ($m = $from, $n = 0; $m <= $to && $n < count($parts); $m++, $n++) | |
$transformations[sprintf("%04X", $m)] = sprintf("%04X", hexdec($parts[$n])); | |
} | |
} | |
} | |
} | |
function getTextUsingTransformations($texts, $transformations) { | |
$document = ""; | |
for ($i = 0; $i < count($texts); $i++) { | |
$isHex = false; | |
$isPlain = false; | |
$hex = ""; | |
$plain = ""; | |
for ($j = 0; $j < strlen($texts[$i]); $j++) { | |
$c = $texts[$i][$j]; | |
switch($c) { | |
case "<": | |
$hex = ""; | |
$isHex = true; | |
break; | |
case ">": | |
$hexs = str_split($hex, $this->multibyte); // 2 or 4 (UTF8 or ISO) | |
for ($k = 0; $k < count($hexs); $k++) { | |
$chex = str_pad($hexs[$k], 4, "0"); // Add tailing zero | |
if (isset($transformations[$chex])) | |
$chex = $transformations[$chex]; | |
$document .= html_entity_decode("&#x".$chex.";"); | |
} | |
$isHex = false; | |
break; | |
case "(": | |
$plain = ""; | |
$isPlain = true; | |
break; | |
case ")": | |
$document .= $plain; | |
$isPlain = false; | |
break; | |
case "\\": | |
$c2 = $texts[$i][$j + 1]; | |
if (in_array($c2, array("\\", "(", ")"))) $plain .= $c2; | |
elseif ($c2 == "n") $plain .= '\n'; | |
elseif ($c2 == "r") $plain .= '\r'; | |
elseif ($c2 == "t") $plain .= '\t'; | |
elseif ($c2 == "b") $plain .= '\b'; | |
elseif ($c2 == "f") $plain .= '\f'; | |
elseif ($c2 >= '0' && $c2 <= '9') { | |
$oct = preg_replace("#[^0-9]#", "", substr($texts[$i], $j + 1, 3)); | |
$j += strlen($oct) - 1; | |
$plain .= html_entity_decode("&#".octdec($oct).";", $this->convertquotes); | |
} | |
$j++; | |
break; | |
default: | |
if ($isHex) | |
$hex .= $c; | |
if ($isPlain) | |
$plain .= $c; | |
break; | |
} | |
} | |
$document .= "\n"; | |
} | |
return $document; | |
} | |
} | |
?> |
replace line 368
$document .= "\n";
to
$document .= "<br>";
How do I actually execute the code? I can't get it to work :/ I have the on my index.php and a onchange="" attribute inside a but it doesn't echo anything.
So I've been using this handy little class, but it had one severe drawback, which is that it does not order the Objects in the PDF according to their positioning in the document. Since I had to fetch a number at the top of PDF documents, I hacked the getDirtyText() method a bit like this to order them by vertical positioning:
function
getDirtyTexts(&$texts, $textContainers) {
$textContainers_pieces = explode('Tj',$textContainers[0]);
$text_holder = array();
foreach ($textContainers_pieces as $piece) {
$string = strtok($piece, " ");
if ($string[5] > $text_holder[0]) {
array_unshift($text_holder,$piece);
} else {
array_push($text_holder, $piece);
}
}
$highest_string = $text_holder[0];
preg_match('#\((.*?)\)#', $highest_string, $match);
if ( (string)(int)$match[1] == $match[1]) {
return $match[1];
}
return false;
`}`
@spiderwisp, I guess you did some more changes, because your function doesn't look to be fitting in the current code.
I saw in some PDF files that the Tj-tag at the end of the last textContainer can be lacking. Therefore I have changed the function getDirtyTexts to the following:
function getDirtyTexts(&$texts, $textContainers) {
for ($j = 0; $j < count($textContainers); $j++) {
if (preg_match_all("#\[(.*)\]\s*TJ[\n|\r]#ismU", $textContainers[$j], $parts))
$texts = array_merge($texts, array(@implode('', $parts[1])));
elseif (preg_match_all("#T[d|w|m|f]\s*(\(.*\))\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts))
$texts = array_merge($texts, array(@implode('', $parts[1])));
elseif (preg_match_all("#T[d|w|m|f]\s*(\[.*\])\s*#ismU", $textContainers[$j], $parts))
$texts = array_merge($texts, array(@implode('', $parts[1])));
}
}
Hello.
Your code is useful for me.
But in some documents the plain text result contains parts with random chars.
Example given
"WiIbi tYaeuWZttW/IRIWYtoCWVt[HVt WF33WeH[a WpoaHsWEeYWK33WeH[a WiIbiWtaHeu"
It seems like it tries to convert some binary objects (images) to plain text.
Hi,
the same thing happens to me as derUli. Some pdf read them well and others throw me weird text.
Example of failure: https://boe.es/boe/dias/2018/12/03/pdfs/BOE-A-2018-16469.pdf
Result:
\���������8(��GH����GH�IHEUHUR�GH��������UHVWULQJLGRV��GH�GLiORJR�FRPSHWLWLYR��GH� OLFLWDFLyQ�FRQ�QHJRFLDFLyQ�\�GH�DVRFLDFLyQ�SDUD�OD�LQQRYDFLyQ��/D�LQWHUYHQFLyQ�GH�OD�0HVD� VHUi�SRWHVWDWLYD�HQ�ORV�SURFHGLPLHQWRV�QHJRFLDGRV�HQ�TXH�QR�VHD�QHFHVDULR�SXEOLFDU� DQXQFLRV�GH�OLFLWDFLyQ��VDOYR�TXH�VH�IXQGDPHQWH�HQ�OD�H[LVWHQFLD�GH�XQD�LPSHULRVD� XUJHQFLD�SUHYLVWD�HQ�HO�DUWtFXOR���������E\f�GH�OD�/H\���������GH���GH�QRYLHPEUH��HQ�HO�TXH� VHUi�REOLJDWRULD��DVt�FRPR�HQ�ORV�SURFHGLPLHQWRV�DELHUWRV�VLPSOLILFDGRV�D�ORV�TXH�VH� UHILHUH�HO�DUWtFXOR�������GH�OD�FLWDGD�OH\��GH�PRGR�TXH�HQ�HVWRV�SURFHGLPLHQWRV�VROR�
i am using this code, but it can't detect new line.
may anyone can help with my problem