-
-
Save neko-fire/7038322 to your computer and use it in GitHub Desktop.
<?php | |
/* | |
This program is free software; you can redistribute it and/or modify | |
it under the terms of the GNU General Public License as published by | |
the Free Software Foundation; either version 2 of the License, or | |
(at your option) any later version. | |
This program is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
GNU General Public License for more details. | |
You should have received a copy of the GNU General Public License | |
along with this program; if not, write to the Free Software | |
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
This code is an improved version of what can be found at: | |
http://www.webcheatsheet.com/php/reading_clean_text_from_pdf.php | |
AUTHOR: | |
- Webcheatsheet.com (Original code) | |
- Joeri Stegeman (joeri210 [at] yahoo [dot] com) (Class conversion and fixes/adjustments) | |
DESCRIPTION: | |
This is a class to convert PDF files into ASCII text or so called PDF text extraction. | |
It will ignore anything that is not addressed as text within the PDF and any layout. | |
Currently supported filters are: ASCIIHexDecode, ASCII85Decode, FlateDecode | |
PURPOSE(S): | |
Most likely for people that want their PDF to be searchable. | |
SYNTAX: | |
include('class.pdf2text.php'); | |
$a = new PDF2Text(); | |
$a->setFilename('test.pdf'); | |
$a->decodePDF(); | |
echo $a->output(); | |
ALTERNATIVES: | |
Other excellent options to search within a PDF: | |
- Apache PDFbox (http://pdfbox.apache.org/). An open source Java solution | |
- pdflib TET (http://www.pdflib.com/products/tet/) | |
- Online converter: http://snowtide.com/PDFTextStream | |
*/ | |
class PDF2Text { | |
// Some settings | |
var $multibyte = 2; // Use setUnicode(TRUE|FALSE) | |
var $convertquotes = ENT_QUOTES; // ENT_COMPAT (double-quotes), ENT_QUOTES (Both), ENT_NOQUOTES (None) | |
// Variables | |
var $filename = ''; | |
var $decodedtext = ''; | |
function setFilename($filename) { | |
// Reset | |
$this->decodedtext = ''; | |
$this->filename = $filename; | |
} | |
function output($echo = false) { | |
if($echo) echo $this->decodedtext; | |
else return $this->decodedtext; | |
} | |
function setUnicode($input) { | |
// 4 for unicode. But 2 should work in most cases just fine | |
if($input == true) $this->multibyte = 4; | |
else $this->multibyte = 2; | |
} | |
function decodePDF() { | |
// Read the data from pdf file | |
$infile = @file_get_contents($this->filename, FILE_BINARY); | |
if (empty($infile)) | |
return ""; | |
// Get all text data. | |
$transformations = array(); | |
$texts = array(); | |
// Get the list of all objects. | |
preg_match_all("#obj[\n|\r](.*)endobj[\n|\r]#ismU", $infile, $objects); | |
$objects = @$objects[1]; | |
// Select objects with streams. | |
for ($i = 0; $i < count($objects); $i++) { | |
$currentObject = $objects[$i]; | |
// Check if an object includes data stream. | |
if (preg_match("#stream[\n|\r](.*)endstream[\n|\r]#ismU", $currentObject, $stream)) { | |
$stream = ltrim($stream[1]); | |
// Check object parameters and look for text data. | |
$options = $this->getObjectOptions($currentObject); | |
if (!(empty($options["Length1"]) && empty($options["Type"]) && empty($options["Subtype"]))) | |
continue; | |
// Hack, length doesnt always seem to be correct | |
unset($options["Length"]); | |
// So, we have text data. Decode it. | |
$data = $this->getDecodedStream($stream, $options); | |
if (strlen($data)) { | |
if (preg_match_all("#BT[\n|\r](.*)ET[\n|\r]#ismU", $data, $textContainers)) { | |
$textContainers = @$textContainers[1]; | |
$this->getDirtyTexts($texts, $textContainers); | |
} else | |
$this->getCharTransformations($transformations, $data); | |
} | |
} | |
} | |
// Analyze text blocks taking into account character transformations and return results. | |
$this->decodedtext = $this->getTextUsingTransformations($texts, $transformations); | |
} | |
function decodeAsciiHex($input) { | |
$output = ""; | |
$isOdd = true; | |
$isComment = false; | |
for($i = 0, $codeHigh = -1; $i < strlen($input) && $input[$i] != '>'; $i++) { | |
$c = $input[$i]; | |
if($isComment) { | |
if ($c == '\r' || $c == '\n') | |
$isComment = false; | |
continue; | |
} | |
switch($c) { | |
case '\0': case '\t': case '\r': case '\f': case '\n': case ' ': break; | |
case '%': | |
$isComment = true; | |
break; | |
default: | |
$code = hexdec($c); | |
if($code === 0 && $c != '0') | |
return ""; | |
if($isOdd) | |
$codeHigh = $code; | |
else | |
$output .= chr($codeHigh * 16 + $code); | |
$isOdd = !$isOdd; | |
break; | |
} | |
} | |
if($input[$i] != '>') | |
return ""; | |
if($isOdd) | |
$output .= chr($codeHigh * 16); | |
return $output; | |
} | |
function decodeAscii85($input) { | |
$output = ""; | |
$isComment = false; | |
$ords = array(); | |
for($i = 0, $state = 0; $i < strlen($input) && $input[$i] != '~'; $i++) { | |
$c = $input[$i]; | |
if($isComment) { | |
if ($c == '\r' || $c == '\n') | |
$isComment = false; | |
continue; | |
} | |
if ($c == '\0' || $c == '\t' || $c == '\r' || $c == '\f' || $c == '\n' || $c == ' ') | |
continue; | |
if ($c == '%') { | |
$isComment = true; | |
continue; | |
} | |
if ($c == 'z' && $state === 0) { | |
$output .= str_repeat(chr(0), 4); | |
continue; | |
} | |
if ($c < '!' || $c > 'u') | |
return ""; | |
$code = ord($input[$i]) & 0xff; | |
$ords[$state++] = $code - ord('!'); | |
if ($state == 5) { | |
$state = 0; | |
for ($sum = 0, $j = 0; $j < 5; $j++) | |
$sum = $sum * 85 + $ords[$j]; | |
for ($j = 3; $j >= 0; $j--) | |
$output .= chr($sum >> ($j * 8)); | |
} | |
} | |
if ($state === 1) | |
return ""; | |
elseif ($state > 1) { | |
for ($i = 0, $sum = 0; $i < $state; $i++) | |
$sum += ($ords[$i] + ($i == $state - 1)) * pow(85, 4 - $i); | |
for ($i = 0; $i < $state - 1; $i++) | |
$ouput .= chr($sum >> ((3 - $i) * 8)); | |
} | |
return $output; | |
} | |
function decodeFlate($input) { | |
return gzuncompress($input); | |
} | |
function getObjectOptions($object) { | |
$options = array(); | |
if (preg_match("#<<(.*)>>#ismU", $object, $options)) { | |
$options = explode("/", $options[1]); | |
@array_shift($options); | |
$o = array(); | |
for ($j = 0; $j < @count($options); $j++) { | |
$options[$j] = preg_replace("#\s+#", " ", trim($options[$j])); | |
if (strpos($options[$j], " ") !== false) { | |
$parts = explode(" ", $options[$j]); | |
$o[$parts[0]] = $parts[1]; | |
} else | |
$o[$options[$j]] = true; | |
} | |
$options = $o; | |
unset($o); | |
} | |
return $options; | |
} | |
function getDecodedStream($stream, $options) { | |
$data = ""; | |
if (empty($options["Filter"])) | |
$data = $stream; | |
else { | |
$length = !empty($options["Length"]) ? $options["Length"] : strlen($stream); | |
$_stream = substr($stream, 0, $length); | |
foreach ($options as $key => $value) { | |
if ($key == "ASCIIHexDecode") | |
$_stream = $this->decodeAsciiHex($_stream); | |
if ($key == "ASCII85Decode") | |
$_stream = $this->decodeAscii85($_stream); | |
if ($key == "FlateDecode") | |
$_stream = $this->decodeFlate($_stream); | |
if ($key == "Crypt") { // TO DO | |
} | |
} | |
$data = $_stream; | |
} | |
return $data; | |
} | |
function getDirtyTexts(&$texts, $textContainers) { | |
for ($j = 0; $j < count($textContainers); $j++) { | |
if (preg_match_all("#\[(.*)\]\s*TJ[\n|\r]#ismU", $textContainers[$j], $parts)) | |
$texts = array_merge($texts, @$parts[1]); | |
elseif(preg_match_all("#T[d|w|m|f]\s*(\(.*\))\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts)) | |
$texts = array_merge($texts, @$parts[1]); | |
elseif(preg_match_all("#T[d|w|m|f]\s*(\[.*\])\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts)) | |
$texts = array_merge($texts, @$parts[1]); | |
} | |
} | |
function getCharTransformations(&$transformations, $stream) { | |
preg_match_all("#([0-9]+)\s+beginbfchar(.*)endbfchar#ismU", $stream, $chars, PREG_SET_ORDER); | |
preg_match_all("#([0-9]+)\s+beginbfrange(.*)endbfrange#ismU", $stream, $ranges, PREG_SET_ORDER); | |
for ($j = 0; $j < count($chars); $j++) { | |
$count = $chars[$j][1]; | |
$current = explode("\n", trim($chars[$j][2])); | |
for ($k = 0; $k < $count && $k < count($current); $k++) { | |
if (preg_match("#<([0-9a-f]{2,4})>\s+<([0-9a-f]{4,512})>#is", trim($current[$k]), $map)) | |
$transformations[str_pad($map[1], 4, "0")] = $map[2]; | |
} | |
} | |
for ($j = 0; $j < count($ranges); $j++) { | |
$count = $ranges[$j][1]; | |
$current = explode("\n", trim($ranges[$j][2])); | |
for ($k = 0; $k < $count && $k < count($current); $k++) { | |
if (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+<([0-9a-f]{4})>#is", trim($current[$k]), $map)) { | |
$from = hexdec($map[1]); | |
$to = hexdec($map[2]); | |
$_from = hexdec($map[3]); | |
for ($m = $from, $n = 0; $m <= $to; $m++, $n++) | |
$transformations[sprintf("%04X", $m)] = sprintf("%04X", $_from + $n); | |
} elseif (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+\[(.*)\]#ismU", trim($current[$k]), $map)) { | |
$from = hexdec($map[1]); | |
$to = hexdec($map[2]); | |
$parts = preg_split("#\s+#", trim($map[3])); | |
for ($m = $from, $n = 0; $m <= $to && $n < count($parts); $m++, $n++) | |
$transformations[sprintf("%04X", $m)] = sprintf("%04X", hexdec($parts[$n])); | |
} | |
} | |
} | |
} | |
function getTextUsingTransformations($texts, $transformations) { | |
$document = ""; | |
for ($i = 0; $i < count($texts); $i++) { | |
$isHex = false; | |
$isPlain = false; | |
$hex = ""; | |
$plain = ""; | |
for ($j = 0; $j < strlen($texts[$i]); $j++) { | |
$c = $texts[$i][$j]; | |
switch($c) { | |
case "<": | |
$hex = ""; | |
$isHex = true; | |
break; | |
case ">": | |
$hexs = str_split($hex, $this->multibyte); // 2 or 4 (UTF8 or ISO) | |
for ($k = 0; $k < count($hexs); $k++) { | |
$chex = str_pad($hexs[$k], 4, "0"); // Add tailing zero | |
if (isset($transformations[$chex])) | |
$chex = $transformations[$chex]; | |
$document .= html_entity_decode("&#x".$chex.";"); | |
} | |
$isHex = false; | |
break; | |
case "(": | |
$plain = ""; | |
$isPlain = true; | |
break; | |
case ")": | |
$document .= $plain; | |
$isPlain = false; | |
break; | |
case "\\": | |
$c2 = $texts[$i][$j + 1]; | |
if (in_array($c2, array("\\", "(", ")"))) $plain .= $c2; | |
elseif ($c2 == "n") $plain .= '\n'; | |
elseif ($c2 == "r") $plain .= '\r'; | |
elseif ($c2 == "t") $plain .= '\t'; | |
elseif ($c2 == "b") $plain .= '\b'; | |
elseif ($c2 == "f") $plain .= '\f'; | |
elseif ($c2 >= '0' && $c2 <= '9') { | |
$oct = preg_replace("#[^0-9]#", "", substr($texts[$i], $j + 1, 3)); | |
$j += strlen($oct) - 1; | |
$plain .= html_entity_decode("&#".octdec($oct).";", $this->convertquotes); | |
} | |
$j++; | |
break; | |
default: | |
if ($isHex) | |
$hex .= $c; | |
if ($isPlain) | |
$plain .= $c; | |
break; | |
} | |
} | |
$document .= "\n"; | |
} | |
return $document; | |
} | |
} | |
?> |
Hi,
the same thing happens to me as derUli. Some pdf read them well and others throw me weird text.
Example of failure: https://boe.es/boe/dias/2018/12/03/pdfs/BOE-A-2018-16469.pdf
Result:
\���������8(��GH����GH�IHEUHUR�GH��������UHVWULQJLGRV��GH�GLiORJR�FRPSHWLWLYR��GH� OLFLWDFLyQ�FRQ�QHJRFLDFLyQ�\�GH�DVRFLDFLyQ�SDUD�OD�LQQRYDFLyQ��/D�LQWHUYHQFLyQ�GH�OD�0HVD� VHUi�SRWHVWDWLYD�HQ�ORV�SURFHGLPLHQWRV�QHJRFLDGRV�HQ�TXH�QR�VHD�QHFHVDULR�SXEOLFDU� DQXQFLRV�GH�OLFLWDFLyQ��VDOYR�TXH�VH�IXQGDPHQWH�HQ�OD�H[LVWHQFLD�GH�XQD�LPSHULRVD� XUJHQFLD�SUHYLVWD�HQ�HO�DUWtFXOR���������E\f�GH�OD�/H\���������GH���GH�QRYLHPEUH��HQ�HO�TXH� VHUi�REOLJDWRULD��DVt�FRPR�HQ�ORV�SURFHGLPLHQWRV�DELHUWRV�VLPSOLILFDGRV�D�ORV�TXH�VH� UHILHUH�HO�DUWtFXOR�������GH�OD�FLWDGD�OH\��GH�PRGR�TXH�HQ�HVWRV�SURFHGLPLHQWRV�VROR�
Hello.
Your code is useful for me.
But in some documents the plain text result contains parts with random chars.
Example given
"WiIbi tYaeuWZttW/IRIWYtoCWVt[HVt WF33WeH[a WpoaHsWEeYWK33WeH[a WiIbiWtaHeu"
It seems like it tries to convert some binary objects (images) to plain text.