Skip to content

Instantly share code, notes, and snippets.

@jetonr
Last active July 29, 2016 14:02
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save jetonr/5d7a74587fd53c3945464fdf37e017f5 to your computer and use it in GitHub Desktop.
pdf parser
<?php
include 'vendor/autoload.php';
ini_set('memory_limit','1024M');
ini_set('max_execution_time', 300);
//var_dump(pages_text('42.pdf',50,202));
var_dump(text_to_array('42.pdf.txt'));
function pages_text( $filename, $min, $max ) {
$parser = new \Smalot\PdfParser\Parser();
$pdf = $parser->parseFile($filename);
$pages = $pdf->getPages();
$text = '';
for ( $i = $min; $i < $max; $i++ ) {
$page_text = $pages[$i]->getText();
$page_text = preg_replace( "/(^[0-9]+)\s+\n\s+\n/", "\r\n", $page_text );
$page_text = preg_replace('/\(54\s/', '(546)', $page_text);
$page_text = preg_replace('/_+/', '', $page_text);
$text .= $page_text;
}
file_put_contents($filename.".txt", $text);
}
function text_to_array( $filename ) {
$text = file_get_contents( $filename );
$myArray = array();
foreach (explode('(210)', $text) as $key => $value) {
$results = preg_split( '/\(([0-9]+)\)/s', $value, null, PREG_SPLIT_DELIM_CAPTURE );
$myKey = '';
foreach ($results as $k => $v) {
if ( $k == 0 ) {
$myArray[$key][210] = trim($v);
} else if ( ($k > 0) && ($myKey == '')) {
$myKey = trim($v);
} else if ($k > 0) {
$myArray[$key][$myKey] = trim($v);
$myKey = '';
}
}
}
return $myArray;
}
function images_array( $pdf ) {
$array = array_reverse( $pdf->getObjectsByType('XObject', 'Image') );
$new_array = array();
foreach ( $array as $image ) {
$new_array[] = base64_encode( $image->getContent() );
}
return $new_array;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment