smalot/PdfParser.php

## PdfParser.php
<?php

/**
 * @file
 * Class PdfParser
 *
 * @author : Sebastien MALOT <sebastien@malot.fr>
 * @date : 2013-08-08
 *
 * References :
 * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
 * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
 * - http://www.php.net/manual/en/ref.pdf.php#74211
 */
class PdfParser
{
  /**
   * Parse PDF file
   *
   * @param string $filename
   * @return string
   */
  public static function parseFile($filename)
  {
    $content = file_get_contents($filename);

    return self::extractText($content);
  }

  /**
   * Parse PDF content
   *
   * @param string $content
   * @return string
   */
  public static function parseContent($content)
  {
    return self::extractText($content);
  }

  /**
   * Convert a PDF into text.
   *
   * @param string $filename The filename to extract the data from.
   * @return string The extracted text from the PDF
   */
  protected static function extractText($data)
  {
    /**
     * Split apart the PDF document into sections. We will address each
     * section separately.
     */
    $a_obj    = self::getDataArray($data, 'obj', 'endobj');
    $j        = 0;
    $a_chunks = array();

    /**
     * Attempt to extract each part of the PDF document into a 'filter'
     * element and a 'data' element. This can then be used to decode the
     * data.
     */
    foreach ($a_obj as $obj) {
      $a_filter = self::getDataArray($obj, '<<', '>>');

      if (is_array($a_filter) && isset($a_filter[0])) {
        $a_chunks[$j]['filter'] = $a_filter[0];
        $a_data = self::getDataArray($obj, 'stream', 'endstream');

        if (is_array($a_data) && isset($a_data[0])) {
          $a_chunks[$j]['data'] = trim(substr($a_data[0], strlen('stream'), strlen($a_data[0]) - strlen('stream') - strlen('endstream')));
        }

        $j++;
      }
    }

    $result_data = null;

    // decode the chunks
    foreach ($a_chunks as $chunk) {
      // Look at each chunk decide if we can decode it by looking at the contents of the filter
      if (isset($chunk['data'])) {

        // look at the filter to find out which encoding has been used
        if (strpos($chunk['filter'], 'FlateDecode') !== false) {
          // Use gzuncompress but suppress error messages.
          $data =@ gzuncompress($chunk['data']);
        } else {
          $data = $chunk['data'];
        }

        if (trim($data) != '') {
          // If we got data then attempt to extract it.
          $result_data .= ' ' . self::extractTextElements($data);
        }
      }
    }

    /**
     * Make sure we don't have large blocks of white space before and after
     * our string. Also extract alphanumerical information to reduce
     * redundant data.
     */
    if (trim($result_data) == '') {
      return null;
    } else {
      // Optimize hyphened words
      $result_data = preg_replace('/\s*-[\r\n]+\s*/', '', $result_data);
      $result_data = preg_replace('/\s+/', ' ', $result_data);

      return $result_data;
    }
  }

  protected static function extractTextElements($content)
  {
    if (strpos($content, '/CIDInit') === 0) {
      return '';
    }

    $text  = '';
    $lines = explode("\n", $content);

    foreach ($lines as $line) {
      $line = trim($line);
      $matches = array();

      // Parse each lines to extract command and operator values
      if (preg_match('/^(?<command>.*[\)\] ])(?<operator>[a-z]+[\*]?)$/i', $line, $matches)) {
        $command = trim($matches['command']);

        // Convert octal encoding
        $found_octal_values = array();
        preg_match_all('/\\\\([0-9]{3})/', $command, $found_octal_values);

        foreach($found_octal_values[0] as $value) {
          $octal = substr($value, 1);

          if (intval($octal) < 40) {
            // Skips non printable chars
            $command = str_replace($value, '', $command);
          } else {
            $command = str_replace($value, chr(octdec($octal)), $command);
          }
        }
        // Removes encoded new lines, tabs, ...
        $command = preg_replace('/\\\\[\r\n]/', '', $command);
        $command = preg_replace('/\\\\[rnftb ]/', ' ', $command);
        // Force UTF-8 charset
        $encoding = mb_detect_encoding($command, array('ASCII', 'UTF-8', 'Windows-1252', 'ISO-8859-1'));
        if (strtoupper($encoding) != 'UTF-8') {
          if ($decoded = @iconv('CP1252', 'UTF-8//TRANSLIT//IGNORE', $command)) {
            $command = $decoded;
          }
        }
        // Removes leading spaces
        $operator = trim($matches['operator']);
      } else {
        $command = $line;
        $operator = '';
      }

      // Handle main operators
      switch ($operator) {
        // Set character spacing.
        case 'Tc':
          break;

        // Move text current point.
        case 'Td':
          $values = explode(' ', $command);
          $y = array_pop($values);
          $x = array_pop($values);
          if ($x > 0) {
            $text .= ' ';
          }
          if ($y < 0) {
            $text .= ' ';
          }
          break;

        // Move text current point and set leading.
        case 'TD':
          $values = explode(' ', $command);
          $y = array_pop($values);
          if ($y < 0) {
            $text .= "\n";
          }
          break;

        // Set font name and size.
        case 'Tf':
          $text.= ' ';
          break;

        // Display text, allowing individual character positioning
        case 'TJ':
          $start = mb_strpos($command, '[', null, 'UTF-8') + 1;
          $end   = mb_strrpos($command, ']', null, 'UTF-8');
          $text.= self::parseTextCommand(mb_substr($command, $start, $end - $start, 'UTF-8'));
          break;

        // Display text.
        case 'Tj':
          $start = mb_strpos($command, '(', null, 'UTF-8') + 1;
          $end   = mb_strrpos($command, ')', null, 'UTF-8');
          $text.= mb_substr($command, $start, $end - $start, 'UTF-8'); // Removes round brackets
          break;

        // Set leading.
        case 'TL':

        // Set text matrix.
        case 'Tm':
//          $text.= ' ';
          break;

        // Set text rendering mode.
        case 'Tr':
          break;

        // Set super/subscripting text rise.
        case 'Ts':
          break;

        // Set text spacing.
        case 'Tw':
          break;

        // Set horizontal scaling.
        case 'Tz':
          break;

        // Move to start of next line.
        case 'T*':
          $text.= "\n";
          break;

        // Internal use
        case 'g':
        case 'gs':
        case 're':
        case 'f':
        // Begin text
        case 'BT':
        // End text
        case 'ET':
          break;

        case '':
          break;

        default:
      }
    }

    $text = str_replace(array('\\(', '\\)'), array('(', ')'), $text);

    return $text;
  }

  /**
   * Strip out the text from a small chunk of data.
   *
   * @param string $text
   * @param int $font_size Currently not used
   *
   * @return string
   */
  protected static function parseTextCommand($text, $font_size = 0) {

    $result = '';
    $cur_start_pos = 0;

    while (($cur_start_text = mb_strpos($text, '(', $cur_start_pos, 'UTF-8')) !== false) {
      // New text element found
      if ($cur_start_text - $cur_start_pos > 8) {
        $spacing = ' ';
      } else {
        $spacing_size = mb_substr($text, $cur_start_pos, $cur_start_text - $cur_start_pos, 'UTF-8');

        if ($spacing_size < -50) {
          $spacing = ' ';
        } else {
          $spacing = '';
        }
      }
      $cur_start_text++;

      $start_search_end = $cur_start_text;
      while (($cur_start_pos = mb_strpos($text, ')', $start_search_end, 'UTF-8')) !== false) {
        if (mb_substr($text, $cur_start_pos - 1, 1, 'UTF-8') != '\\') {
          break;
        }
        $start_search_end = $cur_start_pos + 1;
      }

      // something wrong happened
      if ($cur_start_pos === false) {
        break;
      }

      // Add to result
      $result .= $spacing . mb_substr($text, $cur_start_text, $cur_start_pos - $cur_start_text, 'UTF-8');
      $cur_start_pos++;
    }

    return $result;
  }

  /**
   * Convert a section of data into an array, separated by the start and end words.
   *
   * @param  string $data       The data.
   * @param  string $start_word The start of each section of data.
   * @param  string $end_word   The end of each section of data.
   * @return array              The array of data.
   */
  protected static function getDataArray($data, $start_word, $end_word)
  {
    $start     = 0;
    $end       = 0;
    $a_results = array();

    while ($start !== false && $end !== false) {
      $start = strpos($data, $start_word, $end);
      $end   = strpos($data, $end_word, $start);

      if ($end !== false && $start !== false) {
        // data is between start and end
        $a_results[] = substr($data, $start, $end - $start + strlen($end_word));
      }
    }

    return $a_results;
  }
}
	<?php

	/**
	* @file
	* Class PdfParser
	*
	* @author : Sebastien MALOT <sebastien@malot.fr>
	* @date : 2013-08-08
	*
	* References :
	* - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
	* - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
	* - http://www.php.net/manual/en/ref.pdf.php#74211
	*/
	class PdfParser
	{
	/**
	* Parse PDF file
	*
	* @param string $filename
	* @return string
	*/
	public static function parseFile($filename)
	{
	$content = file_get_contents($filename);

	return self::extractText($content);
	}

	/**
	* Parse PDF content
	*
	* @param string $content
	* @return string
	*/
	public static function parseContent($content)
	{
	return self::extractText($content);
	}

	/**
	* Convert a PDF into text.
	*
	* @param string $filename The filename to extract the data from.
	* @return string The extracted text from the PDF
	*/
	protected static function extractText($data)
	{
	/**
	* Split apart the PDF document into sections. We will address each
	* section separately.
	*/
	$a_obj = self::getDataArray($data, 'obj', 'endobj');
	$j = 0;
	$a_chunks = array();

	/**
	* Attempt to extract each part of the PDF document into a 'filter'
	* element and a 'data' element. This can then be used to decode the
	* data.
	*/
	foreach ($a_obj as $obj) {
	$a_filter = self::getDataArray($obj, '<<', '>>');

	if (is_array($a_filter) && isset($a_filter[0])) {
	$a_chunks[$j]['filter'] = $a_filter[0];
	$a_data = self::getDataArray($obj, 'stream', 'endstream');

	if (is_array($a_data) && isset($a_data[0])) {
	$a_chunks[$j]['data'] = trim(substr($a_data[0], strlen('stream'), strlen($a_data[0]) - strlen('stream') - strlen('endstream')));
	}

	$j++;
	}
	}

	$result_data = null;

	// decode the chunks
	foreach ($a_chunks as $chunk) {
	// Look at each chunk decide if we can decode it by looking at the contents of the filter
	if (isset($chunk['data'])) {

	// look at the filter to find out which encoding has been used
	if (strpos($chunk['filter'], 'FlateDecode') !== false) {
	// Use gzuncompress but suppress error messages.
	$data =@ gzuncompress($chunk['data']);
	} else {
	$data = $chunk['data'];
	}

	if (trim($data) != '') {
	// If we got data then attempt to extract it.
	$result_data .= ' ' . self::extractTextElements($data);
	}
	}
	}

	/**
	* Make sure we don't have large blocks of white space before and after
	* our string. Also extract alphanumerical information to reduce
	* redundant data.
	*/
	if (trim($result_data) == '') {
	return null;
	} else {
	// Optimize hyphened words
	$result_data = preg_replace('/\s-[\r\n]+\s/', '', $result_data);
	$result_data = preg_replace('/\s+/', ' ', $result_data);

	return $result_data;
	}
	}

	protected static function extractTextElements($content)
	{
	if (strpos($content, '/CIDInit') === 0) {
	return '';
	}

	$text = '';
	$lines = explode("\n", $content);

	foreach ($lines as $line) {
	$line = trim($line);
	$matches = array();

	// Parse each lines to extract command and operator values
	if (preg_match('/^(?<command>.[\)\] ])(?<operator>[a-z]+[\]?)$/i', $line, $matches)) {
	$command = trim($matches['command']);

	// Convert octal encoding
	$found_octal_values = array();
	preg_match_all('/\\\\([0-9]{3})/', $command, $found_octal_values);

	foreach($found_octal_values[0] as $value) {
	$octal = substr($value, 1);

	if (intval($octal) < 40) {
	// Skips non printable chars
	$command = str_replace($value, '', $command);
	} else {
	$command = str_replace($value, chr(octdec($octal)), $command);
	}
	}
	// Removes encoded new lines, tabs, ...
	$command = preg_replace('/\\\\[\r\n]/', '', $command);
	$command = preg_replace('/\\\\[rnftb ]/', ' ', $command);
	// Force UTF-8 charset
	$encoding = mb_detect_encoding($command, array('ASCII', 'UTF-8', 'Windows-1252', 'ISO-8859-1'));
	if (strtoupper($encoding) != 'UTF-8') {
	if ($decoded = @iconv('CP1252', 'UTF-8//TRANSLIT//IGNORE', $command)) {
	$command = $decoded;
	}
	}
	// Removes leading spaces
	$operator = trim($matches['operator']);
	} else {
	$command = $line;
	$operator = '';
	}

	// Handle main operators
	switch ($operator) {
	// Set character spacing.
	case 'Tc':
	break;

	// Move text current point.
	case 'Td':
	$values = explode(' ', $command);
	$y = array_pop($values);
	$x = array_pop($values);
	if ($x > 0) {
	$text .= ' ';
	}
	if ($y < 0) {
	$text .= ' ';
	}
	break;

	// Move text current point and set leading.
	case 'TD':
	$values = explode(' ', $command);
	$y = array_pop($values);
	if ($y < 0) {
	$text .= "\n";
	}
	break;

	// Set font name and size.
	case 'Tf':
	$text.= ' ';
	break;

	// Display text, allowing individual character positioning
	case 'TJ':
	$start = mb_strpos($command, '[', null, 'UTF-8') + 1;
	$end = mb_strrpos($command, ']', null, 'UTF-8');
	$text.= self::parseTextCommand(mb_substr($command, $start, $end - $start, 'UTF-8'));
	break;

	// Display text.
	case 'Tj':
	$start = mb_strpos($command, '(', null, 'UTF-8') + 1;
	$end = mb_strrpos($command, ')', null, 'UTF-8');
	$text.= mb_substr($command, $start, $end - $start, 'UTF-8'); // Removes round brackets
	break;

	// Set leading.
	case 'TL':

	// Set text matrix.
	case 'Tm':
	// $text.= ' ';
	break;

	// Set text rendering mode.
	case 'Tr':
	break;

	// Set super/subscripting text rise.
	case 'Ts':
	break;

	// Set text spacing.
	case 'Tw':
	break;

	// Set horizontal scaling.
	case 'Tz':
	break;

	// Move to start of next line.
	case 'T*':
	$text.= "\n";
	break;

	// Internal use
	case 'g':
	case 'gs':
	case 're':
	case 'f':
	// Begin text
	case 'BT':
	// End text
	case 'ET':
	break;

	case '':
	break;

	default:
	}
	}

	$text = str_replace(array('\\(', '\\)'), array('(', ')'), $text);

	return $text;
	}

	/**
	* Strip out the text from a small chunk of data.
	*
	* @param string $text
	* @param int $font_size Currently not used
	*
	* @return string
	*/
	protected static function parseTextCommand($text, $font_size = 0) {

	$result = '';
	$cur_start_pos = 0;

	while (($cur_start_text = mb_strpos($text, '(', $cur_start_pos, 'UTF-8')) !== false) {
	// New text element found
	if ($cur_start_text - $cur_start_pos > 8) {
	$spacing = ' ';
	} else {
	$spacing_size = mb_substr($text, $cur_start_pos, $cur_start_text - $cur_start_pos, 'UTF-8');

	if ($spacing_size < -50) {
	$spacing = ' ';
	} else {
	$spacing = '';
	}
	}
	$cur_start_text++;

	$start_search_end = $cur_start_text;
	while (($cur_start_pos = mb_strpos($text, ')', $start_search_end, 'UTF-8')) !== false) {
	if (mb_substr($text, $cur_start_pos - 1, 1, 'UTF-8') != '\\') {
	break;
	}
	$start_search_end = $cur_start_pos + 1;
	}

	// something wrong happened
	if ($cur_start_pos === false) {
	break;
	}

	// Add to result
	$result .= $spacing . mb_substr($text, $cur_start_text, $cur_start_pos - $cur_start_text, 'UTF-8');
	$cur_start_pos++;
	}

	return $result;
	}

	/**
	* Convert a section of data into an array, separated by the start and end words.
	*
	* @param string $data The data.
	* @param string $start_word The start of each section of data.
	* @param string $end_word The end of each section of data.
	* @return array The array of data.
	*/
	protected static function getDataArray($data, $start_word, $end_word)
	{
	$start = 0;
	$end = 0;
	$a_results = array();

	while ($start !== false && $end !== false) {
	$start = strpos($data, $start_word, $end);
	$end = strpos($data, $end_word, $start);

	if ($end !== false && $start !== false) {
	// data is between start and end
	$a_results[] = substr($data, $start, $end - $start + strlen($end_word));
	}
	}

	return $a_results;
	}
	}