<?php

/**
 * @file
 * Class PdfParser
 *
 * @author : Sebastien MALOT <sebastien@malot.fr>
 * @date : 2013-08-08
 *
 * References :
 * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
 * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
 * - http://www.php.net/manual/en/ref.pdf.php#74211
 */
class PdfParser
{
    /**
     * Parse PDF file
     *
     * @param string $filename
     * @return string
     */
    public static function parseFile($filename)
    {
        $content = file_get_contents($filename);

        return self::extractText($content);
    }

    /**
     * Parse PDF content
     *
     * @param string $content
     * @return string
     */
    public static function parseContent($content)
    {
        return self::extractText($content);
    }

    /**
     * Convert a PDF into text.
     *
     * @param string $filename The filename to extract the data from.
     * @return string The extracted text from the PDF
     */
    protected static function extractText($data)
    {
        /**
         * Split apart the PDF document into sections. We will address each
         * section separately.
         */
        $a_obj    = self::getDataArray($data, 'obj', 'endobj');
        $j        = 0;
        $a_chunks = array();

        /**
         * Attempt to extract each part of the PDF document into a 'filter'
         * element and a 'data' element. This can then be used to decode the
         * data.
         */
        foreach ($a_obj as $obj) {
            $a_filter = self::getDataArray($obj, '<<', '>>');

            if (is_array($a_filter) && isset($a_filter[0])) {
                $a_chunks[$j]['filter'] = $a_filter[0];
                $a_data = self::getDataArray($obj, 'stream', 'endstream');

                if (is_array($a_data) && isset($a_data[0])) {
                    $a_chunks[$j]['data'] = trim(substr($a_data[0], strlen('stream'), strlen($a_data[0]) - strlen('stream') - strlen('endstream')));
                }

                $j++;
            }
        }

        $result_data = null;

        // decode the chunks
        foreach ($a_chunks as $chunk) {
            // Look at each chunk decide if we can decode it by looking at the contents of the filter
            if (isset($chunk['data'])) {

        // look at the filter to find out which encoding has been used
                if (strpos($chunk['filter'], 'FlateDecode') !== false) {
                    // Use gzuncompress but suppress error messages.
                    $data =@ gzuncompress($chunk['data']);
                } else {
                    $data = $chunk['data'];
                }

                if (trim($data) != '') {
                    // If we got data then attempt to extract it.
                    $result_data .= ' ' . self::extractTextElements($data);
                }
            }
        }

        /**
         * Make sure we don't have large blocks of white space before and after
         * our string. Also extract alphanumerical information to reduce
         * redundant data.
         */
        if (trim($result_data) == '') {
            return null;
        } else {
            // Optimize hyphened words
            $result_data = preg_replace('/\s*-[\r\n]+\s*/', '', $result_data);
            $result_data = preg_replace('/\s+/', ' ', $result_data);

            return $result_data;
        }
    }

    protected static function extractTextElements($content)
    {
        if (strpos($content, '/CIDInit') === 0) {
            return '';
        }

        $text  = '';
        $lines = explode("\n", $content);

        foreach ($lines as $line) {
            $line = trim($line);
            $matches = array();

            // Parse each lines to extract command and operator values
            if (preg_match('/^(?<command>.*[\)\] ])(?<operator>[a-z]+[\*]?)$/i', $line, $matches)) {
                $command = trim($matches['command']);

                // Convert octal encoding
                $found_octal_values = array();
                preg_match_all('/\\\\([0-9]{3})/', $command, $found_octal_values);

                foreach ($found_octal_values[0] as $value) {
                    $octal = substr($value, 1);

                    if ((int)$octal < 40) {
                        // Skips non printable chars
                        $command = str_replace($value, '', $command);
                    } else {
                        $command = str_replace($value, chr(octdec($octal)), $command);
                    }
                }
                // Removes encoded new lines, tabs, ...
                $command = preg_replace('/\\\\[\r\n]/', '', $command);
                $command = preg_replace('/\\\\[rnftb ]/', ' ', $command);
                // Force UTF-8 charset
                $encoding = mb_detect_encoding($command, array('ASCII', 'UTF-8', 'Windows-1252', 'ISO-8859-1'));
                if (strtoupper($encoding) != 'UTF-8') {
                    if ($decoded = @iconv('CP1252', 'UTF-8//TRANSLIT//IGNORE', $command)) {
                        $command = $decoded;
                    }
                }
                // Removes leading spaces
                $operator = trim($matches['operator']);
            } else {
                $command = $line;
                $operator = '';
            }

            // Handle main operators
            switch ($operator) {
        // Set character spacing.
        case 'Tc':
          break;

        // Move text current point.
        case 'Td':
          $values = explode(' ', $command);
          $y = array_pop($values);
          $x = array_pop($values);
          if ($x > 0) {
              $text .= ' ';
          }
          if ($y < 0) {
              $text .= ' ';
          }
          break;

        // Move text current point and set leading.
        case 'TD':
          $values = explode(' ', $command);
          $y = array_pop($values);
          if ($y < 0) {
              $text .= "\n";
          }
          break;

        // Set font name and size.
        case 'Tf':
          $text.= ' ';
          break;

        // Display text, allowing individual character positioning
        case 'TJ':
          $start = mb_strpos($command, '[', null, 'UTF-8') + 1;
          $end   = mb_strrpos($command, ']', null, 'UTF-8');
          $text.= self::parseTextCommand(mb_substr($command, $start, $end - $start, 'UTF-8'));
          break;

        // Display text.
        case 'Tj':
          $start = mb_strpos($command, '(', null, 'UTF-8') + 1;
          $end   = mb_strrpos($command, ')', null, 'UTF-8');
          $text.= mb_substr($command, $start, $end - $start, 'UTF-8'); // Removes round brackets
          break;

        // Set leading.
        case 'TL':

        // Set text matrix.
        case 'Tm':
//          $text.= ' ';
          break;

        // Set text rendering mode.
        case 'Tr':
          break;

        // Set super/subscripting text rise.
        case 'Ts':
          break;

        // Set text spacing.
        case 'Tw':
          break;

        // Set horizontal scaling.
        case 'Tz':
          break;

        // Move to start of next line.
        case 'T*':
          $text.= "\n";
          break;

        // Internal use
        case 'g':
        case 'gs':
        case 're':
        case 'f':
        // Begin text
        case 'BT':
        // End text
        case 'ET':
          break;

        case '':
          break;

        default:
      }
        }

        $text = str_replace(array('\\(', '\\)'), array('(', ')'), $text);

        return $text;
    }

    /**
     * Strip out the text from a small chunk of data.
     *
     * @param string $text
     * @param int $font_size Currently not used
     *
     * @return string
     */
    protected static function parseTextCommand($text, $font_size = 0)
    {
        $result = '';
        $cur_start_pos = 0;

        while (($cur_start_text = mb_strpos($text, '(', $cur_start_pos, 'UTF-8')) !== false) {
            // New text element found
            if ($cur_start_text - $cur_start_pos > 8) {
                $spacing = ' ';
            } else {
                $spacing_size = mb_substr($text, $cur_start_pos, $cur_start_text - $cur_start_pos, 'UTF-8');

                if ($spacing_size < -50) {
                    $spacing = ' ';
                } else {
                    $spacing = '';
                }
            }
            $cur_start_text++;

            $start_search_end = $cur_start_text;
            while (($cur_start_pos = mb_strpos($text, ')', $start_search_end, 'UTF-8')) !== false) {
                if (mb_substr($text, $cur_start_pos - 1, 1, 'UTF-8') != '\\') {
                    break;
                }
                $start_search_end = $cur_start_pos + 1;
            }

            // something wrong happened
            if ($cur_start_pos === false) {
                break;
            }

            // Add to result
            $result .= $spacing . mb_substr($text, $cur_start_text, $cur_start_pos - $cur_start_text, 'UTF-8');
            $cur_start_pos++;
        }

        return $result;
    }

    /**
     * Convert a section of data into an array, separated by the start and end words.
     *
     * @param  string $data       The data.
     * @param  string $start_word The start of each section of data.
     * @param  string $end_word   The end of each section of data.
     * @return array              The array of data.
     */
    protected static function getDataArray($data, $start_word, $end_word)
    {
        $start     = 0;
        $end       = 0;
        $a_results = array();

        while ($start !== false && $end !== false) {
            $start = strpos($data, $start_word, $end);
            $end   = strpos($data, $end_word, $start);

            if ($end !== false && $start !== false) {
                // data is between start and end
                $a_results[] = substr($data, $start, $end - $start + strlen($end_word));
            }
        }

        return $a_results;
    }
}
