Last active
August 29, 2015 14:03
-
-
Save jschreuder/1e0f47ff2d6f0f293315 to your computer and use it in GitHub Desktop.
I read the article saying (among other things) you shouldn't write your own CSV parser. So I decided to write one. And get a bit of practice with PHP 5.5's generators in the process.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace Webspot\Csv; | |
use Webspot\Csv\Parser\LineIn; | |
use Webspot\Csv\Parser\RowOut; | |
class Parser | |
{ | |
const STATE_NEW_FIELD = 10; | |
const STATE_NEW_ROW = 11; | |
const STATE_WITHIN_FIELD = 20; | |
const STATE_WITHIN_FIELD_ENCLOSURE = 21; | |
const STATE_WITHIN_FIELD_ENCLOSURE_ESCAPED = 22; | |
const STATE_END_FIELD = 30; | |
const STATE_END_PARSING = 31; | |
/** @var resource */ | |
private $resource; | |
/** @var string */ | |
private $delimiter; | |
/** @var string */ | |
private $enclosure; | |
/** @var string */ | |
private $escape; | |
/** @var int using the state constants */ | |
private $state; | |
/** @var RowOut */ | |
private $currentRow; | |
/** @var int */ | |
private $parsedRows; | |
/** @var StateHandler\StateHandlerInterface[] */ | |
private $stateHandlers = []; | |
/** | |
* Takes resource and the field delimiter, enclosure and escape characters used to parse it | |
* | |
* @param resource $resource | |
* @param string $delimiter | |
* @param string $enclosure | |
* @param string $escape | |
* @throws \InvalidArgumentException | |
*/ | |
public function __construct($resource, $delimiter = ';', $enclosure = '"', $escape = '\\') | |
{ | |
// Check input validity | |
if ( ! is_resource($resource) or get_resource_type($resource) !== 'stream') { | |
throw new \InvalidArgumentException('Parser input must be a valid Stream resource.'); | |
} | |
$this->resource = $resource; | |
$this->delimiter = $delimiter; | |
$this->enclosure = $enclosure; | |
$this->escape = $escape; | |
$this->stateHandlers = [ | |
self::STATE_NEW_ROW => new StateHandler\NewFieldHandler($delimiter, $enclosure), | |
self::STATE_NEW_FIELD => new StateHandler\NewFieldHandler($delimiter, $enclosure), | |
self::STATE_WITHIN_FIELD => new StateHandler\WithinFieldHandler($delimiter), | |
self::STATE_WITHIN_FIELD_ENCLOSURE => new StateHandler\WithinFieldEnclosureHandler($enclosure, $escape), | |
self::STATE_WITHIN_FIELD_ENCLOSURE_ESCAPED => new StateHandler\WithinFieldEnclosureEscapeHandler(), | |
self::STATE_END_FIELD => new StateHandler\EndFieldHandler($delimiter), | |
]; | |
} | |
/** | |
* Parse string instead of file using tmpfile() | |
* | |
* @param string $string | |
* @param string $delimiter | |
* @param string $enclosure | |
* @param string $escape | |
* @return array | |
*/ | |
public static function string($string, $delimiter = ';', $enclosure = '"', $escape = '\\') | |
{ | |
$tmpFile = tmpfile(); | |
fwrite($tmpFile, $string); | |
rewind($tmpFile); | |
return new self($tmpFile, $delimiter, $enclosure, $escape); | |
} | |
/** | |
* Resets object to restart parsing by resetting the stream pointer and internal state properties | |
* | |
* @return void | |
*/ | |
private function reset() | |
{ | |
// rewind resource before parsing starts | |
rewind($this->resource); | |
// initialize parser state | |
$this->state = self::STATE_NEW_ROW; | |
$this->currentRow = new RowOut(); | |
$this->parsedRows = 0; | |
} | |
/** | |
* Fetches the state handler for the given state | |
* | |
* @param int $state | |
* @return StateHandler\StateHandlerInterface | |
* @throws \RuntimeException | |
*/ | |
private function getStateHandler($state) | |
{ | |
if ( ! isset($this->stateHandlers[$state])) { | |
throw new \RuntimeException('No state handler found for state: '.$state); | |
} | |
return $this->stateHandlers[$state]; | |
} | |
/** | |
* Parses and returns the entire file's contents | |
* | |
* @return array | |
*/ | |
public function parse() | |
{ | |
$this->reset(); | |
$result = []; | |
while ($this->state !== self::STATE_END_PARSING) { | |
$result[] = $this->parseRow(); | |
} | |
return $result; | |
} | |
/** | |
* Reads a CSV resource and parses it to an array | |
* | |
* @return array|null either the parsed row or null when done | |
* @throws \UnderflowException | |
*/ | |
public function parseRow() | |
{ | |
$this->reset(); | |
// start parsing | |
while ($line = fgets($this->resource)) { | |
$lineIn = new LineIn(rtrim($line, "\n\r")); | |
while ($lineIn->length() > 0) { | |
$this->state = $this->getStateHandler($this->state)->handle($lineIn, $this->currentRow); | |
} | |
// parse next line as part of current line and field when still within enclosure | |
if ($this->state === self::STATE_WITHIN_FIELD_ENCLOSURE) { | |
continue; | |
} | |
// fetch parsed row, updated parsedRows and reset state for next row | |
$row = $this->currentRow->getRow(); | |
$this->parsedRows++; | |
$this->currentRow = new RowOut(); | |
$this->state = self::STATE_NEW_ROW; | |
yield $row; | |
} | |
if ($this->state === self::STATE_WITHIN_FIELD_ENCLOSURE) { | |
throw new \UnderflowException('Parse error: ended within enclosure state, parsing terminated prematurely.'); | |
} | |
// parsing done, return result | |
$this->state = self::STATE_END_PARSING; | |
} | |
/** | |
* Returns the number of rows parsed by parseRow() | |
* | |
* @return int | |
*/ | |
public function getParsedRowCount() | |
{ | |
return $this->parsedRows; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace Webspot\Csv\StateHandler; | |
use Webspot\Csv\Parser; | |
use Webspot\Csv\Parser\LineIn; | |
use Webspot\Csv\Parser\RowOut; | |
class EndFieldHandler implements StateHandlerInterface | |
{ | |
/** @var string */ | |
private $delimiter; | |
public function __construct($delimiter) | |
{ | |
$this->delimiter = $delimiter; | |
} | |
/** {@inheritdoc} */ | |
public function handle(LineIn $lineIn, RowOut $rowOut) | |
{ | |
$char = $lineIn->unshift(); | |
if ($char !== $this->delimiter) { | |
throw new \DomainException('Parse Error: Field cannot contain characters outside enclosure.'); | |
} | |
$rowOut->addFieldToRow(); | |
return Parser::STATE_NEW_FIELD; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace Webspot\Csv\Parser; | |
class LineIn | |
{ | |
/** @var string */ | |
private $line; | |
public function __construct($line) | |
{ | |
$this->line = $line; | |
} | |
/** | |
* Returns the current length of the line | |
* | |
* @return int | |
*/ | |
public function length() | |
{ | |
return strlen($this->line); | |
} | |
/** | |
* Returns the current first character of the line | |
* | |
* @return string | |
*/ | |
public function getNext() | |
{ | |
return $this->line[0]; | |
} | |
/** | |
* Removes and returns the first character of the line | |
* | |
* @return string | |
*/ | |
public function unshift() | |
{ | |
$next = $this->getNext(); | |
$this->line = substr($this->line, 1); | |
return $next; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace Webspot\Csv\StateHandler; | |
use Webspot\Csv\Parser; | |
use Webspot\Csv\Parser\LineIn; | |
use Webspot\Csv\Parser\RowOut; | |
class NewFieldHandler implements StateHandlerInterface | |
{ | |
/** @var string */ | |
private $delimiter; | |
/** @var string */ | |
private $enclosure; | |
public function __construct($delimiter, $enclosure) | |
{ | |
$this->delimiter = $delimiter; | |
$this->enclosure = $enclosure; | |
} | |
/** {@inheritdoc} */ | |
public function handle(LineIn $lineIn, RowOut $rowOut) | |
{ | |
$char = $lineIn->getNext(); | |
if ($char === $this->enclosure) { | |
$lineIn->unshift(); | |
return Parser::STATE_WITHIN_FIELD_ENCLOSURE; | |
} elseif ($char === $this->delimiter) { | |
return Parser::STATE_END_FIELD; | |
} else { | |
return Parser::STATE_WITHIN_FIELD; | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace Webspot\Csv\Parser; | |
class RowOut | |
{ | |
/** @var string[] */ | |
private $row; | |
/** @var string */ | |
private $field; | |
/** | |
* Adds the given string to the current field | |
* | |
* @param $string | |
* @return void | |
*/ | |
public function addToField($string) | |
{ | |
$this->field .= $string; | |
} | |
/** | |
* Adds the current field to the row and starts on a new field | |
* | |
* @return void | |
*/ | |
public function addFieldToRow() | |
{ | |
$this->row[] = $this->field; | |
$this->field = ''; | |
} | |
/** | |
* Fetches the parsed row | |
* | |
* @return string[] | |
*/ | |
public function getRow() | |
{ | |
return $this->row; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace Webspot\Csv\StateHandler; | |
use Webspot\Csv\Parser\LineIn; | |
use Webspot\Csv\Parser\RowOut; | |
interface StateHandlerInterface | |
{ | |
/** | |
* Handles the current state and returns the new one | |
* | |
* @param LineIn $lineIn | |
* @param RowOut $rowOut | |
* @return int | |
*/ | |
public function handle(LineIn $lineIn, RowOut $rowOut); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace Webspot\Csv\StateHandler; | |
use Webspot\Csv\Parser; | |
use Webspot\Csv\Parser\LineIn; | |
use Webspot\Csv\Parser\RowOut; | |
class WithinFieldEnclosureEscapeHandler implements StateHandlerInterface | |
{ | |
/** {@inheritdoc} */ | |
public function handle(LineIn $lineIn, RowOut $rowOut) | |
{ | |
if ( ! $lineIn->length()) { | |
throw new \DomainException('Parse error: escape character cannot be the last character.'); | |
} | |
$rowOut->addToField($lineIn->unshift()); | |
return Parser::STATE_WITHIN_FIELD_ENCLOSURE; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace Webspot\Csv\StateHandler; | |
use Webspot\Csv\Parser; | |
use Webspot\Csv\Parser\LineIn; | |
use Webspot\Csv\Parser\RowOut; | |
class WithinFieldEnclosureHandler implements StateHandlerInterface | |
{ | |
/** @var string */ | |
private $enclosure; | |
/** @var string */ | |
private $escape; | |
public function __construct($enclosure, $escape) | |
{ | |
$this->enclosure = $enclosure; | |
$this->escape = $escape; | |
} | |
/** {@inheritdoc} */ | |
public function handle(LineIn $lineIn, RowOut $rowOut) | |
{ | |
while ($lineIn->length() && $char = $lineIn->unshift()) { | |
if ($char === $this->escape) { | |
return Parser::STATE_WITHIN_FIELD_ENCLOSURE_ESCAPED; | |
} elseif ($char === $this->enclosure) { | |
return Parser::STATE_END_FIELD; | |
} else { | |
$rowOut->addToField($char); | |
} | |
} | |
return Parser::STATE_WITHIN_FIELD_ENCLOSURE; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace Webspot\Csv\StateHandler; | |
use Webspot\Csv\Parser; | |
use Webspot\Csv\Parser\LineIn; | |
use Webspot\Csv\Parser\RowOut; | |
class WithinFieldHandler implements StateHandlerInterface | |
{ | |
/** @var string */ | |
private $delimiter; | |
public function __construct($delimiter) | |
{ | |
$this->delimiter = $delimiter; | |
} | |
/** {@inheritdoc} */ | |
public function handle(LineIn $lineIn, RowOut $rowOut) | |
{ | |
while ($lineIn->length() && $lineIn->getNext() !== $this->delimiter) { | |
$rowOut->addToField($lineIn->unshift()); | |
} | |
return Parser::STATE_END_FIELD; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment