Skip to content

Instantly share code, notes, and snippets.

@jschreuder
Last active August 29, 2015 14:03
Show Gist options
  • Save jschreuder/1e0f47ff2d6f0f293315 to your computer and use it in GitHub Desktop.
Save jschreuder/1e0f47ff2d6f0f293315 to your computer and use it in GitHub Desktop.
I read the article saying (among other things) you shouldn't write your own CSV parser. So I decided to write one. And get a bit of practice with PHP 5.5's generators in the process.
<?php
namespace Webspot\Csv;
use Webspot\Csv\Parser\LineIn;
use Webspot\Csv\Parser\RowOut;
class Parser
{
const STATE_NEW_FIELD = 10;
const STATE_NEW_ROW = 11;
const STATE_WITHIN_FIELD = 20;
const STATE_WITHIN_FIELD_ENCLOSURE = 21;
const STATE_WITHIN_FIELD_ENCLOSURE_ESCAPED = 22;
const STATE_END_FIELD = 30;
const STATE_END_PARSING = 31;
/** @var resource */
private $resource;
/** @var string */
private $delimiter;
/** @var string */
private $enclosure;
/** @var string */
private $escape;
/** @var int using the state constants */
private $state;
/** @var RowOut */
private $currentRow;
/** @var int */
private $parsedRows;
/** @var StateHandler\StateHandlerInterface[] */
private $stateHandlers = [];
/**
* Takes resource and the field delimiter, enclosure and escape characters used to parse it
*
* @param resource $resource
* @param string $delimiter
* @param string $enclosure
* @param string $escape
* @throws \InvalidArgumentException
*/
public function __construct($resource, $delimiter = ';', $enclosure = '"', $escape = '\\')
{
// Check input validity
if ( ! is_resource($resource) or get_resource_type($resource) !== 'stream') {
throw new \InvalidArgumentException('Parser input must be a valid Stream resource.');
}
$this->resource = $resource;
$this->delimiter = $delimiter;
$this->enclosure = $enclosure;
$this->escape = $escape;
$this->stateHandlers = [
self::STATE_NEW_ROW => new StateHandler\NewFieldHandler($delimiter, $enclosure),
self::STATE_NEW_FIELD => new StateHandler\NewFieldHandler($delimiter, $enclosure),
self::STATE_WITHIN_FIELD => new StateHandler\WithinFieldHandler($delimiter),
self::STATE_WITHIN_FIELD_ENCLOSURE => new StateHandler\WithinFieldEnclosureHandler($enclosure, $escape),
self::STATE_WITHIN_FIELD_ENCLOSURE_ESCAPED => new StateHandler\WithinFieldEnclosureEscapeHandler(),
self::STATE_END_FIELD => new StateHandler\EndFieldHandler($delimiter),
];
}
/**
* Parse string instead of file using tmpfile()
*
* @param string $string
* @param string $delimiter
* @param string $enclosure
* @param string $escape
* @return array
*/
public static function string($string, $delimiter = ';', $enclosure = '"', $escape = '\\')
{
$tmpFile = tmpfile();
fwrite($tmpFile, $string);
rewind($tmpFile);
return new self($tmpFile, $delimiter, $enclosure, $escape);
}
/**
* Resets object to restart parsing by resetting the stream pointer and internal state properties
*
* @return void
*/
private function reset()
{
// rewind resource before parsing starts
rewind($this->resource);
// initialize parser state
$this->state = self::STATE_NEW_ROW;
$this->currentRow = new RowOut();
$this->parsedRows = 0;
}
/**
* Fetches the state handler for the given state
*
* @param int $state
* @return StateHandler\StateHandlerInterface
* @throws \RuntimeException
*/
private function getStateHandler($state)
{
if ( ! isset($this->stateHandlers[$state])) {
throw new \RuntimeException('No state handler found for state: '.$state);
}
return $this->stateHandlers[$state];
}
/**
* Parses and returns the entire file's contents
*
* @return array
*/
public function parse()
{
$this->reset();
$result = [];
while ($this->state !== self::STATE_END_PARSING) {
$result[] = $this->parseRow();
}
return $result;
}
/**
* Reads a CSV resource and parses it to an array
*
* @return array|null either the parsed row or null when done
* @throws \UnderflowException
*/
public function parseRow()
{
$this->reset();
// start parsing
while ($line = fgets($this->resource)) {
$lineIn = new LineIn(rtrim($line, "\n\r"));
while ($lineIn->length() > 0) {
$this->state = $this->getStateHandler($this->state)->handle($lineIn, $this->currentRow);
}
// parse next line as part of current line and field when still within enclosure
if ($this->state === self::STATE_WITHIN_FIELD_ENCLOSURE) {
continue;
}
// fetch parsed row, updated parsedRows and reset state for next row
$row = $this->currentRow->getRow();
$this->parsedRows++;
$this->currentRow = new RowOut();
$this->state = self::STATE_NEW_ROW;
yield $row;
}
if ($this->state === self::STATE_WITHIN_FIELD_ENCLOSURE) {
throw new \UnderflowException('Parse error: ended within enclosure state, parsing terminated prematurely.');
}
// parsing done, return result
$this->state = self::STATE_END_PARSING;
}
/**
* Returns the number of rows parsed by parseRow()
*
* @return int
*/
public function getParsedRowCount()
{
return $this->parsedRows;
}
}
<?php
namespace Webspot\Csv\StateHandler;
use Webspot\Csv\Parser;
use Webspot\Csv\Parser\LineIn;
use Webspot\Csv\Parser\RowOut;
class EndFieldHandler implements StateHandlerInterface
{
/** @var string */
private $delimiter;
public function __construct($delimiter)
{
$this->delimiter = $delimiter;
}
/** {@inheritdoc} */
public function handle(LineIn $lineIn, RowOut $rowOut)
{
$char = $lineIn->unshift();
if ($char !== $this->delimiter) {
throw new \DomainException('Parse Error: Field cannot contain characters outside enclosure.');
}
$rowOut->addFieldToRow();
return Parser::STATE_NEW_FIELD;
}
}
<?php
namespace Webspot\Csv\Parser;
class LineIn
{
/** @var string */
private $line;
public function __construct($line)
{
$this->line = $line;
}
/**
* Returns the current length of the line
*
* @return int
*/
public function length()
{
return strlen($this->line);
}
/**
* Returns the current first character of the line
*
* @return string
*/
public function getNext()
{
return $this->line[0];
}
/**
* Removes and returns the first character of the line
*
* @return string
*/
public function unshift()
{
$next = $this->getNext();
$this->line = substr($this->line, 1);
return $next;
}
}
<?php
namespace Webspot\Csv\StateHandler;
use Webspot\Csv\Parser;
use Webspot\Csv\Parser\LineIn;
use Webspot\Csv\Parser\RowOut;
class NewFieldHandler implements StateHandlerInterface
{
/** @var string */
private $delimiter;
/** @var string */
private $enclosure;
public function __construct($delimiter, $enclosure)
{
$this->delimiter = $delimiter;
$this->enclosure = $enclosure;
}
/** {@inheritdoc} */
public function handle(LineIn $lineIn, RowOut $rowOut)
{
$char = $lineIn->getNext();
if ($char === $this->enclosure) {
$lineIn->unshift();
return Parser::STATE_WITHIN_FIELD_ENCLOSURE;
} elseif ($char === $this->delimiter) {
return Parser::STATE_END_FIELD;
} else {
return Parser::STATE_WITHIN_FIELD;
}
}
}
<?php
namespace Webspot\Csv\Parser;
class RowOut
{
/** @var string[] */
private $row;
/** @var string */
private $field;
/**
* Adds the given string to the current field
*
* @param $string
* @return void
*/
public function addToField($string)
{
$this->field .= $string;
}
/**
* Adds the current field to the row and starts on a new field
*
* @return void
*/
public function addFieldToRow()
{
$this->row[] = $this->field;
$this->field = '';
}
/**
* Fetches the parsed row
*
* @return string[]
*/
public function getRow()
{
return $this->row;
}
}
<?php
namespace Webspot\Csv\StateHandler;
use Webspot\Csv\Parser\LineIn;
use Webspot\Csv\Parser\RowOut;
interface StateHandlerInterface
{
/**
* Handles the current state and returns the new one
*
* @param LineIn $lineIn
* @param RowOut $rowOut
* @return int
*/
public function handle(LineIn $lineIn, RowOut $rowOut);
}
<?php
namespace Webspot\Csv\StateHandler;
use Webspot\Csv\Parser;
use Webspot\Csv\Parser\LineIn;
use Webspot\Csv\Parser\RowOut;
class WithinFieldEnclosureEscapeHandler implements StateHandlerInterface
{
/** {@inheritdoc} */
public function handle(LineIn $lineIn, RowOut $rowOut)
{
if ( ! $lineIn->length()) {
throw new \DomainException('Parse error: escape character cannot be the last character.');
}
$rowOut->addToField($lineIn->unshift());
return Parser::STATE_WITHIN_FIELD_ENCLOSURE;
}
}
<?php
namespace Webspot\Csv\StateHandler;
use Webspot\Csv\Parser;
use Webspot\Csv\Parser\LineIn;
use Webspot\Csv\Parser\RowOut;
class WithinFieldEnclosureHandler implements StateHandlerInterface
{
/** @var string */
private $enclosure;
/** @var string */
private $escape;
public function __construct($enclosure, $escape)
{
$this->enclosure = $enclosure;
$this->escape = $escape;
}
/** {@inheritdoc} */
public function handle(LineIn $lineIn, RowOut $rowOut)
{
while ($lineIn->length() && $char = $lineIn->unshift()) {
if ($char === $this->escape) {
return Parser::STATE_WITHIN_FIELD_ENCLOSURE_ESCAPED;
} elseif ($char === $this->enclosure) {
return Parser::STATE_END_FIELD;
} else {
$rowOut->addToField($char);
}
}
return Parser::STATE_WITHIN_FIELD_ENCLOSURE;
}
}
<?php
namespace Webspot\Csv\StateHandler;
use Webspot\Csv\Parser;
use Webspot\Csv\Parser\LineIn;
use Webspot\Csv\Parser\RowOut;
class WithinFieldHandler implements StateHandlerInterface
{
/** @var string */
private $delimiter;
public function __construct($delimiter)
{
$this->delimiter = $delimiter;
}
/** {@inheritdoc} */
public function handle(LineIn $lineIn, RowOut $rowOut)
{
while ($lineIn->length() && $lineIn->getNext() !== $this->delimiter) {
$rowOut->addToField($lineIn->unshift());
}
return Parser::STATE_END_FIELD;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment