pkoppstein/fromcsvfile.jq

## fromcsvfile.jq
# Copyright (C) 2018 peak@princeton.edu
# License: Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported (CC BY-NC-SA 3.0)
# See http://creativecommons.org/licenses/by-nc/3.0/
# Attribution shall include the copyright notice above.

# fromcsv.jq version: 0.4 of 2018-12-30
# Requires: jq with `inputs`
# Source: https://gist.github.com/pkoppstein/bbbbdf7489c8c515680beb1c75fa59f2

# A PEG-inspired parser for reading CSV files without the need to "slurp" them.

# The parser is intended to handle a wide variety of "edge cases".
# Note that both \r\n and \r\r\n are interpreted as end-of-record.

# Error-reporting showing the relevant record number and troublesome string fragment is provided on STDERR.

# Main jq filters:
#  fromcsv/0     # JSON string input
#  fromcsvfile/0 # read from STDIN

# Example usage: jq -nRc 'include "fromcsvfile"; fromcsvfile' MYFILE.csv

######### PEG machinery

# consume a regular expression rooted at the start of .remainder
def consume($re):
  # on failure, match yields empty
  (.remainder | match("^" + $re)) as $match
  | .remainder |= .[$match.length :] ;

def parse($re):
  # on failure, match yields empty
  (.remainder | match("^" + $re)) as $match
  | .remainder |= .[$match.length :]
  |.result += [$match.string | gsub("\"\"";"\"")] ;

# Utility function as there is no EOF marker
def at_eof: .remainder | (.=="" or . == "\n" or . == "\r\n");

############ Grammar for CSV

# end-of-record if end-of-record characters or else at eof
def EOR:
  consume("\n|\r\n|\r\r\n")
  // (if .remainder=="" then . else empty end) ;

# Internal double-quotes must be doubled;
# CRs and LFs are allowed, as are empty quoted fields.
def field_content_quoted:
  parse("((\"\")|([^\"]))*") ;

# # EXTENSION: When reading an unquoted field, we ought to recognize CRLF as end-of-record,
# i.e. only accept LF if it is NOT preceded by CR.
# Reject unescaped double-quote
def unquoted_field:
  (parse("[^\",\r\n]+") | (if .remainder|test("^\r\n") then consume("\r") else empty end))
  // parse("[^\",\n]*") # possibly empty
  ;

def quoted_field_continue:
  def trim: sub("(\r\r|\r)$";"");
  .remainder += ("\n" + input | trim)
  # | (.record|debug) as $debug
  | (field_content_quoted | consume("\" *"))
    // quoted_field_continue
    ;

# Ignore blanks before and after the enclosing quotation marks # EXTENSION
def quoted_field:
  consume(" *\"")
  | ( (field_content_quoted | consume("\" *"))
      // quoted_field_continue ) ;

def field:
  quoted_field
  // unquoted_field
  // if at_eof then empty else stderr end ;

# ("," field)+
def fields:
  consume(",")
  | field
  | (fields // .) ;

# field ("," field)*
# i.e. field fields
def record: field | (fields // .) ;

def fromcsv:
  ## Loop for processing all the records
  def _fromcsv:
    if at_eof then empty
    else (EOR // .)
    | record
    | select(.result)
    | .result,
      (.result = null | .record+=1 |  _fromcsv)
    end ;

  {record:0, remainder: .}
  | _fromcsv ;

def fromcsvfile:
  def trim: sub("(\r\r|\r)$";"");
  inputs | trim | fromcsv;
	# Copyright (C) 2018 peak@princeton.edu
	# License: Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported (CC BY-NC-SA 3.0)
	# See http://creativecommons.org/licenses/by-nc/3.0/
	# Attribution shall include the copyright notice above.

	# fromcsv.jq version: 0.4 of 2018-12-30
	# Requires: jq with `inputs`
	# Source: https://gist.github.com/pkoppstein/bbbbdf7489c8c515680beb1c75fa59f2

	# A PEG-inspired parser for reading CSV files without the need to "slurp" them.

	# The parser is intended to handle a wide variety of "edge cases".
	# Note that both \r\n and \r\r\n are interpreted as end-of-record.

	# Error-reporting showing the relevant record number and troublesome string fragment is provided on STDERR.

	# Main jq filters:
	# fromcsv/0 # JSON string input
	# fromcsvfile/0 # read from STDIN

	# Example usage: jq -nRc 'include "fromcsvfile"; fromcsvfile' MYFILE.csv

	######### PEG machinery

	# consume a regular expression rooted at the start of .remainder
	def consume($re):
	# on failure, match yields empty
	(.remainder \| match("^" + $re)) as $match
	\| .remainder \|= .[$match.length :] ;

	def parse($re):
	# on failure, match yields empty
	(.remainder \| match("^" + $re)) as $match
	\| .remainder \|= .[$match.length :]
	\|.result += [$match.string \| gsub("\"\"";"\"")] ;

	# Utility function as there is no EOF marker
	def at_eof: .remainder \| (.=="" or . == "\n" or . == "\r\n");

	############ Grammar for CSV

	# end-of-record if end-of-record characters or else at eof
	def EOR:
	consume("\n\|\r\n\|\r\r\n")
	// (if .remainder=="" then . else empty end) ;

	# Internal double-quotes must be doubled;
	# CRs and LFs are allowed, as are empty quoted fields.
	def field_content_quoted:
	parse("((\"\")\|([^\"]))*") ;

	# # EXTENSION: When reading an unquoted field, we ought to recognize CRLF as end-of-record,
	# i.e. only accept LF if it is NOT preceded by CR.
	# Reject unescaped double-quote
	def unquoted_field:
	(parse("[^\",\r\n]+") \| (if .remainder\|test("^\r\n") then consume("\r") else empty end))
	// parse("[^\",\n]*") # possibly empty
	;

	def quoted_field_continue:
	def trim: sub("(\r\r\|\r)$";"");
	.remainder += ("\n" + input \| trim)
	# \| (.record\|debug) as $debug
	\| (field_content_quoted \| consume("\" *"))
	// quoted_field_continue
	;

	# Ignore blanks before and after the enclosing quotation marks # EXTENSION
	def quoted_field:
	consume(" *\"")
	\| ( (field_content_quoted \| consume("\" *"))
	// quoted_field_continue ) ;

	def field:
	quoted_field
	// unquoted_field
	// if at_eof then empty else stderr end ;

	# ("," field)+
	def fields:
	consume(",")
	\| field
	\| (fields // .) ;

	# field ("," field)*
	# i.e. field fields
	def record: field \| (fields // .) ;

	def fromcsv:
	## Loop for processing all the records
	def _fromcsv:
	if at_eof then empty
	else (EOR // .)
	\| record
	\| select(.result)
	\| .result,
	(.result = null \| .record+=1 \| _fromcsv)
	end ;

	{record:0, remainder: .}
	\| _fromcsv ;

	def fromcsvfile:
	def trim: sub("(\r\r\|\r)$";"");
	inputs \| trim \| fromcsv;