aappddeevv/ImportUtilities.m

## ImportUtilities.m
(* ::Package:: *)

(* :Title: Import Delimited *)

(* :Summary: Containts declarations for importing a delimited text file into a session. *)

BeginPackage["ImportUtilities`"]


(* Canned functions that can be used as arguments. *)
WhitespaceSplitter::usage = "Function that splits string records on whitespace."

ForeachTrim::usage = "Option that trimes the string argument."

SplitFieldOnComma::usage = "Option that splits a record on commas."

IncludeAllLines::usage = "Option that includes all lines by always indicating that the line should not be ignored. Always returns false."

ReadSingleRecord::usage = "Option that reads a single record from a stream."

NoHeader::usage = "Always 0 indicating that no records should be skipped at the start."

SingleLineHeader::usage = "Always 1 indiacting that a single record should be skipped at the start."

IdentityRecordTransformer::usage="Option that always returns the argument directly."

CopyLine::usage = "Option that just returns the argument directly, thereby just copying the input to the output."

ImportDelimited::usage = "ImportDelimited[filename] imports the delimited file. Many useful options exist to control the import process."

MDYH24MS::usage = "Pattern spec to convert from date time string."

MDYH12MSMAM::usage = "Pattern spec to convert from date time string, very long version."

MDYH12MSMAMTransformer::usage = "Transformer date time to absolute time"

MDYH24MSTransformer::usage = "Transformer date time to absolute time"

Splitter::usage = "Function to split a line."
MaxProcessed::usage ="All or number representing the number of processed lines to keep."
MaxLine::usage="All or a number representing the last line number to read to. This is an absolute record position. If you want to control your sample size, use MaxProcessed."
ForeachSplit::usage="A function to apply to each field. Default is to trim whitespace."
RecordTransformer::usage="A function applied to the parsed row assuming it is not skipped or ignored. The output of this function is the new parsed row content."
ApplyAt::usage="Association of line index (1 based) \[Rule] f[line] applied to the unparsed and unprocessed record. This collects lines and are returned in ApplyAt in the returned values. It ignores skippin gand ignoring."
IgnoreLine::usage="Return a boolean when a line should be ignored. Only applies to lines that are not skipped. Default is to include all lines."
Reader::usage="Read a line from teh input stream. Return EndOfFile when end of file"


Begin["`Private`"]

  MDYH12MSMAM = {"Month", "Day", "Year", "Hour12", "Minute", "Second", "Millisecond", "AMPM"}
  MDYH24MS = {"Month", "Day", "Year", "Hour24", "Minute", "Second"}

  MDYH12MSMAMTransformer = AbsoluteTime[DateList[{#, MDYH12MSMAM }]] &
  MDYH24MSTransformer = AbsoluteTime[DateList[{#, MDYH24MS}]] &

  WhitespaceSplitter = StringSplit[#,Whitespace..]&
  ForeachTrim = StringTrim[#]&
  SplitFieldOnComma = StringSplit[#,","]&
  IncludeAllLines = False &
  ReadSingleRecord = Read[#, Record]&
  NoHeader = 0
  SingleLineHeader = 1
  IdentityRecordTransformer = #&
  CopyLine = #&

  (* Import a file using the functions to customize the import process. *)
  ImportDelimited[file_String?FileExistsQ,

  (* lines to skip at the start of the stream, ignores results of IgnoreLines *)
    opts: OptionsPattern[{StartSkip -> NoHeader,

    (* All or a number representing the number of processed lines to keep. A processed line is like a sample, it was not skipped or ignored *)
    MaxProcessed -> All,

    (* All or a number representing the last line number to read to. This is an absolute record position. If you want to control your sample size, use MaxProcessed *)
    MaxLine -> All,

    (* function to split a line *)
    Splitter -> SplitFieldOnComma,

    (* could use Identity[#]& to preserve surrounding whitespace. Default is to trim whitespace. *)
    ForeachSplit -> ForeachTrim,

    (* field index \[Rule] f[string] *)
    Transformers -> <||>,

    (* a function applied to the parsed row assuming it is not skipped or ignored. The output of this function is the new parsed row content. The default is identity. *)
    RecordTransformer -> IdentityRecordTransformer,

    (* line index (1-based) \[Rule] f[line], applied to the unparsed and unprocessed record which is typically one line of the input stream.
The resulting value is returned in a applyAt list that has {line, result value} tuples. ApplyAt
uses simple AppendTo so use it sparaingly to collect values. It ignores skipping and ignoring.*)
  ApplyAt -> <||>,

  (* a function returning a boolean that determines whether a line should be ignored, only applies to lines that are not skipped *)
  IgnoreLine -> IncludeAllLines,

  (* read a line from the input stream. Return EndOfFile when end of file. *)
  Reader-> ReadSingleRecord }]] :=

  Module[{str, result, line = {}, specials = {}, linecount=0, parsed,ignoredcount=0,startcount = 0,linesprocessed = 0,
		rr = OptionValue[Reader],
		sp=OptionValue[Splitter],
		maxlines =-1,
		applies = OptionValue[ApplyAt],
		recordTransformer = OptionValue[RecordTransformer],
		transformers = OptionValue[Transformers],
		fieldProcessor = OptionValue[ForeachSplit],
		maxProcessed = -1,
		ignoreLine = OptionValue[IgnoreLine],
		skips = OptionValue[StartSkip]} ,
	      str = OpenRead[file];
	      maxlines = With[{v=OptionValue[MaxLine]}, If[v =!= All,  v, maxlines]];
	      maxProcessed = With[{v = OptionValue[MaxProcessed]}, If[v =!= All, v, maxProcessed]];
	      result = Reap[
		      While[True && If[maxlines<0,True, linecount<maxlines],
			    line =rr @ str;
			    linecount++;
			    If[line === EndOfFile, Break[]];
			    (* If an applyto exist for this record, use it. *)
			    If[KeyExistsQ[applies, linecount],
				With[{aa = applies[[Key[linecount]]]},
				    AppendTo[specials, {linecount, aa@line}]]];
			    If[linecount<=skips,startcount++; Continue[]];
			    If[ignoreLine@line,ignoredcount++; Continue[]];

			    parsed = recordTransformer@(fieldProcessor /@ (sp @ line));
			    (* Redo this loop to map over the lhs of the associations versus every field! *)
			    If[Length[transformers]>0,
			      With[{len = Length[parsed]},
				  (* if a transform has been specified, transform it then replace the string value (expensive!) *)
				  MapIndexed[(With[{index=First[#2]},
						  If[KeyExistsQ[transformers,index],
						      parsed[[index]]= transformers[index]@ #1]])&, parsed]]];
			  Sow[parsed, d]; (* reap only data tagged with d for data *)


			    linesprocessed++;
			    If[maxProcessed >=0 && linesprocessed >= maxProcessed, Break[]]
		      ], d, Rule];
	      Close[str];
	      <|"Processed"->linesprocessed,
	      "LinesRead" -> linecount,
	      "StartSkipped"->startcount,
	      "Ignored"->ignoredcount,
	      "ApplyAt" -> specials,
	      "Data"->d /. result[[2]] (* only return the reaped data not the reap data strucure *) |>
	]
End[]
EndPackage[]
	(* ::Package:: *)

	(* :Title: Import Delimited *)

	(* :Summary: Containts declarations for importing a delimited text file into a session. *)

	BeginPackage["ImportUtilities`"]


	(* Canned functions that can be used as arguments. *)
	WhitespaceSplitter::usage = "Function that splits string records on whitespace."

	ForeachTrim::usage = "Option that trimes the string argument."

	SplitFieldOnComma::usage = "Option that splits a record on commas."

	IncludeAllLines::usage = "Option that includes all lines by always indicating that the line should not be ignored. Always returns false."

	ReadSingleRecord::usage = "Option that reads a single record from a stream."

	NoHeader::usage = "Always 0 indicating that no records should be skipped at the start."

	SingleLineHeader::usage = "Always 1 indiacting that a single record should be skipped at the start."

	IdentityRecordTransformer::usage="Option that always returns the argument directly."

	CopyLine::usage = "Option that just returns the argument directly, thereby just copying the input to the output."

	ImportDelimited::usage = "ImportDelimited[filename] imports the delimited file. Many useful options exist to control the import process."

	MDYH24MS::usage = "Pattern spec to convert from date time string."

	MDYH12MSMAM::usage = "Pattern spec to convert from date time string, very long version."

	MDYH12MSMAMTransformer::usage = "Transformer date time to absolute time"

	MDYH24MSTransformer::usage = "Transformer date time to absolute time"

	Splitter::usage = "Function to split a line."
	MaxProcessed::usage ="All or number representing the number of processed lines to keep."
	MaxLine::usage="All or a number representing the last line number to read to. This is an absolute record position. If you want to control your sample size, use MaxProcessed."
	ForeachSplit::usage="A function to apply to each field. Default is to trim whitespace."
	RecordTransformer::usage="A function applied to the parsed row assuming it is not skipped or ignored. The output of this function is the new parsed row content."
	ApplyAt::usage="Association of line index (1 based) \[Rule] f[line] applied to the unparsed and unprocessed record. This collects lines and are returned in ApplyAt in the returned values. It ignores skippin gand ignoring."
	IgnoreLine::usage="Return a boolean when a line should be ignored. Only applies to lines that are not skipped. Default is to include all lines."
	Reader::usage="Read a line from teh input stream. Return EndOfFile when end of file"


	Begin["`Private`"]

	MDYH12MSMAM = {"Month", "Day", "Year", "Hour12", "Minute", "Second", "Millisecond", "AMPM"}
	MDYH24MS = {"Month", "Day", "Year", "Hour24", "Minute", "Second"}

	MDYH12MSMAMTransformer = AbsoluteTime[DateList[{#, MDYH12MSMAM }]] &
	MDYH24MSTransformer = AbsoluteTime[DateList[{#, MDYH24MS}]] &

	WhitespaceSplitter = StringSplit[#,Whitespace..]&
	ForeachTrim = StringTrim[#]&
	SplitFieldOnComma = StringSplit[#,","]&
	IncludeAllLines = False &
	ReadSingleRecord = Read[#, Record]&
	NoHeader = 0
	SingleLineHeader = 1
	IdentityRecordTransformer = #&
	CopyLine = #&

	(* Import a file using the functions to customize the import process. *)
	ImportDelimited[file_String?FileExistsQ,

	(* lines to skip at the start of the stream, ignores results of IgnoreLines *)
	opts: OptionsPattern[{StartSkip -> NoHeader,

	(* All or a number representing the number of processed lines to keep. A processed line is like a sample, it was not skipped or ignored *)
	MaxProcessed -> All,

	(* All or a number representing the last line number to read to. This is an absolute record position. If you want to control your sample size, use MaxProcessed *)
	MaxLine -> All,

	(* function to split a line *)
	Splitter -> SplitFieldOnComma,

	(* could use Identity[#]& to preserve surrounding whitespace. Default is to trim whitespace. *)
	ForeachSplit -> ForeachTrim,

	(* field index \[Rule] f[string] *)
	Transformers -> <\|\|>,

	(* a function applied to the parsed row assuming it is not skipped or ignored. The output of this function is the new parsed row content. The default is identity. *)
	RecordTransformer -> IdentityRecordTransformer,

	(* line index (1-based) \[Rule] f[line], applied to the unparsed and unprocessed record which is typically one line of the input stream.
	The resulting value is returned in a applyAt list that has {line, result value} tuples. ApplyAt
	uses simple AppendTo so use it sparaingly to collect values. It ignores skipping and ignoring.*)
	ApplyAt -> <\|\|>,

	(* a function returning a boolean that determines whether a line should be ignored, only applies to lines that are not skipped *)
	IgnoreLine -> IncludeAllLines,

	(* read a line from the input stream. Return EndOfFile when end of file. *)
	Reader-> ReadSingleRecord }]] :=

	Module[{str, result, line = {}, specials = {}, linecount=0, parsed,ignoredcount=0,startcount = 0,linesprocessed = 0,
	rr = OptionValue[Reader],
	sp=OptionValue[Splitter],
	maxlines =-1,
	applies = OptionValue[ApplyAt],
	recordTransformer = OptionValue[RecordTransformer],
	transformers = OptionValue[Transformers],
	fieldProcessor = OptionValue[ForeachSplit],
	maxProcessed = -1,
	ignoreLine = OptionValue[IgnoreLine],
	skips = OptionValue[StartSkip]} ,
	str = OpenRead[file];
	maxlines = With[{v=OptionValue[MaxLine]}, If[v =!= All, v, maxlines]];
	maxProcessed = With[{v = OptionValue[MaxProcessed]}, If[v =!= All, v, maxProcessed]];
	result = Reap[
	While[True && If[maxlines<0,True, linecount<maxlines],
	line =rr @ str;
	linecount++;
	If[line === EndOfFile, Break[]];
	(* If an applyto exist for this record, use it. *)
	If[KeyExistsQ[applies, linecount],
	With[{aa = applies[[Key[linecount]]]},
	AppendTo[specials, {linecount, aa@line}]]];
	If[linecount<=skips,startcount++; Continue[]];
	If[ignoreLine@line,ignoredcount++; Continue[]];

	parsed = recordTransformer@(fieldProcessor /@ (sp @ line));
	(* Redo this loop to map over the lhs of the associations versus every field! *)
	If[Length[transformers]>0,
	With[{len = Length[parsed]},
	(* if a transform has been specified, transform it then replace the string value (expensive!) *)
	MapIndexed[(With[{index=First[#2]},
	If[KeyExistsQ[transformers,index],
	parsed[[index]]= transformers[index]@ #1]])&, parsed]]];
	Sow[parsed, d]; (* reap only data tagged with d for data *)


	linesprocessed++;
	If[maxProcessed >=0 && linesprocessed >= maxProcessed, Break[]]
	], d, Rule];
	Close[str];
	<\|"Processed"->linesprocessed,
	"LinesRead" -> linecount,
	"StartSkipped"->startcount,
	"Ignored"->ignoredcount,
	"ApplyAt" -> specials,
	"Data"->d /. result[[2]] (* only return the reaped data not the reap data strucure *) \|>
	]
	End[]
	EndPackage[]