Skip to content

Instantly share code, notes, and snippets.

@aappddeevv
Last active August 29, 2015 14:11
Show Gist options
  • Save aappddeevv/0203276b4c1c502336d0 to your computer and use it in GitHub Desktop.
Save aappddeevv/0203276b4c1c502336d0 to your computer and use it in GitHub Desktop.
mathematica delimited file importer
(* ::Package:: *)
(* :Title: Import Delimited *)
(* :Summary: Containts declarations for importing a delimited text file into a session. *)
BeginPackage["ImportUtilities`"]
(* Canned functions that can be used as arguments. *)
WhitespaceSplitter::usage = "Function that splits string records on whitespace."
ForeachTrim::usage = "Option that trimes the string argument."
SplitFieldOnComma::usage = "Option that splits a record on commas."
IncludeAllLines::usage = "Option that includes all lines by always indicating that the line should not be ignored. Always returns false."
ReadSingleRecord::usage = "Option that reads a single record from a stream."
NoHeader::usage = "Always 0 indicating that no records should be skipped at the start."
SingleLineHeader::usage = "Always 1 indiacting that a single record should be skipped at the start."
IdentityRecordTransformer::usage="Option that always returns the argument directly."
CopyLine::usage = "Option that just returns the argument directly, thereby just copying the input to the output."
ImportDelimited::usage = "ImportDelimited[filename] imports the delimited file. Many useful options exist to control the import process."
MDYH24MS::usage = "Pattern spec to convert from date time string."
MDYH12MSMAM::usage = "Pattern spec to convert from date time string, very long version."
MDYH12MSMAMTransformer::usage = "Transformer date time to absolute time"
MDYH24MSTransformer::usage = "Transformer date time to absolute time"
Splitter::usage = "Function to split a line."
MaxProcessed::usage ="All or number representing the number of processed lines to keep."
MaxLine::usage="All or a number representing the last line number to read to. This is an absolute record position. If you want to control your sample size, use MaxProcessed."
ForeachSplit::usage="A function to apply to each field. Default is to trim whitespace."
RecordTransformer::usage="A function applied to the parsed row assuming it is not skipped or ignored. The output of this function is the new parsed row content."
ApplyAt::usage="Association of line index (1 based) \[Rule] f[line] applied to the unparsed and unprocessed record. This collects lines and are returned in ApplyAt in the returned values. It ignores skippin gand ignoring."
IgnoreLine::usage="Return a boolean when a line should be ignored. Only applies to lines that are not skipped. Default is to include all lines."
Reader::usage="Read a line from teh input stream. Return EndOfFile when end of file"
Begin["`Private`"]
MDYH12MSMAM = {"Month", "Day", "Year", "Hour12", "Minute", "Second", "Millisecond", "AMPM"}
MDYH24MS = {"Month", "Day", "Year", "Hour24", "Minute", "Second"}
MDYH12MSMAMTransformer = AbsoluteTime[DateList[{#, MDYH12MSMAM }]] &
MDYH24MSTransformer = AbsoluteTime[DateList[{#, MDYH24MS}]] &
WhitespaceSplitter = StringSplit[#,Whitespace..]&
ForeachTrim = StringTrim[#]&
SplitFieldOnComma = StringSplit[#,","]&
IncludeAllLines = False &
ReadSingleRecord = Read[#, Record]&
NoHeader = 0
SingleLineHeader = 1
IdentityRecordTransformer = #&
CopyLine = #&
(* Import a file using the functions to customize the import process. *)
ImportDelimited[file_String?FileExistsQ,
(* lines to skip at the start of the stream, ignores results of IgnoreLines *)
opts: OptionsPattern[{StartSkip -> NoHeader,
(* All or a number representing the number of processed lines to keep. A processed line is like a sample, it was not skipped or ignored *)
MaxProcessed -> All,
(* All or a number representing the last line number to read to. This is an absolute record position. If you want to control your sample size, use MaxProcessed *)
MaxLine -> All,
(* function to split a line *)
Splitter -> SplitFieldOnComma,
(* could use Identity[#]& to preserve surrounding whitespace. Default is to trim whitespace. *)
ForeachSplit -> ForeachTrim,
(* field index \[Rule] f[string] *)
Transformers -> <||>,
(* a function applied to the parsed row assuming it is not skipped or ignored. The output of this function is the new parsed row content. The default is identity. *)
RecordTransformer -> IdentityRecordTransformer,
(* line index (1-based) \[Rule] f[line], applied to the unparsed and unprocessed record which is typically one line of the input stream.
The resulting value is returned in a applyAt list that has {line, result value} tuples. ApplyAt
uses simple AppendTo so use it sparaingly to collect values. It ignores skipping and ignoring.*)
ApplyAt -> <||>,
(* a function returning a boolean that determines whether a line should be ignored, only applies to lines that are not skipped *)
IgnoreLine -> IncludeAllLines,
(* read a line from the input stream. Return EndOfFile when end of file. *)
Reader-> ReadSingleRecord }]] :=
Module[{str, result, line = {}, specials = {}, linecount=0, parsed,ignoredcount=0,startcount = 0,linesprocessed = 0,
rr = OptionValue[Reader],
sp=OptionValue[Splitter],
maxlines =-1,
applies = OptionValue[ApplyAt],
recordTransformer = OptionValue[RecordTransformer],
transformers = OptionValue[Transformers],
fieldProcessor = OptionValue[ForeachSplit],
maxProcessed = -1,
ignoreLine = OptionValue[IgnoreLine],
skips = OptionValue[StartSkip]} ,
str = OpenRead[file];
maxlines = With[{v=OptionValue[MaxLine]}, If[v =!= All, v, maxlines]];
maxProcessed = With[{v = OptionValue[MaxProcessed]}, If[v =!= All, v, maxProcessed]];
result = Reap[
While[True && If[maxlines<0,True, linecount<maxlines],
line =rr @ str;
linecount++;
If[line === EndOfFile, Break[]];
(* If an applyto exist for this record, use it. *)
If[KeyExistsQ[applies, linecount],
With[{aa = applies[[Key[linecount]]]},
AppendTo[specials, {linecount, aa@line}]]];
If[linecount<=skips,startcount++; Continue[]];
If[ignoreLine@line,ignoredcount++; Continue[]];
parsed = recordTransformer@(fieldProcessor /@ (sp @ line));
(* Redo this loop to map over the lhs of the associations versus every field! *)
If[Length[transformers]>0,
With[{len = Length[parsed]},
(* if a transform has been specified, transform it then replace the string value (expensive!) *)
MapIndexed[(With[{index=First[#2]},
If[KeyExistsQ[transformers,index],
parsed[[index]]= transformers[index]@ #1]])&, parsed]]];
Sow[parsed, d]; (* reap only data tagged with d for data *)
linesprocessed++;
If[maxProcessed >=0 && linesprocessed >= maxProcessed, Break[]]
], d, Rule];
Close[str];
<|"Processed"->linesprocessed,
"LinesRead" -> linecount,
"StartSkipped"->startcount,
"Ignored"->ignoredcount,
"ApplyAt" -> specials,
"Data"->d /. result[[2]] (* only return the reaped data not the reap data strucure *) |>
]
End[]
EndPackage[]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment