Last active
May 5, 2017 21:17
-
-
Save dasjestyr/b33b1a24fcdb0c42e5ca5022f2f2cf58 to your computer and use it in GitHub Desktop.
A lexer that will pull fields from a delimited line of text
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
internal class DelimitedFieldLexer : IFieldLexer | |
{ | |
private const int NoData = -1; | |
private const int Delimiter = -2; | |
private const int Initialized = -3; | |
private const int EndOfLine = 0x0A; | |
private const int WindowsEndOfLine = 0x0D; | |
private const int Quote = 0x22; | |
private const LexingState BreakCondition = LexingState.NoData | LexingState.EndOfLine |LexingState.StartingNextField; | |
private readonly TextReader _reader; | |
private readonly StringBuilder _buffer; | |
private int[] _delimiters; | |
private LexingState _state; | |
public DelimitedFieldLexer(TextReader reader) | |
{ | |
_reader = reader; | |
_buffer = new StringBuilder(); | |
} | |
public DelimitedFieldLexer(TextReader reader, IEnumerable<char> delimiters) | |
: this(reader) | |
{ | |
SetDelimiters(delimiters.ToArray()); | |
} | |
public void SetDelimiters(char[] delimiters) | |
{ | |
_delimiters = delimiters | |
.Select(delimiter => (int) delimiter) | |
.ToArray(); | |
} | |
public bool GetNextField(out string field) | |
{ | |
if (_delimiters == null || _delimiters.Length == 0) | |
throw new InvalidOperationException("No delimiters were set."); | |
UpdateState(Initialized); | |
while ((_state & BreakCondition) != _state) | |
{ | |
var currentValue = _reader.Read(); | |
UpdateState(currentValue); | |
switch (_state) | |
{ | |
case LexingState.WindowsEndOfLine: | |
continue; | |
case LexingState.StartingNextField: | |
case LexingState.EndOfLine: | |
case LexingState.NoData: | |
break; | |
case LexingState.ReadingField: | |
_buffer.Append((char)currentValue); | |
break; | |
case LexingState.ReadingQuoted: | |
currentValue = _reader.Read(); | |
do | |
{ | |
_buffer.Append((char)currentValue); | |
currentValue = _reader.Read(); | |
} while (currentValue != Quote); | |
_state = LexingState.ReadingField; | |
continue; | |
default: | |
throw new ArgumentOutOfRangeException(); | |
} | |
} | |
field = _buffer.ToString(); | |
_buffer.Clear(); | |
return HasMoreFields(); | |
} | |
private bool HasMoreFields() | |
{ | |
return _state != LexingState.EndOfLine && _reader.Peek() != NoData; | |
} | |
private void UpdateState(int value) | |
{ | |
// This switch is marginally faster than a dictionary and significantly faster than Hashtable | |
value = _delimiters.Contains(value) ? Delimiter : value; | |
switch (value) | |
{ | |
case NoData: | |
_state = LexingState.NoData; | |
break; | |
case Delimiter: | |
_state = LexingState.StartingNextField; | |
break; | |
case EndOfLine: | |
_state = LexingState.EndOfLine; | |
break; | |
case WindowsEndOfLine: | |
_state = LexingState.WindowsEndOfLine; | |
break; | |
case Quote: | |
_state = LexingState.ReadingQuoted; | |
break; | |
default: | |
_state = LexingState.ReadingField; | |
break; | |
} | |
} | |
[Flags] | |
private enum LexingState | |
{ | |
ReadingField = 1, | |
ReadingQuoted = 1 << 1, | |
StartingNextField = 1 << 2, | |
WindowsEndOfLine = 1 << 3, | |
EndOfLine = 1 << 4, | |
NoData = 1 << 5 | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment