Skip to content

Instantly share code, notes, and snippets.

@dasjestyr
Last active May 5, 2017 21:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dasjestyr/b33b1a24fcdb0c42e5ca5022f2f2cf58 to your computer and use it in GitHub Desktop.
Save dasjestyr/b33b1a24fcdb0c42e5ca5022f2f2cf58 to your computer and use it in GitHub Desktop.
A lexer that will pull fields from a delimited line of text
internal class DelimitedFieldLexer : IFieldLexer
{
private const int NoData = -1;
private const int Delimiter = -2;
private const int Initialized = -3;
private const int EndOfLine = 0x0A;
private const int WindowsEndOfLine = 0x0D;
private const int Quote = 0x22;
private const LexingState BreakCondition = LexingState.NoData | LexingState.EndOfLine |LexingState.StartingNextField;
private readonly TextReader _reader;
private readonly StringBuilder _buffer;
private int[] _delimiters;
private LexingState _state;
public DelimitedFieldLexer(TextReader reader)
{
_reader = reader;
_buffer = new StringBuilder();
}
public DelimitedFieldLexer(TextReader reader, IEnumerable<char> delimiters)
: this(reader)
{
SetDelimiters(delimiters.ToArray());
}
public void SetDelimiters(char[] delimiters)
{
_delimiters = delimiters
.Select(delimiter => (int) delimiter)
.ToArray();
}
public bool GetNextField(out string field)
{
if (_delimiters == null || _delimiters.Length == 0)
throw new InvalidOperationException("No delimiters were set.");
UpdateState(Initialized);
while ((_state & BreakCondition) != _state)
{
var currentValue = _reader.Read();
UpdateState(currentValue);
switch (_state)
{
case LexingState.WindowsEndOfLine:
continue;
case LexingState.StartingNextField:
case LexingState.EndOfLine:
case LexingState.NoData:
break;
case LexingState.ReadingField:
_buffer.Append((char)currentValue);
break;
case LexingState.ReadingQuoted:
currentValue = _reader.Read();
do
{
_buffer.Append((char)currentValue);
currentValue = _reader.Read();
} while (currentValue != Quote);
_state = LexingState.ReadingField;
continue;
default:
throw new ArgumentOutOfRangeException();
}
}
field = _buffer.ToString();
_buffer.Clear();
return HasMoreFields();
}
private bool HasMoreFields()
{
return _state != LexingState.EndOfLine && _reader.Peek() != NoData;
}
private void UpdateState(int value)
{
// This switch is marginally faster than a dictionary and significantly faster than Hashtable
value = _delimiters.Contains(value) ? Delimiter : value;
switch (value)
{
case NoData:
_state = LexingState.NoData;
break;
case Delimiter:
_state = LexingState.StartingNextField;
break;
case EndOfLine:
_state = LexingState.EndOfLine;
break;
case WindowsEndOfLine:
_state = LexingState.WindowsEndOfLine;
break;
case Quote:
_state = LexingState.ReadingQuoted;
break;
default:
_state = LexingState.ReadingField;
break;
}
}
[Flags]
private enum LexingState
{
ReadingField = 1,
ReadingQuoted = 1 << 1,
StartingNextField = 1 << 2,
WindowsEndOfLine = 1 << 3,
EndOfLine = 1 << 4,
NoData = 1 << 5
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment