Skip to content

Instantly share code, notes, and snippets.

@ryanohs
Last active July 25, 2021 15:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ryanohs/caf769dbe554ded0fec151a53a17f8ee to your computer and use it in GitHub Desktop.
Save ryanohs/caf769dbe554ded0fec151a53a17f8ee to your computer and use it in GitHub Desktop.
A bullet journal parser
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using Xunit;
namespace Parser
{
public class Parser
{
public Node Parse(string input)
{
using var stream = new MemoryStream();
using var writer = new StreamWriter(stream);
writer.Write(input);
writer.Flush();
stream.Position = 0;
return Parse(stream);
}
public Node Parse(Stream stream)
{
using var streamReader = new StreamReader(stream);
var root = new Node();
var parser = new JournalStateMachine();
var state = new ParserState(root);
while(!streamReader.EndOfStream)
{
var c = (char)streamReader.Read(); // todo encoding?
if(c == '\r') continue; // ignore \r, only going to care about \n
state = parser.Next(state, c);
}
// ensure final trailing \n
parser.Next(state, '\n');
return root;
}
}
public class JournalStateMachine
{
public enum State
{
Start,
End,
Tab,
Spaces,
Bullet,
PreContent,
Content,
}
public enum Token
{
Space,
Tab,
Bullet,
Newline,
Other,
}
public record Transition(State State, Token Token);
public Dictionary<Transition, State> Transitions = new()
{
{ new(State.Start, Token.Tab), State.Tab },
{ new(State.Start, Token.Space), State.Spaces },
{ new(State.Start, Token.Bullet), State.Bullet },
{ new(State.Start, Token.Other), State.Content },
{ new(State.Start, Token.Newline), State.Start },
{ new(State.Tab, Token.Tab), State.Tab },
{ new(State.Tab, Token.Space), State.Spaces },
{ new(State.Tab, Token.Bullet), State.Bullet },
{ new(State.Tab, Token.Other), State.Content },
{ new(State.Tab, Token.Newline), State.Start },
{ new(State.Spaces, Token.Space), State.Spaces },
{ new(State.Spaces, Token.Tab), State.Tab },
{ new(State.Spaces, Token.Bullet), State.Bullet },
{ new(State.Spaces, Token.Other), State.Content },
{ new(State.Spaces, Token.Newline), State.Start },
{ new(State.Bullet, Token.Space), State.PreContent },
{ new(State.Bullet, Token.Tab), State.PreContent },
{ new(State.Bullet, Token.Bullet), State.Bullet },
{ new(State.Bullet, Token.Other), State.Content },
{ new(State.Bullet, Token.Newline), State.End },
// PreContent -- allow for any whitespace between bullet and content
{ new(State.PreContent, Token.Space), State.PreContent },
{ new(State.PreContent, Token.Tab), State.PreContent },
{ new(State.PreContent, Token.Bullet), State.Content },
{ new(State.PreContent, Token.Other), State.Content },
{ new(State.PreContent, Token.Newline), State.Start },
{ new(State.Content, Token.Space), State.Content },
{ new(State.Content, Token.Tab), State.Content },
{ new(State.Content, Token.Bullet), State.Content },
{ new(State.Content, Token.Other), State.Content },
{ new(State.Content, Token.Newline), State.End },
};
public ParserState Next(ParserState state, char c)
{
var token = GetToken(c);
if(!Transitions.TryGetValue(new Transition(state.CurrentState, token), out var newState))
{
throw new Exception($"Unhandled transition: ({state.CurrentState}, {token}) Previously parsed = '{state.RawLine}' Next = '{c}'");
}
return newState switch
{
State.Start => Start(state),
State.End => End(state),
State.Tab => Tab(state, c),
State.Spaces => Spaces(state, c),
State.Bullet => Bullet(state, c),
State.PreContent => PreContent(state, c),
State.Content => Content(state, c),
_ => throw new ArgumentOutOfRangeException($"Unhandled state {newState}")
};
}
private ParserState Start(ParserState state)
{
state.RawLine.Clear();
state.Content.Clear();
return state with
{
CurrentIndentDepth = 0,
CurrentState = State.Start,
NodeType = NodeType.Title
};
}
private ParserState Tab(ParserState state, char c)
{
state.RawLine.Append(c);
return state with
{
CurrentState = State.Tab,
CurrentIndentDepth = state.CurrentIndentDepth + 1,
CurrentSpacesCount = 0
};
}
private ParserState Spaces(ParserState state, char c)
{
state.RawLine.Append(c);
return state with
{
CurrentState = State.Spaces,
CurrentSpacesCount = (state.CurrentSpacesCount == 3) ? 0 : state.CurrentSpacesCount + 1,
CurrentIndentDepth = (state.CurrentSpacesCount == 3) ? state.CurrentIndentDepth + 1 : state.CurrentIndentDepth,
};
}
private ParserState Bullet(ParserState state, char c)
{
state.RawLine.Append(c);
return state with
{
CurrentState = State.Bullet,
NodeType = GetNodeType(c)
};
}
private ParserState PreContent(ParserState state, char c)
{
state.RawLine.Append(c);
return state with
{
CurrentState = State.PreContent
};
}
private ParserState Content(ParserState state, char c)
{
state.RawLine.Append(c);
state.Content.Append(c);
return state with
{
CurrentState = State.Content
};
}
private ParserState End(ParserState state)
{
var (indentDepth, parent) = ComputeIndentDepthAndParent(state);
var node = new Node()
{
Parent = parent,
IndentDepth = indentDepth,
Type = state.NodeType,
Content = state.Content.ToString(),
RawLine = state.RawLine.ToString()
};
parent.Children.Add(node);
state.RawLine.Clear();
state.Content.Clear();
return Start(state with {
PreviousNode = node
});
}
private (int indentDepth, Node parent) ComputeIndentDepthAndParent(ParserState state)
{
var indentDepth = state.CurrentIndentDepth;
if(state.NodeType == NodeType.Title)
{
indentDepth--;
}
if(state.CurrentSpacesCount > 0)
{
indentDepth++;
}
var parent = state.PreviousNode;
while(indentDepth <= parent.IndentDepth)
{
parent = parent.Parent;
}
return (indentDepth, parent);
}
private Token GetToken(char c)
{
return c switch
{
' ' => Token.Space,
'\t' => Token.Tab,
'-' => Token.Bullet,
'*' => Token.Bullet,
'o' => Token.Bullet,
'!' => Token.Bullet,
'\r' => Token.Newline,
'\n' => Token.Newline,
_ => Token.Other
};
}
private NodeType GetNodeType(char c)
{
return c switch
{
'-' => NodeType.Note,
'*' => NodeType.Task,
'o' => NodeType.Event,
'!' => NodeType.Inspiration,
_ => throw new ArgumentOutOfRangeException(nameof(c), c, null)
};
}
}
public record ParserState
{
public ParserState(Node root)
{
PreviousNode = root;
}
public JournalStateMachine.State CurrentState { get; init; } = JournalStateMachine.State.Start;
public int CurrentIndentDepth { get; init; }
public int CurrentSpacesCount { get; init; }
public Node PreviousNode { get; init; }
public NodeType NodeType { get; init; }
public StringBuilder RawLine { get; init; } = new();
public StringBuilder Content { get; init; } = new();
}
public record Node
{
public string RawLine { get; init; }
public string Content { get; init; }
public int IndentDepth { get; init; } = -2; // root is -2. Top level Title nodes are -1.
public NodeType Type { get; init; }
public Node Parent { get; init; }
public List<Node> Children { get; init; } = new ();
}
public enum NodeType
{
Title,
Note,
Task,
Event,
Inspiration
}
public class ParserTests
{
[Fact]
public void ParseLine()
{
var line = "- First";
var parser = new Parser();
var root = parser.Parse(line);
Assert.Equal(1, root.Children.Count);
Assert.Equal("First", root.Children.Single().Content);
}
[Fact]
public void ParseMultipleWords()
{
var line = "- Multiple Words";
var parser = new Parser();
var root = parser.Parse(line);
Assert.Equal("Multiple Words", root.Children.Single().Content);
}
[Fact]
public void SpecialTokensInsideContent()
{
var line = "- !@#$%^&*()\t-_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
var parser = new Parser();
var root = parser.Parse(line);
Assert.Equal("!@#$%^&*()\t-_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ", root.Children.Single().Content);
}
[Theory]
[InlineData("- Parent\n\t- Child")]
[InlineData("- Parent\n\t\t- Child")] // excess tab depth
public void Children(string line)
{
var parser = new Parser();
var root = parser.Parse(line);
Assert.Equal(1, root.Children.Count);
var parent = root.Children.Single();
Assert.Equal("Parent", parent.Content);
Assert.Equal(1, parent.Children.Count);
Assert.Equal("Child", parent.Children.Single().Content);
}
[Theory]
[InlineData("- Parent\n - Child")]
[InlineData("- Parent\n - Child")] // excess spaces
public void Spaces(string line)
{
var parser = new Parser();
var root = parser.Parse(line);
Assert.Equal(1, root.Children.Count);
var parent = root.Children.Single();
Assert.Equal("Parent", parent.Content);
Assert.Equal(1, parent.Children.Count);
Assert.Equal("Child", parent.Children.Single().Content);
}
[Theory]
[InlineData("- Parent\n\t- Child\n- Sibling\n")]
[InlineData("- Parent\n\t\t- Child\n- Sibling\n")] // excess tab depth
public void Promotion(string line)
{
var parser = new Parser();
var root = parser.Parse(line);
Assert.Equal(2, root.Children.Count);
var parent = root.Children[0];
Assert.Equal("Parent", parent.Content);
Assert.Equal(1, parent.Children.Count);
Assert.Equal("Child", parent.Children.Single().Content);
var sibling = root.Children[1];
Assert.Equal("Sibling", sibling.Content);
}
[Theory]
[InlineData("Title Line", NodeType.Title)]
[InlineData("- Note", NodeType.Note)]
[InlineData("* Task", NodeType.Task)]
[InlineData("o Event", NodeType.Event)]
[InlineData("! Inspiration", NodeType.Inspiration)]
public void NodeTypes(string line, NodeType type)
{
var parser = new Parser();
var root = parser.Parse(line);
Assert.Equal(type, root.Children.Single().Type);
}
[Fact]
public void MultilineSample()
{
var input =
@"Node Types
- Note
* Task
o Event
! Inspiration
Children
- A
- B
- C
- D
- E
Crazy Tabs
- H
-I
-J
- K
-L
- M";
var parser = new Parser();
var root = parser.Parse(input);
var nodeTypes = root.Children[0];
Assert.Equal("Node Types", nodeTypes.Content);
Assert.Equal(NodeType.Note, nodeTypes.Children[0].Type);
Assert.Equal(NodeType.Task, nodeTypes.Children[1].Type);
Assert.Equal(NodeType.Event, nodeTypes.Children[2].Type);
Assert.Equal(NodeType.Inspiration, nodeTypes.Children[3].Type);
var childrenNode = root.Children[1];
Assert.Equal("A", childrenNode.Children[0].Content);
Assert.Equal("B", childrenNode.Children[0].Children[0].Content);
Assert.Equal("C", childrenNode.Children[0].Children[1].Content);
Assert.Equal("D", childrenNode.Children[1].Content);
Assert.Equal("E", childrenNode.Children[1].Children[0].Content);
var crazyTabs = root.Children[2];
Assert.Equal("H", crazyTabs.Children[0].Content);
Assert.Equal("I", crazyTabs.Children[0].Children[0].Content);
Assert.Equal("J", crazyTabs.Children[0].Children[1].Content);
Assert.Equal("K", crazyTabs.Children[1].Content);
Assert.Equal("L", crazyTabs.Children[1].Children[0].Content);
Assert.Equal("M", crazyTabs.Children[2].Content);
}
[Fact]
public void SpacesAndTabs()
{
var input =
@"
Mixed Whitespace
- H
-I
-J
- K
-L
- M";
var parser = new Parser();
var root = parser.Parse(input);
var mixed = root.Children[0];
Assert.Equal("H", mixed.Children[0].Content);
Assert.Equal("I", mixed.Children[0].Children[0].Content);
Assert.Equal("J", mixed.Children[0].Children[1].Content);
Assert.Equal("K", mixed.Children[1].Content);
Assert.Equal("L", mixed.Children[1].Children[0].Content);
Assert.Equal("M", mixed.Children[2].Content);
}
[Fact]
public void MultipleBullet()
{
var line = "--! Test";
var parser = new Parser();
var root = parser.Parse(line);
Assert.Equal(NodeType.Inspiration, root.Children.Single().Type);
}
[Fact]
public void IgnoreTrailingNewLines()
{
var line = "- Test\n\n\n";
var parser = new Parser();
var root = parser.Parse(line);
Assert.Equal(1, root.Children.Count);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment