Skip to content

Instantly share code, notes, and snippets.

@M0r13n
Created June 21, 2024 11:49
Show Gist options
  • Save M0r13n/3144d244aef4a0a40e8adad9154e5868 to your computer and use it in GitHub Desktop.
Save M0r13n/3144d244aef4a0a40e8adad9154e5868 to your computer and use it in GitHub Desktop.
word count like using `wc` using async state machine parsing. Inspired by: https://github.com/robertdavidgraham/wc2
WAS_SPACE = 0
NEW_LINE = 1
NEW_WORD = 2
WAS_WORD = 3
SPACES = [9,10,11,12,13,32]
NEWLINE = 10
def init_table():
# 0 => was space
# 1 => was line
# 2 => new word
# 3 => was word
table = [0,] * 1024
# Transitions when not a space
for c in range(256):
table[WAS_SPACE* 256 + c] = NEW_WORD
table[NEW_LINE * 256 + c] = NEW_WORD
table[NEW_WORD * 256 + c] = WAS_WORD
table[WAS_WORD * 256 + c] = WAS_WORD
# Transitions when space
for c in SPACES:
table[WAS_SPACE* 256 + c] = WAS_SPACE;
table[NEW_LINE * 256 + c] = WAS_SPACE;
table[NEW_WORD * 256 + c] = WAS_SPACE;
table[WAS_WORD * 256 + c] = WAS_SPACE;
# Transitions when newline
table[WAS_SPACE* 256 + NEWLINE] = NEW_LINE;
table[NEW_LINE * 256 + NEWLINE] = NEW_LINE;
table[NEW_WORD * 256 + NEWLINE] = NEW_LINE;
table[WAS_WORD * 256 + NEWLINE] = NEW_LINE;
return table
if __name__ == '__main__':
# State Machine
table = init_table()
# Counts: num spaces, num new lines, num new words, num chars of words
counts = [0,0,0,0]
# Initial state
state = WAS_SPACE
# Read & parse
# TODO: read in chunks for better efficiency
with open('./test.txt', 'rb') as fd:
for c in fd.read():
state = table[state * 256 + c]
counts[state] += 1
print(f'{counts[1]} {counts[2]} {sum(counts)}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment