Skip to content

Instantly share code, notes, and snippets.

@nayakrujul
Created June 11, 2023 16:28
Show Gist options
  • Save nayakrujul/ce246f605d0d2e865ff8253f1afe6631 to your computer and use it in GitHub Desktop.
Save nayakrujul/ce246f605d0d2e865ff8253f1afe6631 to your computer and use it in GitHub Desktop.
Code used to generate the Thunno 2 corpus
import collections
digraphs = collections.Counter()
trigraphs = collections.Counter()
quadgraphs = collections.Counter()
with open("programs.txt") as f:
# List of all Thunno 2 programs collected by SEDE query
programs = f.read().splitlines()
for prog in programs:
# Digraphs (2 characters)
for a, b in zip(prog, prog[1:]):
digraphs[a, b] += 1
# Trigraphs (3 characters)
for a, b, c in zip(prog, prog[1:], prog[2:]):
trigraphs[a, b, c] += 1
# Quadgraphs (4 characters)
for a, b, c, d in zip(prog, prog[1:], prog[2:], prog[3:]):
quadgraphs[a, b, c, d] += 1
print("2-graphs:\n")
for d, n in digraphs.most_common(30):
print("%4d %s" % (n, "".join(d)))
print("\n3-graphs:")
for d, n in trigraphs.most_common(25):
print("%4d %s" % (n, "".join(d)))
print("\n4-graphs:\n")
for d, n in quadgraphs.most_common(20):
print("%4d %s" % (n, "".join(d)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment