Skip to content

Instantly share code, notes, and snippets.

@lyxal
Created June 23, 2021 12:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lyxal/7a8d3b1f505f512ce0191312e9a45cba to your computer and use it in GitHub Desktop.
Save lyxal/7a8d3b1f505f512ce0191312e9a45cba to your computer and use it in GitHub Desktop.
import csv
from collections import Counter
# unigraphs = Counter()
digraphs = Counter()
trigraphs = Counter()
quadgraphs = Counter()
with open('QueryResults.csv', newline='', encoding='utf-8') as f:
with open('corpus.txt', 'w', encoding='utf-8') as cf:
for row in csv.reader(f):
if row[0] == 'Post Link': continue
code = row[1]
if '<code>' not in code: continue
osabie = code.partition('<code>')[2].partition('</code>')[0].strip()
osabie = osabie.replace('&quot;', '"')
osabie = osabie.replace('&gt;', '>').replace('&lt;', '<')
osabie = osabie.replace('&amp;', '&')
# Filter out some noise
if osabie.count('1') > 10: continue
if osabie.count('Z') > 10: continue
if len(osabie) > 100: continue
if 'Gate' in osabie: continue
if 'forgot the' in osabie: continue
if '[>>+<<-]>' in osabie: continue
if 'Given a set of' in osabie: continue
if ' ' in osabie: continue
# Count digraphs and trigraphs
for line in osabie.split('\n'):
# for (c) in tuple(line):
# unigraphs[c] += 1
for (c,d) in zip(line, line[1:]):
digraphs[c,d] += 1
for (c,d,e) in zip(line, line[1:], line[2:]):
trigraphs[c,d,e] += 1
for (c,d,e,f) in zip(line, line[1:], line[2:], line[3:]):
quadgraphs[c,d,e,f] += 1
cf.write(osabie + '\n')
with open('most-common.txt', 'w', encoding='utf-8') as f:
# f.write('Unigraphs:\n')
# for d, n in unigraphs.most_common(30):
# f.write('%4d %s\n' % (n, ''.join(d)))
f.write('Digraphs:\n')
for d, n in digraphs.most_common(30):
f.write('%4d %s\n' % (n, ''.join(d)))
f.write('\nTrigraphs:\n')
for d, n in trigraphs.most_common(30):
f.write('%4d %s\n' % (n, ''.join(d)))
f.write('\nQuadgraphs:\n')
for d, n in quadgraphs.most_common(30):
f.write('%4d %s\n' % (n, ''.join(d)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment