Skip to content

Instantly share code, notes, and snippets.

@snowwm
Created September 21, 2019 09:41
Show Gist options
  • Save snowwm/b8c0b89f7e74c5c21588550145cf948f to your computer and use it in GitHub Desktop.
Save snowwm/b8c0b89f7e74c5c21588550145cf948f to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import sys, random, textwrap, re
from collections import defaultdict
SRC_FILE = './multitab_vcb.src.ini'
DST_FILE = './multitab_vcb.ini'
DST_ENCODING = 'cp1251'
MAX_TRANSLATIONS = 2
MAX_ENTRIES = 25 # max 100
MAX_WORDLEN = 11
class Dictionary(defaultdict):
def __init__(self):
super().__init__(list)
self.weights = defaultdict(lambda: 1)
def add_entry(self, word, translations, weight=1):
cell = self[word]
cell.extend(translations)
assert len(cell) <= MAX_TRANSLATIONS, f'MAX_TRANSLATIONS exceeded for "{word}"'
self.weights[word] = max(self.weights[word], weight)
def del_entry(self, word):
self.pop(word)
self.weights.pop(word)
def read_dict():
with open(SRC_FILE, 'r') as src:
e2r = Dictionary() # Eng to Rus
r2e = Dictionary() # Rus to Eng
cur_weight = 1
for line in src:
line = line.partition(';')[0].strip() # ignore comments
if not line:
continue
if line.startswith('['):
match = re.match(r'\[(\d+) ', line)
cur_weight = int(match.group(1)) if match else 1
continue
line = line.partition('=')
en = [w.strip() for w in line[0].split(',')]
ru = [w.strip() for w in line[2].split(',')]
for ew in filter(lambda x: len(x) <= MAX_WORDLEN, en):
e2r.add_entry(ew, ru, cur_weight)
for rw in filter(lambda x: len(x) <= MAX_WORDLEN, ru):
r2e.add_entry(rw, en, cur_weight)
return e2r, r2e
def write_dict(d):
with open(DST_FILE, 'w', encoding=DST_ENCODING, newline='\r\n') as dst:
for _ in range(min(MAX_ENTRIES, len(d))):
word = random.choices(list(d.keys()), d.weights.values())[0]
dst.write(word)
dst.write(' = ')
dst.write(','.join([w.lower() for w in d[word]]))
dst.write('\n')
d.del_entry(word)
if __name__ == '__main__':
e2r, r2e = read_dict()
# r2e probability
threshold = float(sys.argv[1]) if len(sys.argv) > 1 else 0.5
use_r2e = random.random() < threshold
print(textwrap.dedent(f'''\
Dictionary Statistics:
English words: {len(e2r)}
Russian words: {len(r2e)}
Writing {'rus->eng' if use_r2e else 'eng->rus'} translation'''))
write_dict(r2e if use_r2e else e2r)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment