Find simplest google translate request that generate the maximum number of unique words
from googletrans import Translator | |
from itertools import product | |
from pandas import DataFrame, read_csv | |
import numpy as np | |
import string | |
import time | |
import os | |
# Get google translator object | |
translator = Translator() | |
if os.path.isfile('results.csv'): | |
print('load...') | |
results = read_csv('results.csv') | |
else: | |
results = DataFrame(columns=('input', 'output', 'size')) | |
# make simple alphabet | |
az = string.ascii_lowercase[:26] | |
n_words = 20 | |
n_letters = 4 | |
# Brute-force search | |
for n_letter in range(n_letters): | |
for chars in product(*[az] * n_letter): | |
input_string = ' '.join([''.join(chars)] * n_words) | |
if input_string in results['input'].values: | |
continue | |
# Deal with Google anti flood policy | |
translated = False | |
while not translated: | |
try: | |
output_string = translator.translate(input_string).text | |
translated = True | |
except ValueError: | |
time.sleep(1) | |
# count number of unique word | |
size = len(np.unique(output_string.split(' '))) | |
results = results.append(dict(input=input_string, | |
output=output_string, | |
size=size), | |
ignore_index=True) | |
disp = results.sort_values(['size', 'output'], ascending=False) | |
print(disp[:10]) | |
# save | |
results.to_csv('results.csv', header=True, index=False, encoding='utf-8') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment