Skip to content

Instantly share code, notes, and snippets.

@RobinvanderVliet
Created September 10, 2019 13:01
Show Gist options
  • Save RobinvanderVliet/4f9d5a3142badc4252d11a2b60fdfca1 to your computer and use it in GitHub Desktop.
Save RobinvanderVliet/4f9d5a3142badc4252d11a2b60fdfca1 to your computer and use it in GitHub Desktop.
Next word suggestor
echo "[!] Downloading sentences from Tatoeba."
wget -O "sentences.tar.bz2" "https://downloads.tatoeba.org/exports/sentences.tar.bz2"
echo "[!] Extracting sentences from Tatoeba."
tar -xvjf "sentences.tar.bz2"
rm "sentences.tar.bz2"
echo "[!] Done!"
import re
def cleanSentence(sentence):
sentence = re.sub("[!¡?¿@\",:;().]", " ", sentence)
sentence = sentence.lower().strip()
sentence = re.sub(" +", " ", sentence)
return sentence.split(" ")[::-1]
language = input("Enter a three-letter language code: ")
importedSentences = open("sentences.csv", "r").read().splitlines()
lines = []
for i in range(len(importedSentences)):
line = importedSentences[i].split("\t")
if line[1] == language:
lines.append(cleanSentence(line[2]))
while True:
sentence = cleanSentence(input("Enter a sentence: "))
words = []
precision = 0
#loop thru all database lines
for i in range(len(lines)):
#loop thru all words from line
firstWord = ""
wordsFound = 0
for j in range(len(lines[i])):
if sentence[wordsFound] == lines[i][j]:
if wordsFound == 0:
firstWord = lines[i][j - 1]
wordsFound = wordsFound + 1
if wordsFound == len(sentence):
break
elif wordsFound > 0:
break
precision = max(precision, wordsFound)
words.append([wordsFound, firstWord])
wordAmounts = {}
for i in range(len(words)):
if precision == words[i][0]:
if words[i][1] in wordAmounts:
wordAmounts[words[i][1]] = wordAmounts[words[i][1]] + 1
else:
wordAmounts[words[i][1]] = 1
sortedWordAmounts = sorted(wordAmounts.items(), key=lambda kv: kv[1])
print(sortedWordAmounts)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment