Skip to content

Instantly share code, notes, and snippets.

@AShedko
Created November 19, 2020 21:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save AShedko/b10b20c4ca9b55354308f24034f6a4e9 to your computer and use it in GitHub Desktop.
Save AShedko/b10b20c4ca9b55354308f24034f6a4e9 to your computer and use it in GitHub Desktop.
Simplifies Vocabulary collection task immensely.
import nltk
import re
from nltk.stem import PorterStemmer
import sys
import googletrans
import pandas as pd
import time
import random
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--texts", type=argparse.FileType('r'),
nargs='+', required=True)
parser.add_argument(
"-w", "--words", type=argparse.FileType('r'), required=True)
parser.add_argument('-t', '--translate', type=bool)
args = parser.parse_args()
print(args.texts)
print(args)
porter = PorterStemmer()
rex = re.compile(r'\s+')
if args.translate:
tr = googletrans.Translator()
d = {}
words = [l.strip() for l in args.words.readlines()]
for (i, tf) in enumerate(args.texts, 1):
text = tf.read()
sents = nltk.sent_tokenize(text)
for w in words:
if d.get(w):
continue
sw = w.lower()
ss = [s for s in sents if sw in s.lower()]
if len(ss) == 0:
continue
s = random.sample(ss, 1)[0]
result = rex.sub(' ', s) + " [{}]".format(i)
if (args.translate):
transl = tr.translate(result, dest="ru").text
print(w, sw, result, transl)
time.sleep(.06)
d[w] = (result, transl)
else:
print(w, sw, result)
d[w] = result
df = pd.DataFrame.from_dict(d, orient='index')
df.to_excel("translated.xlsx")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment