ymoslem/filter-monolingual.py

## filter-monolingual.py
# Remove duplicate, lines with bad characters, and shuffle
# Find the number of CPUs/cores to add to parallel: nproc --all
# sort -S 50% --parallel=4  dataset.es | uniq -u > dataset.unique.es
# shuf dataset.unique.es > dataset.unique.shuf.es
# !perl -ne '/�/ or print' dataset.unique.shuf.es > dataset.unique.shuf.cleaner.es


import re
import fasttext

model = fasttext.load_model("lid.176.bin")

filename = "CoWeSe.es"
outputfile = "CoWeSe.filtered.es"
language = "es"
min_lang_score = 0.30  # up to 0.90 depending on the language
characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789ÁÉÍÑÓÚÜ"


with open(filename, "r") as rawfile, open(outputfile, "w+") as output:
  for line in rawfile:
    line = re.sub('<.*?>|&lt;.*?&gt;|&?(amp|nbsp|quot);|{}', ' ', line)
    line = re.sub(r'[ ]{2,}', ' ', line).strip()
    lang = model.predict(line.lower())
    if len(line) > 0  \
    and len(line.split(" ")) > 3   \
    and len(line.split(" ")) <= 70  \
    and line[0] in characters  \
    and lang[0][0][-2:] == language \
    and lang[1][0] > min_lang_score:
      output.write(line + "\n")

print("Done!")
	# Remove duplicate, lines with bad characters, and shuffle
	# Find the number of CPUs/cores to add to parallel: nproc --all
	# sort -S 50% --parallel=4 dataset.es \| uniq -u > dataset.unique.es
	# shuf dataset.unique.es > dataset.unique.shuf.es
	# !perl -ne '/�/ or print' dataset.unique.shuf.es > dataset.unique.shuf.cleaner.es


	import re
	import fasttext

	model = fasttext.load_model("lid.176.bin")

	filename = "CoWeSe.es"
	outputfile = "CoWeSe.filtered.es"
	language = "es"
	min_lang_score = 0.30 # up to 0.90 depending on the language
	characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789ÁÉÍÑÓÚÜ"


	with open(filename, "r") as rawfile, open(outputfile, "w+") as output:
	for line in rawfile:
	line = re.sub('<.?>\|<.?>\|&?(amp\|nbsp\|quot);\|{}', ' ', line)
	line = re.sub(r'[ ]{2,}', ' ', line).strip()
	lang = model.predict(line.lower())
	if len(line) > 0 \
	and len(line.split(" ")) > 3 \
	and len(line.split(" ")) <= 70 \
	and line[0] in characters \
	and lang[0][0][-2:] == language \
	and lang[1][0] > min_lang_score:
	output.write(line + "\n")

	print("Done!")