Skip to content

Instantly share code, notes, and snippets.

@ymoslem
Last active August 30, 2022 12:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ymoslem/b937ff3d447cb107ead9f05e1cb38326 to your computer and use it in GitHub Desktop.
Save ymoslem/b937ff3d447cb107ead9f05e1cb38326 to your computer and use it in GitHub Desktop.
# Remove duplicate, lines with bad characters, and shuffle
# Find the number of CPUs/cores to add to parallel: nproc --all
# sort -S 50% --parallel=4 dataset.es | uniq -u > dataset.unique.es
# shuf dataset.unique.es > dataset.unique.shuf.es
# !perl -ne '/�/ or print' dataset.unique.shuf.es > dataset.unique.shuf.cleaner.es
import re
import fasttext
model = fasttext.load_model("lid.176.bin")
filename = "CoWeSe.es"
outputfile = "CoWeSe.filtered.es"
language = "es"
min_lang_score = 0.30 # up to 0.90 depending on the language
characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789ÁÉÍÑÓÚÜ"
with open(filename, "r") as rawfile, open(outputfile, "w+") as output:
for line in rawfile:
line = re.sub('<.*?>|&lt;.*?&gt;|&?(amp|nbsp|quot);|{}', ' ', line)
line = re.sub(r'[ ]{2,}', ' ', line).strip()
lang = model.predict(line.lower())
if len(line) > 0 \
and len(line.split(" ")) > 3 \
and len(line.split(" ")) <= 70 \
and line[0] in characters \
and lang[0][0][-2:] == language \
and lang[1][0] > min_lang_score:
output.write(line + "\n")
print("Done!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment