Skip to content

Instantly share code, notes, and snippets.

@lord-alfred
Last active December 2, 2020 19:07
Show Gist options
  • Save lord-alfred/a8a58d7ffba5ea4dc8645e9a99989f4e to your computer and use it in GitHub Desktop.
Save lord-alfred/a8a58d7ffba5ea4dc8645e9a99989f4e to your computer and use it in GitHub Desktop.
Example of language identification with Facebook fastText
# run:
# python3 predict_titles.py lid.176.bin titles.txt
# or see help:
# python3 predict_titles.py -h
import argparse
import codecs
import os
from collections import defaultdict
from fasttext import load_model # https://github.com/facebookresearch/fastText/tree/master/python
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Predict titles from file and move this to lang subdirs"
)
parser.add_argument(
"model",
help="Model to use",
)
parser.add_argument(
"titles_file",
help="Titles for prediction"
)
parser.add_argument(
"-o",
"--output",
help="Output folder",
default="predicted",
)
parser.add_argument(
"-t",
"--threshold",
help="Max threshold for set lang to unknown",
default=0.1
)
args = parser.parse_args()
# load model from arg
f = load_model(args.model)
# create output dir
if not os.path.exists(args.output):
os.makedirs(args.output)
# result data
result = defaultdict(list)
# get prediction for every title
with codecs.open(args.titles_file, encoding='utf-8') as titles:
for title in titles:
title = title.strip()
predict_result = f.predict(title)
if predict_result[1][0] < args.threshold:
lang = 'unknown'
else:
lang = predict_result[0][0].replace('__label__', '')
result[lang].append((title, predict_result[1][0]))
# sort and save
for lang_code, data in sorted(result.items(), key=lambda x: len(x[1]), reverse=True):
count = len(data)
data.sort(key=lambda x: x[1], reverse=True)
data = map(lambda x: x[0], data)
path = os.path.join(args.output, f'{count}_{lang_code}.txt')
with codecs.open(path, 'w', encoding='utf-8') as fh:
fh.write('\n'.join(data))
print(f'{lang_code}: {count}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment