Last active
August 8, 2020 22:18
-
-
Save nbertagnolli/94b45d93661aa3211ef8c661ed7f5a0f to your computer and use it in GitHub Desktop.
Python snippet to predict which language each string in a list is.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import List | |
import os | |
import requests | |
import fasttext | |
def get_language(texts: List[str]) -> List[str]: | |
"""Predicts the languge code for each text in a list | |
Args: | |
texts: A list of texts for which we want to predict the language | |
Returns: | |
A list of two to three letter languge codes for each text. | |
""" | |
# If the model doesn't exist download it | |
if not os.path.isfile(fasttext_model_path): | |
url = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin' | |
r = requests.get(url, allow_redirects=True) | |
open('/tmp/lid.176.bin', 'wb').write(r.content) | |
# Load the downloaded model into fasttext | |
lang_model = fasttext.load_model(fasttext_model_path) | |
# Predict the language code for each text in texts | |
# Remove newlines because fasttext doesn't like them | |
langs, _ = self.lang_model.predict([x.replace("\n", " ") for x in texts]) | |
# Extract the two character language code from the predictions. | |
return [x[0].split("__")[-1] for x in langs] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment