This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pip3 install openai | |
import openai | |
import time | |
OPENAI_API_KEY = "your_api_key_here" | |
openai.api_key = OPENAI_API_KEY | |
prompt = """French: La semaine dernière, quelqu’un m’a fait part de sa gratitude envers notre travail. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This example uses M2M-100 models converted to the CTranslate2 format. | |
# Download CTranslate2 models: | |
# • M2M-100 418M-parameter model: https://bit.ly/33fM1AO | |
# • M2M-100 1.2B-parameter model: https://bit.ly/3GYiaed | |
import ctranslate2 | |
import sentencepiece as spm | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import streamlit as st | |
import openai | |
# Change to your OpenAI API key | |
OPENAI_API_KEY = "your_OpenAI_API_key_here" | |
openai.api_key = OPENAI_API_KEY | |
# Set the page layout to wide | |
st.set_page_config(page_title="Extract Terms", page_icon=None, layout="wide") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Sentence METEOR | |
# METEOR mainly works on sentence evaluation rather than corpus evaluation | |
# Run this file from CMD/Terminal | |
# Example Command: python3 sentence-meteor.py test_file_name.txt mt_file_name.txt | |
import sys | |
from nltk.translate.meteor_score import meteor_score | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# Subwording the source file only | |
# Command: python3 subword.py <source_model_file> <source_pred_file> | |
# Note: If you did not train the model with start and end tokens remove ['<s>'] and ['</s>'] from line #30 | |
import sys | |
import sentencepiece as spm |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# First convert your OpenNMT-py or OpenNMT-tf model to a CTranslate2 model. | |
# pip3 install ctranslate2 | |
# • OpenNMT-py: | |
# ct2-opennmt-py-converter --model_path model.pt --output_dir enja_ctranslate2 --quantization int8 | |
# • OpenNMT-tf: | |
# ct2-opennmt-tf-converter --model_path model --output_dir enja_ctranslate2 --src_vocab source.vocab --tgt_vocab target.vocab --model_type TransformerBase --quantization int8 | |
import ctranslate2 | |
import sentencepiece as spm |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ctranslate2 | |
# Replace with your tokenize function and source tokenization model | |
def tokenize(input_sentences): | |
tokens = [input_sentence.split(" ") for input_sentence in input_sentences] | |
return tokens | |
# Replace with your detokenize function and target tokenization model | |
def detokenize(outputs): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Remove duplicate, lines with bad characters, and shuffle | |
# Find the number of CPUs/cores to add to parallel: nproc --all | |
# sort -S 50% --parallel=4 dataset.es | uniq -u > dataset.unique.es | |
# shuf dataset.unique.es > dataset.unique.shuf.es | |
# !perl -ne '/�/ or print' dataset.unique.shuf.es > dataset.unique.shuf.cleaner.es | |
import re | |
import fasttext |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# pip3 install gdown langdetect fasttext pycld2 py3langid | |
import gdown | |
from datetime import datetime | |
# Download fasttext models | |
url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz" | |
output = "lid.176.ftz" | |
gdown.download(url, output, quiet=False) |
NewerOlder