This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import inspect | |
import logging | |
import os | |
from pathlib import Path | |
import torch | |
from psutil import cpu_count | |
from transformers import T5Config, T5ForConditionalGeneration, T5Tokenizer | |
from transformers.generation_utils import GenerationMixin | |
from transformers.modeling_outputs import BaseModelOutputWithPast, Seq2SeqLMOutput |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import torch | |
import torch.nn.functional as F | |
from tqdm import trange | |
from transformers import AutoTokenizer | |
from onnxruntime import InferenceSession | |
class GenerativeT5(torch.nn.Module): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import transformers | |
import argparse | |
import numpy as np | |
import pandas as pd | |
from huggingface_hub import HfFolder | |
import evaluate | |
from datasets import load_dataset, Dataset, load_metric, concatenate_datasets, DatasetDict | |
from transformers import AutoModelForCausalLM, AutoTokenizer |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
activation functions | |
-------------------- | |
commonly use: ReLU, Sigmoid (0 to 1), Tanh (-1 to 1) | |
- Sigmoid functions and their combinations generally work better in the case of classifiers | |
- Sigmoids and tanh functions are sometimes avoided due to the vanishing gradient problem | |
- ReLU function is a general activation function and is used in most cases these days | |
- If we encounter a case of dead neurons in our networks the leaky ReLU function is the best choice | |
- Always keep in mind that ReLU function should only be used in the hidden layers | |
- As a rule of thumb, you can begin with using ReLU function and then move over to other activation functions in case ReLU doesn’t provide with optimum results |