Skip to content

Instantly share code, notes, and snippets.

View leaderboard.py
"""Interaction with and data models for the leaderboard API."""
from __future__ import annotations
import dataclasses
from typing import ClassVar, Iterator, List, Optional, Dict
from urllib.parse import urljoin
import requests
try:
import settings
View testing_fa_en_models.py
from transformers import MT5Config, MT5ForConditionalGeneration, MT5Tokenizer
from transformers.models.t5.modeling_t5 import load_tf_weights_in_t5
model_name = "persiannlp/mt5-base-parsinlu-opus-translation_fa_en"
tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)
def run_model(input_string, **generator_args):
input_ids = tokenizer.encode(input_string, return_tensors="pt")
res = model.generate(input_ids, **generator_args)
View runtpu.sh
export TPU_NAME=sihao02
export PROJECT=???
export ZONE=???
export BUCKET=gs://sihao-source/models
PRETRAINED_STEPS=1000000
FINETUNE_STEPS=50000
declare -a sizes=("large")
declare -a tasks=("twitter")
View verify-tsv-files.py
import json
import t5
import tensorflow as tf
import functools
ds = tf.data.TextLineDataset("data-path/train.tsv")
ds = ds.map(functools.partial(tf.io.decode_csv,
record_defaults=["", ""], # expects two columsn
field_delim="\t", use_quote_delim=False),
num_parallel_calls=tf.data.experimental.AUTOTUNE)
View finetuning-anlg.sh
declare -a sizes=("11B" )
TASK=anlg
PRETRAINED_STEPS=1000000
FINETUNE_STEPS=20000
export BUCKET=gs://danielk-files/t5-models
export TPU_NAME=...
for SIZE in "${sizes[@]}"; do
PRETRAINED_DIR="gs://t5-data/pretrained_models/${SIZE}"
MODEL_DIR="${BUCKET}/${TASK}/${SIZE}"
View pegasus.py
# this file extracts the predictions of several existing summarization systems for XSUM dataset
import json
from datasets import load_dataset
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
dataset = load_dataset('xsum')
total_len = len(dataset['test'])
batch_size = 16
View query_es.py
import os
import re
from elasticsearch import Elasticsearch, ElasticsearchException
from elasticsearch import RequestsHttpConnection
from requests_aws4auth import AWS4Auth
es_client = None
# make these configurable
max_char_length = 10000
View qqp_create_split.py
import random
threshold = 0.50
with open("/Users/danielk/ideaProjects/parsiglue-baselines/data/qqp/QQP-all.tsv") as f:
all_lines = list(f.readlines())
all_sentence_pairs = []
all_splits = []
for line in all_lines:
line_split = line.replace("\n", "").split("\t")
View distillation_11b_small.gin
import t5.models.mesh_transformer
import t5.data.sentencepiece_vocabulary
import mesh_tensorflow.optimize
import mesh_tensorflow.transformer.dataset
import mesh_tensorflow.transformer.learning_rate_schedules
import mesh_tensorflow.transformer.t2t_vocabulary
import mesh_tensorflow.transformer.transformer_layers
import mesh_tensorflow.transformer.utils
# Macros:
# ==============================================================================
View scinceqa-genie-error.txt
2020-09-02T01:48:11.933435968Z 2020-09-02 01:48:11.933205: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory
2020-09-02T01:48:11.933493537Z 2020-09-02 01:48:11.933251: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2020-09-02T01:48:13.344958874Z 2020-09-02 01:48:13.000344: Computing metric rouge with config: {'metric': 'rouge'}
2020-09-02T01:48:14.505232551Z 2020-09-02 01:48:14.000504: Computing score from result: {'rouge1': AggregateScore(low=Score(precision=1.0, recall=1.0, fmeasure=1.0), mid=Score(precision=1.0, recall=1.0, fmeasure=1.0), high=Score(precision=1.0, recall=1.0, fmeasure=1.0)), 'rougeL': AggregateScore(low=Score(precision=1.0, recall=1.0, fmeasure=1.0), mid=Score(precision=1.0, recall=1.0, fmeasure=1.0), high=Score(precision=1.0, recall=1.0, fmeasure=1.0))}
2020