Lysandre Debut LysandreJik

## broken-tokenizer.ipynb

      
              1 file
            
          
              1 fork
            
          
              0 comments
            
          
              0 stars
            
          
                LysandreJik
                / broken-tokenizer.ipynb
            
            
              Created
              February 25, 2021 20:36
            
              
                Broken Tokenizer
              
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## compare_electra.py
import collections

from configure_pretraining import PretrainingConfig
from run_pretraining import PretrainingModel
from pretrain.pretrain_data import get_input_fn, Inputs
import tensorflow as tf
import torch
from model import modeling
from transformers.modeling_electra import ElectraModel, ElectraGenerator, ElectraDiscriminator, load_tf_weights_in_electra
from transformers import BertConfig

## training_gpt2_lmhead_model.py
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel
import tensorflow as tf

model = TFGPT2LMHeadModel.from_pretrained("distilgpt2")
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")

text = """
A SQUAT grey building of only thirty-four stories. Over the main entrance the
words, CENTRAL LONDON HATCHERY AND CONDITIONING CENTRE,
and, in a shield, the World State’s motto, COMMUNITY, IDENTITY, STABILITY.

## save_hub_checkpoint.py
import tensorflow as tf
import tensorflow_hub as hub

model_size = 'xlarge'.upper()
version = 2

model = hub.Module("https://tfhub.dev/google/albert_{}/{}".format(model_size.lower(), version), trainable=False)
init = tf.global_variables_initializer()
saver = tf.train.Saver()

## compare_albert.py
import tensorflow_hub as hub
import tensorflow as tf
import modeling
import os
import numpy as np
import tokenization

# Model size and paths
model_size = 'large'.upper()
version = 2

## cuda_10_install.sh
sudo apt remove cuda
wget https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda_10.0.130_410.48_linux
sudo sh cuda_10.0.130_410.48_linux.run
export PATH=$PATH:/usr/local/cuda/bin
nvcc --version

## bert-base-cased-config.json
{
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,

## DistilBERT_keras.py
model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
tokenizer = DistilbertTokenizer.from_pretrained("distilbert-base-uncased")

## BERT_keras.py
model = TFBertForSequenceClassification.from_pretrained("bert-base-cased")
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

data = tensorflow_datasets.load("glue/mrpc")
train_dataset = data["train"]
train_dataset = glue_convert_examples_to_features(train_dataset, tokenizer, 128, 'mrpc')

optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

## benchmark.py
############################################################
############################################################

# EDITED AFTER FEEDBACK REGARDING THE TENSORFLOW INFERENCE #

############################################################
############################################################
	import collections

	from configure_pretraining import PretrainingConfig
	from run_pretraining import PretrainingModel
	from pretrain.pretrain_data import get_input_fn, Inputs
	import tensorflow as tf
	import torch
	from model import modeling
	from transformers.modeling_electra import ElectraModel, ElectraGenerator, ElectraDiscriminator, load_tf_weights_in_electra
	from transformers import BertConfig
	from transformers import GPT2Tokenizer, TFGPT2LMHeadModel
	import tensorflow as tf

	model = TFGPT2LMHeadModel.from_pretrained("distilgpt2")
	tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")

	text = """
	A SQUAT grey building of only thirty-four stories. Over the main entrance the
	words, CENTRAL LONDON HATCHERY AND CONDITIONING CENTRE,
	and, in a shield, the World State’s motto, COMMUNITY, IDENTITY, STABILITY.
	import tensorflow as tf
	import tensorflow_hub as hub

	model_size = 'xlarge'.upper()
	version = 2

	model = hub.Module("https://tfhub.dev/google/albert_{}/{}".format(model_size.lower(), version), trainable=False)
	init = tf.global_variables_initializer()
	saver = tf.train.Saver()
	import tensorflow_hub as hub
	import tensorflow as tf
	import modeling
	import os
	import numpy as np
	import tokenization

	# Model size and paths
	model_size = 'large'.upper()
	version = 2
	sudo apt remove cuda
	wget https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda_10.0.130_410.48_linux
	sudo sh cuda_10.0.130_410.48_linux.run
	export PATH=$PATH:/usr/local/cuda/bin
	nvcc --version
	{
	"attention_probs_dropout_prob": 0.1,
	"hidden_act": "gelu",
	"hidden_dropout_prob": 0.1,
	"hidden_size": 768,
	"initializer_range": 0.02,
	"intermediate_size": 3072,
	"max_position_embeddings": 512,
	"num_attention_heads": 12,
	"num_hidden_layers": 12,
	model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
	tokenizer = DistilbertTokenizer.from_pretrained("distilbert-base-uncased")
	model = TFBertForSequenceClassification.from_pretrained("bert-base-cased")
	tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

	data = tensorflow_datasets.load("glue/mrpc")
	train_dataset = data["train"]
	train_dataset = glue_convert_examples_to_features(train_dataset, tokenizer, 128, 'mrpc')

	optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
	loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
	metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
	############################################################
	############################################################

	# EDITED AFTER FEEDBACK REGARDING THE TENSORFLOW INFERENCE #

	############################################################
	############################################################