Thomas Wolf thomwolf

## gpt-2-wikitext-103.py
# Copyright (c) 2019-present, Thomas Wolf.
# All rights reserved. This source code is licensed under the MIT-style license.
""" A very small and self-contained gist to train a GPT-2 transformer model on wikitext-103 """
import os
from collections import namedtuple
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from ignite.engine import Engine, Events

## fast_speech_text_speech.py
""" To use: install LLM studio (or Ollama), clone OpenVoice, run this script in the OpenVoice directory
    git clone https://github.com/myshell-ai/OpenVoice
    cd OpenVoice
    git clone https://huggingface.co/myshell-ai/OpenVoice
    cp -r OpenVoice/* .
    pip install whisper pynput pyaudio
"""

from openai import OpenAI
import time

## gist:6aa3a2689f66ec2c8d28b281bdd01fe2
""" To use: install LLM studio (or Ollama), clone OpenVoice, run this script in the OpenVoice directory
    git clone https://github.com/myshell-ai/OpenVoice
    cd OpenVoice
    git clone https://huggingface.co/myshell-ai/OpenVoice
    cp -r OpenVoice/* .
    pip install whisper pynput pyaudio
"""
from dataclasses import dataclass
from typing import Optional
import random

## loading_wikipedia.py
import os; import psutil; import timeit
from datasets import load_dataset

mem_before = psutil.Process(os.getpid()).memory_info().rss >> 20
wiki = load_dataset("wikipedia", "20200501.en", split='train')
mem_after = psutil.Process(os.getpid()).memory_info().rss >> 20
print(f"RAM memory used: {(mem_after - mem_before)} MB")

s = """batch_size = 1000
for i in range(0, len(wiki), batch_size):

## gradient_accumulation.py
model.zero_grad()                                   # Reset gradients tensors
for i, (inputs, labels) in enumerate(training_set):
    predictions = model(inputs)                     # Forward pass
    loss = loss_function(predictions, labels)       # Compute loss function
    loss = loss / accumulation_steps                # Normalize our loss (if averaged)
    loss.backward()                                 # Backward pass
    if (i+1) % accumulation_steps == 0:             # Wait for several backward steps
        optimizer.step()                            # Now we can do an optimizer step
        model.zero_grad()                           # Reset gradients tensors
        if (i+1) % evaluation_steps == 0:           # Evaluate the model when we...

## top-k-top-p.py
def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
        Args:
            logits: logits distribution shape (vocabulary size)
            top_k >0: keep only top k tokens with highest probability (top-k filtering).
            top_p >0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
    """
    assert logits.dim() == 1  # batch size 1 for now - could be updated for more but the code would be less clear
    top_k = min(top_k, logits.size(-1))  # Safety check

## AdamW.py
from torch.optim import Optimizer

class AdamW(Optimizer):
    """
       Implements Adam algorithm with weight decay fix in PyTorch
       Paper: Fixing Weight Decay Regularization in Adam by Ilya Loshchilov, Frank Hutter
       https://arxiv.org/abs/1711.05101
    """
    def __init__(self, params, lr, b1=0.9, b2=0.999, e=1e-8, l2=0,
                 vector_l2=False, max_grad_norm=-1, **kwargs):

## parallel.py
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Created by: Hang Zhang, Rutgers University, Email: zhang.hang@rutgers.edu
## Modified by Thomas Wolf, HuggingFace Inc., Email: thomas@huggingface.co
## Copyright (c) 2017-2018
##
## This source code is licensed under the MIT-style license found in the
## LICENSE file in the root directory of this source tree
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

"""Encoding Data Parallel"""

## MetaLearner.py
class MetaLearner(nn.Module):
    """ Bare Meta-learner class
        Should be added: intialization, hidden states, more control over everything
    """
    def __init__(self, model):
        super(MetaLearner, self).__init__()
        self.weights = Parameter(torch.Tensor(1, 2))

    def forward(self, forward_model, backward_model):
        """ Forward optimizer with a simple linear neural net

## persona-chat.py
import json
from pytorch_pretrained_bert import cached_path

url = "https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json"

# Download and load JSON dataset
personachat_file = cached_path(url)
with open(personachat_file, "r", encoding="utf-8") as f:
    dataset = json.loads(f.read())
	# Copyright (c) 2019-present, Thomas Wolf.
	# All rights reserved. This source code is licensed under the MIT-style license.
	""" A very small and self-contained gist to train a GPT-2 transformer model on wikitext-103 """
	import os
	from collections import namedtuple
	from tqdm import tqdm
	import torch
	import torch.nn as nn
	from torch.utils.data import DataLoader
	from ignite.engine import Engine, Events
	""" To use: install LLM studio (or Ollama), clone OpenVoice, run this script in the OpenVoice directory
	git clone https://github.com/myshell-ai/OpenVoice
	cd OpenVoice
	git clone https://huggingface.co/myshell-ai/OpenVoice
	cp -r OpenVoice/* .
	pip install whisper pynput pyaudio
	"""

	from openai import OpenAI
	import time
	import os; import psutil; import timeit
	from datasets import load_dataset

	mem_before = psutil.Process(os.getpid()).memory_info().rss >> 20
	wiki = load_dataset("wikipedia", "20200501.en", split='train')
	mem_after = psutil.Process(os.getpid()).memory_info().rss >> 20
	print(f"RAM memory used: {(mem_after - mem_before)} MB")

	s = """batch_size = 1000
	for i in range(0, len(wiki), batch_size):
	model.zero_grad() # Reset gradients tensors
	for i, (inputs, labels) in enumerate(training_set):
	predictions = model(inputs) # Forward pass
	loss = loss_function(predictions, labels) # Compute loss function
	loss = loss / accumulation_steps # Normalize our loss (if averaged)
	loss.backward() # Backward pass
	if (i+1) % accumulation_steps == 0: # Wait for several backward steps
	optimizer.step() # Now we can do an optimizer step
	model.zero_grad() # Reset gradients tensors
	if (i+1) % evaluation_steps == 0: # Evaluate the model when we...
	def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
	""" Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
	Args:
	logits: logits distribution shape (vocabulary size)
	top_k >0: keep only top k tokens with highest probability (top-k filtering).
	top_p >0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
	Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
	"""
	assert logits.dim() == 1 # batch size 1 for now - could be updated for more but the code would be less clear
	top_k = min(top_k, logits.size(-1)) # Safety check
	from torch.optim import Optimizer

	class AdamW(Optimizer):
	"""
	Implements Adam algorithm with weight decay fix in PyTorch
	Paper: Fixing Weight Decay Regularization in Adam by Ilya Loshchilov, Frank Hutter
	https://arxiv.org/abs/1711.05101
	"""
	def __init__(self, params, lr, b1=0.9, b2=0.999, e=1e-8, l2=0,
	vector_l2=False, max_grad_norm=-1, **kwargs):
	##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	## Created by: Hang Zhang, Rutgers University, Email: zhang.hang@rutgers.edu
	## Modified by Thomas Wolf, HuggingFace Inc., Email: thomas@huggingface.co
	## Copyright (c) 2017-2018
	##
	## This source code is licensed under the MIT-style license found in the
	## LICENSE file in the root directory of this source tree
	##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

	"""Encoding Data Parallel"""
	class MetaLearner(nn.Module):
	""" Bare Meta-learner class
	Should be added: intialization, hidden states, more control over everything
	"""
	def __init__(self, model):
	super(MetaLearner, self).__init__()
	self.weights = Parameter(torch.Tensor(1, 2))

	def forward(self, forward_model, backward_model):
	""" Forward optimizer with a simple linear neural net
	import json
	from pytorch_pretrained_bert import cached_path

	url = "https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json"

	# Download and load JSON dataset
	personachat_file = cached_path(url)
	with open(personachat_file, "r", encoding="utf-8") as f:
	dataset = json.loads(f.read())