Jeremy Howard jph00

## discrete-action-backprop-bypass-hlb-cifar10-demo.py
# Sketch-specific note: a roughly ~25 run battery for this code estimated a roughly ~93.11% accuracy in the same number of steps as the baseline network, ~1.7x runtime overhead (much of which goes to the torch.randn allocations and extra layer calculations).

# Note: The one change we need to make if we're in Colab is to uncomment this below block.
# If we are in an ipython session or a notebook, clear the state to avoid bugs
#"""
try:
  _ = get_ipython().__class__.__name__
  ## we set -f below to avoid prompting the user before clearing the notebook state
  %reset -f
except NameError:

## multipack_sampler_flash_attn.py
"""
Testing flash attn with multipacking which essentially packs sequences using https://github.com/imoneoi/multipack_sampler,
and passes a single sequence of `1 x (bs x seqlen)` to the model to avoid padding.

An alternative is to use block diagonal attention as attention bias, but the following uses flash attention 2 which
is much faster.

Multipacking can be used to speed up both pretraining and finetuning.
"""

## domain_knowledge_generation_gpt4.py
import os
import openai
from jinja2 import Template, meta, Environment
from dotenv import load_dotenv
load_dotenv() # add a .env file with the following
# setup is for azure, change accordingly for normal openai
openai.api_key = os.getenv("OPENAI_API_KEY")
openai.api_type = os.getenv("OPENAI_API_TYPE")
openai.api_version = os.getenv("OPENAI_API_VERSION")
openai.api_base = os.getenv("OPENAI_API_BASE")

## finetune_alpaca_llama_minimal.py
import argparse
import copy

import torch

import datasets as hfds
import transformers

from tqdm.auto import tqdm
import wandb

## merge_qlora_with_quantized_model.py
"""

The code below combines approaches published by both @eugene-yh and @jinyongyoo on Github.

Thanks for the contributions guys!

"""

import torch
import peft

## finetune_llama_gptq.py
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software

## gist:385014b37f998c7857b15a3ea60b4cae
Config: max_steps: 17500, lora_r: 64  , lr: 0.0002, bf16: False, lora_modules: all , bits: 4   , full_finetune: False, lora_dropout: 0.0 , warmup_steps: 100 , compress_statistics: True, dataset: NaN , gradient_accumulation_steps: NaN , learning_rate: NaN , quant_type: fp4 , adam_beta2: 0.999, update_freq: 6
eval_bert_f1 mean (SE): 64.8716 (21.8331). 95% CI (22.079, 107.664). Sample size: 4
eval_bert_f1 mean (SE): 64.8716 (21.8331). 95% CI (22.079, 107.664). Sample size: 4
eval_rougeL mean (SE): 33.1083 (19.1162). 95% CI (-4.359, 70.576). Sample size: 4
================================================================================
Config: max_steps: 17500, lora_r: 64  , lr: 0.0002, bf16: False, lora_modules: all , bits: 4   , full_finetune: False, lora_dropout: 0.0 , warmup_steps: 100 , compress_statistics: False, dataset: NaN , gradient_accumulation_steps: NaN , learning_rate: NaN , quant_type: fp4 , adam_beta2: 0.999, update_freq: 6
eval_bert_f1 mean (SE): 67.0044 (22.3593). 95% CI (23.180, 110.829).

## gist:a96b0d948a97583f8ab0599fa888c35c
This table contains data from multiple software versions. Some hyperparamter names are "NaN" meaning, they did not exist in that software version. The best 7B result is 40.08 MMLU.

================================================================================
Config: learning_rate: 0.005, adam_beta2: 0.999, lora_dropout: 0.0 , max_grad_norm: 1.0 , max_steps: 7320, lr_scheduler_type: <SchedulerType.COSINE: cosine>, weight_decay: 0.0 , base_model: /gscratch/zlab/llama/7B, quant_type: nf4 , gradient_accumulation_steps: 6   , per_device_train_batch_size: 2
acc mean (SE): 0.2290 (nan). 95% CI (nan, nan). Sample size: 1
================================================================================
Config: learning_rate: 0.0002, adam_beta2: 0.999, lora_dropout: 0.0 , max_grad_norm: 0.3 , max_steps: 9750, lr_scheduler_type: <SchedulerType.CONSTANT: constant>, weight_decay: 0.0 , base_model: NaN , quant_type: nf4 , gradient_accumulation_steps: 2   , per_device_train_batch_size: 8
acc mean (SE): 0.2290 (0.0

## finetune_model.py
import torch
from datasets import load_dataset
import argparse
import os
import math
from itertools import chain
from datetime import timedelta
from torch.utils.data import DataLoader
from accelerate import Accelerator
from accelerate.utils import (DummyOptim, DummyScheduler,

## rl-for-llms.md

      
        
          
            
              
              1 file
            
          
          
            
              
              23 forks
            
          
          
            
              
              11 comments
            
          
          
            
              
              539 stars
            
          
        
        
          
              
          
          
            
                yoavg
                / rl-for-llms.md
            
            
              Last active
              July 3, 2024 02:26
            
          
        
      
        
  
      
    Reinforcement Learning for Language Models

Yoav Goldberg, April 2023.
Why RL?

With the release of the ChatGPT model and followup large language models (LLMs), there was a lot of discussion of the importance of "RLHF training", that is, "reinforcement learning from human feedback".
I was puzzled for a while as to why RL (Reinforcement Learning) is better than learning from demonstrations (a.k.a supervised learning) for training language models. Shouldn't learning from demonstrations (or, in language model terminology "instruction fine tuning", learning to immitate human written answers) be sufficient? I came up with a theoretical argument that was somewhat convincing. But I came to realize there is an additional argumment which not only supports the case of RL training, but also requires it, in particular for models like ChatGPT. This additional argument is spelled out in (the first half of) a talk by John Schulman from OpenAI. This post pretty much
	# Sketch-specific note: a roughly ~25 run battery for this code estimated a roughly ~93.11% accuracy in the same number of steps as the baseline network, ~1.7x runtime overhead (much of which goes to the torch.randn allocations and extra layer calculations).

	# Note: The one change we need to make if we're in Colab is to uncomment this below block.
	# If we are in an ipython session or a notebook, clear the state to avoid bugs
	#"""
	try:
	_ = get_ipython().__class__.__name__
	## we set -f below to avoid prompting the user before clearing the notebook state
	%reset -f
	except NameError:
	"""
	Testing flash attn with multipacking which essentially packs sequences using https://github.com/imoneoi/multipack_sampler,
	and passes a single sequence of `1 x (bs x seqlen)` to the model to avoid padding.

	An alternative is to use block diagonal attention as attention bias, but the following uses flash attention 2 which
	is much faster.

	Multipacking can be used to speed up both pretraining and finetuning.
	"""
	import os
	import openai
	from jinja2 import Template, meta, Environment
	from dotenv import load_dotenv
	load_dotenv() # add a .env file with the following
	# setup is for azure, change accordingly for normal openai
	openai.api_key = os.getenv("OPENAI_API_KEY")
	openai.api_type = os.getenv("OPENAI_API_TYPE")
	openai.api_version = os.getenv("OPENAI_API_VERSION")
	openai.api_base = os.getenv("OPENAI_API_BASE")
	import argparse
	import copy

	import torch

	import datasets as hfds
	import transformers

	from tqdm.auto import tqdm
	import wandb
	"""

	The code below combines approaches published by both @eugene-yh and @jinyongyoo on Github.

	Thanks for the contributions guys!

	"""

	import torch
	import peft
	# coding=utf-8
	# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	Config: max_steps: 17500, lora_r: 64 , lr: 0.0002, bf16: False, lora_modules: all , bits: 4 , full_finetune: False, lora_dropout: 0.0 , warmup_steps: 100 , compress_statistics: True, dataset: NaN , gradient_accumulation_steps: NaN , learning_rate: NaN , quant_type: fp4 , adam_beta2: 0.999, update_freq: 6
	eval_bert_f1 mean (SE): 64.8716 (21.8331). 95% CI (22.079, 107.664). Sample size: 4
	eval_bert_f1 mean (SE): 64.8716 (21.8331). 95% CI (22.079, 107.664). Sample size: 4
	eval_rougeL mean (SE): 33.1083 (19.1162). 95% CI (-4.359, 70.576). Sample size: 4
	================================================================================
	Config: max_steps: 17500, lora_r: 64 , lr: 0.0002, bf16: False, lora_modules: all , bits: 4 , full_finetune: False, lora_dropout: 0.0 , warmup_steps: 100 , compress_statistics: False, dataset: NaN , gradient_accumulation_steps: NaN , learning_rate: NaN , quant_type: fp4 , adam_beta2: 0.999, update_freq: 6
	eval_bert_f1 mean (SE): 67.0044 (22.3593). 95% CI (23.180, 110.829).
	This table contains data from multiple software versions. Some hyperparamter names are "NaN" meaning, they did not exist in that software version. The best 7B result is 40.08 MMLU.

	================================================================================
	Config: learning_rate: 0.005, adam_beta2: 0.999, lora_dropout: 0.0 , max_grad_norm: 1.0 , max_steps: 7320, lr_scheduler_type: <SchedulerType.COSINE: cosine>, weight_decay: 0.0 , base_model: /gscratch/zlab/llama/7B, quant_type: nf4 , gradient_accumulation_steps: 6 , per_device_train_batch_size: 2
	acc mean (SE): 0.2290 (nan). 95% CI (nan, nan). Sample size: 1
	================================================================================
	Config: learning_rate: 0.0002, adam_beta2: 0.999, lora_dropout: 0.0 , max_grad_norm: 0.3 , max_steps: 9750, lr_scheduler_type: <SchedulerType.CONSTANT: constant>, weight_decay: 0.0 , base_model: NaN , quant_type: nf4 , gradient_accumulation_steps: 2 , per_device_train_batch_size: 8
	acc mean (SE): 0.2290 (0.0
	import torch
	from datasets import load_dataset
	import argparse
	import os
	import math
	from itertools import chain
	from datetime import timedelta
	from torch.utils.data import DataLoader
	from accelerate import Accelerator
	from accelerate.utils import (DummyOptim, DummyScheduler,