Kerem Turgutlu KeremTurgutlu

## tinygemm_vs_bitblas.py
import torch
import numpy as np
from hqq.core.quantize import HQQLinear, BaseQuantizeConfig, Quantizer, HQQBackend
from hqq.backends.torchao import HQQLinearTorchWeightOnlynt4, patch_hqq_to_aoint4

# from unpack_int4.ops import unpack_int4_packed
import torchao
import bitblas
# unpack_cuda_compiled = torch.compile(torchao.ops.unpack_int4_to_int, mode="default", fullgraph=True)
from bitblas.cache import global_operator_cache, get_database_path

## test_triton_mm.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                KeremTurgutlu
                / test_triton_mm.ipynb
            
            
              Last active
              May 24, 2024 13:51
            
              
                test_triton_mm.ipynb
              
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## exp.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                KeremTurgutlu
                / exp.ipynb
            
            
              Last active
              January 10, 2024 14:29
            
              
                QLORA Memory Experiments
              
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## gpt_eval_templates.py
gpt_eval_template_coherence = """
You will be given title: [TITLE] and description: [DESC] written from a set of information of a real estate listing in Turkish.

Your task is to rate the title and description on one metric.

Please make sure you read and understand these instructions carefully. Please keep this
document open while reviewing, and refer to it as needed.

Evaluation Criteria:
Coherence (1-5) - the collective quality of all sentences. We align this dimension with

## multipack_sampler_flash_attn.py
"""
Testing flash attn with multipacking which essentially packs sequences using https://github.com/imoneoi/multipack_sampler,
and passes a single sequence of `1 x (bs x seqlen)` to the model to avoid padding.

An alternative is to use block diagonal attention as attention bias, but the following uses flash attention 2 which
is much faster.

Multipacking can be used to speed up both pretraining and finetuning.
"""

## reddit_comments.tsv
TsvHttpData-1.0
https://files.pushshift.io/reddit/comments/RC_2005-12.zst

## ema_swa.py
from fastai.vision.all import *

__all__ = ["EMA", "SWA"]

class EMA(Callback):
    "https://fastai.github.io/timmdocs/training_modelEMA"
    order,run_valid=5,False
    def __init__(self, decay=0.9999):
        super().__init__()
        self.decay = decay

## train_sam.py
from fastai.vision.all import *
from torch.cuda.amp import autocast, GradScaler
from torch.cuda.amp.grad_scaler import _refresh_per_optimizer_state
from sam import SAM

class FastaiSched:
    def __init__(self, optimizer, max_lr):
        self.optimizer = optimizer
        self.lr_sched = combine_scheds([0.1,0.9], [SchedLin(1e-8,max_lr), SchedCos(max_lr,1e-8)])
        self.update(0)

## zero_training.py

import wandb
from fastai.callback.wandb import WandbCallback
from fastai.distributed import *
torch.backends.cudnn.benchmark = True

from zero_optimizer import ZeroRedundancyOptimizer

@patch
def after_batch(self: WandbCallback):

## distributed_wandb.py
@call_parse
def main(
    size:      Param("Image resolution", int)=224,
    bs:        Param("Batch Size", int)=128,
    epochs:    Param("Number of epochs for training", int)=1,
    lr:        Param("Learning rate for training", float)=5e-5):

    WANDB = True

    # start wandb
	import torch
	import numpy as np
	from hqq.core.quantize import HQQLinear, BaseQuantizeConfig, Quantizer, HQQBackend
	from hqq.backends.torchao import HQQLinearTorchWeightOnlynt4, patch_hqq_to_aoint4

	# from unpack_int4.ops import unpack_int4_packed
	import torchao
	import bitblas
	# unpack_cuda_compiled = torch.compile(torchao.ops.unpack_int4_to_int, mode="default", fullgraph=True)
	from bitblas.cache import global_operator_cache, get_database_path
	gpt_eval_template_coherence = """
	You will be given title: [TITLE] and description: [DESC] written from a set of information of a real estate listing in Turkish.

	Your task is to rate the title and description on one metric.

	Please make sure you read and understand these instructions carefully. Please keep this
	document open while reviewing, and refer to it as needed.

	Evaluation Criteria:
	Coherence (1-5) - the collective quality of all sentences. We align this dimension with
	"""
	Testing flash attn with multipacking which essentially packs sequences using https://github.com/imoneoi/multipack_sampler,
	and passes a single sequence of `1 x (bs x seqlen)` to the model to avoid padding.

	An alternative is to use block diagonal attention as attention bias, but the following uses flash attention 2 which
	is much faster.

	Multipacking can be used to speed up both pretraining and finetuning.
	"""
	TsvHttpData-1.0
	https://files.pushshift.io/reddit/comments/RC_2005-12.zst
	from fastai.vision.all import *

	__all__ = ["EMA", "SWA"]

	class EMA(Callback):
	"https://fastai.github.io/timmdocs/training_modelEMA"
	order,run_valid=5,False
	def __init__(self, decay=0.9999):
	super().__init__()
	self.decay = decay
	from fastai.vision.all import *
	from torch.cuda.amp import autocast, GradScaler
	from torch.cuda.amp.grad_scaler import _refresh_per_optimizer_state
	from sam import SAM

	class FastaiSched:
	def __init__(self, optimizer, max_lr):
	self.optimizer = optimizer
	self.lr_sched = combine_scheds([0.1,0.9], [SchedLin(1e-8,max_lr), SchedCos(max_lr,1e-8)])
	self.update(0)

	import wandb
	from fastai.callback.wandb import WandbCallback
	from fastai.distributed import *
	torch.backends.cudnn.benchmark = True

	from zero_optimizer import ZeroRedundancyOptimizer

	@patch
	def after_batch(self: WandbCallback):
	@call_parse
	def main(
	size: Param("Image resolution", int)=224,
	bs: Param("Batch Size", int)=128,
	epochs: Param("Number of epochs for training", int)=1,
	lr: Param("Learning rate for training", float)=5e-5):

	WANDB = True

	# start wandb