Skip to content

Instantly share code, notes, and snippets.

View ParagEkbote's full-sized avatar
📈

Parag Ekbote ParagEkbote

📈
View GitHub Profile
@ParagEkbote
ParagEkbote / compare_tokenizer_perf.py
Last active November 3, 2025 15:34
FineWiki Multilingual Tokenizer Analysis
#!/usr/bin/env python3
"""
Tokenizer Performance Comparison Script
A comprehensive tool for evaluating tokenizer efficiency across multiple languages.
Supports HuggingFace datasets, local files, and built-in samples.
Author: https://github.com/ParagEkbote
"""
from typing import Tuple, List, Dict, Optional, Callable, Literal
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from pruna import smash, SmashConfig
# Define per-model Smash + torch.compile configs
models = {
"HuggingFaceTB/SmolLM2-360M": {
"bits": 4, "group_size": 64,
"compiler": "torch_compile",
"torch_compile_mode": "max-autotune",
import os
import torch
import argparse
from datetime import datetime
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from pruna import smash, SmashConfig
from pruna.data.pruna_datamodule import PrunaDataModule
from pruna.evaluation.evaluation_agent import EvaluationAgent
from pruna.evaluation.task import Task