ljaljushkin/Readme.md

## Readme.md

      
    Raw
  

              Readme.md
            
          
    Here are the script and requirements.txt for evaluating language models using WWB.
This particular example considers models tuned on grammatical error correction.
The script selects grammatically not correct sentences from jhu-clsp/jfleg (refer to grammar_prompts.csv).
Then it evaluates reference FP32 model (e.g. "pszemraj/bart-base-grammar-synthesis") and creates a reference output: sentences without grammar errors.
And finally, a target model for evaluation is scored against the received outputs from the reference model.
In this example, the target model is NF4-quantized version of the given FP32 model.
python -m venv env
source env/bin/activate
python -m pip install -U pip
python -m pip install -r requirements.txt
python eval_grammar_corrector.py

Expected output:
   similarity    FDT   SDT  FDT norm  SDT norm
0    0.973024  16.56  0.64  0.846626  0.033268


## eval_grammar_corrector.py
from whowhatbench import Evaluator
from datasets import load_dataset
import pandas as pd
from pathlib import Path
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSeq2SeqLM,
    BitsAndBytesConfig
)

PROMPTS_FILENAME = 'grammar_prompts.csv'
REF_CSV_FILENAME = 'reference.csv'
DATASET = 'jhu-clsp/jfleg'
MODEL_ID = "pszemraj/bart-base-grammar-synthesis"
NUM_PROMPTS = 25

if not Path(PROMPTS_FILENAME).exists():
    dataset = load_dataset(DATASET, split='validation')
    short_dataset = dataset.shuffle(seed=42).select(range(NUM_PROMPTS))
    list_prompts = [data['sentence'] for data in short_dataset]
    pd.DataFrame.from_dict({'questions': list_prompts}).to_csv(PROMPTS_FILENAME)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
ref_config = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True)
ref_model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_ID,
    config=ref_config,
    trust_remote_code=True,
)
if Path(REF_CSV_FILENAME).exists():
    evaluator = Evaluator(tokenizer=tokenizer, gt_data=REF_CSV_FILENAME, test_data=PROMPTS_FILENAME, crop_question=False)
else:
    evaluator = Evaluator(base_model=ref_model, tokenizer=tokenizer, test_data=PROMPTS_FILENAME, crop_question=False)
    evaluator.dump_gt(REF_CSV_FILENAME)


cmp_model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_ID,
    config=ref_config,
    trust_remote_code=True,
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
    )
)

all_metrics_per_question, all_metrics = evaluator.score(cmp_model)
print(all_metrics)

## requirements.txt
transformers
datasets
pandas
bitsandbytes
accelerate
git+https://github.com/openvinotoolkit/openvino.genai.git#subdirectory=llm_bench/python/who_what_benchmark
	from whowhatbench import Evaluator
	from datasets import load_dataset
	import pandas as pd
	from pathlib import Path
	from transformers import (
	AutoTokenizer,
	AutoConfig,
	AutoModelForSeq2SeqLM,
	BitsAndBytesConfig
	)

	PROMPTS_FILENAME = 'grammar_prompts.csv'
	REF_CSV_FILENAME = 'reference.csv'
	DATASET = 'jhu-clsp/jfleg'
	MODEL_ID = "pszemraj/bart-base-grammar-synthesis"
	NUM_PROMPTS = 25

	if not Path(PROMPTS_FILENAME).exists():
	dataset = load_dataset(DATASET, split='validation')
	short_dataset = dataset.shuffle(seed=42).select(range(NUM_PROMPTS))
	list_prompts = [data['sentence'] for data in short_dataset]
	pd.DataFrame.from_dict({'questions': list_prompts}).to_csv(PROMPTS_FILENAME)

	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
	ref_config = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True)
	ref_model = AutoModelForSeq2SeqLM.from_pretrained(
	MODEL_ID,
	config=ref_config,
	trust_remote_code=True,
	)
	if Path(REF_CSV_FILENAME).exists():
	evaluator = Evaluator(tokenizer=tokenizer, gt_data=REF_CSV_FILENAME, test_data=PROMPTS_FILENAME, crop_question=False)
	else:
	evaluator = Evaluator(base_model=ref_model, tokenizer=tokenizer, test_data=PROMPTS_FILENAME, crop_question=False)
	evaluator.dump_gt(REF_CSV_FILENAME)


	cmp_model = AutoModelForSeq2SeqLM.from_pretrained(
	MODEL_ID,
	config=ref_config,
	trust_remote_code=True,
	quantization_config=BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	)
	)

	all_metrics_per_question, all_metrics = evaluator.score(cmp_model)
	print(all_metrics)
	transformers
	datasets
	pandas
	bitsandbytes
	accelerate
	git+https://github.com/openvinotoolkit/openvino.genai.git#subdirectory=llm_bench/python/who_what_benchmark