Skip to content

Instantly share code, notes, and snippets.

@qkdxorjs1002
Last active January 24, 2024 01:32
Show Gist options
  • Save qkdxorjs1002/1f1e3f5df0dfb7daa060b5ab7ab33c03 to your computer and use it in GitHub Desktop.
Save qkdxorjs1002/1f1e3f5df0dfb7daa060b5ab7ab33c03 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"gpuType": "T4",
"machine_shape": "hm"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"source": [
"# 패키지 다운로드"
],
"metadata": {
"id": "I1aM0iADTje0"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "FuXIFTFapAMI"
},
"outputs": [],
"source": [
"!pip install -q -U bitsandbytes\n",
"!pip install -q -U git+https://github.com/huggingface/transformers.git \n",
"!pip install -q -U git+https://github.com/huggingface/peft.git\n",
"!pip install -q -U git+https://github.com/huggingface/accelerate.git\n",
"!pip install -q datasets"
]
},
{
"cell_type": "markdown",
"source": [
"# 데이터 셋 가져오기"
],
"metadata": {
"id": "_9fHSqQHTqkU"
}
},
{
"cell_type": "code",
"source": [
"from datasets import load_dataset\n",
"\n",
"data = load_dataset(\"paragonnov/coway_faq\")"
],
"metadata": {
"id": "jm4FzCvfeYcK"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"data"
],
"metadata": {
"id": "2KUhV7x3e6Db"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# 학습을 위한 *text* 데이터 매핑"
],
"metadata": {
"id": "W-WnBJsbTvG5"
}
},
{
"cell_type": "code",
"source": [
"data = data.map(\n",
" lambda x: {'text': f\"### 질문: {x['instruction']}\\n\\n### 답변: {x['output']}<|endoftext|>\" }\n",
")"
],
"metadata": {
"id": "0FbgsI9sezTJ"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"data"
],
"metadata": {
"id": "w7-HuF1p6EQT"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# 모델 조각 불러오기"
],
"metadata": {
"id": "dreUUUrdT2-1"
}
},
{
"cell_type": "code",
"source": [
"import torch\n",
"from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig\n",
"\n",
"model_id = \"beomi/polyglot-ko-12.8b-safetensors\" # safetensors 컨버팅된 레포\n",
"bnb_config = BitsAndBytesConfig(\n",
" load_in_4bit=True,\n",
" bnb_4bit_use_double_quant=True,\n",
" bnb_4bit_quant_type=\"nf4\",\n",
" bnb_4bit_compute_dtype=torch.bfloat16\n",
")\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
"model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={\"\":0})"
],
"metadata": {
"id": "E0Nl5mWL0k2T"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"data = data.map(lambda samples: tokenizer(samples[\"text\"]), batched=True)"
],
"metadata": {
"id": "C4TDUgDbhyhK"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"data['train'][0]['text']"
],
"metadata": {
"id": "h61VdWpSAJEp"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# PEFT 로 LB 학습"
],
"metadata": {
"id": "EXROYJDNT9DJ"
}
},
{
"cell_type": "code",
"source": [
"from peft import prepare_model_for_kbit_training\n",
"\n",
"model.gradient_checkpointing_enable()\n",
"model = prepare_model_for_kbit_training(model)"
],
"metadata": {
"id": "a9EUEDAl0ss3"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def print_trainable_parameters(model):\n",
" \"\"\"\n",
" Prints the number of trainable parameters in the model.\n",
" \"\"\"\n",
" trainable_params = 0\n",
" all_param = 0\n",
" for _, param in model.named_parameters():\n",
" all_param += param.numel()\n",
" if param.requires_grad:\n",
" trainable_params += param.numel()\n",
" print(\n",
" f\"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}\"\n",
" )"
],
"metadata": {
"id": "gkIcwsSU01EB"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from peft import LoraConfig, get_peft_model\n",
"\n",
"config = LoraConfig(\n",
" r=8, \n",
" lora_alpha=32, \n",
" target_modules=[\"query_key_value\"], \n",
" lora_dropout=0.05, \n",
" bias=\"none\", \n",
" task_type=\"CAUSAL_LM\"\n",
")\n",
"\n",
"model = get_peft_model(model, config)\n",
"print_trainable_parameters(model)"
],
"metadata": {
"id": "Ybeyl20n3dYH"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# 데이터 학습하기"
],
"metadata": {
"id": "0ddGcswMUD9C"
}
},
{
"cell_type": "code",
"source": [
"import transformers\n",
"\n",
"# needed for gpt-neo-x tokenizer\n",
"tokenizer.pad_token = tokenizer.eos_token\n",
"\n",
"trainer = transformers.Trainer(\n",
" model=model,\n",
" train_dataset=data[\"train\"],\n",
" args=transformers.TrainingArguments(\n",
" per_device_train_batch_size=2,\n",
" gradient_accumulation_steps=1,\n",
" max_steps=50, ## 초소량만 학습: 50 step만 학습. 약 4분정도 걸립니다.\n",
" learning_rate=1e-4,\n",
" fp16=True,\n",
" logging_steps=10,\n",
" output_dir=\"outputs\",\n",
" optim=\"paged_adamw_8bit\"\n",
" ),\n",
" data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),\n",
")\n",
"model.config.use_cache = False # silence the warnings. Please re-enable for inference!\n",
"trainer.train()"
],
"metadata": {
"id": "jq0nX33BmfaC"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# 모델 평가"
],
"metadata": {
"id": "LiM298jQUI5c"
}
},
{
"cell_type": "code",
"source": [
"model.eval()\n",
"model.config.use_cache = True # silence the warnings. Please re-enable for inference!"
],
"metadata": {
"id": "a-jauOEv9XVe"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def gen(x):\n",
" gened = model.generate(\n",
" **tokenizer(\n",
" f\"### 질문: {x}\\n\\n### 답변:\", \n",
" return_tensors='pt', \n",
" return_token_type_ids=False\n",
" ), \n",
" max_new_tokens=256,\n",
" early_stopping=True,\n",
" do_sample=True,\n",
" eos_token_id=2,\n",
" )\n",
" print(tokenizer.decode(gened[0]))"
],
"metadata": {
"id": "oDp9W-Gmp5Mb"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# 추론하기"
],
"metadata": {
"id": "08KlSlCoUN0X"
}
},
{
"cell_type": "code",
"source": [
"gen('카카오 간편로그인이 되지 않아요')"
],
"metadata": {
"id": "iIbK1GaipZd9"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# HuggingFaced에 모델 업로드"
],
"metadata": {
"id": "JriDpidBa3nG"
}
},
{
"cell_type": "code",
"source": [
"!huggingface-cli login"
],
"metadata": {
"id": "_LCQuL2na2ul"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"model.push_to_hub('<ID>/<MODEL_NAME>')"
],
"metadata": {
"id": "TWc3sigabGiH"
},
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment