Last active
January 24, 2024 01:32
-
-
Save qkdxorjs1002/1f1e3f5df0dfb7daa060b5ab7ab33c03 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"provenance": [], | |
"gpuType": "T4", | |
"machine_shape": "hm" | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
}, | |
"accelerator": "GPU" | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# 패키지 다운로드" | |
], | |
"metadata": { | |
"id": "I1aM0iADTje0" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "FuXIFTFapAMI" | |
}, | |
"outputs": [], | |
"source": [ | |
"!pip install -q -U bitsandbytes\n", | |
"!pip install -q -U git+https://github.com/huggingface/transformers.git \n", | |
"!pip install -q -U git+https://github.com/huggingface/peft.git\n", | |
"!pip install -q -U git+https://github.com/huggingface/accelerate.git\n", | |
"!pip install -q datasets" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# 데이터 셋 가져오기" | |
], | |
"metadata": { | |
"id": "_9fHSqQHTqkU" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from datasets import load_dataset\n", | |
"\n", | |
"data = load_dataset(\"paragonnov/coway_faq\")" | |
], | |
"metadata": { | |
"id": "jm4FzCvfeYcK" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"data" | |
], | |
"metadata": { | |
"id": "2KUhV7x3e6Db" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# 학습을 위한 *text* 데이터 매핑" | |
], | |
"metadata": { | |
"id": "W-WnBJsbTvG5" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"data = data.map(\n", | |
" lambda x: {'text': f\"### 질문: {x['instruction']}\\n\\n### 답변: {x['output']}<|endoftext|>\" }\n", | |
")" | |
], | |
"metadata": { | |
"id": "0FbgsI9sezTJ" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"data" | |
], | |
"metadata": { | |
"id": "w7-HuF1p6EQT" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# 모델 조각 불러오기" | |
], | |
"metadata": { | |
"id": "dreUUUrdT2-1" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import torch\n", | |
"from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig\n", | |
"\n", | |
"model_id = \"beomi/polyglot-ko-12.8b-safetensors\" # safetensors 컨버팅된 레포\n", | |
"bnb_config = BitsAndBytesConfig(\n", | |
" load_in_4bit=True,\n", | |
" bnb_4bit_use_double_quant=True,\n", | |
" bnb_4bit_quant_type=\"nf4\",\n", | |
" bnb_4bit_compute_dtype=torch.bfloat16\n", | |
")\n", | |
"\n", | |
"tokenizer = AutoTokenizer.from_pretrained(model_id)\n", | |
"model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={\"\":0})" | |
], | |
"metadata": { | |
"id": "E0Nl5mWL0k2T" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"data = data.map(lambda samples: tokenizer(samples[\"text\"]), batched=True)" | |
], | |
"metadata": { | |
"id": "C4TDUgDbhyhK" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"data['train'][0]['text']" | |
], | |
"metadata": { | |
"id": "h61VdWpSAJEp" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# PEFT 로 LB 학습" | |
], | |
"metadata": { | |
"id": "EXROYJDNT9DJ" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from peft import prepare_model_for_kbit_training\n", | |
"\n", | |
"model.gradient_checkpointing_enable()\n", | |
"model = prepare_model_for_kbit_training(model)" | |
], | |
"metadata": { | |
"id": "a9EUEDAl0ss3" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"def print_trainable_parameters(model):\n", | |
" \"\"\"\n", | |
" Prints the number of trainable parameters in the model.\n", | |
" \"\"\"\n", | |
" trainable_params = 0\n", | |
" all_param = 0\n", | |
" for _, param in model.named_parameters():\n", | |
" all_param += param.numel()\n", | |
" if param.requires_grad:\n", | |
" trainable_params += param.numel()\n", | |
" print(\n", | |
" f\"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}\"\n", | |
" )" | |
], | |
"metadata": { | |
"id": "gkIcwsSU01EB" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from peft import LoraConfig, get_peft_model\n", | |
"\n", | |
"config = LoraConfig(\n", | |
" r=8, \n", | |
" lora_alpha=32, \n", | |
" target_modules=[\"query_key_value\"], \n", | |
" lora_dropout=0.05, \n", | |
" bias=\"none\", \n", | |
" task_type=\"CAUSAL_LM\"\n", | |
")\n", | |
"\n", | |
"model = get_peft_model(model, config)\n", | |
"print_trainable_parameters(model)" | |
], | |
"metadata": { | |
"id": "Ybeyl20n3dYH" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# 데이터 학습하기" | |
], | |
"metadata": { | |
"id": "0ddGcswMUD9C" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import transformers\n", | |
"\n", | |
"# needed for gpt-neo-x tokenizer\n", | |
"tokenizer.pad_token = tokenizer.eos_token\n", | |
"\n", | |
"trainer = transformers.Trainer(\n", | |
" model=model,\n", | |
" train_dataset=data[\"train\"],\n", | |
" args=transformers.TrainingArguments(\n", | |
" per_device_train_batch_size=2,\n", | |
" gradient_accumulation_steps=1,\n", | |
" max_steps=50, ## 초소량만 학습: 50 step만 학습. 약 4분정도 걸립니다.\n", | |
" learning_rate=1e-4,\n", | |
" fp16=True,\n", | |
" logging_steps=10,\n", | |
" output_dir=\"outputs\",\n", | |
" optim=\"paged_adamw_8bit\"\n", | |
" ),\n", | |
" data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),\n", | |
")\n", | |
"model.config.use_cache = False # silence the warnings. Please re-enable for inference!\n", | |
"trainer.train()" | |
], | |
"metadata": { | |
"id": "jq0nX33BmfaC" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# 모델 평가" | |
], | |
"metadata": { | |
"id": "LiM298jQUI5c" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"model.eval()\n", | |
"model.config.use_cache = True # silence the warnings. Please re-enable for inference!" | |
], | |
"metadata": { | |
"id": "a-jauOEv9XVe" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"def gen(x):\n", | |
" gened = model.generate(\n", | |
" **tokenizer(\n", | |
" f\"### 질문: {x}\\n\\n### 답변:\", \n", | |
" return_tensors='pt', \n", | |
" return_token_type_ids=False\n", | |
" ), \n", | |
" max_new_tokens=256,\n", | |
" early_stopping=True,\n", | |
" do_sample=True,\n", | |
" eos_token_id=2,\n", | |
" )\n", | |
" print(tokenizer.decode(gened[0]))" | |
], | |
"metadata": { | |
"id": "oDp9W-Gmp5Mb" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# 추론하기" | |
], | |
"metadata": { | |
"id": "08KlSlCoUN0X" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"gen('카카오 간편로그인이 되지 않아요')" | |
], | |
"metadata": { | |
"id": "iIbK1GaipZd9" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# HuggingFaced에 모델 업로드" | |
], | |
"metadata": { | |
"id": "JriDpidBa3nG" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!huggingface-cli login" | |
], | |
"metadata": { | |
"id": "_LCQuL2na2ul" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"model.push_to_hub('<ID>/<MODEL_NAME>')" | |
], | |
"metadata": { | |
"id": "TWc3sigabGiH" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment