Skip to content

Instantly share code, notes, and snippets.

View luistung's full-sized avatar
🎯
Focusing

luistung

🎯
Focusing
View GitHub Profile
@luistung
luistung / llm_ft.py
Last active April 28, 2024 09:53
finetune llm example
from transformers import AutoTokenizer
from datasets import Dataset
import torch
import pandas as pd
import numpy as np
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token # 设置填充符号
qa_pairs = [
@luistung
luistung / continue_pretrain.py
Created April 28, 2024 09:45
continue pretrain example using hugging face
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
# 选择模型,这里可以替换为任何 transformers 支持的模型,如 "bert-base-uncased", "gpt2" 等
model_name = "gpt2"
device = torch.device("cpu")
# 加载模型和分词器
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

Keybase proof

I hereby claim:

  • I am luistung on github.
  • I am luistung (https://keybase.io/luistung) on keybase.
  • I have a public key ASCcsP70NQH1pe_YHkb_VWNteyojKMnNa4gSEpNRxAF3_Qo

To claim this, I am signing this object:

@luistung
luistung / CMakeLists.txt
Created June 11, 2021 02:58
pytorch to c++
cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
project(custom_ops)
find_package(Torch REQUIRED)
add_executable(example-app example-app.cpp)
target_link_libraries(example-app "${TORCH_LIBRARIES}")
set_property(TARGET example-app PROPERTY CXX_STANDARD 14)
@luistung
luistung / tokenization.cpp
Last active May 9, 2024 17:06
c++ version of bert tokenize
/* c++ version of tokenization for bert
Copyright (C) 2019 luistung
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
@luistung
luistung / tokenization.cpp
Created October 11, 2019 12:02
c++ version of bert tokenize
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <unordered_map>
#include <boost/algorithm/string.hpp>
#include <utf8proc.h>
//https://unicode.org/reports/tr15/#Norm_Forms
//https://ssl.icu-project.org/apiref/icu4c/uchar_8h.html
@luistung
luistung / .vimrc
Last active April 24, 2019 03:31
vim config
syntax on
colorscheme desert
set number
set cindent
set tabstop=4
set shiftwidth=4
set expandtab
set termencoding=utf-8
set fileencodings=utf-8
set encoding=utf-8
@luistung
luistung / httpserver.py
Created July 29, 2014 08:54
python简单http服务器
from BaseHTTPServer import HTTPServer, BaseHTTPRequestHandler
class TestHTTPHandler(BaseHTTPRequestHandler):
def do_GET(self):
self.protocal_version = 'HTTP/1.1'
self.send_response(200)
self.send_header("Welcome", "Contect")
self.end_headers()
self.wfile.write('hello world')
http_server = HTTPServer(('0.0.0.0', 7778), TestHTTPHandler)
@luistung
luistung / SConstruct
Last active August 29, 2015 14:03
scons示例
env = Environment(CPPFLAGS=["-std=gnu++0x", "-Wall", "-O0"])
env.SharedLibrary('foo', ['f1.cpp'])
env.Program("test.cpp", LIBS=["boost_regex", "boost_locale", "foo"], LIBPATH='.')
@luistung
luistung / merge.py
Created May 27, 2014 04:57
多路归并
import os
import sys
import heapq
fileList = [open(sys.argv[1] + '/' + f) for f in os.listdir(sys.argv[1])]
def lineIterator(f):
for line in f:
line = line.rstrip()
tokens = line.split('\t')