Skip to content

Instantly share code, notes, and snippets.

@moyix
Created August 30, 2022 00:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save moyix/b325c0938a09156e7a7157e844a7ab51 to your computer and use it in GitHub Desktop.
Save moyix/b325c0938a09156e7a7157e844a7ab51 to your computer and use it in GitHub Desktop.
Demo of extending a rotary position embedding model to a longer context than it was trained on
(sfcodegen) moyix@isabella:~$ python load_codegen_with_longer_context.py
vocab_file vocab.json
merges_file merges.txt
tokenizer_file tokenizer.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json
tokenizer_config_file tokenizer_config.json
Partial prompt from /usr/include/stdlib.h:
[...] restrict __nptr,
char **__restrict __endptr)
__THROW __nonnull ((1));
#[Generated text start]endif
#if __HAVE_FLOAT128 && __GLIBC_USE (IEC_60559_TYPES_EXT)
extern _Float128 strtold (const char *__restrict __nptr,
char **__restrict __endptr)
__THROW __nonnull ((1));
#endif
#if __HAVE_FLOAT128X && __GLIBC_USE (IEC_60559_TYPES_EXT)
extern _Float128 strtoldx (const char *__restrict __nptr,
char **__restrict __endptr)
__THROW __nonnull ((1));
#endif
#if __HAVE_FLOAT128 && __GLIBC_USE (IEC_60559_TYPES_EXT)
extern _Float128 strtodx (const char *__restrict __nptr,
char **__restrict __endptr)
__THROW __nonnull ((1));
#endif
#if __HAVE_FLOAT128X && __GLIBC_USE (IEC_60559_TYPES_EXT)
extern _Float128x strtodx (const char *__restrict __nptr,
char **__restrict __endptr)
__THROW __nonnull ((1));
#endif
#if __HAVE_FLOAT128X && __GLIBC_USE (IEC_60559_TYPES_EXT)
extern _Float128x strtodx (const char *__restrict __nptr,
char **__restrict __endptr)
__THROW __nonnull ((1));
#endif
#if __HAVE_CONFIG_H
#include <config.h>
#endif
#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h
#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h
#include <sys/types.h
#endif
#include <sys/types.h
#include <sys/types.h>
#endif
#include <sys/types.h>
#include <sys/types.h
#include <sys/sys
#include <sys/types.h>
#include <sys/sys
#endif
#include <sys
#include <sys
#
#
#
#include
#endif
#
#
#
#
#
#include
#
#
#
#
#
#
#
#
#
#
#
# Ifl
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
# H
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
*/
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#c
#
#l_C_c_
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
# Copyrighted
#
#
#
#
#
#
#
#c
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
# /*
#
#
#
#
#
#
#
#_
#
#
#)
#
#
#
#_p_
#
#
#c_
#
#
#
# _
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#adm_
#
#
#
#
#_
#
#
#
#
#
#
#
#
#
#
#include
#include
#_
#
#
#edl_D_t
#include
#include
#include
#include
#include
#include
#include
#include_H_Hd_DRSF_H_H_H
#end_
#
#include_c_
#
#include
#include
#crun_
#
#C_
#
#r_c
#
#include
#
#
#
#lib_
#
#
#
#
#
#
#
#include _
#r_t
#
#include
#include
#
#include
#cubd_start_m_D_un_start_un_D_start_H_m_L4_m_
#
#
#
#
_r_in_
#c_
#
#ifn_H_L_h_
#c_L_#
#r_C_
#c_
#include
#c_
#
#c_L_
#
#r1
#r_
#r_#
#include
#
#
#
#c_
#
#
#
#if_
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#pr_
#if_free_free_p
#if_C_
#
#c_c_c_
#
#pr_C_INTRACL_L_
#
#
#
#r_H_P_r_r_H_
#
#if_in_H_legal_
#
#if_
#
#
#
#
#
#
#
#
#r_
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#include
#
#include
#
#
#include
#
#
#legal
#
#
#
#
#
#pr_legal_locp
#
#
#
#
#
#
#
#
#
#
#
#pr_
#_
#p_
#
#
#
#
#
#
#
#
#
#_
#_
#_
#_)
#_
#
#_
#_
#
#
#
#_
#_except_H_
#
#
#
#
#
#
#
#
#
#_
#
#
#
#
#
#_
#_
#
#
#
#
#
#
#
#
#_
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#_
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#_
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#!/usr/bin/env python
import sys
import torch
from transformers import CodeGenConfig, CodeGenForCausalLM, AutoTokenizer
from transformers.utils.hub import cached_file
COLOR_BOLD = '\033[1m'
COLOR_RESET = '\033[0m'
NEW_SIZE = 4096
cg_config = CodeGenConfig.from_pretrained('Salesforce/codegen-350M-multi')
cg_config.n_ctx = NEW_SIZE
cg_config.n_positions = NEW_SIZE
weights_file = cached_file('Salesforce/codegen-350M-multi', 'pytorch_model.bin')
state_dict = torch.load(weights_file)
# Expand the causal mask to the new size. Unclear why this is in the saved
# model weights, because it's a constant...
for k in list(state_dict.keys()):
if k.endswith('causal_mask'):
# This is copied from CodeGen's __init__() method.
state_dict[k] = torch.tril(torch.ones((NEW_SIZE, NEW_SIZE), dtype=torch.uint8)).view(
1, 1, NEW_SIZE, NEW_SIZE
)
model = CodeGenForCausalLM.from_pretrained(None, config=cg_config, state_dict=state_dict, torch_dtype='auto')
model.to('cuda')
model.eval()
# Try to generate something
filename = '/usr/include/stdlib.h'
text = open(filename).read()
tokenizer = AutoTokenizer.from_pretrained('Salesforce/codegen-350M-multi')
tokenizer.model_max_length = sys.maxsize
enc = tokenizer.encode(text)
enc = enc[:2048]
enc = torch.tensor(enc, dtype=torch.long).unsqueeze(0)
enc = enc.to('cuda')
with torch.no_grad():
out = model.generate(
input_ids=enc,
do_sample=True,
temperature=0.2,
top_p=1.0,
max_new_tokens=2048,
num_return_sequences=1,
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id,
)
print(f"Generated {len(out[0])-2048} tokens.")
print(f"Partial prompt from {filename}:\n")
print(f"{COLOR_BOLD}[...]{COLOR_RESET} ", end='')
print(tokenizer.decode(out[0][2048-32:2048].cpu().tolist()), end='')
print(f"{COLOR_BOLD}[Generated text start]{COLOR_RESET}", end='')
print(tokenizer.decode(out[0][2048:].cpu().tolist()))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment