moyix/00_output.txt

## 00_output.txt
(sfcodegen) moyix@isabella:~$ python load_codegen_with_longer_context.py
vocab_file vocab.json
merges_file merges.txt
tokenizer_file tokenizer.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json
tokenizer_config_file tokenizer_config.json
Partial prompt from /usr/include/stdlib.h:

[...] restrict __nptr,
			      char **__restrict __endptr)
     __THROW __nonnull ((1));
#[Generated text start]endif

#if __HAVE_FLOAT128 && __GLIBC_USE (IEC_60559_TYPES_EXT)
extern _Float128 strtold (const char *__restrict __nptr,
			  char **__restrict __endptr)
     __THROW __nonnull ((1));
#endif

#if __HAVE_FLOAT128X && __GLIBC_USE (IEC_60559_TYPES_EXT)
extern _Float128 strtoldx (const char *__restrict __nptr,
			    char **__restrict __endptr)
     __THROW __nonnull ((1));
#endif

#if __HAVE_FLOAT128 && __GLIBC_USE (IEC_60559_TYPES_EXT)
extern _Float128 strtodx (const char *__restrict __nptr,
			   char **__restrict __endptr)
     __THROW __nonnull ((1));
#endif

#if __HAVE_FLOAT128X && __GLIBC_USE (IEC_60559_TYPES_EXT)
extern _Float128x strtodx (const char *__restrict __nptr,
			     char **__restrict __endptr)
     __THROW __nonnull ((1));
#endif

#if __HAVE_FLOAT128X && __GLIBC_USE (IEC_60559_TYPES_EXT)
extern _Float128x strtodx (const char *__restrict __nptr,
			       char **__restrict __endptr)
     __THROW __nonnull ((1));
#endif

#if __HAVE_CONFIG_H
#include <config.h>
#endif

#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h
#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h>
#include <sys/types.h
#include <sys/types.h
#endif
#include <sys/types.h
#include <sys/types.h>
#endif
#include <sys/types.h>
#include <sys/types.h
#include <sys/sys
#include <sys/types.h>
#include <sys/sys
#endif
#include <sys
#include <sys
#
#
#
#include
#endif
#
#
#
#
#
#include
#
#
#
#
#
#
#
#
#
#
#
# Ifl
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#  H
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
   */
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#c
#

#l_C_c_
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
# Copyrighted
#
#
#
#
#
#
#
#c
#
#
#
#
#
#
#
#
#
#
#
#
#
#

#

#
#
#
#
#
#
#

#  /*


#
#


#


#
#


#
#
#_
#
#
#)


#
#


#
#_p_
#
#
#c_


#
#
#
#  _
#
#
#

#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#adm_

#
#
#
#
#_
#
#
#
#

#
#
#
#
#
#
#include
#include
#_
#
#
#edl_D_t
#include
#include
#include
#include
#include
#include
#include
#include_H_Hd_DRSF_H_H_H
#end_
#
#include_c_
#
#include
#include
#crun_
#
#C_
#
#r_c
#
#include
#
#
#
#lib_
#
#
#
#
#
#
#
#include _
#r_t
#
#include
#include
#
#include
#cubd_start_m_D_un_start_un_D_start_H_m_L4_m_
#
#
#
#

_r_in_
#c_
#
#ifn_H_L_h_
#c_L_#
#r_C_
#c_
#include
#c_
#
#c_L_
#
#r1
#r_
#r_#
#include
#
#
#
#c_
#
#
#
#if_
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#pr_
#if_free_free_p
#if_C_
#
#c_c_c_
#
#pr_C_INTRACL_L_
#
#
#
#r_H_P_r_r_H_
#
#if_in_H_legal_
#
#if_
#
#
#
#
#
#
#
#
#r_
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#include
#
#include
#
#
#include
#

#
#legal
#
#


#
#
#
#pr_legal_locp
#
#
#
#
#
#
#
#
#
#
#
#pr_
#_
#p_
#
#
#
#


#
#
#
#
#
#_
#_
#_
#_)
#_
#
#_
#_
#
#
#
#_
#_except_H_
#
#
#
#
#
#
#


#
#

#_
#
#
#
#
#
#_
#_
#
#
#
#
#
#
#
#
#_
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#_
#
#
#
#

#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#_
#
#
#
#
#
#
#
#
#
#
#
#
#
#


## load_codegen_with_longer_context.py
#!/usr/bin/env python

import sys
import torch
from transformers import CodeGenConfig, CodeGenForCausalLM, AutoTokenizer
from transformers.utils.hub import cached_file

COLOR_BOLD = '\033[1m'
COLOR_RESET = '\033[0m'

NEW_SIZE = 4096

cg_config = CodeGenConfig.from_pretrained('Salesforce/codegen-350M-multi')
cg_config.n_ctx = NEW_SIZE
cg_config.n_positions = NEW_SIZE

weights_file = cached_file('Salesforce/codegen-350M-multi', 'pytorch_model.bin')
state_dict = torch.load(weights_file)

# Expand the causal mask to the new size. Unclear why this is in the saved
# model weights, because it's a constant...
for k in list(state_dict.keys()):
    if k.endswith('causal_mask'):
        # This is copied from CodeGen's __init__() method.
        state_dict[k] = torch.tril(torch.ones((NEW_SIZE, NEW_SIZE), dtype=torch.uint8)).view(
            1, 1, NEW_SIZE, NEW_SIZE
        )

model = CodeGenForCausalLM.from_pretrained(None, config=cg_config, state_dict=state_dict, torch_dtype='auto')
model.to('cuda')
model.eval()

# Try to generate something
filename = '/usr/include/stdlib.h'
text = open(filename).read()
tokenizer = AutoTokenizer.from_pretrained('Salesforce/codegen-350M-multi')
tokenizer.model_max_length = sys.maxsize
enc = tokenizer.encode(text)
enc = enc[:2048]
enc = torch.tensor(enc, dtype=torch.long).unsqueeze(0)
enc = enc.to('cuda')
with torch.no_grad():
    out = model.generate(
        input_ids=enc,
        do_sample=True,
        temperature=0.2,
        top_p=1.0,
        max_new_tokens=2048,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    print(f"Generated {len(out[0])-2048} tokens.")
    print(f"Partial prompt from {filename}:\n")
    print(f"{COLOR_BOLD}[...]{COLOR_RESET} ", end='')
    print(tokenizer.decode(out[0][2048-32:2048].cpu().tolist()), end='')
    print(f"{COLOR_BOLD}[Generated text start]{COLOR_RESET}", end='')
    print(tokenizer.decode(out[0][2048:].cpu().tolist()))
	(sfcodegen) moyix@isabella:~$ python load_codegen_with_longer_context.py
	vocab_file vocab.json
	merges_file merges.txt
	tokenizer_file tokenizer.json
	added_tokens_file added_tokens.json
	special_tokens_map_file special_tokens_map.json
	tokenizer_config_file tokenizer_config.json
	Partial prompt from /usr/include/stdlib.h:

	[...] restrict __nptr,
	char **__restrict __endptr)
	__THROW __nonnull ((1));
	#[Generated text start]endif

	#if __HAVE_FLOAT128 && __GLIBC_USE (IEC_60559_TYPES_EXT)
	extern _Float128 strtold (const char *__restrict __nptr,
	char **__restrict __endptr)
	__THROW __nonnull ((1));
	#endif

	#if __HAVE_FLOAT128X && __GLIBC_USE (IEC_60559_TYPES_EXT)
	extern _Float128 strtoldx (const char *__restrict __nptr,
	char **__restrict __endptr)
	__THROW __nonnull ((1));
	#endif

	#if __HAVE_FLOAT128 && __GLIBC_USE (IEC_60559_TYPES_EXT)
	extern _Float128 strtodx (const char *__restrict __nptr,
	char **__restrict __endptr)
	__THROW __nonnull ((1));
	#endif

	#if __HAVE_FLOAT128X && __GLIBC_USE (IEC_60559_TYPES_EXT)
	extern _Float128x strtodx (const char *__restrict __nptr,
	char **__restrict __endptr)
	__THROW __nonnull ((1));
	#endif

	#if __HAVE_FLOAT128X && __GLIBC_USE (IEC_60559_TYPES_EXT)
	extern _Float128x strtodx (const char *__restrict __nptr,
	char **__restrict __endptr)
	__THROW __nonnull ((1));
	#endif

	#if __HAVE_CONFIG_H
	#include <config.h>
	#endif

	#include <sys/types.h>
	#include <sys/types.h>
	#include <sys/types.h>
	#include <sys/types.h>
	#include <sys/types.h>
	#include <sys/types.h>
	#include <sys/types.h>
	#include <sys/types.h>
	#include <sys/types.h>
	#include <sys/types.h>
	#include <sys/types.h>
	#include <sys/types.h>
	#include <sys/types.h>
	#include <sys/types.h
	#include <sys/types.h>
	#include <sys/types.h>
	#include <sys/types.h>
	#include <sys/types.h>
	#include <sys/types.h>
	#include <sys/types.h
	#include <sys/types.h
	#endif
	#include <sys/types.h
	#include <sys/types.h>
	#endif
	#include <sys/types.h>
	#include <sys/types.h
	#include <sys/sys
	#include <sys/types.h>
	#include <sys/sys
	#endif
	#include <sys
	#include <sys
	#
	#
	#
	#include
	#endif
	#
	#
	#
	#
	#
	#include
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	# Ifl
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	# H
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	*/
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#c
	#

	#l_C_c_
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	# Copyrighted
	#
	#
	#
	#
	#
	#
	#
	#c
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#

	#

	#
	#
	#
	#
	#
	#
	#

	# /*








	#
	#


	#










	#
	#








	#
	#
	#_
	#
	#
	#)






	#
	#



	#
	#_p_
	#
	#
	#c_



	#
	#
	#
	# _
	#
	#
	#

	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#adm_

	#
	#
	#
	#
	#_
	#
	#
	#
	#

	#
	#
	#
	#
	#
	#
	#include
	#include
	#_
	#
	#
	#edl_D_t
	#include
	#include
	#include
	#include
	#include
	#include
	#include
	#include_H_Hd_DRSF_H_H_H
	#end_
	#
	#include_c_
	#
	#include
	#include
	#crun_
	#
	#C_
	#
	#r_c
	#
	#include
	#
	#
	#
	#lib_
	#
	#
	#
	#
	#
	#
	#
	#include _
	#r_t
	#
	#include
	#include
	#
	#include
	#cubd_start_m_D_un_start_un_D_start_H_m_L4_m_
	#
	#
	#
	#

	_r_in_
	#c_
	#
	#ifn_H_L_h_
	#c_L_#
	#r_C_
	#c_
	#include
	#c_
	#
	#c_L_
	#
	#r1
	#r_
	#r_#
	#include
	#
	#
	#
	#c_
	#
	#
	#
	#if_
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#pr_
	#if_free_free_p
	#if_C_
	#
	#c_c_c_
	#
	#pr_C_INTRACL_L_
	#
	#
	#
	#r_H_P_r_r_H_
	#
	#if_in_H_legal_
	#
	#if_
	#
	#
	#
	#
	#
	#
	#
	#
	#r_
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#include
	#
	#include
	#
	#
	#include
	#

	#
	#legal
	#
	#



	#
	#
	#
	#pr_legal_locp
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#pr_
	#_
	#p_
	#
	#
	#
	#



	#
	#
	#
	#
	#
	#_
	#_
	#_
	#_)
	#_
	#
	#_
	#_
	#
	#
	#
	#_
	#_except_H_
	#
	#
	#
	#
	#
	#
	#





	#
	#

	#_
	#
	#
	#
	#
	#
	#_
	#_
	#
	#
	#
	#
	#
	#
	#
	#
	#_
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#_
	#
	#
	#
	#

	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#_
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#
	#!/usr/bin/env python

	import sys
	import torch
	from transformers import CodeGenConfig, CodeGenForCausalLM, AutoTokenizer
	from transformers.utils.hub import cached_file

	COLOR_BOLD = '\033[1m'
	COLOR_RESET = '\033[0m'

	NEW_SIZE = 4096

	cg_config = CodeGenConfig.from_pretrained('Salesforce/codegen-350M-multi')
	cg_config.n_ctx = NEW_SIZE
	cg_config.n_positions = NEW_SIZE

	weights_file = cached_file('Salesforce/codegen-350M-multi', 'pytorch_model.bin')
	state_dict = torch.load(weights_file)

	# Expand the causal mask to the new size. Unclear why this is in the saved
	# model weights, because it's a constant...
	for k in list(state_dict.keys()):
	if k.endswith('causal_mask'):
	# This is copied from CodeGen's __init__() method.
	state_dict[k] = torch.tril(torch.ones((NEW_SIZE, NEW_SIZE), dtype=torch.uint8)).view(
	1, 1, NEW_SIZE, NEW_SIZE
	)

	model = CodeGenForCausalLM.from_pretrained(None, config=cg_config, state_dict=state_dict, torch_dtype='auto')
	model.to('cuda')
	model.eval()

	# Try to generate something
	filename = '/usr/include/stdlib.h'
	text = open(filename).read()
	tokenizer = AutoTokenizer.from_pretrained('Salesforce/codegen-350M-multi')
	tokenizer.model_max_length = sys.maxsize
	enc = tokenizer.encode(text)
	enc = enc[:2048]
	enc = torch.tensor(enc, dtype=torch.long).unsqueeze(0)
	enc = enc.to('cuda')
	with torch.no_grad():
	out = model.generate(
	input_ids=enc,
	do_sample=True,
	temperature=0.2,
	top_p=1.0,
	max_new_tokens=2048,
	num_return_sequences=1,
	pad_token_id=tokenizer.eos_token_id,
	eos_token_id=tokenizer.eos_token_id,
	)
	print(f"Generated {len(out[0])-2048} tokens.")
	print(f"Partial prompt from {filename}:\n")
	print(f"{COLOR_BOLD}[...]{COLOR_RESET} ", end='')
	print(tokenizer.decode(out[0][2048-32:2048].cpu().tolist()), end='')
	print(f"{COLOR_BOLD}[Generated text start]{COLOR_RESET}", end='')
	print(tokenizer.decode(out[0][2048:].cpu().tolist()))