Brendan Dolan-Gavitt moyix

## xla_constructors.txt
$ objdump -s -j .init_array ./jaxlib/xla_extension.so | sed -e '1,/Contents/ d' | cut -c 10-44 | xxd -r -p | od -A none -w8 -t x8 --endian=little | addr2line -a -f -e ./jaxlib/xla_extension.so | paste -sd '  \n' | c++filt
0x000000000084c5e0 __cpu_indicator_init /dt9-src/libgcc/config/i386/cpuinfo.c:434
0x000000000084ca20 frame_dummy crtstuff.c:?
0x000000000079c440 _GLOBAL__sub_I_xla.cc xla.cc:?
0x000000000079c540 _GLOBAL__sub_I_dlpack.cc dlpack.cc:?
0x000000000079c5f0 _GLOBAL__sub_I_mlir.cc mlir.cc:?
0x000000000079c620 _GLOBAL__sub_I_ops.cc ops.cc:?
0x000000000079c650 _GLOBAL__sub_I_approx_topk.cc approx_topk.cc:?
0x000000000079c680 _GLOBAL__sub_I_approx_topk_shape.cc approx_topk_shape.cc:?
0x000000000079c6b0 _GLOBAL__sub_I_lu_decomposition.cc lu_decomposition.cc:?

## check_for_ffast_math.py
#!/usr/bin/env python

import subprocess
import re
import sys

def get_init_array(filename):
    # Call objdump -s -j .init_array <filename> to get the contents of the .init_array section
    try:
        objdump_output = subprocess.check_output(['objdump', '-s', '-j', '.init_array', filename], stderr=subprocess.STDOUT)

## 00_output.txt
(sfcodegen) moyix@isabella:~$ python load_codegen_with_longer_context.py
vocab_file vocab.json
merges_file merges.txt
tokenizer_file tokenizer.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json
tokenizer_config_file tokenizer_config.json
Partial prompt from /usr/include/stdlib.h:

[...] restrict __nptr,

## load_codegen_with_longer_context.py
#!/usr/bin/env python

import torch
from transformers import CodeGenConfig, CodeGenForCausalLM, CodeGenTokenizer
from transformers.utils.hub import cached_file

NEW_SIZE = 4096

cg_config = CodeGenConfig.from_pretrained('Salesforce/codegen-350M-mono')
cg_config.n_ctx = NEW_SIZE

## copilot_proxy.py
#!/usr/bin/env python

import time
import random
import string
from flask import Flask, request
import numpy as np
import tritonclient.grpc as client_util
import json
from tritonclient.utils import np_to_triton_dtype

## huggingface_gptj_convert.py
# Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
# Modified by Brendan Dolan-Gavitt, 2022
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software

## CodeGen_GPTJ_Conversion.md

      
              1 file
            
          
              2 forks
            
          
              1 comment
            
          
              56 stars
            
          
                moyix
                / CodeGen_GPTJ_Conversion.md
            
            
              Last active
              January 5, 2024 12:50
            
              
                How to convert the SalesForce CodeGen models to GPT-J
              
          
    Using Linear Algebra to Convert a Large Code Model

Background

The SalesForce CodeGen models are a family of large language models trained on a large amount of natural language data and then fine-tuned on specialized datasets of code. Models of size 350M, 2B, 6B, and 16B parameters are provided in three flavors:

nl, the base model trained on The Pile, a large natural language dataset compiled by EleutherAI
multi, which is fine-tuned from the nl model on a dataset of code in multiple languages, scraped from GitHub, and
mono, which is fine-tuned from the multi model on Python code only.


## codegen_gptj_convert.py
#!/usr/bin/env python

import argparse
import torch
from transformers import GPTJForCausalLM, GPTJConfig
# Note: these need the git version of Transformers as of 7/22/2022
from transformers import CodeGenTokenizer, CodeGenForCausalLM
from transformers import CODEGEN_PRETRAINED_MODEL_ARCHIVE_LIST

parser = argparse.ArgumentParser('Convert SalesForce CodeGen model to GPT-J')

## shadowstack.c
#include <stdio.h>
#include <asm/prctl.h>
#include <sys/prctl.h>
#include <stdlib.h>

// Compile with: clang-8 -fsanitize=shadow-call-stack shadowstack.c -o shadowstack

int arch_prctl(int code, unsigned long *addr);

void __attribute__ ((constructor)) __attribute__((no_sanitize("shadow-call-stack"))) setupgs()

## tiffcrop.c
static int  readSeparateTilesIntoBuffer (TIFF* in, uint8 *obuf,
                                         uint32 imagelength, uint32 imagewidth,
                                         uint32 tw, uint32 tl,
                                         uint16 spp, uint16 bps)
  {
  int     i, status = 1, sample;
  int     shift_width, bytes_per_pixel;
  uint16  bytes_per_sample;
  uint32  row, col;     /* Current row and col of image */
  uint32  nrow, ncol;   /* Number of rows and cols in current tile */
	$ objdump -s -j .init_array ./jaxlib/xla_extension.so \| sed -e '1,/Contents/ d' \| cut -c 10-44 \| xxd -r -p \| od -A none -w8 -t x8 --endian=little \| addr2line -a -f -e ./jaxlib/xla_extension.so \| paste -sd ' \n' \| c++filt
	0x000000000084c5e0 __cpu_indicator_init /dt9-src/libgcc/config/i386/cpuinfo.c:434
	0x000000000084ca20 frame_dummy crtstuff.c:?
	0x000000000079c440 _GLOBAL__sub_I_xla.cc xla.cc:?
	0x000000000079c540 _GLOBAL__sub_I_dlpack.cc dlpack.cc:?
	0x000000000079c5f0 _GLOBAL__sub_I_mlir.cc mlir.cc:?
	0x000000000079c620 _GLOBAL__sub_I_ops.cc ops.cc:?
	0x000000000079c650 _GLOBAL__sub_I_approx_topk.cc approx_topk.cc:?
	0x000000000079c680 _GLOBAL__sub_I_approx_topk_shape.cc approx_topk_shape.cc:?
	0x000000000079c6b0 _GLOBAL__sub_I_lu_decomposition.cc lu_decomposition.cc:?
	#!/usr/bin/env python

	import subprocess
	import re
	import sys

	def get_init_array(filename):
	# Call objdump -s -j .init_array <filename> to get the contents of the .init_array section
	try:
	objdump_output = subprocess.check_output(['objdump', '-s', '-j', '.init_array', filename], stderr=subprocess.STDOUT)
	(sfcodegen) moyix@isabella:~$ python load_codegen_with_longer_context.py
	vocab_file vocab.json
	merges_file merges.txt
	tokenizer_file tokenizer.json
	added_tokens_file added_tokens.json
	special_tokens_map_file special_tokens_map.json
	tokenizer_config_file tokenizer_config.json
	Partial prompt from /usr/include/stdlib.h:

	[...] restrict __nptr,
	#!/usr/bin/env python

	import torch
	from transformers import CodeGenConfig, CodeGenForCausalLM, CodeGenTokenizer
	from transformers.utils.hub import cached_file

	NEW_SIZE = 4096

	cg_config = CodeGenConfig.from_pretrained('Salesforce/codegen-350M-mono')
	cg_config.n_ctx = NEW_SIZE
	#!/usr/bin/env python

	import time
	import random
	import string
	from flask import Flask, request
	import numpy as np
	import tritonclient.grpc as client_util
	import json
	from tritonclient.utils import np_to_triton_dtype
	# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
	# Modified by Brendan Dolan-Gavitt, 2022
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	#!/usr/bin/env python

	import argparse
	import torch
	from transformers import GPTJForCausalLM, GPTJConfig
	# Note: these need the git version of Transformers as of 7/22/2022
	from transformers import CodeGenTokenizer, CodeGenForCausalLM
	from transformers import CODEGEN_PRETRAINED_MODEL_ARCHIVE_LIST

	parser = argparse.ArgumentParser('Convert SalesForce CodeGen model to GPT-J')
	#include <stdio.h>
	#include <asm/prctl.h>
	#include <sys/prctl.h>
	#include <stdlib.h>

	// Compile with: clang-8 -fsanitize=shadow-call-stack shadowstack.c -o shadowstack

	int arch_prctl(int code, unsigned long *addr);

	void __attribute__ ((constructor)) __attribute__((no_sanitize("shadow-call-stack"))) setupgs()
	static int readSeparateTilesIntoBuffer (TIFF* in, uint8 *obuf,
	uint32 imagelength, uint32 imagewidth,
	uint32 tw, uint32 tl,
	uint16 spp, uint16 bps)
	{
	int i, status = 1, sample;
	int shift_width, bytes_per_pixel;
	uint16 bytes_per_sample;
	uint32 row, col; /* Current row and col of image */
	uint32 nrow, ncol; /* Number of rows and cols in current tile */