Skip to content

Instantly share code, notes, and snippets.

@anon767
Last active May 26, 2023 17:30
Show Gist options
  • Save anon767/0c6462139cd9dde2b9047d84e72175be to your computer and use it in GitHub Desktop.
Save anon767/0c6462139cd9dde2b9047d84e72175be to your computer and use it in GitHub Desktop.
Classify function and lines with LineVul

How to use?

  • Clone LineVul
  • Download model like explained in their Repo
  • Place classify.py in the LineVul/linevul folder
  • Run it for example with:
 python3 classify.py \
    --input test.c  --model_name=12heads_linevul_model.bin \
      --model_type=roberta \
      --tokenizer_name=microsoft/codebert-base \
      --model_name_or_path=microsoft/codebert-base \
      --block_size 512
from __future__ import absolute_import, division, print_function
import argparse
import os
import torch
from tokenizers import Tokenizer
from transformers import (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer)
from linevul_main import convert_examples_to_features, set_seed, \
get_all_lines_score, clean_special_token_values, get_word_att_scores
from linevul_model import Model
os.environ["TRANSFORMERS_OFFLINE"] = "1"
def test(args, model, tokenizer, inp):
"""
Return the softmax probability for the function and the line scores
"""
model.to(args.device)
label = torch.zeros(args.block_size, dtype=torch.long).to(args.device) # doesn't matter
parsed_input = convert_examples_to_features(open(inp).read(), label, tokenizer, args)
parsed_input_ids = torch.tensor(parsed_input.input_ids,
dtype=torch.long).unsqueeze(0)
parsed_input_ids.to(args.device)
all_tokens = tokenizer.convert_ids_to_tokens(parsed_input.input_ids)
all_tokens = [token.replace("Ġ", "") for token in all_tokens]
all_tokens = [token.replace("ĉ", "Ċ") for token in all_tokens]
with torch.no_grad():
prob, attentions = model(input_ids=parsed_input_ids, output_attentions=True)
attentions = attentions[0][0]
attention = None
# go into the layer
for i in range(len(attentions)):
layer_attention = attentions[i]
# summerize the values of each token dot other tokens
layer_attention = sum(layer_attention)
if attention is None:
attention = layer_attention
else:
attention += layer_attention
# clean att score for <s> and </s>
attention = clean_special_token_values(attention, padding=True)
# attention should be 1D tensor with seq length representing each token's attention value
word_att_scores = get_word_att_scores(all_tokens=all_tokens, att_scores=attention)
all_lines_score, flaw_line_indices = get_all_lines_score(word_att_scores, [])
print(prob, all_lines_score)
return prob, all_lines_score
def main():
"""
For Example:
python3 classify.py \
--input test.c --model_name=12heads_linevul_model.bin \
--model_type=roberta \
--tokenizer_name=microsoft/codebert-base \
--model_name_or_path=microsoft/codebert-base \
--block_size 512
"""
parser = argparse.ArgumentParser()
## parameters
parser.add_argument("--input", default=None, type=str, required=False,
help="The input C/C++ file.")
parser.add_argument("--model_type", default="bert", type=str,
help="The model architecture to be fine-tuned.")
parser.add_argument("--block_size", default=-1, type=int,
help="Optional input sequence length after tokenization."
"The training dataset will be truncated in block of this size for training."
"Default to the model max input length for single sentence inputs (take into account special tokens).")
parser.add_argument("--model_name", default="model.bin", type=str,
help="Saved model name.")
parser.add_argument("--model_name_or_path", default=None, type=str,
help="The model checkpoint for weights initialization.")
parser.add_argument("--config_name", default="", type=str,
help="Optional pretrained config name or path if not the same as model_name_or_path")
parser.add_argument("--use_non_pretrained_model", action='store_true', default=False,
help="Whether to use non-pretrained model.")
parser.add_argument("--tokenizer_name", default="", type=str,
help="Optional pretrained tokenizer name or path if not the same as model_name_or_path")
parser.add_argument("--code_length", default=256, type=int,
help="Optional Code input sequence length after tokenization.")
parser.add_argument('--seed', type=int, default=42,
help="random seed for initialization")
# num of attention heads
parser.add_argument('--num_attention_heads', type=int, default=12,
help="number of attention heads used in CodeBERT")
# word-level tokenizer
parser.add_argument("--use_word_level_tokenizer", default=False, action='store_true',
help="Whether to use word-level tokenizer.")
# bpe non-pretrained tokenizer
parser.add_argument("--use_non_pretrained_tokenizer", default=False, action='store_true',
help="Whether to use non-pretrained bpe tokenizer.")
args = parser.parse_args()
# Setup CUDA, GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
args.n_gpu = torch.cuda.device_count()
args.device = device
set_seed(args)
config = RobertaConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
config.num_labels = 1
config.num_attention_heads = args.num_attention_heads
if args.use_word_level_tokenizer:
print('using wordlevel tokenizer!')
tokenizer = Tokenizer.from_file('./word_level_tokenizer/wordlevel.json')
elif args.use_non_pretrained_tokenizer:
tokenizer = RobertaTokenizer(vocab_file="bpe_tokenizer/bpe_tokenizer-vocab.json",
merges_file="bpe_tokenizer/bpe_tokenizer-merges.txt")
else:
tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
if args.use_non_pretrained_model:
model = RobertaForSequenceClassification(config=config)
else:
model = RobertaForSequenceClassification.from_pretrained(args.model_name_or_path, config=config,
ignore_mismatched_sizes=True)
model = Model(model, config, tokenizer, args)
checkpoint = f'saved_models/checkpoint-best-f1/{args.model_name}'
model.load_state_dict(torch.load(checkpoint, map_location=args.device))
test(args, model, tokenizer, args.input)
return 0
if __name__ == "__main__":
main()
int main() {
printf("yo");
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment