|
from __future__ import absolute_import, division, print_function |
|
|
|
import argparse |
|
import os |
|
|
|
import torch |
|
from tokenizers import Tokenizer |
|
from transformers import (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer) |
|
|
|
from linevul_main import convert_examples_to_features, set_seed, \ |
|
get_all_lines_score, clean_special_token_values, get_word_att_scores |
|
from linevul_model import Model |
|
|
|
os.environ["TRANSFORMERS_OFFLINE"] = "1" |
|
|
|
|
|
def test(args, model, tokenizer, inp): |
|
""" |
|
Return the softmax probability for the function and the line scores |
|
""" |
|
model.to(args.device) |
|
|
|
label = torch.zeros(args.block_size, dtype=torch.long).to(args.device) # doesn't matter |
|
parsed_input = convert_examples_to_features(open(inp).read(), label, tokenizer, args) |
|
parsed_input_ids = torch.tensor(parsed_input.input_ids, |
|
dtype=torch.long).unsqueeze(0) |
|
parsed_input_ids.to(args.device) |
|
|
|
all_tokens = tokenizer.convert_ids_to_tokens(parsed_input.input_ids) |
|
all_tokens = [token.replace("Ġ", "") for token in all_tokens] |
|
all_tokens = [token.replace("ĉ", "Ċ") for token in all_tokens] |
|
with torch.no_grad(): |
|
prob, attentions = model(input_ids=parsed_input_ids, output_attentions=True) |
|
|
|
attentions = attentions[0][0] |
|
|
|
attention = None |
|
# go into the layer |
|
for i in range(len(attentions)): |
|
layer_attention = attentions[i] |
|
# summerize the values of each token dot other tokens |
|
layer_attention = sum(layer_attention) |
|
if attention is None: |
|
attention = layer_attention |
|
else: |
|
attention += layer_attention |
|
# clean att score for <s> and </s> |
|
attention = clean_special_token_values(attention, padding=True) |
|
# attention should be 1D tensor with seq length representing each token's attention value |
|
word_att_scores = get_word_att_scores(all_tokens=all_tokens, att_scores=attention) |
|
all_lines_score, flaw_line_indices = get_all_lines_score(word_att_scores, []) |
|
print(prob, all_lines_score) |
|
return prob, all_lines_score |
|
|
|
def main(): |
|
""" |
|
For Example: |
|
python3 classify.py \ |
|
--input test.c --model_name=12heads_linevul_model.bin \ |
|
--model_type=roberta \ |
|
--tokenizer_name=microsoft/codebert-base \ |
|
--model_name_or_path=microsoft/codebert-base \ |
|
--block_size 512 |
|
|
|
""" |
|
parser = argparse.ArgumentParser() |
|
## parameters |
|
parser.add_argument("--input", default=None, type=str, required=False, |
|
help="The input C/C++ file.") |
|
parser.add_argument("--model_type", default="bert", type=str, |
|
help="The model architecture to be fine-tuned.") |
|
parser.add_argument("--block_size", default=-1, type=int, |
|
help="Optional input sequence length after tokenization." |
|
"The training dataset will be truncated in block of this size for training." |
|
"Default to the model max input length for single sentence inputs (take into account special tokens).") |
|
parser.add_argument("--model_name", default="model.bin", type=str, |
|
help="Saved model name.") |
|
parser.add_argument("--model_name_or_path", default=None, type=str, |
|
help="The model checkpoint for weights initialization.") |
|
parser.add_argument("--config_name", default="", type=str, |
|
help="Optional pretrained config name or path if not the same as model_name_or_path") |
|
parser.add_argument("--use_non_pretrained_model", action='store_true', default=False, |
|
help="Whether to use non-pretrained model.") |
|
parser.add_argument("--tokenizer_name", default="", type=str, |
|
help="Optional pretrained tokenizer name or path if not the same as model_name_or_path") |
|
parser.add_argument("--code_length", default=256, type=int, |
|
help="Optional Code input sequence length after tokenization.") |
|
parser.add_argument('--seed', type=int, default=42, |
|
help="random seed for initialization") |
|
|
|
# num of attention heads |
|
parser.add_argument('--num_attention_heads', type=int, default=12, |
|
help="number of attention heads used in CodeBERT") |
|
|
|
# word-level tokenizer |
|
parser.add_argument("--use_word_level_tokenizer", default=False, action='store_true', |
|
help="Whether to use word-level tokenizer.") |
|
# bpe non-pretrained tokenizer |
|
parser.add_argument("--use_non_pretrained_tokenizer", default=False, action='store_true', |
|
help="Whether to use non-pretrained bpe tokenizer.") |
|
args = parser.parse_args() |
|
# Setup CUDA, GPU |
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
args.n_gpu = torch.cuda.device_count() |
|
args.device = device |
|
set_seed(args) |
|
config = RobertaConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path) |
|
config.num_labels = 1 |
|
config.num_attention_heads = args.num_attention_heads |
|
if args.use_word_level_tokenizer: |
|
print('using wordlevel tokenizer!') |
|
tokenizer = Tokenizer.from_file('./word_level_tokenizer/wordlevel.json') |
|
elif args.use_non_pretrained_tokenizer: |
|
tokenizer = RobertaTokenizer(vocab_file="bpe_tokenizer/bpe_tokenizer-vocab.json", |
|
merges_file="bpe_tokenizer/bpe_tokenizer-merges.txt") |
|
else: |
|
tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name) |
|
if args.use_non_pretrained_model: |
|
model = RobertaForSequenceClassification(config=config) |
|
else: |
|
model = RobertaForSequenceClassification.from_pretrained(args.model_name_or_path, config=config, |
|
ignore_mismatched_sizes=True) |
|
model = Model(model, config, tokenizer, args) |
|
checkpoint = f'saved_models/checkpoint-best-f1/{args.model_name}' |
|
model.load_state_dict(torch.load(checkpoint, map_location=args.device)) |
|
test(args, model, tokenizer, args.input) |
|
|
|
return 0 |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |