Skip to content

Instantly share code, notes, and snippets.

@seanie12
Last active August 23, 2019 06:14
Show Gist options
  • Save seanie12/558ed7ec508f7f1fa1b7705170bd0077 to your computer and use it in GitHub Desktop.
Save seanie12/558ed7ec508f7f1fa1b7705170bd0077 to your computer and use it in GitHub Desktop.
#! /bin/bash
set -e
OUTPUT=$1
mkdir -p $OUTPUT
wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/SQuAD.jsonl.gz -O $OUTPUT/SQuAD.jsonl.gz
wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/NewsQA.jsonl.gz -O $OUTPUT/NewsQA.jsonl.gz
wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/TriviaQA-web.jsonl.gz -O $OUTPUT/TriviaQA.jsonl.gz
wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/HotpotQA.jsonl.gz -O $OUTPUT/HotpotQA.jsonl.gz
wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/dev/NaturalQuestionsShort.jsonl.gz -O $OUTPUT/NaturalQuestions.jsonl.gz
#! /bin/bash
set -e
OUTPUT=$1
mkdir -p $OUTPUT
wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/train/SQuAD.jsonl.gz -O $OUTPUT/SQuAD.jsonl.gz
wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/train/NewsQA.jsonl.gz -O $OUTPUT/NewsQA.jsonl.gz
wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/train/TriviaQA-web.jsonl.gz -O $OUTPUT/TriviaQA.jsonl.gz
wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/train/HotpotQA.jsonl.gz -O $OUTPUT/HotpotQA.jsonl.gz
wget https://s3.us-east-2.amazonaws.com/mrqa/release/v2/train/NaturalQuestionsShort.jsonl.gz -O $OUTPUT/NaturalQuestions.jsonl.gz
def read_examples(input_file, debug=False):
# Read data
unproc_data = []
with gzip.open(input_file, 'rt', encoding='utf-8') as f: # opening file in binary(rb) mode
for item in json_lines.reader(f):
# print(item) #or use print(item['X']) for printing specific data
unproc_data.append(item)
def is_whitespace(c):
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
return True
return False
# Delete header
unproc_data = unproc_data[1:]
if debug:
unproc_data = unproc_data[:100]
###################### Make Examples ######################
examples = []
skip_tags = ['<Table>', '<Tr>', '<Td>', '<Ol>', '<Ul>', '<Li>']
for item in unproc_data:
# in case of NQ dataset, context containing tags is excluded
context = item["context"]
skip_flag = False
for tag in skip_tags:
if tag in context:
skip_flag = True
break
if skip_flag:
continue
# 1. Get Context
paragraph_text = context.replace("[TLE]", "[SEP]")
paragraph_text = paragraph_text.replace("[PAR]", "[SEP]")
paragraph_text = paragraph_text.replace("[DOC]", "[SEP]")
doc_tokens = []
char_to_word_offset = []
prev_is_whitespace = True
for c in paragraph_text:
if is_whitespace(c):
prev_is_whitespace = True
else:
if prev_is_whitespace:
doc_tokens.append(c)
else:
doc_tokens[-1] += c
prev_is_whitespace = False
char_to_word_offset.append(len(doc_tokens) - 1)
# 2. qas
for qa in item['qas']:
qas_id = qa['qid']
question_text = qa['question']
# Only take the first answer
answer = qa['detected_answers'][0]
orig_answer_text = answer['text']
answer_offset = answer['char_spans'][0][0]
answer_length = len(orig_answer_text)
start_position = char_to_word_offset[answer_offset]
try:
end_position = char_to_word_offset[answer_offset + answer_length - 1]
except IndexError:
print("invalid answer span. Exclude this example")
end_position = -1
continue
# Only add answers where the text can be exactly recovered from the
# document. If this CAN'T happen it's likely due to weird Unicode
# stuff so we will just skip the example.
#
# Note that this means for training mode, every example is NOT
# guaranteed to be preserved.
actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
cleaned_answer_text = " ".join(
whitespace_tokenize(orig_answer_text))
if actual_text.find(cleaned_answer_text) == -1:
continue
example = SquadExample(
qas_id=qas_id,
question_text=question_text,
doc_tokens=doc_tokens,
orig_answer_text=orig_answer_text,
start_position=start_position,
end_position=end_position)
examples.append(example)
return examples
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment