Skip to content

Instantly share code, notes, and snippets.

Last active May 23, 2024 12:55
Show Gist options
  • Save W4ngatang/60c2bdb54d156a41194446737ce03e2e to your computer and use it in GitHub Desktop.
Save W4ngatang/60c2bdb54d156a41194446737ce03e2e to your computer and use it in GitHub Desktop.
Script for downloading data of the GLUE benchmark (
''' Script for downloading all GLUE data.
Note: for legal reasons, we are unable to host MRPC.
You can either use the version hosted by the SentEval team, which is already tokenized,
or you can download the original data from ( and extract the data from it manually.
For Windows users, you can run the .msi file. For Mac and Linux users, consider an external library such as 'cabextract' (see below for an example).
You should then rename and place specific files in a folder (see below for an example).
mkdir MRPC
cabextract MSRParaphraseCorpus.msi -d MRPC
cat MRPC/_2DEC3DBE877E4DB192D17C0256E90F1D | tr -d $'\r' > MRPC/msr_paraphrase_train.txt
cat MRPC/_D7B391F9EAFF4B1B8BCE8F21B20B1B61 | tr -d $'\r' > MRPC/msr_paraphrase_test.txt
rm MRPC/_*
rm MSRParaphraseCorpus.msi
1/30/19: It looks like SentEval is no longer hosting their extracted and tokenized MRPC data, so you'll need to download the data from the original source for now.
2/11/19: It looks like SentEval actually *is* hosting the extracted data. Hooray!
import os
import sys
import shutil
import argparse
import tempfile
import urllib.request
import zipfile
TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "QNLI", "RTE", "WNLI", "diagnostic"]
TASK2PATH = {"CoLA":'',
def download_and_extract(task, data_dir):
print("Downloading and extracting %s..." % task)
if task == "MNLI":
print("\tNote (12/10/20): This script no longer downloads SNLI. You will need to manually download and format the data to use SNLI.")
data_file = "" % task
urllib.request.urlretrieve(TASK2PATH[task], data_file)
with zipfile.ZipFile(data_file) as zip_ref:
def format_mrpc(data_dir, path_to_data):
print("Processing MRPC...")
mrpc_dir = os.path.join(data_dir, "MRPC")
if not os.path.isdir(mrpc_dir):
if path_to_data:
mrpc_train_file = os.path.join(path_to_data, "msr_paraphrase_train.txt")
mrpc_test_file = os.path.join(path_to_data, "msr_paraphrase_test.txt")
mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")
mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")
URLLIB.urlretrieve(MRPC_TRAIN, mrpc_train_file)
URLLIB.urlretrieve(MRPC_TEST, mrpc_test_file)
except urllib.error.HTTPError:
print("Error downloading MRPC")
assert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_file
assert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_file
with, encoding='utf-8') as data_fh, \, "test.tsv"), 'w', encoding='utf-8') as test_fh:
header = data_fh.readline()
test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
for idx, row in enumerate(data_fh):
label, id1, id2, s1, s2 = row.strip().split('\t')
test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
URLLIB.urlretrieve(TASK2PATH["MRPC"], os.path.join(mrpc_dir, "dev_ids.tsv"))
except KeyError or urllib.error.HTTPError:
print("\tError downloading standard development IDs for MRPC. You will need to manually split your data.")
dev_ids = []
with, "dev_ids.tsv"), encoding='utf-8') as ids_fh:
for row in ids_fh:
with, encoding='utf-8') as data_fh, \, "train.tsv"), 'w', encoding='utf-8') as train_fh, \, "dev.tsv"), 'w', encoding='utf-8') as dev_fh:
header = data_fh.readline()
for row in data_fh:
label, id1, id2, s1, s2 = row.strip().split('\t')
if [id1, id2] in dev_ids:
dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
def download_diagnostic(data_dir):
print("Downloading and extracting diagnostic...")
if not os.path.isdir(os.path.join(data_dir, "diagnostic")):
os.mkdir(os.path.join(data_dir, "diagnostic"))
data_file = os.path.join(data_dir, "diagnostic", "diagnostic.tsv")
urllib.request.urlretrieve(TASK2PATH["diagnostic"], data_file)
def get_tasks(task_names):
task_names = task_names.split(',')
if "all" in task_names:
tasks = TASKS
tasks = []
for task_name in task_names:
assert task_name in TASKS, "Task %s not found!" % task_name
return tasks
def main(arguments):
parser = argparse.ArgumentParser()
parser.add_argument('--data_dir', help='directory to save data to', type=str, default='glue_data')
parser.add_argument('--tasks', help='tasks to download data for as a comma separated string',
type=str, default='all')
parser.add_argument('--path_to_mrpc', help='path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt',
type=str, default='')
args = parser.parse_args(arguments)
if not os.path.isdir(args.data_dir):
tasks = get_tasks(args.tasks)
for task in tasks:
if task == 'MRPC':
format_mrpc(args.data_dir, args.path_to_mrpc)
elif task == 'diagnostic':
download_and_extract(task, args.data_dir)
if __name__ == '__main__':
Copy link

syusyuu commented May 23, 2024

Got a NameError 'URLLIB' is not defined here:
Traceback (most recent call last): File "C:\Users\cpeng4\Downloads\download_glue_data\", line 150, in <module> sys.exit(main(sys.argv[1:])) File "C:\Users\cpeng4\Downloads\download_glue_data\", line 142, in main format_mrpc(args.data_dir, args.path_to_mrpc) File "C:\Users\cpeng4\Downloads\download_glue_data\", line 65, in format_mrpc URLLIB.urlretrieve(MRPC_TRAIN, mrpc_train_file) NameError: name 'URLLIB' is not defined

I managed to get the MRPC donwload completed by adding the following three lines:

  • import io
  • URLLIB = urllib.request
  • 'MRPC':'' inside the TASK2PATH dict (creds to @laouer)

tanks a lot!!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment