Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Script for downloading data of the GLUE benchmark (gluebenchmark.com)
''' Script for downloading all GLUE data.
Note: for legal reasons, we are unable to host MRPC.
You can either use the version hosted by the SentEval team, which is already tokenized,
or you can download the original data from (https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B-3604ED519838/MSRParaphraseCorpus.msi) and extract the data from it manually.
For Windows users, you can run the .msi file. For Mac and Linux users, consider an external library such as 'cabextract' (see below for an example).
You should then rename and place specific files in a folder (see below for an example).
mkdir MRPC
cabextract MSRParaphraseCorpus.msi -d MRPC
cat MRPC/_2DEC3DBE877E4DB192D17C0256E90F1D | tr -d $'\r' > MRPC/msr_paraphrase_train.txt
cat MRPC/_D7B391F9EAFF4B1B8BCE8F21B20B1B61 | tr -d $'\r' > MRPC/msr_paraphrase_test.txt
rm MRPC/_*
rm MSRParaphraseCorpus.msi
1/30/19: It looks like SentEval is no longer hosting their extracted and tokenized MRPC data, so you'll need to download the data from the original source for now.
2/11/19: It looks like SentEval actually *is* hosting the extracted data. Hooray!
'''
import os
import sys
import shutil
import argparse
import tempfile
import urllib.request
import zipfile
TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "QNLI", "RTE", "WNLI", "diagnostic"]
TASK2PATH = {"CoLA":'https://dl.fbaipublicfiles.com/glue/data/CoLA.zip',
"SST":'https://dl.fbaipublicfiles.com/glue/data/SST-2.zip',
"QQP":'https://dl.fbaipublicfiles.com/glue/data/QQP-clean.zip',
"STS":'https://dl.fbaipublicfiles.com/glue/data/STS-B.zip',
"MNLI":'https://dl.fbaipublicfiles.com/glue/data/MNLI.zip',
"QNLI":'https://dl.fbaipublicfiles.com/glue/data/QNLIv2.zip',
"RTE":'https://dl.fbaipublicfiles.com/glue/data/RTE.zip',
"WNLI":'https://dl.fbaipublicfiles.com/glue/data/WNLI.zip',
"diagnostic":'https://dl.fbaipublicfiles.com/glue/data/AX.tsv'}
MRPC_TRAIN = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt'
MRPC_TEST = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt'
def download_and_extract(task, data_dir):
print("Downloading and extracting %s..." % task)
if task == "MNLI":
print("\tNote (12/10/20): This script no longer downloads SNLI. You will need to manually download and format the data to use SNLI.")
data_file = "%s.zip" % task
urllib.request.urlretrieve(TASK2PATH[task], data_file)
with zipfile.ZipFile(data_file) as zip_ref:
zip_ref.extractall(data_dir)
os.remove(data_file)
print("\tCompleted!")
def format_mrpc(data_dir, path_to_data):
print("Processing MRPC...")
mrpc_dir = os.path.join(data_dir, "MRPC")
if not os.path.isdir(mrpc_dir):
os.mkdir(mrpc_dir)
if path_to_data:
mrpc_train_file = os.path.join(path_to_data, "msr_paraphrase_train.txt")
mrpc_test_file = os.path.join(path_to_data, "msr_paraphrase_test.txt")
else:
try:
mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")
mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")
URLLIB.urlretrieve(MRPC_TRAIN, mrpc_train_file)
URLLIB.urlretrieve(MRPC_TEST, mrpc_test_file)
except urllib.error.HTTPError:
print("Error downloading MRPC")
return
assert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_file
assert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_file
with io.open(mrpc_test_file, encoding='utf-8') as data_fh, \
io.open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding='utf-8') as test_fh:
header = data_fh.readline()
test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
for idx, row in enumerate(data_fh):
label, id1, id2, s1, s2 = row.strip().split('\t')
test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
try:
URLLIB.urlretrieve(TASK2PATH["MRPC"], os.path.join(mrpc_dir, "dev_ids.tsv"))
except KeyError or urllib.error.HTTPError:
print("\tError downloading standard development IDs for MRPC. You will need to manually split your data.")
return
dev_ids = []
with io.open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding='utf-8') as ids_fh:
for row in ids_fh:
dev_ids.append(row.strip().split('\t'))
with io.open(mrpc_train_file, encoding='utf-8') as data_fh, \
io.open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding='utf-8') as train_fh, \
io.open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding='utf-8') as dev_fh:
header = data_fh.readline()
train_fh.write(header)
dev_fh.write(header)
for row in data_fh:
label, id1, id2, s1, s2 = row.strip().split('\t')
if [id1, id2] in dev_ids:
dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
else:
train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
print("\tCompleted!")
def download_diagnostic(data_dir):
print("Downloading and extracting diagnostic...")
if not os.path.isdir(os.path.join(data_dir, "diagnostic")):
os.mkdir(os.path.join(data_dir, "diagnostic"))
data_file = os.path.join(data_dir, "diagnostic", "diagnostic.tsv")
urllib.request.urlretrieve(TASK2PATH["diagnostic"], data_file)
print("\tCompleted!")
return
def get_tasks(task_names):
task_names = task_names.split(',')
if "all" in task_names:
tasks = TASKS
else:
tasks = []
for task_name in task_names:
assert task_name in TASKS, "Task %s not found!" % task_name
tasks.append(task_name)
return tasks
def main(arguments):
parser = argparse.ArgumentParser()
parser.add_argument('--data_dir', help='directory to save data to', type=str, default='glue_data')
parser.add_argument('--tasks', help='tasks to download data for as a comma separated string',
type=str, default='all')
parser.add_argument('--path_to_mrpc', help='path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt',
type=str, default='')
args = parser.parse_args(arguments)
if not os.path.isdir(args.data_dir):
os.mkdir(args.data_dir)
tasks = get_tasks(args.tasks)
for task in tasks:
if task == 'MRPC':
format_mrpc(args.data_dir, args.path_to_mrpc)
elif task == 'diagnostic':
download_diagnostic(args.data_dir)
else:
download_and_extract(task, args.data_dir)
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))
@JelovXCMS
Copy link

JelovXCMS commented Jan 29, 2021

Got a NameError 'URLLIB' is not defined here:
Traceback (most recent call last): File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 150, in <module> sys.exit(main(sys.argv[1:])) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 142, in main format_mrpc(args.data_dir, args.path_to_mrpc) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 65, in format_mrpc URLLIB.urlretrieve(MRPC_TRAIN, mrpc_train_file) NameError: name 'URLLIB' is not defined

@stevenwjy
Copy link

stevenwjy commented Feb 20, 2021

Got a NameError 'URLLIB' is not defined here:
Traceback (most recent call last): File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 150, in <module> sys.exit(main(sys.argv[1:])) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 142, in main format_mrpc(args.data_dir, args.path_to_mrpc) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 65, in format_mrpc URLLIB.urlretrieve(MRPC_TRAIN, mrpc_train_file) NameError: name 'URLLIB' is not defined

I managed to get the MRPC donwload completed by adding the following three lines:

  • import io
  • URLLIB = urllib.request
  • 'MRPC':'https://raw.githubusercontent.com/MegEngine/Models/master/official/nlp/bert/glue_data/MRPC/dev_ids.tsv' inside the TASK2PATH dict (creds to @laouer)

@wangruinju
Copy link

wangruinju commented Mar 5, 2021

Got a NameError 'URLLIB' is not defined here:
Traceback (most recent call last): File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 150, in <module> sys.exit(main(sys.argv[1:])) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 142, in main format_mrpc(args.data_dir, args.path_to_mrpc) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 65, in format_mrpc URLLIB.urlretrieve(MRPC_TRAIN, mrpc_train_file) NameError: name 'URLLIB' is not defined

I managed to get the MRPC donwload completed by adding the following three lines:

  • import io
  • URLLIB = urllib.request
  • 'MRPC':'https://raw.githubusercontent.com/MegEngine/Models/master/official/nlp/bert/glue_data/MRPC/dev_ids.tsv' inside the TASK2PATH dict (creds to @laouer)

It works for me. Thanks a lot!

@takaakiki
Copy link

takaakiki commented Mar 6, 2021

Got a NameError 'URLLIB' is not defined here:
Traceback (most recent call last): File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 150, in <module> sys.exit(main(sys.argv[1:])) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 142, in main format_mrpc(args.data_dir, args.path_to_mrpc) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 65, in format_mrpc URLLIB.urlretrieve(MRPC_TRAIN, mrpc_train_file) NameError: name 'URLLIB' is not defined

I managed to get the MRPC donwload completed by adding the following three lines:

  • import io
  • URLLIB = urllib.request
  • 'MRPC':'https://raw.githubusercontent.com/MegEngine/Models/master/official/nlp/bert/glue_data/MRPC/dev_ids.tsv' inside the TASK2PATH dict (creds to @laouer)

Thanks!@stevenwjy

@shirl3yll
Copy link

shirl3yll commented Mar 11, 2021

Traceback (most recent call last):
File "E:/BERT/YYBert-master/download_glue_data.py", line 150, in
sys.exit(main(sys.argv[1:]))
File "E:/BERT/YYBert-master/download_glue_data.py", line 142, in main
format_mrpc(args.data_dir, args.path_to_mrpc)
File "E:/BERT/YYBert-master/download_glue_data.py", line 65, in format_mrpc
URLLIB.urlretrieve(MRPC_TRAIN, mrpc_train_file)
AttributeError: 'NoneType' object has no attribute 'urlretrieve'

How to fix it....

@awshrishi
Copy link

awshrishi commented Mar 26, 2021

Got a NameError 'URLLIB' is not defined here:
Traceback (most recent call last): File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 150, in <module> sys.exit(main(sys.argv[1:])) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 142, in main format_mrpc(args.data_dir, args.path_to_mrpc) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 65, in format_mrpc URLLIB.urlretrieve(MRPC_TRAIN, mrpc_train_file) NameError: name 'URLLIB' is not defined

I managed to get the MRPC donwload completed by adding the following three lines:

  • import io
  • URLLIB = urllib.request
  • 'MRPC':'https://raw.githubusercontent.com/MegEngine/Models/master/official/nlp/bert/glue_data/MRPC/dev_ids.tsv' inside the TASK2PATH dict (creds to @laouer)

Worked for me! Thanks. Why isn't the URLLIB declared? @W4ngatang

@iliasprc
Copy link

iliasprc commented Mar 31, 2021

Dear all , I get the following error when attempting to download the data Downloading and extracting CoLA...
Traceback (most recent call last):
File "/mnt/784C5F3A4C5EF1FC/PROJECTS/MSC/reformer-pytorch/dev/download_glue_data.py", line 141, in
sys.exit(main(sys.argv[1:]))
File "/mnt/784C5F3A4C5EF1FC/PROJECTS/MSC/reformer-pytorch/dev/download_glue_data.py", line 137, in main
download_and_extract(task, args.data_dir)
File "/mnt/784C5F3A4C5EF1FC/PROJECTS/MSC/reformer-pytorch/dev/download_glue_data.py", line 47, in download_and_extract
urllib.request.urlretrieve(TASK2PATH[task], data_file)
File "/usr/lib/python3.6/urllib/request.py", line 248, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "/usr/lib/python3.6/urllib/request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.6/urllib/request.py", line 532, in open
response = meth(req, response)
File "/usr/lib/python3.6/urllib/request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python3.6/urllib/request.py", line 570, in error
return self._call_chain(*args)
File "/usr/lib/python3.6/urllib/request.py", line 504, in _call_chain
result = func(*args)
File "/usr/lib/python3.6/urllib/request.py", line 650, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden

@Gforky
Copy link

Gforky commented Apr 1, 2021

Got a NameError 'URLLIB' is not defined here:
Traceback (most recent call last): File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 150, in <module> sys.exit(main(sys.argv[1:])) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 142, in main format_mrpc(args.data_dir, args.path_to_mrpc) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 65, in format_mrpc URLLIB.urlretrieve(MRPC_TRAIN, mrpc_train_file) NameError: name 'URLLIB' is not defined

I managed to get the MRPC donwload completed by adding the following three lines:

  • import io
  • URLLIB = urllib.request
  • 'MRPC':'https://raw.githubusercontent.com/MegEngine/Models/master/official/nlp/bert/glue_data/MRPC/dev_ids.tsv' inside the TASK2PATH dict (creds to @laouer)

It works for me. Thanks a lot!

Works for me, thanks a lot.

@iEdric
Copy link

iEdric commented May 13, 2021

import io
URLLIB = urllib.request
'MRPC':'https://raw.githubusercontent.com/MegEngine/Models/master/official/nlp/bert/glue_data/MRPC/dev_ids.tsv' inside the TASK2PATH dict (creds to @laouer)

@vlasenkoalexey
Copy link

vlasenkoalexey commented May 19, 2021

Here is fixed script in case anyone else needs it: https://gist.github.com/vlasenkoalexey/fef1601580f269eca73bf26a198595f3

@longerHost
Copy link

longerHost commented May 24, 2021

Here is fixed script in case anyone else needs it: https://gist.github.com/vlasenkoalexey/fef1601580f269eca73bf26a198595f3

Cool, this script works perfectly. Thanks.

@hrdxwandg
Copy link

hrdxwandg commented May 29, 2021

Here is fixed script in case anyone else needs it: https://gist.github.com/vlasenkoalexey/fef1601580f269eca73bf26a198595f3

thx

@wacharlin
Copy link

wacharlin commented Jun 6, 2021

Got a NameError 'URLLIB' is not defined here:
Traceback (most recent call last): File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 150, in <module> sys.exit(main(sys.argv[1:])) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 142, in main format_mrpc(args.data_dir, args.path_to_mrpc) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 65, in format_mrpc URLLIB.urlretrieve(MRPC_TRAIN, mrpc_train_file) NameError: name 'URLLIB' is not defined

I managed to get the MRPC donwload completed by adding the following three lines:

  • import io
  • URLLIB = urllib.request
  • 'MRPC':'https://raw.githubusercontent.com/MegEngine/Models/master/official/nlp/bert/glue_data/MRPC/dev_ids.tsv' inside the TASK2PATH dict (creds to @laouer)

fine!

@RunxinXu
Copy link

RunxinXu commented Jun 11, 2021

Got a NameError 'URLLIB' is not defined here:
Traceback (most recent call last): File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 150, in <module> sys.exit(main(sys.argv[1:])) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 142, in main format_mrpc(args.data_dir, args.path_to_mrpc) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 65, in format_mrpc URLLIB.urlretrieve(MRPC_TRAIN, mrpc_train_file) NameError: name 'URLLIB' is not defined

I managed to get the MRPC donwload completed by adding the following three lines:

  • import io
  • URLLIB = urllib.request
  • 'MRPC':'https://raw.githubusercontent.com/MegEngine/Models/master/official/nlp/bert/glue_data/MRPC/dev_ids.tsv' inside the TASK2PATH dict (creds to @laouer)

Thanks!

@pkuwj
Copy link

pkuwj commented Jul 9, 2021

@ztygreat
Copy link

ztygreat commented Jul 30, 2021

Got a NameError 'URLLIB' is not defined here:
Traceback (most recent call last): File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 150, in <module> sys.exit(main(sys.argv[1:])) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 142, in main format_mrpc(args.data_dir, args.path_to_mrpc) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 65, in format_mrpc URLLIB.urlretrieve(MRPC_TRAIN, mrpc_train_file) NameError: name 'URLLIB' is not defined

I managed to get the MRPC donwload completed by adding the following three lines:

  • import io
  • URLLIB = urllib.request
  • 'MRPC':'https://raw.githubusercontent.com/MegEngine/Models/master/official/nlp/bert/glue_data/MRPC/dev_ids.tsv' inside the TASK2PATH dict (creds to @laouer)

Thanks!@stevenwjy

Thanks!

@raffaem
Copy link

raffaem commented Sep 14, 2021

"QQP":'https://dl.fbaipublicfiles.com/glue/data/STS-B.zip',
"STS":'https://dl.fbaipublicfiles.com/glue/data/QQP-clean.zip'
The urls of the two datasets are reversed.

Yeas, you are right. Thanks!

@raffaem
Copy link

raffaem commented Sep 14, 2021

I'm getting the following warning:

Downloading and extracting MNLI...
	Note (12/10/20): This script no longer downloads SNLI. You will need to manually download and format the data to use SNLI.

@raffaem
Copy link

raffaem commented Sep 14, 2021

Here is the script with all the fixes so far

@MeenalShah13
Copy link

MeenalShah13 commented Oct 11, 2021

I am getting the following error when trying to download MRPC files for BERT classifier:

Processing MRPC...
	Error downloading standard development IDs for MRPC. You will need to manually split your data.
***** Task data directory: glue_data/MRPC *****
msr_paraphrase_test.txt  msr_paraphrase_train.txt  test.tsv
***** Model output directory: gs://capstone-testing/bert-tfhub/models/MRPC *****

What is the best method to resolve this?

@brian-zZZ
Copy link

brian-zZZ commented May 21, 2022

  • import io
  • URLLIB = urllib.request
  • 'MRPC':'https://raw.githubusercontent.com/MegEngine/Models/master/official/nlp/bert/glue_data/MRPC/dev_ids.tsv' inside the TASK2PATH

Thanks a lot! It works for me, too

@zoe9698
Copy link

zoe9698 commented Jul 18, 2022

I'm getting the following warning:

Downloading and extracting MNLI...
	Note (12/10/20): This script no longer downloads SNLI. You will need to manually download and format the data to use SNLI.

+1

@astro-jon
Copy link

astro-jon commented Jul 22, 2022

I'm getting the following warning:

Downloading and extracting MNLI...
	Note (12/10/20): This script no longer downloads SNLI. You will need to manually download and format the data to use SNLI.

+1

You should download the SNLI dataset manually, by https://nlp.stanford.edu/projects/snli/snli_1.0.zip

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment