Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Script for downloading data of the GLUE benchmark (gluebenchmark.com)
''' Script for downloading all GLUE data.
Note: for legal reasons, we are unable to host MRPC.
You can either use the version hosted by the SentEval team, which is already tokenized,
or you can download the original data from (https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B-3604ED519838/MSRParaphraseCorpus.msi) and extract the data from it manually.
For Windows users, you can run the .msi file. For Mac and Linux users, consider an external library such as 'cabextract' (see below for an example).
You should then rename and place specific files in a folder (see below for an example).
mkdir MRPC
cabextract MSRParaphraseCorpus.msi -d MRPC
cat MRPC/_2DEC3DBE877E4DB192D17C0256E90F1D | tr -d $'\r' > MRPC/msr_paraphrase_train.txt
cat MRPC/_D7B391F9EAFF4B1B8BCE8F21B20B1B61 | tr -d $'\r' > MRPC/msr_paraphrase_test.txt
rm MRPC/_*
rm MSRParaphraseCorpus.msi
1/30/19: It looks like SentEval is no longer hosting their extracted and tokenized MRPC data, so you'll need to download the data from the original source for now.
2/11/19: It looks like SentEval actually *is* hosting the extracted data. Hooray!
'''
import os
import sys
import shutil
import argparse
import tempfile
import urllib.request
import zipfile
TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "SNLI", "QNLI", "RTE", "WNLI", "diagnostic"]
TASK2PATH = {"CoLA":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4',
"SST":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8',
"MRPC":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc',
"QQP":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP.zip?alt=media&token=700c6acf-160d-4d89-81d1-de4191d02cb5',
"STS":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5',
"MNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce',
"SNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df',
"QNLI": 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLIv2.zip?alt=media&token=6fdcf570-0fc5-4631-8456-9505272d1601',
"RTE":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb',
"WNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf',
"diagnostic":'https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D'}
MRPC_TRAIN = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt'
MRPC_TEST = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt'
def download_and_extract(task, data_dir):
print("Downloading and extracting %s..." % task)
data_file = "%s.zip" % task
urllib.request.urlretrieve(TASK2PATH[task], data_file)
with zipfile.ZipFile(data_file) as zip_ref:
zip_ref.extractall(data_dir)
os.remove(data_file)
print("\tCompleted!")
def format_mrpc(data_dir, path_to_data):
print("Processing MRPC...")
mrpc_dir = os.path.join(data_dir, "MRPC")
if not os.path.isdir(mrpc_dir):
os.mkdir(mrpc_dir)
if path_to_data:
mrpc_train_file = os.path.join(path_to_data, "msr_paraphrase_train.txt")
mrpc_test_file = os.path.join(path_to_data, "msr_paraphrase_test.txt")
else:
print("Local MRPC data not specified, downloading data from %s" % MRPC_TRAIN)
mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")
mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")
urllib.request.urlretrieve(MRPC_TRAIN, mrpc_train_file)
urllib.request.urlretrieve(MRPC_TEST, mrpc_test_file)
assert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_file
assert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_file
urllib.request.urlretrieve(TASK2PATH["MRPC"], os.path.join(mrpc_dir, "dev_ids.tsv"))
dev_ids = []
with open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding="utf8") as ids_fh:
for row in ids_fh:
dev_ids.append(row.strip().split('\t'))
with open(mrpc_train_file, encoding="utf8") as data_fh, \
open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding="utf8") as train_fh, \
open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding="utf8") as dev_fh:
header = data_fh.readline()
train_fh.write(header)
dev_fh.write(header)
for row in data_fh:
label, id1, id2, s1, s2 = row.strip().split('\t')
if [id1, id2] in dev_ids:
dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
else:
train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
with open(mrpc_test_file, encoding="utf8") as data_fh, \
open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding="utf8") as test_fh:
header = data_fh.readline()
test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
for idx, row in enumerate(data_fh):
label, id1, id2, s1, s2 = row.strip().split('\t')
test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
print("\tCompleted!")
def download_diagnostic(data_dir):
print("Downloading and extracting diagnostic...")
if not os.path.isdir(os.path.join(data_dir, "diagnostic")):
os.mkdir(os.path.join(data_dir, "diagnostic"))
data_file = os.path.join(data_dir, "diagnostic", "diagnostic.tsv")
urllib.request.urlretrieve(TASK2PATH["diagnostic"], data_file)
print("\tCompleted!")
return
def get_tasks(task_names):
task_names = task_names.split(',')
if "all" in task_names:
tasks = TASKS
else:
tasks = []
for task_name in task_names:
assert task_name in TASKS, "Task %s not found!" % task_name
tasks.append(task_name)
return tasks
def main(arguments):
parser = argparse.ArgumentParser()
parser.add_argument('--data_dir', help='directory to save data to', type=str, default='glue_data')
parser.add_argument('--tasks', help='tasks to download data for as a comma separated string',
type=str, default='all')
parser.add_argument('--path_to_mrpc', help='path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt',
type=str, default='')
args = parser.parse_args(arguments)
if not os.path.isdir(args.data_dir):
os.mkdir(args.data_dir)
tasks = get_tasks(args.tasks)
for task in tasks:
if task == 'MRPC':
format_mrpc(args.data_dir, args.path_to_mrpc)
elif task == 'diagnostic':
download_diagnostic(args.data_dir)
else:
download_and_extract(task, args.data_dir)
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))
@W4ngatang

This comment has been minimized.

Copy link
Owner Author

commented May 8, 2018

Run with

python download_glue_data.py --data_dir glue_data --tasks all
@netik1020

This comment has been minimized.

Copy link

commented Nov 3, 2018

Hi,

I got encoding errors in function format_mrpc().
Adding the parameter ' encoding="utf8" ' to all the calls to open a file resolves the issue.
Add the parameter to the file open calls at lines 68,72,73,74,85 and 86.
Thank You,
Netik

@pajoma

This comment has been minimized.

Copy link

commented Nov 7, 2018

Had the same issue as @netik1020, the UTF-8 method parameter worked for me.

I used the tensorflow docker image (based on ubuntu xenial).

@zkl99999

This comment has been minimized.

Copy link

commented Nov 9, 2018

ConnectionResetError: [Errno 104] Connection reset by peer
urllib.error.URLError: <urlopen error [Errno 104] Connection reset by peer>

lantern fanqiang, solve above

@Edward-Aidi

This comment has been minimized.

Copy link

commented Nov 9, 2018

      Hi,

I got encoding errors in function format_mrpc().
Adding the parameter ' encoding="utf8" ' to all the calls to open a file resolves the issue.
Add the parameter to the file open calls at lines 68,72,73,74,85 and 86.
Thank You,
Netik

Kudos Netik!

@tongmeihan1995

This comment has been minimized.

@dragonev

This comment has been minimized.

Copy link

commented Nov 14, 2018

ConnectionResetError: [Errno 104] Connection reset by peer
urllib.error.URLError: <urlopen error [Errno 104] Connection reset by peer>

lantern fanqiang, solve above

@ zkl99999 have you resolved this issue?

@calliwen

This comment has been minimized.

Copy link

commented Nov 22, 2018

I clicked the data-download link, and get the error message is:
{
"error": {
"code": 400,
"message": "Invalid HTTP method/URL pair."
}
}
It may be the authorized relatived. So I turned to download the data manually by the link: https://gluebenchmark.com/tasks ;
And I processed above data by the instruction in the script description.

But I don't find the MRPC dataset's dev_id.tsv. So I can't run the Bert-base model with MRPC do_eval=true.

@111xumengze

This comment has been minimized.

Copy link

commented Nov 29, 2018

UnicodeDecodeError: 'gbk' codec can't decode byte 0x99 in position 5357: illegal multibyte sequence...

@111xumengze

This comment has been minimized.

Copy link

commented Nov 29, 2018

UnicodeDecodeError: 'gbk' codec can't decode byte 0x99 in position 5357: illegal multibyte sequence...

open(file_path) -> open(file_path,,encoding='UTF-8')

@bjayakumar

This comment has been minimized.

Copy link

commented Nov 30, 2018

you should have written this is the script for py3.x

@Elfsong

This comment has been minimized.

Copy link

commented Dec 19, 2018

Add encoding = "utf-8" to open function.

@willduan

This comment has been minimized.

Copy link

commented Jan 4, 2019

ConnectionResetError: [Errno 104] Connection reset by peer
urllib.error.URLError: <urlopen error [Errno 104] Connection reset by peer>

lantern fanqiang, solve above

if you have shadowsocks, and the proxy port is 1080, you can just add the codes below:

import os
import sys
import shutil
import argparse
import tempfile
import urllib.request
import zipfile

proxy = urllib.request.ProxyHandler({'http': '127.0.0.1:1080', 'https': '127.0.0.1:1080'})
# construct a new opener using your proxy settings
opener = urllib.request.build_opener(proxy)
# install the openen on the module-level
urllib.request.install_opener(opener)

It will works fine~

@laibamehnaz

This comment has been minimized.

Copy link

commented Jan 16, 2019

Hi,

I got encoding errors in function format_mrpc().
Adding the parameter ' encoding="utf8" ' to all the calls to open a file resolves the issue.
Add the parameter to the file open calls at lines 68,72,73,74,85 and 86.
Thank You,
Netik

Thanks for the help!!

@Lyrichu

This comment has been minimized.

Copy link

commented Jan 29, 2019

if you run with error:no module named request, just replace all urllib.request to urllib will be ok

@W4ngatang

This comment has been minimized.

Copy link
Owner Author

commented Jan 30, 2019

Thanks to everyone for your interest and especially those who helped discover bugs.
I've updated the script to incorporate the feedback and to download our updated version of QNLI.
Important difference: SentEval is no longer hosting MRPC, so you'll need to download and extract MRPC from the source.
This script has only been tested on python 3.

@haoransh

This comment has been minimized.

Copy link

commented Feb 11, 2019

@W4ngatang I recommend you to use https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt and https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt for MRPC in this script. These two files are tokenized and the same as before. They can be downloaded by wget easily.

@W4ngatang

This comment has been minimized.

Copy link
Owner Author

commented Feb 12, 2019

Updated, thanks!

@chongtwo

This comment has been minimized.

Copy link

commented Mar 25, 2019

I clicked the data-download link, and get the error message is:
{
"error": {
"code": 400,
"message": "Invalid HTTP method/URL pair."
}
}
It may be the authorized relatived. So I turned to download the data manually by the link: https://gluebenchmark.com/tasks ;
And I processed above data by the instruction in the script description.

But I don't find the MRPC dataset's dev_id.tsv. So I can't run the Bert-base model with MRPC do_eval=true.

@calliwen use the "--path_to_mrpc" to specify your local downloaded MRPC files' directory and the script will generate dev_id.tsv automatically

@Star-in-Sky

This comment has been minimized.

Copy link

commented Apr 9, 2019

在国内完全用不了啊````纠结

@parthsdoshi

This comment has been minimized.

Copy link

commented Apr 21, 2019

Seems like the Firebase hosting has gone over the limit -- the script won't work till the person hosting the firebase instance either pays more or the author of the script changes the URLs.

@a-maci

This comment has been minimized.

Copy link

commented Apr 22, 2019

Any resolution to the ^^above problem? The firebase hosting limit has exceeded.

@kongya

This comment has been minimized.

Copy link

commented Apr 22, 2019

Any resolution to the ^^above problem? The firebase hosting limit has exceeded.

@IDIDIR

This comment has been minimized.

Copy link

commented Apr 22, 2019

Any resolution to the ^^above problem? The firebase hosting limit has exceeded.

@sedelnik

This comment has been minimized.

Copy link

commented Apr 22, 2019

Any resolution to the ^^above problem? The firebase hosting limit has exceeded.

@seaslee

This comment has been minimized.

Copy link

commented Apr 22, 2019

Any resolution to the ^^above problem? The firebase hosting limit has exceeded.

@helenalee1994

This comment has been minimized.

Copy link

commented Apr 24, 2019

It works for me today.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.