Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Script for downloading data of the GLUE benchmark (gluebenchmark.com)
''' Script for downloading all GLUE data.
Note: for legal reasons, we are unable to host MRPC.
You can either use the version hosted by the SentEval team, which is already tokenized,
or you can download the original data from (https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B-3604ED519838/MSRParaphraseCorpus.msi) and extract the data from it manually.
For Windows users, you can run the .msi file. For Mac and Linux users, consider an external library such as 'cabextract' (see below for an example).
You should then rename and place specific files in a folder (see below for an example).
mkdir MRPC
cabextract MSRParaphraseCorpus.msi -d MRPC
cat MRPC/_2DEC3DBE877E4DB192D17C0256E90F1D | tr -d $'\r' > MRPC/msr_paraphrase_train.txt
cat MRPC/_D7B391F9EAFF4B1B8BCE8F21B20B1B61 | tr -d $'\r' > MRPC/msr_paraphrase_test.txt
rm MRPC/_*
rm MSRParaphraseCorpus.msi
1/30/19: It looks like SentEval is no longer hosting their extracted and tokenized MRPC data, so you'll need to download the data from the original source for now.
2/11/19: It looks like SentEval actually *is* hosting the extracted data. Hooray!
'''
import os
import sys
import shutil
import argparse
import tempfile
import urllib.request
import zipfile
TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "SNLI", "QNLI", "RTE", "WNLI", "diagnostic"]
TASK2PATH = {"CoLA":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4',
"SST":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8',
"MRPC":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc',
"QQP":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP.zip?alt=media&token=700c6acf-160d-4d89-81d1-de4191d02cb5',
"STS":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5',
"MNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce',
"SNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df',
"QNLI": 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLIv2.zip?alt=media&token=6fdcf570-0fc5-4631-8456-9505272d1601',
"RTE":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb',
"WNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf',
"diagnostic":'https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D'}
MRPC_TRAIN = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt'
MRPC_TEST = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt'
def download_and_extract(task, data_dir):
print("Downloading and extracting %s..." % task)
data_file = "%s.zip" % task
urllib.request.urlretrieve(TASK2PATH[task], data_file)
with zipfile.ZipFile(data_file) as zip_ref:
zip_ref.extractall(data_dir)
os.remove(data_file)
print("\tCompleted!")
def format_mrpc(data_dir, path_to_data):
print("Processing MRPC...")
mrpc_dir = os.path.join(data_dir, "MRPC")
if not os.path.isdir(mrpc_dir):
os.mkdir(mrpc_dir)
if path_to_data:
mrpc_train_file = os.path.join(path_to_data, "msr_paraphrase_train.txt")
mrpc_test_file = os.path.join(path_to_data, "msr_paraphrase_test.txt")
else:
print("Local MRPC data not specified, downloading data from %s" % MRPC_TRAIN)
mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")
mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")
urllib.request.urlretrieve(MRPC_TRAIN, mrpc_train_file)
urllib.request.urlretrieve(MRPC_TEST, mrpc_test_file)
assert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_file
assert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_file
urllib.request.urlretrieve(TASK2PATH["MRPC"], os.path.join(mrpc_dir, "dev_ids.tsv"))
dev_ids = []
with open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding="utf8") as ids_fh:
for row in ids_fh:
dev_ids.append(row.strip().split('\t'))
with open(mrpc_train_file, encoding="utf8") as data_fh, \
open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding="utf8") as train_fh, \
open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding="utf8") as dev_fh:
header = data_fh.readline()
train_fh.write(header)
dev_fh.write(header)
for row in data_fh:
label, id1, id2, s1, s2 = row.strip().split('\t')
if [id1, id2] in dev_ids:
dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
else:
train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
with open(mrpc_test_file, encoding="utf8") as data_fh, \
open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding="utf8") as test_fh:
header = data_fh.readline()
test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
for idx, row in enumerate(data_fh):
label, id1, id2, s1, s2 = row.strip().split('\t')
test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
print("\tCompleted!")
def download_diagnostic(data_dir):
print("Downloading and extracting diagnostic...")
if not os.path.isdir(os.path.join(data_dir, "diagnostic")):
os.mkdir(os.path.join(data_dir, "diagnostic"))
data_file = os.path.join(data_dir, "diagnostic", "diagnostic.tsv")
urllib.request.urlretrieve(TASK2PATH["diagnostic"], data_file)
print("\tCompleted!")
return
def get_tasks(task_names):
task_names = task_names.split(',')
if "all" in task_names:
tasks = TASKS
else:
tasks = []
for task_name in task_names:
assert task_name in TASKS, "Task %s not found!" % task_name
tasks.append(task_name)
return tasks
def main(arguments):
parser = argparse.ArgumentParser()
parser.add_argument('--data_dir', help='directory to save data to', type=str, default='glue_data')
parser.add_argument('--tasks', help='tasks to download data for as a comma separated string',
type=str, default='all')
parser.add_argument('--path_to_mrpc', help='path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt',
type=str, default='')
args = parser.parse_args(arguments)
if not os.path.isdir(args.data_dir):
os.mkdir(args.data_dir)
tasks = get_tasks(args.tasks)
for task in tasks:
if task == 'MRPC':
format_mrpc(args.data_dir, args.path_to_mrpc)
elif task == 'diagnostic':
download_diagnostic(args.data_dir)
else:
download_and_extract(task, args.data_dir)
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))
@W4ngatang

This comment has been minimized.

Copy link
Owner Author

W4ngatang commented May 8, 2018

Run with

python download_glue_data.py --data_dir glue_data --tasks all
@netik1020

This comment has been minimized.

Copy link

netik1020 commented Nov 3, 2018

Hi,

I got encoding errors in function format_mrpc().
Adding the parameter ' encoding="utf8" ' to all the calls to open a file resolves the issue.
Add the parameter to the file open calls at lines 68,72,73,74,85 and 86.
Thank You,
Netik

@pajoma

This comment has been minimized.

Copy link

pajoma commented Nov 7, 2018

Had the same issue as @netik1020, the UTF-8 method parameter worked for me.

I used the tensorflow docker image (based on ubuntu xenial).

@zkl99999

This comment has been minimized.

Copy link

zkl99999 commented Nov 9, 2018

ConnectionResetError: [Errno 104] Connection reset by peer
urllib.error.URLError: <urlopen error [Errno 104] Connection reset by peer>

lantern fanqiang, solve above

@Edward-Aidi

This comment has been minimized.

Copy link

Edward-Aidi commented Nov 9, 2018

      Hi,

I got encoding errors in function format_mrpc().
Adding the parameter ' encoding="utf8" ' to all the calls to open a file resolves the issue.
Add the parameter to the file open calls at lines 68,72,73,74,85 and 86.
Thank You,
Netik

Kudos Netik!

@tongmeihan1995

This comment has been minimized.

@dragonev

This comment has been minimized.

Copy link

dragonev commented Nov 14, 2018

ConnectionResetError: [Errno 104] Connection reset by peer
urllib.error.URLError: <urlopen error [Errno 104] Connection reset by peer>

lantern fanqiang, solve above

@ zkl99999 have you resolved this issue?

@calliwen

This comment has been minimized.

Copy link

calliwen commented Nov 22, 2018

I clicked the data-download link, and get the error message is:
{
"error": {
"code": 400,
"message": "Invalid HTTP method/URL pair."
}
}
It may be the authorized relatived. So I turned to download the data manually by the link: https://gluebenchmark.com/tasks ;
And I processed above data by the instruction in the script description.

But I don't find the MRPC dataset's dev_id.tsv. So I can't run the Bert-base model with MRPC do_eval=true.

@111xumengze

This comment has been minimized.

Copy link

111xumengze commented Nov 29, 2018

UnicodeDecodeError: 'gbk' codec can't decode byte 0x99 in position 5357: illegal multibyte sequence...

@111xumengze

This comment has been minimized.

Copy link

111xumengze commented Nov 29, 2018

UnicodeDecodeError: 'gbk' codec can't decode byte 0x99 in position 5357: illegal multibyte sequence...

open(file_path) -> open(file_path,,encoding='UTF-8')

@bjayakumar

This comment has been minimized.

Copy link

bjayakumar commented Nov 30, 2018

you should have written this is the script for py3.x

@Elfsong

This comment has been minimized.

Copy link

Elfsong commented Dec 19, 2018

Add encoding = "utf-8" to open function.

@willduan

This comment has been minimized.

Copy link

willduan commented Jan 4, 2019

ConnectionResetError: [Errno 104] Connection reset by peer
urllib.error.URLError: <urlopen error [Errno 104] Connection reset by peer>

lantern fanqiang, solve above

if you have shadowsocks, and the proxy port is 1080, you can just add the codes below:

import os
import sys
import shutil
import argparse
import tempfile
import urllib.request
import zipfile

proxy = urllib.request.ProxyHandler({'http': '127.0.0.1:1080', 'https': '127.0.0.1:1080'})
# construct a new opener using your proxy settings
opener = urllib.request.build_opener(proxy)
# install the openen on the module-level
urllib.request.install_opener(opener)

It will works fine~

@laibamehnaz

This comment has been minimized.

Copy link

laibamehnaz commented Jan 16, 2019

Hi,

I got encoding errors in function format_mrpc().
Adding the parameter ' encoding="utf8" ' to all the calls to open a file resolves the issue.
Add the parameter to the file open calls at lines 68,72,73,74,85 and 86.
Thank You,
Netik

Thanks for the help!!

@Lyrichu

This comment has been minimized.

Copy link

Lyrichu commented Jan 29, 2019

if you run with error:no module named request, just replace all urllib.request to urllib will be ok

@W4ngatang

This comment has been minimized.

Copy link
Owner Author

W4ngatang commented Jan 30, 2019

Thanks to everyone for your interest and especially those who helped discover bugs.
I've updated the script to incorporate the feedback and to download our updated version of QNLI.
Important difference: SentEval is no longer hosting MRPC, so you'll need to download and extract MRPC from the source.
This script has only been tested on python 3.

@haoransh

This comment has been minimized.

Copy link

haoransh commented Feb 11, 2019

@W4ngatang I recommend you to use https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt and https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt for MRPC in this script. These two files are tokenized and the same as before. They can be downloaded by wget easily.

@W4ngatang

This comment has been minimized.

Copy link
Owner Author

W4ngatang commented Feb 12, 2019

Updated, thanks!

@chongtwo

This comment has been minimized.

Copy link

chongtwo commented Mar 25, 2019

I clicked the data-download link, and get the error message is:
{
"error": {
"code": 400,
"message": "Invalid HTTP method/URL pair."
}
}
It may be the authorized relatived. So I turned to download the data manually by the link: https://gluebenchmark.com/tasks ;
And I processed above data by the instruction in the script description.

But I don't find the MRPC dataset's dev_id.tsv. So I can't run the Bert-base model with MRPC do_eval=true.

@calliwen use the "--path_to_mrpc" to specify your local downloaded MRPC files' directory and the script will generate dev_id.tsv automatically

@Star-in-Sky

This comment has been minimized.

Copy link

Star-in-Sky commented Apr 9, 2019

在国内完全用不了啊````纠结

@parthsdoshi

This comment has been minimized.

Copy link

parthsdoshi commented Apr 21, 2019

Seems like the Firebase hosting has gone over the limit -- the script won't work till the person hosting the firebase instance either pays more or the author of the script changes the URLs.

@a-maci

This comment has been minimized.

Copy link

a-maci commented Apr 22, 2019

Any resolution to the ^^above problem? The firebase hosting limit has exceeded.

@kongya

This comment has been minimized.

Copy link

kongya commented Apr 22, 2019

Any resolution to the ^^above problem? The firebase hosting limit has exceeded.

@IDIDIR

This comment has been minimized.

Copy link

IDIDIR commented Apr 22, 2019

Any resolution to the ^^above problem? The firebase hosting limit has exceeded.

@sedelnik

This comment has been minimized.

Copy link

sedelnik commented Apr 22, 2019

Any resolution to the ^^above problem? The firebase hosting limit has exceeded.

@seaslee

This comment has been minimized.

Copy link

seaslee commented Apr 22, 2019

Any resolution to the ^^above problem? The firebase hosting limit has exceeded.

@helenalee1994

This comment has been minimized.

Copy link

helenalee1994 commented Apr 24, 2019

It works for me today.

@lotfipip

This comment has been minimized.

Copy link

lotfipip commented Jun 20, 2019

please help me about indentatioError in: sys.exit(main(sys.argv[1:]))
indentatioError :expected an Intended block

@jack23curtin

This comment has been minimized.

Copy link

jack23curtin commented Jun 24, 2019

For me, I had to take out: encoding = "utf-8". I kept getting the error: TypeError: 'encoding' is an invalid keyword argument for this function.

Seem to work fine after that.

@seven-minutes

This comment has been minimized.

Copy link

seven-minutes commented Jun 27, 2019

urllib.error.URLError:
how to fix this problem???

@W4ngatang

This comment has been minimized.

Copy link
Owner Author

W4ngatang commented Jul 3, 2019

Hi sorry for the delayed response. This script is deprecated and you should refer instead to the script here in Jiant. Also, note that SuperGLUE has been released and is probably more interesting to try to improve.

@RamonYeung

This comment has been minimized.

Copy link

RamonYeung commented Aug 5, 2019

Work out of the box! Thanks.

@TankAbraham

This comment has been minimized.

Copy link

TankAbraham commented Sep 9, 2019

在国内完全用不了啊````纠结

是的呀,现在有解决方案了吗?
Yes, it is! Is there any solution for the network connection problems in China?

@awaemmanuel

This comment has been minimized.

Copy link

awaemmanuel commented Sep 22, 2019

I can confirm it all works today for me. Thank you :)

@WorldWarII

This comment has been minimized.

Copy link

WorldWarII commented Nov 11, 2019

在国内完全用不了啊````纠结

是的呀,现在有解决方案了吗?
Yes, it is! Is there any solution for the network connection problems in China?

+1,在国内因为网络原因用不了,报URLError,痛苦

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.