Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Script for downloading data of the GLUE benchmark (gluebenchmark.com)
''' Script for downloading all GLUE data.
Note: for legal reasons, we are unable to host MRPC.
You can either use the version hosted by the SentEval team, which is already tokenized,
or you can download the original data from (https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B-3604ED519838/MSRParaphraseCorpus.msi) and extract the data from it manually.
For Windows users, you can run the .msi file. For Mac and Linux users, consider an external library such as 'cabextract' (see below for an example).
You should then rename and place specific files in a folder (see below for an example).
mkdir MRPC
cabextract MSRParaphraseCorpus.msi -d MRPC
cat MRPC/_2DEC3DBE877E4DB192D17C0256E90F1D | tr -d $'\r' > MRPC/msr_paraphrase_train.txt
cat MRPC/_D7B391F9EAFF4B1B8BCE8F21B20B1B61 | tr -d $'\r' > MRPC/msr_paraphrase_test.txt
rm MRPC/_*
rm MSRParaphraseCorpus.msi
1/30/19: It looks like SentEval is no longer hosting their extracted and tokenized MRPC data, so you'll need to download the data from the original source for now.
2/11/19: It looks like SentEval actually *is* hosting the extracted data. Hooray!
'''
import os
import sys
import shutil
import argparse
import tempfile
import urllib.request
import zipfile
TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "SNLI", "QNLI", "RTE", "WNLI", "diagnostic"]
TASK2PATH = {"CoLA":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4',
"SST":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8',
"MRPC":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc',
"QQP":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP.zip?alt=media&token=700c6acf-160d-4d89-81d1-de4191d02cb5',
"STS":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5',
"MNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce',
"SNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df',
"QNLI": 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLIv2.zip?alt=media&token=6fdcf570-0fc5-4631-8456-9505272d1601',
"RTE":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb',
"WNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf',
"diagnostic":'https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D'}
MRPC_TRAIN = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt'
MRPC_TEST = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt'
def download_and_extract(task, data_dir):
print("Downloading and extracting %s..." % task)
data_file = "%s.zip" % task
urllib.request.urlretrieve(TASK2PATH[task], data_file)
with zipfile.ZipFile(data_file) as zip_ref:
zip_ref.extractall(data_dir)
os.remove(data_file)
print("\tCompleted!")
def format_mrpc(data_dir, path_to_data):
print("Processing MRPC...")
mrpc_dir = os.path.join(data_dir, "MRPC")
if not os.path.isdir(mrpc_dir):
os.mkdir(mrpc_dir)
if path_to_data:
mrpc_train_file = os.path.join(path_to_data, "msr_paraphrase_train.txt")
mrpc_test_file = os.path.join(path_to_data, "msr_paraphrase_test.txt")
else:
print("Local MRPC data not specified, downloading data from %s" % MRPC_TRAIN)
mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")
mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")
urllib.request.urlretrieve(MRPC_TRAIN, mrpc_train_file)
urllib.request.urlretrieve(MRPC_TEST, mrpc_test_file)
assert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_file
assert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_file
urllib.request.urlretrieve(TASK2PATH["MRPC"], os.path.join(mrpc_dir, "dev_ids.tsv"))
dev_ids = []
with open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding="utf8") as ids_fh:
for row in ids_fh:
dev_ids.append(row.strip().split('\t'))
with open(mrpc_train_file, encoding="utf8") as data_fh, \
open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding="utf8") as train_fh, \
open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding="utf8") as dev_fh:
header = data_fh.readline()
train_fh.write(header)
dev_fh.write(header)
for row in data_fh:
label, id1, id2, s1, s2 = row.strip().split('\t')
if [id1, id2] in dev_ids:
dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
else:
train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
with open(mrpc_test_file, encoding="utf8") as data_fh, \
open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding="utf8") as test_fh:
header = data_fh.readline()
test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
for idx, row in enumerate(data_fh):
label, id1, id2, s1, s2 = row.strip().split('\t')
test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
print("\tCompleted!")
def download_diagnostic(data_dir):
print("Downloading and extracting diagnostic...")
if not os.path.isdir(os.path.join(data_dir, "diagnostic")):
os.mkdir(os.path.join(data_dir, "diagnostic"))
data_file = os.path.join(data_dir, "diagnostic", "diagnostic.tsv")
urllib.request.urlretrieve(TASK2PATH["diagnostic"], data_file)
print("\tCompleted!")
return
def get_tasks(task_names):
task_names = task_names.split(',')
if "all" in task_names:
tasks = TASKS
else:
tasks = []
for task_name in task_names:
assert task_name in TASKS, "Task %s not found!" % task_name
tasks.append(task_name)
return tasks
def main(arguments):
parser = argparse.ArgumentParser()
parser.add_argument('--data_dir', help='directory to save data to', type=str, default='glue_data')
parser.add_argument('--tasks', help='tasks to download data for as a comma separated string',
type=str, default='all')
parser.add_argument('--path_to_mrpc', help='path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt',
type=str, default='')
args = parser.parse_args(arguments)
if not os.path.isdir(args.data_dir):
os.mkdir(args.data_dir)
tasks = get_tasks(args.tasks)
for task in tasks:
if task == 'MRPC':
format_mrpc(args.data_dir, args.path_to_mrpc)
elif task == 'diagnostic':
download_diagnostic(args.data_dir)
else:
download_and_extract(task, args.data_dir)
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))
@W4ngatang

This comment has been minimized.

Copy link
Owner Author

@W4ngatang W4ngatang commented May 8, 2018

Run with

python download_glue_data.py --data_dir glue_data --tasks all
@netik1020

This comment has been minimized.

Copy link

@netik1020 netik1020 commented Nov 3, 2018

Hi,

I got encoding errors in function format_mrpc().
Adding the parameter ' encoding="utf8" ' to all the calls to open a file resolves the issue.
Add the parameter to the file open calls at lines 68,72,73,74,85 and 86.
Thank You,
Netik

@pajoma

This comment has been minimized.

Copy link

@pajoma pajoma commented Nov 7, 2018

Had the same issue as @netik1020, the UTF-8 method parameter worked for me.

I used the tensorflow docker image (based on ubuntu xenial).

@zkailinzhang

This comment has been minimized.

Copy link

@zkailinzhang zkailinzhang commented Nov 9, 2018

ConnectionResetError: [Errno 104] Connection reset by peer
urllib.error.URLError: <urlopen error [Errno 104] Connection reset by peer>

lantern fanqiang, solve above

@Edward-Aidi

This comment has been minimized.

Copy link

@Edward-Aidi Edward-Aidi commented Nov 9, 2018

      Hi,

I got encoding errors in function format_mrpc().
Adding the parameter ' encoding="utf8" ' to all the calls to open a file resolves the issue.
Add the parameter to the file open calls at lines 68,72,73,74,85 and 86.
Thank You,
Netik

Kudos Netik!

@tongmeihan1995

This comment has been minimized.

@dragonev

This comment has been minimized.

Copy link

@dragonev dragonev commented Nov 14, 2018

ConnectionResetError: [Errno 104] Connection reset by peer
urllib.error.URLError: <urlopen error [Errno 104] Connection reset by peer>

lantern fanqiang, solve above

@ zkl99999 have you resolved this issue?

@calliwen

This comment has been minimized.

Copy link

@calliwen calliwen commented Nov 22, 2018

I clicked the data-download link, and get the error message is:
{
"error": {
"code": 400,
"message": "Invalid HTTP method/URL pair."
}
}
It may be the authorized relatived. So I turned to download the data manually by the link: https://gluebenchmark.com/tasks ;
And I processed above data by the instruction in the script description.

But I don't find the MRPC dataset's dev_id.tsv. So I can't run the Bert-base model with MRPC do_eval=true.

@111xumengze

This comment has been minimized.

Copy link

@111xumengze 111xumengze commented Nov 29, 2018

UnicodeDecodeError: 'gbk' codec can't decode byte 0x99 in position 5357: illegal multibyte sequence...

@111xumengze

This comment has been minimized.

Copy link

@111xumengze 111xumengze commented Nov 29, 2018

UnicodeDecodeError: 'gbk' codec can't decode byte 0x99 in position 5357: illegal multibyte sequence...

open(file_path) -> open(file_path,,encoding='UTF-8')

@bjayakumar

This comment has been minimized.

Copy link

@bjayakumar bjayakumar commented Nov 30, 2018

you should have written this is the script for py3.x

@Elfsong

This comment has been minimized.

Copy link

@Elfsong Elfsong commented Dec 19, 2018

Add encoding = "utf-8" to open function.

@willduan

This comment has been minimized.

Copy link

@willduan willduan commented Jan 4, 2019

ConnectionResetError: [Errno 104] Connection reset by peer
urllib.error.URLError: <urlopen error [Errno 104] Connection reset by peer>

lantern fanqiang, solve above

if you have shadowsocks, and the proxy port is 1080, you can just add the codes below:

import os
import sys
import shutil
import argparse
import tempfile
import urllib.request
import zipfile

proxy = urllib.request.ProxyHandler({'http': '127.0.0.1:1080', 'https': '127.0.0.1:1080'})
# construct a new opener using your proxy settings
opener = urllib.request.build_opener(proxy)
# install the openen on the module-level
urllib.request.install_opener(opener)

It will works fine~

@laibamehnaz

This comment has been minimized.

Copy link

@laibamehnaz laibamehnaz commented Jan 16, 2019

Hi,

I got encoding errors in function format_mrpc().
Adding the parameter ' encoding="utf8" ' to all the calls to open a file resolves the issue.
Add the parameter to the file open calls at lines 68,72,73,74,85 and 86.
Thank You,
Netik

Thanks for the help!!

@Lyrichu

This comment has been minimized.

Copy link

@Lyrichu Lyrichu commented Jan 29, 2019

if you run with error:no module named request, just replace all urllib.request to urllib will be ok

@W4ngatang

This comment has been minimized.

Copy link
Owner Author

@W4ngatang W4ngatang commented Jan 30, 2019

Thanks to everyone for your interest and especially those who helped discover bugs.
I've updated the script to incorporate the feedback and to download our updated version of QNLI.
Important difference: SentEval is no longer hosting MRPC, so you'll need to download and extract MRPC from the source.
This script has only been tested on python 3.

@haoransh

This comment has been minimized.

Copy link

@haoransh haoransh commented Feb 11, 2019

@W4ngatang I recommend you to use https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt and https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt for MRPC in this script. These two files are tokenized and the same as before. They can be downloaded by wget easily.

@W4ngatang

This comment has been minimized.

Copy link
Owner Author

@W4ngatang W4ngatang commented Feb 12, 2019

Updated, thanks!

@chongtwo

This comment has been minimized.

Copy link

@chongtwo chongtwo commented Mar 25, 2019

I clicked the data-download link, and get the error message is:
{
"error": {
"code": 400,
"message": "Invalid HTTP method/URL pair."
}
}
It may be the authorized relatived. So I turned to download the data manually by the link: https://gluebenchmark.com/tasks ;
And I processed above data by the instruction in the script description.

But I don't find the MRPC dataset's dev_id.tsv. So I can't run the Bert-base model with MRPC do_eval=true.

@calliwen use the "--path_to_mrpc" to specify your local downloaded MRPC files' directory and the script will generate dev_id.tsv automatically

@Star-in-Sky

This comment has been minimized.

Copy link

@Star-in-Sky Star-in-Sky commented Apr 9, 2019

在国内完全用不了啊````纠结

@parthsdoshi

This comment has been minimized.

Copy link

@parthsdoshi parthsdoshi commented Apr 21, 2019

Seems like the Firebase hosting has gone over the limit -- the script won't work till the person hosting the firebase instance either pays more or the author of the script changes the URLs.

@a-maci

This comment has been minimized.

Copy link

@a-maci a-maci commented Apr 22, 2019

Any resolution to the ^^above problem? The firebase hosting limit has exceeded.

@kongya

This comment has been minimized.

Copy link

@kongya kongya commented Apr 22, 2019

Any resolution to the ^^above problem? The firebase hosting limit has exceeded.

@IDIDIR

This comment has been minimized.

Copy link

@IDIDIR IDIDIR commented Apr 22, 2019

Any resolution to the ^^above problem? The firebase hosting limit has exceeded.

@sedelnik

This comment has been minimized.

Copy link

@sedelnik sedelnik commented Apr 22, 2019

Any resolution to the ^^above problem? The firebase hosting limit has exceeded.

@seaslee

This comment has been minimized.

Copy link

@seaslee seaslee commented Apr 22, 2019

Any resolution to the ^^above problem? The firebase hosting limit has exceeded.

@leehelenah

This comment has been minimized.

Copy link

@leehelenah leehelenah commented Apr 24, 2019

It works for me today.

@lotfipip

This comment has been minimized.

Copy link

@lotfipip lotfipip commented Jun 20, 2019

please help me about indentatioError in: sys.exit(main(sys.argv[1:]))
indentatioError :expected an Intended block

@jack23curtin

This comment has been minimized.

Copy link

@jack23curtin jack23curtin commented Jun 24, 2019

For me, I had to take out: encoding = "utf-8". I kept getting the error: TypeError: 'encoding' is an invalid keyword argument for this function.

Seem to work fine after that.

@seven-minutes

This comment has been minimized.

Copy link

@seven-minutes seven-minutes commented Jun 27, 2019

urllib.error.URLError:
how to fix this problem???

@W4ngatang

This comment has been minimized.

Copy link
Owner Author

@W4ngatang W4ngatang commented Jul 3, 2019

Hi sorry for the delayed response. This script is deprecated and you should refer instead to the script here in Jiant. Also, note that SuperGLUE has been released and is probably more interesting to try to improve.

@RamonYeung

This comment has been minimized.

Copy link

@RamonYeung RamonYeung commented Aug 5, 2019

Work out of the box! Thanks.

@TankAbraham

This comment has been minimized.

Copy link

@TankAbraham TankAbraham commented Sep 9, 2019

在国内完全用不了啊````纠结

是的呀,现在有解决方案了吗?
Yes, it is! Is there any solution for the network connection problems in China?

@awaemmanuel

This comment has been minimized.

Copy link

@awaemmanuel awaemmanuel commented Sep 22, 2019

I can confirm it all works today for me. Thank you :)

@WorldWarII

This comment has been minimized.

Copy link

@WorldWarII WorldWarII commented Nov 11, 2019

在国内完全用不了啊````纠结

是的呀,现在有解决方案了吗?
Yes, it is! Is there any solution for the network connection problems in China?

+1,在国内因为网络原因用不了,报URLError,痛苦

@moonfansLTH

This comment has been minimized.

Copy link

@moonfansLTH moonfansLTH commented Dec 3, 2019

IOError: [Errno socket error] [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:727)

@atif93

This comment has been minimized.

Copy link

@atif93 atif93 commented Dec 3, 2019

IOError: [Errno socket error] [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:727)

Try changing https to http in all the URLs

@xixiaoyao

This comment has been minimized.

Copy link

@xixiaoyao xixiaoyao commented Dec 16, 2019

[test@yo-gpu-127-55-3-03 glue]$ python download.py --data_dir glue_data --tasks all
Downloading and extracting CoLA...
Traceback (most recent call last):
File "/home/paddle/miniconda3/envs/solar/lib/python3.6/urllib/request.py", line 1318, in do_open
encode_chunked=req.has_header('Transfer-encoding'))
File "/home/paddle/miniconda3/envs/solar/lib/python3.6/http/client.py", line 1254, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/home/paddle/miniconda3/envs/solar/lib/python3.6/http/client.py", line 1300, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "/home/paddle/miniconda3/envs/solar/lib/python3.6/http/client.py", line 1249, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "/home/paddle/miniconda3/envs/solar/lib/python3.6/http/client.py", line 1036, in _send_output
self.send(msg)
File "/home/paddle/miniconda3/envs/solar/lib/python3.6/http/client.py", line 974, in send
self.connect()
File "/home/paddle/miniconda3/envs/solar/lib/python3.6/http/client.py", line 1415, in connect
server_hostname=server_hostname)
File "/home/paddle/miniconda3/envs/solar/lib/python3.6/ssl.py", line 407, in wrap_socket
_context=self, _session=session)
File "/home/paddle/miniconda3/envs/solar/lib/python3.6/ssl.py", line 817, in init
self.do_handshake()
File "/home/paddle/miniconda3/envs/solar/lib/python3.6/ssl.py", line 1077, in do_handshake
self._sslobj.do_handshake()
File "/home/paddle/miniconda3/envs/solar/lib/python3.6/ssl.py", line 689, in do_handshake
self._sslobj.do_handshake()
ConnectionResetError: [Errno 104] Connection reset by peer

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
File "download.py", line 139, in
sys.exit(main(sys.argv[1:]))
File "download.py", line 135, in main
download_and_extract(task, args.data_dir)
File "download.py", line 45, in download_and_extract
urllib.request.urlretrieve(TASK2PATH[task], data_file)
File "/home/paddle/miniconda3/envs/solar/lib/python3.6/urllib/request.py", line 248, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "/home/paddle/miniconda3/envs/solar/lib/python3.6/urllib/request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "/home/paddle/miniconda3/envs/solar/lib/python3.6/urllib/request.py", line 526, in open
response = self._open(req, data)
File "/home/paddle/miniconda3/envs/solar/lib/python3.6/urllib/request.py", line 544, in _open
'_open', req)
File "/home/paddle/miniconda3/envs/solar/lib/python3.6/urllib/request.py", line 504, in _call_chain
result = func(*args)
File "/home/paddle/miniconda3/envs/solar/lib/python3.6/urllib/request.py", line 1361, in https_open
context=self._context, check_hostname=self._check_hostname)
File "/home/paddle/miniconda3/envs/solar/lib/python3.6/urllib/request.py", line 1320, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [Errno 104] Connection reset by peer>

@Chapingaccount

This comment has been minimized.

Copy link

@Chapingaccount Chapingaccount commented Feb 24, 2020

ConnectionResetError: [Errno 104] Connection reset by peer
urllib.error.URLError: <urlopen error [Errno 104] Connection reset by peer>
lantern fanqiang, solve above

if you have shadowsocks, and the proxy port is 1080, you can just add the codes below:

import os
import sys
import shutil
import argparse
import tempfile
import urllib.request
import zipfile

proxy = urllib.request.ProxyHandler({'http': '127.0.0.1:1080', 'https': '127.0.0.1:1080'})
# construct a new opener using your proxy settings
opener = urllib.request.build_opener(proxy)
# install the openen on the module-level
urllib.request.install_opener(opener)

It will works fine~

hello! I have shadowsocks, and the proxy port is 1080, i tried what you said but still didn't work.

urllib.error.URLError: <urlopen error [Errno 61] Connection refused>

import os
import sys
import shutil
import argparse
import tempfile
import urllib.request
import zipfile

proxy = urllib.request.ProxyHandler({'http': '127.0.0.1:1080', 'https': '127.0.0.1:1080'})

construct a new opener using your proxy settings

opener = urllib.request.build_opener(proxy)

install the openen on the module-level

urllib.request.install_opener(opener)

TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "SNLI", "QNLI", "RTE", "WNLI", "diagnostic"]

Can you tell me the reason? thanks:)

@ghwn

This comment has been minimized.

Copy link

@ghwn ghwn commented Apr 27, 2020

Thank you!

@vlall

This comment has been minimized.

Copy link

@vlall vlall commented Jul 15, 2020

IOError: [Errno socket error] [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:727)

Try changing https to http in all the URLs

Having the same issue. Changing it to http just gives me a 403 error urllib.error.HTTPError: HTTP Error 403: Forbidden

@vlall

This comment has been minimized.

Copy link

@vlall vlall commented Jul 15, 2020

IOError: [Errno socket error] [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:727)

Try changing https to http in all the URLs

Having the same issue. Changing it to http just gives me a 403 error urllib.error.HTTPError: HTTP Error 403: Forbidden

For any Mac Users experiencing this issue, see: https://stackoverflow.com/questions/50236117/scraping-ssl-certificate-verify-failed-error-for-http-en-wikipedia-org

@timpal0l

This comment has been minimized.

Copy link

@timpal0l timpal0l commented Aug 26, 2020

Where can I find the labels for the test.tsv for the STS-B Task? Thanks...

@W4ngatang

This comment has been minimized.

Copy link
Owner Author

@W4ngatang W4ngatang commented Aug 28, 2020

@offbye

This comment has been minimized.

Copy link

@offbye offbye commented Sep 2, 2020

If u behind a http proxy like me , just add the fellow code to solve it!
Enjoy!!

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

proxy = urllib.request.ProxyHandler({'http': '127.0.0.1:3128', 'https': '127.0.0.1:3128'})
# construct a new opener using your proxy settings
opener = urllib.request.build_opener(proxy)
# install the openen on the module-level
urllib.request.install_opener(opener)

@ruiqi-zhong

This comment has been minimized.

Copy link

@ruiqi-zhong ruiqi-zhong commented Oct 17, 2020

Does anyone know how to open load the QQP training file? I run with the command:

df = pd.read_csv('glue_data/QQP/train.tsv', sep='\t', encoding = 'utf8')

but it gives me errors

Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/home/rz/.local/lib/python3.5/site-packages/pandas/io/parsers.py", line 685, in parser_f return _read(filepath_or_buffer, kwds) File "/home/rz/.local/lib/python3.5/site-packages/pandas/io/parsers.py", line 463, in _read data = parser.read(nrows) File "/home/rz/.local/lib/python3.5/site-packages/pandas/io/parsers.py", line 1154, in read ret = self._engine.read(nrows) File "/home/rz/.local/lib/python3.5/site-packages/pandas/io/parsers.py", line 2059, in read data = self._reader.read(nrows) File "pandas/_libs/parsers.pyx", line 881, in pandas._libs.parsers.TextReader.read File "pandas/_libs/parsers.pyx", line 896, in pandas._libs.parsers.TextReader._read_low_memory File "pandas/_libs/parsers.pyx", line 950, in pandas._libs.parsers.TextReader._read_rows File "pandas/_libs/parsers.pyx", line 937, in pandas._libs.parsers.TextReader._tokenize_rows File "pandas/_libs/parsers.pyx", line 2132, in pandas._libs.parsers.raise_parser_error pandas.errors.ParserError: Error tokenizing data. C error: Expected 6 fields in line 83032, saw 7

Thanks if anyone can help me with this!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.