Last active
July 18, 2019 13:55
-
-
Save ben0it8/e0436befad3923f4911718796242c4f2 to your computer and use it in GitHub Desktop.
read imdb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import requests | |
import tarfile | |
from tqdm import tqdm | |
# path to data | |
DATA_DIR = os.path.abspath('./data') | |
# path to IMDB | |
IMDB_DIR = os.path.join(DATA_DIR, "imdb5k") | |
# url to imdb5k dataset (randomly picked subset of original IMDB) | |
IMDB_URL = "https://github.com/ben0it8/transformer-finetuning/raw/master/imdb5k.tar.gz" | |
def download_url(url:str, dest:str, overwrite:bool=True, show_progress=True, | |
chunk_size=1024*1024, timeout=4, retries=5)->None: | |
"Download `url` to `dest` unless it exists and not `overwrite`." | |
dest = os.path.join(dest, os.path.basename(url)) | |
if os.path.exists(dest) and not overwrite: | |
print(f"File {dest} already exists!") | |
return dest | |
s = requests.Session() | |
s.mount('http://',requests.adapters.HTTPAdapter(max_retries=retries)) | |
u = s.get(url, stream=True, timeout=timeout) | |
try: file_size = int(u.headers["Content-Length"]) | |
except: show_progress = False | |
print(f"Downloading {url}") | |
with open(dest, 'wb') as f: | |
nbytes = 0 | |
if show_progress: | |
pbar = tqdm(range(file_size), leave=False) | |
try: | |
for chunk in u.iter_content(chunk_size=chunk_size): | |
nbytes += len(chunk) | |
if show_progress: pbar.update(nbytes) | |
f.write(chunk) | |
except requests.exceptions.ConnectionError as e: | |
print(f"Download failed after {retries} retries.") | |
import sys;sys.exit(1) | |
finally: | |
return dest | |
def untar(file_path, dest:str): | |
"Untar `file_path` to `dest`" | |
print(f"Untar {os.path.basename(file_path)} to {dest}") | |
with tarfile.open(file_path) as tf: | |
tf.extractall(path=str(dest)) | |
# download imdb dataset | |
file_path = download_url(IMDB_URL, '/tmp', overwrite=True) | |
# untar imdb dataset to DATA_DIR | |
untar(file_path, DATA_DIR) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment