Skip to content

Instantly share code, notes, and snippets.

@ben0it8
Last active July 18, 2019 13:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ben0it8/e0436befad3923f4911718796242c4f2 to your computer and use it in GitHub Desktop.
Save ben0it8/e0436befad3923f4911718796242c4f2 to your computer and use it in GitHub Desktop.
read imdb
import os
import requests
import tarfile
from tqdm import tqdm
# path to data
DATA_DIR = os.path.abspath('./data')
# path to IMDB
IMDB_DIR = os.path.join(DATA_DIR, "imdb5k")
# url to imdb5k dataset (randomly picked subset of original IMDB)
IMDB_URL = "https://github.com/ben0it8/transformer-finetuning/raw/master/imdb5k.tar.gz"
def download_url(url:str, dest:str, overwrite:bool=True, show_progress=True,
chunk_size=1024*1024, timeout=4, retries=5)->None:
"Download `url` to `dest` unless it exists and not `overwrite`."
dest = os.path.join(dest, os.path.basename(url))
if os.path.exists(dest) and not overwrite:
print(f"File {dest} already exists!")
return dest
s = requests.Session()
s.mount('http://',requests.adapters.HTTPAdapter(max_retries=retries))
u = s.get(url, stream=True, timeout=timeout)
try: file_size = int(u.headers["Content-Length"])
except: show_progress = False
print(f"Downloading {url}")
with open(dest, 'wb') as f:
nbytes = 0
if show_progress:
pbar = tqdm(range(file_size), leave=False)
try:
for chunk in u.iter_content(chunk_size=chunk_size):
nbytes += len(chunk)
if show_progress: pbar.update(nbytes)
f.write(chunk)
except requests.exceptions.ConnectionError as e:
print(f"Download failed after {retries} retries.")
import sys;sys.exit(1)
finally:
return dest
def untar(file_path, dest:str):
"Untar `file_path` to `dest`"
print(f"Untar {os.path.basename(file_path)} to {dest}")
with tarfile.open(file_path) as tf:
tf.extractall(path=str(dest))
# download imdb dataset
file_path = download_url(IMDB_URL, '/tmp', overwrite=True)
# untar imdb dataset to DATA_DIR
untar(file_path, DATA_DIR)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment