ben0it8/download_imdb.py

## download_imdb.py
import os
import requests
import tarfile
from tqdm import tqdm

# path to data
DATA_DIR = os.path.abspath('./data')

# path to IMDB
IMDB_DIR = os.path.join(DATA_DIR, "imdb5k")

# url to imdb5k dataset (randomly picked subset of original IMDB)
IMDB_URL = "https://github.com/ben0it8/transformer-finetuning/raw/master/imdb5k.tar.gz"

def download_url(url:str, dest:str, overwrite:bool=True, show_progress=True,
                 chunk_size=1024*1024, timeout=4, retries=5)->None:
    "Download `url` to `dest` unless it exists and not `overwrite`."
    dest = os.path.join(dest, os.path.basename(url))
    if os.path.exists(dest) and not overwrite:
        print(f"File {dest} already exists!")
        return dest

    s = requests.Session()
    s.mount('http://',requests.adapters.HTTPAdapter(max_retries=retries))
    u = s.get(url, stream=True, timeout=timeout)
    try: file_size = int(u.headers["Content-Length"])
    except: show_progress = False
    print(f"Downloading {url}")
    with open(dest, 'wb') as f:
        nbytes = 0
        if show_progress:
            pbar = tqdm(range(file_size), leave=False)
        try:
            for chunk in u.iter_content(chunk_size=chunk_size):
                nbytes += len(chunk)
                if show_progress: pbar.update(nbytes)
                f.write(chunk)
        except requests.exceptions.ConnectionError as e:
            print(f"Download failed after {retries} retries.")
            import sys;sys.exit(1)
        finally:
            return dest

def untar(file_path, dest:str):
    "Untar `file_path` to `dest`"
    print(f"Untar {os.path.basename(file_path)} to {dest}")
    with tarfile.open(file_path) as tf:
        tf.extractall(path=str(dest))

# download imdb dataset
file_path = download_url(IMDB_URL, '/tmp', overwrite=True)

# untar imdb dataset to DATA_DIR
untar(file_path, DATA_DIR)
	import os
	import requests
	import tarfile
	from tqdm import tqdm

	# path to data
	DATA_DIR = os.path.abspath('./data')

	# path to IMDB
	IMDB_DIR = os.path.join(DATA_DIR, "imdb5k")

	# url to imdb5k dataset (randomly picked subset of original IMDB)
	IMDB_URL = "https://github.com/ben0it8/transformer-finetuning/raw/master/imdb5k.tar.gz"

	def download_url(url:str, dest:str, overwrite:bool=True, show_progress=True,
	chunk_size=1024*1024, timeout=4, retries=5)->None:
	"Download `url` to `dest` unless it exists and not `overwrite`."
	dest = os.path.join(dest, os.path.basename(url))
	if os.path.exists(dest) and not overwrite:
	print(f"File {dest} already exists!")
	return dest

	s = requests.Session()
	s.mount('http://',requests.adapters.HTTPAdapter(max_retries=retries))
	u = s.get(url, stream=True, timeout=timeout)
	try: file_size = int(u.headers["Content-Length"])
	except: show_progress = False
	print(f"Downloading {url}")
	with open(dest, 'wb') as f:
	nbytes = 0
	if show_progress:
	pbar = tqdm(range(file_size), leave=False)
	try:
	for chunk in u.iter_content(chunk_size=chunk_size):
	nbytes += len(chunk)
	if show_progress: pbar.update(nbytes)
	f.write(chunk)
	except requests.exceptions.ConnectionError as e:
	print(f"Download failed after {retries} retries.")
	import sys;sys.exit(1)
	finally:
	return dest

	def untar(file_path, dest:str):
	"Untar `file_path` to `dest`"
	print(f"Untar {os.path.basename(file_path)} to {dest}")
	with tarfile.open(file_path) as tf:
	tf.extractall(path=str(dest))

	# download imdb dataset
	file_path = download_url(IMDB_URL, '/tmp', overwrite=True)

	# untar imdb dataset to DATA_DIR
	untar(file_path, DATA_DIR)