iNLyze/Report_Load_Save.ipynb

## Report_Load_Save.ipynb
## Dependencies

import sys, os
sys.path.append('/modules')
print(sys.path)
%load_ext autoreload
%autoreload 2
%matplotlib inline
from jupyterthemes import jtplot
jtplot.style(theme='chesterish', grid=False, ticks=False, )
import matplotlib
matplotlib.rcParams['image.interpolation'] = None
# Set ENV for PyTorch
os.environ["TORCH_HOME"] = "/modules/.torch"
os.environ["TORCH_MODEL_ZOO"] = "/modules/pretrnd_mdls_pytorch/"

from fastai.conv_learner import *
from fastai.dataset import *
from fastai.models import *
from fastai.transforms import *
from fastai.plots import *

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from scipy.stats import describe

import matplotlib.pylab as plt

## Utility Code

def savename(learn,proj_name=''):
    out = f'{learn.models.name}-RGB-{learn.data.sz}-epoch_{learn.sched.epoch}-val_loss_{np.round(np.array(learn.sched.val_losses).min(), 3)}'
    print(f'File name for saving: {out}')
    return out

def parse_csv_labels(fn, skip_header=True, cat_separator = ' '):
    """Parse filenames and label sets from a CSV file.
    This method expects that the csv file at path :fn: has two columns. If it
    has a header, :skip_header: should be set to True. The labels in the
    label set are expected to be space separated.
    Arguments:
        fn: Path to a CSV file.
        skip_header: A boolean flag indicating whether to skip the header.
    Returns:
        a four-tuple of (
            sorted image filenames,
            a dictionary of filenames and corresponding labels,
            a sorted set of unique labels,
            a dictionary of labels to their corresponding index, which will
            be one-hot encoded.
        )
    .
    :param cat_separator: the separator for the categories column
    """
    df = pd.read_csv(fn, index_col=0, header=0 if skip_header else None, dtype=str)
    fnames = df.index.values
    df.iloc[:,0] = df.iloc[:,0].str.split(cat_separator)
    return sorted(fnames), list(df.to_dict().values())[0]

def csv_source(folder, csv_file, skip_header=True, suffix='', continuous=False, **kwargs):
    fnames,csv_labels = parse_csv_labels(csv_file, skip_header, **kwargs)
    return dict_source(folder, fnames, csv_labels, suffix, continuous)

class ImageClassifierData_sep(ImageClassifierData):
        @classmethod
        def from_csv(cls, path, folder, csv_fname, bs=64, tfms=(None,None),
               val_idxs=None, suffix='', test_name=None, continuous=False, skip_header=True, num_workers=8, **kwargs):
            """ Read in images and their labels given as a CSV file.
            This method should be used when training image labels are given in an CSV file as opposed to
            sub-directories with label names.
            Arguments:
                path: a root path of the data (used for storing trained models, precomputed values, etc)
                folder: a name of the folder in which training images are contained.
                csv_fname: a name of the CSV file which contains target labels.
                bs: batch size
                tfms: transformations (for data augmentations). e.g. output of `tfms_from_model`
                val_idxs: index of images to be used for validation. e.g. output of `get_cv_idxs`.
                    If None, default arguments to get_cv_idxs are used.
                suffix: suffix to add to image names in CSV file (sometimes CSV only contains the file name without file
                        extension e.g. '.jpg' - in which case, you can set suffix as '.jpg')
                test_name: a name of the folder which contains test images.
                continuous: TODO
                skip_header: skip the first row of the CSV file.
                num_workers: number of workers
            Returns:
                ImageClassifierData
            """
            assert not (tfms[0] is None or tfms[1] is None), "please provide transformations for your train and validation sets"
            assert not (os.path.isabs(folder)), "folder needs to be a relative path"
            fnames,y,classes = csv_source(folder, csv_fname, skip_header, suffix, continuous=continuous, **kwargs)
            return cls.from_names_and_array(path, fnames, y, classes, val_idxs, test_name,
                    num_workers=num_workers, suffix=suffix, tfms=tfms, bs=bs, continuous=continuous)

def evaluate(is_test=False):
    log_preds,y = learn.TTA(is_test=is_test)
    probs = np.mean(np.exp(log_preds),0)
    print(f'TTA accuracy: {accuracy_np(probs, y)}')
    preds = np.argmax(probs, axis=1)
    cm = confusion_matrix(y, preds)
    plot_confusion_matrix(cm, data.classes)
    return preds, y, probs


## Path

PATH = Path('./data')
LABELS = Path(PATH/'training_upload.csv')

## 128x128

lr=1e-4
sz=128
bs=64
ps=0.3

arch = resnet34
#aug_tfms = transforms_top_down
aug_tfms = [RandomRotateZoom(20, 2.0, 0.15, ps=[0.4, 0.3, 0.1, 0.2]), RandomLighting(0.1, 0.1), RandomDihedral()]
def get_data(sz, bs=bs):
    tfms = tfms_from_model(arch, sz, aug_tfms=aug_tfms)
    return ImageClassifierData_sep.from_csv(PATH, 'train_rgb_512', csv_fname=LABELS, tfms=tfms, suffix='.png', bs=bs, cat_separator=',', test_name='valid_rgb_512')

data = get_data(sz)
learn = ConvLearner.pretrained(arch, data, ps=ps, precompute=True)
denorm = data.trn_ds.denorm

learn.fit(lr, 1, cycle_len=1, cycle_mult=2)

learn.fit(lr, 2, cycle_len=1, cycle_mult=2)

learn.fit(lr, 20)

learn.save('rep_001')

learn.load('rep_001')

## 256x256

sz = 256

learn.set_data(get_data(sz, bs=64))
learn.freeze()

learn.fit(lr, 3, cycle_len=1, cycle_mult=3)

lrs = [lr/9, lr/3, lr]
learn.unfreeze()

learn.fit(lrs, 3, cycle_len=1, cycle_mult=3)

learn.save('rep_002')

## 512x512

sz=512

learn.set_data(get_data(sz, bs=64))
learn.freeze()

learn.fit(lrs, 3, cycle_len=1, cycle_mult=3)

learn.unfreeze()

learn.fit(lrs, 1, cycle_len=1, cycle_mult=3)

learn.save('rep_003')

## 1024x1024

sz = 1024
bs = 24
ps = 0.3

arch = resnet34
aug_tfms = [RandomRotateZoom(20, 6.0, 0.15, ps=[0.4, 0.3, 0.1, 0.2]), RandomLighting(0.1, 0.1), RandomDihedral()]
def get_data(sz, bs=bs):
    tfms = tfms_from_model(arch, sz, aug_tfms=aug_tfms)
    return ImageClassifierData_sep.from_csv(PATH, 'train_rgb', csv_fname=LABELS, tfms=tfms, suffix='.png', bs=bs, cat_separator=',', test_name='valid_rgb')

data = get_data(sz)
denorm = data.trn_ds.denorm

learn = ConvLearner.pretrained(arch, data, ps=ps, precompute=True)


learn.fit(lr, 1)

learn.unfreeze()

lrs = [lr/0.9, lr/0.3, lr]

learn.fit(lrs, 1)

learn.save('rep_004')

## Now restart notebook, go to 1024x1024, re-initialized learner and try to load weights here:

learn.load('rep_004')
	## Dependencies

	import sys, os
	sys.path.append('/modules')
	print(sys.path)
	%load_ext autoreload
	%autoreload 2
	%matplotlib inline
	from jupyterthemes import jtplot
	jtplot.style(theme='chesterish', grid=False, ticks=False, )
	import matplotlib
	matplotlib.rcParams['image.interpolation'] = None
	# Set ENV for PyTorch
	os.environ["TORCH_HOME"] = "/modules/.torch"
	os.environ["TORCH_MODEL_ZOO"] = "/modules/pretrnd_mdls_pytorch/"

	from fastai.conv_learner import *
	from fastai.dataset import *
	from fastai.models import *
	from fastai.transforms import *
	from fastai.plots import *

	from sklearn.preprocessing import LabelEncoder
	from sklearn.metrics import confusion_matrix
	from scipy.stats import describe

	import matplotlib.pylab as plt

	## Utility Code

	def savename(learn,proj_name=''):
	out = f'{learn.models.name}-RGB-{learn.data.sz}-epoch_{learn.sched.epoch}-val_loss_{np.round(np.array(learn.sched.val_losses).min(), 3)}'
	print(f'File name for saving: {out}')
	return out

	def parse_csv_labels(fn, skip_header=True, cat_separator = ' '):
	"""Parse filenames and label sets from a CSV file.
	This method expects that the csv file at path :fn: has two columns. If it
	has a header, :skip_header: should be set to True. The labels in the
	label set are expected to be space separated.
	Arguments:
	fn: Path to a CSV file.
	skip_header: A boolean flag indicating whether to skip the header.
	Returns:
	a four-tuple of (
	sorted image filenames,
	a dictionary of filenames and corresponding labels,
	a sorted set of unique labels,
	a dictionary of labels to their corresponding index, which will
	be one-hot encoded.
	)
	.
	:param cat_separator: the separator for the categories column
	"""
	df = pd.read_csv(fn, index_col=0, header=0 if skip_header else None, dtype=str)
	fnames = df.index.values
	df.iloc[:,0] = df.iloc[:,0].str.split(cat_separator)
	return sorted(fnames), list(df.to_dict().values())[0]

	def csv_source(folder, csv_file, skip_header=True, suffix='', continuous=False, **kwargs):
	fnames,csv_labels = parse_csv_labels(csv_file, skip_header, **kwargs)
	return dict_source(folder, fnames, csv_labels, suffix, continuous)

	class ImageClassifierData_sep(ImageClassifierData):
	@classmethod
	def from_csv(cls, path, folder, csv_fname, bs=64, tfms=(None,None),
	val_idxs=None, suffix='', test_name=None, continuous=False, skip_header=True, num_workers=8, **kwargs):
	""" Read in images and their labels given as a CSV file.
	This method should be used when training image labels are given in an CSV file as opposed to
	sub-directories with label names.
	Arguments:
	path: a root path of the data (used for storing trained models, precomputed values, etc)
	folder: a name of the folder in which training images are contained.
	csv_fname: a name of the CSV file which contains target labels.
	bs: batch size
	tfms: transformations (for data augmentations). e.g. output of `tfms_from_model`
	val_idxs: index of images to be used for validation. e.g. output of `get_cv_idxs`.
	If None, default arguments to get_cv_idxs are used.
	suffix: suffix to add to image names in CSV file (sometimes CSV only contains the file name without file
	extension e.g. '.jpg' - in which case, you can set suffix as '.jpg')
	test_name: a name of the folder which contains test images.
	continuous: TODO
	skip_header: skip the first row of the CSV file.
	num_workers: number of workers
	Returns:
	ImageClassifierData
	"""
	assert not (tfms[0] is None or tfms[1] is None), "please provide transformations for your train and validation sets"
	assert not (os.path.isabs(folder)), "folder needs to be a relative path"
	fnames,y,classes = csv_source(folder, csv_fname, skip_header, suffix, continuous=continuous, **kwargs)
	return cls.from_names_and_array(path, fnames, y, classes, val_idxs, test_name,
	num_workers=num_workers, suffix=suffix, tfms=tfms, bs=bs, continuous=continuous)

	def evaluate(is_test=False):
	log_preds,y = learn.TTA(is_test=is_test)
	probs = np.mean(np.exp(log_preds),0)
	print(f'TTA accuracy: {accuracy_np(probs, y)}')
	preds = np.argmax(probs, axis=1)
	cm = confusion_matrix(y, preds)
	plot_confusion_matrix(cm, data.classes)
	return preds, y, probs


	## Path

	PATH = Path('./data')
	LABELS = Path(PATH/'training_upload.csv')

	## 128x128

	lr=1e-4
	sz=128
	bs=64
	ps=0.3

	arch = resnet34
	#aug_tfms = transforms_top_down
	aug_tfms = [RandomRotateZoom(20, 2.0, 0.15, ps=[0.4, 0.3, 0.1, 0.2]), RandomLighting(0.1, 0.1), RandomDihedral()]
	def get_data(sz, bs=bs):
	tfms = tfms_from_model(arch, sz, aug_tfms=aug_tfms)
	return ImageClassifierData_sep.from_csv(PATH, 'train_rgb_512', csv_fname=LABELS, tfms=tfms, suffix='.png', bs=bs, cat_separator=',', test_name='valid_rgb_512')

	data = get_data(sz)
	learn = ConvLearner.pretrained(arch, data, ps=ps, precompute=True)
	denorm = data.trn_ds.denorm

	learn.fit(lr, 1, cycle_len=1, cycle_mult=2)

	learn.fit(lr, 2, cycle_len=1, cycle_mult=2)

	learn.fit(lr, 20)

	learn.save('rep_001')

	learn.load('rep_001')

	## 256x256

	sz = 256

	learn.set_data(get_data(sz, bs=64))
	learn.freeze()

	learn.fit(lr, 3, cycle_len=1, cycle_mult=3)

	lrs = [lr/9, lr/3, lr]
	learn.unfreeze()

	learn.fit(lrs, 3, cycle_len=1, cycle_mult=3)

	learn.save('rep_002')

	## 512x512

	sz=512

	learn.set_data(get_data(sz, bs=64))
	learn.freeze()

	learn.fit(lrs, 3, cycle_len=1, cycle_mult=3)

	learn.unfreeze()

	learn.fit(lrs, 1, cycle_len=1, cycle_mult=3)

	learn.save('rep_003')

	## 1024x1024

	sz = 1024
	bs = 24
	ps = 0.3

	arch = resnet34
	aug_tfms = [RandomRotateZoom(20, 6.0, 0.15, ps=[0.4, 0.3, 0.1, 0.2]), RandomLighting(0.1, 0.1), RandomDihedral()]
	def get_data(sz, bs=bs):
	tfms = tfms_from_model(arch, sz, aug_tfms=aug_tfms)
	return ImageClassifierData_sep.from_csv(PATH, 'train_rgb', csv_fname=LABELS, tfms=tfms, suffix='.png', bs=bs, cat_separator=',', test_name='valid_rgb')

	data = get_data(sz)
	denorm = data.trn_ds.denorm

	learn = ConvLearner.pretrained(arch, data, ps=ps, precompute=True)



	learn.fit(lr, 1)

	learn.unfreeze()

	lrs = [lr/0.9, lr/0.3, lr]

	learn.fit(lrs, 1)

	learn.save('rep_004')

	## Now restart notebook, go to 1024x1024, re-initialized learner and try to load weights here:

	learn.load('rep_004')