Skip to content

Instantly share code, notes, and snippets.

while 1:
...
df = df.sample(frac=1) # shuffle all rows
...
i, j = 0, batch_size
for _ in range(nbatches):
sub = df.iloc[i:j]
idx = sub.index.values
X2 = bcolz.open(bcolz_dir)[idx]
...
df_mini_batch
13 95555756 dog grey /path/to/imgs/756/55/blah_95555756.png
5 5467756 cat black /path/to/imgs/756/67/blah_5467756.png
1 1161756 cat black /path/to/imgs/756/61/blah_1161756.png
7 31255756 cat grey /path/to/imgs/756/55/blah_31255756.png
...
i, j = 0, batch_size
for _ in range(nbatches):
sub = df.iloc[i:j]
X2 = bc[i:j]
...
# Calculate X and Y appropriately
...
yield [X, X2], Y
i = j
# 'features' is file path to bcolz array on disk
bc = bcolz.open(features)[:]
# begin epoch loop
while 1:
...
df = df.sample(frac=1) # shuffle all rows
bc = bc[df.index.values]
...
cd data
rm train validation # symbolic links to _demo dirs.
mkdir -p train_orig/dogs train_orig/cats \
validation_orig/dogs validation_orig/cats
mv train_orig/dog*[12][345]*.jpg validation_orig/dogs
mv train_orig/cat*[12][345]*.jpg validation_orig/cats
mv train_orig/dog*.jpg train_orig/dogs
mv train_orig/cat*.jpg train_orig/cats
cd /path/to/kaggle/data/
ls train/dogs| head
dog.1000.jpg
dog.1001.jpg
dog.1002.jpg
dog.1003.jpg
dog.1004.jpg
dog.1005.jpg
dog.1006.jpg
cd /path/to/demo/data/
# Download train.zip from Kaggle using
# https://www.kaggle.com/c/dogs-vs-cats/data
ls -l train.zip
-rw-r--r--@ 1 user group 543M Jun 12 10:39 train.zip
unzip -qq train.zip
df['path'] = df.object_id.apply(file_path_from_db_id)
df
object_id bi multi path
index
0 461756 dog white /path/to/imgs/756/61/blah_461756.png
1 1161756 cat black /path/to/imgs/756/61/blah_1161756.png
2 3303651 dog white /path/to/imgs/651/03/blah_3303651.png
3 3367756 dog grey /path/to/imgs/756/67/blah_3367756.png
4 3767756 dog grey /path/to/imgs/756/67/blah_3767756.png
+-----------+ +-----+ +--------------+ +-----+
| existing | ---> | CPU | ---> | mini-batch 1 | ---> | GPU |
| file | +-----+ | mini-batch 2 | +-----+
| server | | mini-batch 3 |
| structure | | ... |
+-----------+ | mini-batch n | ^
+--------------+ |
|
^ ^ ^ +---------------+
| | | | The sexy bit, |
python classifier_from_little_data_script_dfgen.py
Using TensorFlow backend.
9936 train dog
9936 train cat
2564 validation dog
2564 validation cat
Some samples:
df_train:
object_id imgpath target orig label