Last active
April 2, 2020 17:04
-
-
Save dmpetrov/136dd5df9bcf6de90980cec22355437a to your computer and use it in GitHub Desktop.
DVC storage proposal #1487
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### BASIC SCENARIO ### | |
# Create dataset | |
# Assigne dataset name `car-images`, version and verision comment (not Git) | |
$ tar zxf images.tgz | |
$ du -sh images/ | |
8.1G images | |
$ dvc dataset add images/ car-images 1.0.0 -m "Import car images" | |
Dataset car-images@1.0.0 was added | |
# Commit as usual. All datasets info in dvc-file. | |
$ git add images.dvc .gitignore | |
$ git commit -m 'Car images dataset' | |
# List of datasets | |
$ dvc dataset list | |
car-images@1.0.0 | |
$ dvc dataset list --details # details? | |
car-images@1.0.0 8.1G 845295 Car images dataset images.dvc | |
# 845295 is number of files | |
### OUTPUT AS A DATASET ### | |
$ dvc run -d processed_cars/ -d config.yaml -d cnn/model.py -o cnn/model.p \ | |
python cnn/model.py -e 70 -p 0.3 | |
$ dvc dataset assign model.p mymodel 0.1.0 -m 'First working CNN' | |
# Open questions: | |
# - do we need a separate file for model as a dataset (like cnn/mymodel.p.dvc)? | |
# - - how cnn/mymodel.p.dvc and cnn/model.p.dvc will be connected? | |
# - should `dvc run` update\patch the output model version if assigned? Is warning enough? | |
$ git add cnn/model.p.dvc .gitignore | |
$ git commit -m 'First model' | |
$ dvc dataset list --details | |
car-images@1.0.0 8.1G 845295 Car images dataset images.dvc | |
mymodel@0.1.0 218M 1 First working CNN cnn/model.p.dvc | |
$ vi cnn.model.py | |
$ dvc repro cnn/model.p.dvc | |
$ dvc dataset version cnn/mymodel.p.dvc minor -m 'Some fixes' | |
mymodel@0.2.0 | |
$ dvc dataset list --details | |
car-images@1.0.0 8.1G 845295 Car images dataset images.dvc | |
mymodel@0.2.0 217M 1 Some fixes cnn/model.p.dvc | |
### MODIFY DATASET ### | |
$ dvc unprotect images/ | |
$ rm -rf images/ | |
$ cp -r ~/Download/images_new_version images | |
# SYNOPSIS: dvc dataset version [<newversion> | major | minor | patch ] [-m]. See `npm version`. | |
$ dvc dataset version car-images minor -m "Labels update 2019-01-18" | |
Dataset car-images@1.0.0 was updated to 1.1.0 | |
$ dvc dataset list --details | |
car-images@1.1.0 8.3G 851904 Labels update 2019-01-18 images.dvc | |
mymodel@0.2.0 217M 1 Some fixes cnn/model.p.dvc | |
# Retrain model | |
$ dvc repro | |
Warning: please update output dataset version 'mymodel'. Old version 0.1.0. | |
$ dvc dataset version car-images minor -m "Retrained with car-images@1.1.0" | |
Dataset mymodel@0.2.0 was updated to 0.2.0 | |
$ dvc dataset list --details # Note: number of files in car-images was changed | |
car-images@1.1.0 8.3G 851904 Labels update 2019-01-18 images.dvc | |
mymodel@0.2.0 223M 1 Retrained with car-images@1.1.0 cnn/model.p.dvc | |
### INFORMATIONALS OPERATIONS ### | |
# Current workspace only | |
$ dvc dataset list | |
car-images@1.2.0 | |
mymodel@0.3.0 | |
$ dvc dataset list --details | |
car-images@1.1.0 8.3G 851904 Labels update from 2019-04-02 images.dvc | |
mymodel@0.3.0 218M 1 Retrained with car-images@1.1.0 cnn/model.p.dvc | |
# Version history - dvc should find this from git history | |
# Open question: should we address datasets by name or dvc-files or both? | |
$ dvc dataset hist car-images # hist? | |
1.0.0 8.1G 845295 Import car images | |
1.1.0 8.3G 851904 Labels update 2019-03-18 | |
1.2.0 8.4G 861749 Labels update from 2019-04-02 # <-- new version | |
# Open question: how about versions in parallel branches? Ignore them? | |
$ dvc dataset hist car-images --branch | |
master 1.0.0 8.1G 845295 Import car images | |
master 1.1.0 8.3G 851904 Labels update 2019-03-18 | |
master 1.2.0 8.4G 861749 Labels update from 2019-04-02 | |
try_something 1.1.0 8.3G 851904 labels from 2019-03-18 | |
# Show everything | |
$ dvc dataset hist --all --branch | |
master car-images@1.0.0 8.1G 845295 Import car images | |
master car-images@1.1.0 8.3G 851904 Labels update 2019-03-18 | |
master car-images@1.2.0 8.4G 861749 Labels update from 2019-04-02 | |
try_something car-images@1.1.0 8.3G 851904 labels from 2019-03-18 | |
master mymodel@0.1.0 218M 1 First model | |
master mymodel@0.2.0 217M 1 Some fixes | |
master mymodel@0.3.0 223M 1 Retrained with car-images@1.1.0 | |
try_something mymodel@0.2.0 219M 1 retrained | |
try_imagenet mymodel@0.2.0 348M 1 imagenet model | |
try_imagenet imagenet@1.0.0 147G 14745385 Import imagenet | |
### DIFF ### | |
# diff of the current version (1.2.0) with a previous one (1.1.0) | |
$ dvc dataset diff car-images 1.1.0 # patch version can be ignored "1.1" is enough | |
Size: 8.4G --> 8.3G | |
Files: 861749 --> 851904 | |
New files: 140234 | |
Deleted files: 18 | |
Modified files: 6434 | |
$ dvc dataset diff car-images 1.1 --new-files # or --last instead of verion | |
im4325532.jpg | |
im3454534.jpg | |
... | |
# specify both versions | |
$ dvc dataset diff car-images 1.0 1.2 --modified-files | |
im4865885.jpg | |
im8234012.jpg | |
... | |
### CHECKOUT DATASET ### | |
$ dvc dataset checkout car-images@1.0 # dvc checkout is part of this | |
car-images@1.0.0 was checked out. | |
Warning: 'images/', 'images.dvc' were modified. | |
$ dvc repro cnn/model.p.dvc | |
### DVC REGISTRY ### | |
# One of the requirements is to extract a common datasets in a separate repository | |
# and reuse them from different projects. | |
# Some company might keep all the datasets in a single place\project and just reuse them. | |
# Open questions: | |
# - do we need a default dir for a registry like `~/dvc/registry/` and | |
# `/usr/local/dvc/regirtry`? Or a environment variable DVC_REGISTRY? | |
# - do we need a multiple registries (~/dvc/registry and ~/dvc/imagenet) | |
# and how to define? | |
$ cd ~/dvc | |
$ git clone https://github.com/iterative/common registry | |
$ cd registry | |
$ ls | |
imagenet.dvc coco.dvc someotherstuff.dvc | |
$ dvc pull imagenet.dvc | |
$ ls | |
imagenet.dvc imagenet/ coco.dvc someotherstuff.dvc | |
$ cd ~/src/myproject | |
$ dvc dataset list --details | |
car-images@1.1.0 8.3G 851904 Labels update from 2019-04-02 images.dvc | |
mymodel@0.3.0 218M 1 Retrained with car-images@1.1.0 cnn/model.p.dvc | |
~/dvc/registry/imagenet@1.0.0 147G 14745385 Import imagenet | |
# Use a repo from registry | |
# By default the `last` version is copied. | |
# If a dataset has many output - copy all. | |
$ dvc dataset copy imagenet . | |
Dataset imagenet@1.0.0 was copied | |
Adding 'imagenet/' to '.gitignore'. | |
'imagenet/' is in cache '~/dvc/registry/.dvc/cache'. # <-- not in local cache. | |
Saving information to 'imagnet.dvc'. | |
To track the changes with git run: | |
git add .gitignore imagnet.dvc | |
# A repo might have it's own cache (by default). | |
# A dataset dvc-file should point to that cache like "cache_path" and "modul_version": | |
# outs: | |
# - cache: true | |
# md5: ea4dec866e3f4c734e58909ac1b248a3 | |
# path: data/Posts-train.tsv | |
# cache_path: "iterative_datasets/" | |
# modul_version: last | |
# --local-cache can be used to import a dataset in a local chache. | |
$ (cd ~/dvc/registry/ && git pull) | |
$ dvc dataset update imagenet | |
Dataset imagenet@1.0.0 was updated to 1.2.0. | |
Saving information to 'imagnet.dvc'. | |
To track the changes with git run: | |
git add imagnet.dvc | |
# Push model to a registry | |
# Note, data/cache has to be copied (it stays in the same repo by default). | |
$ dvc dataset copy --copy-cache mymodel ~/dvc/registry/ | |
Dataset mymodel@1.0.0 was added to project '~/dvc/registry/' | |
Adding 'model.p' to '.gitignore'. | |
Adding 'model.p' to cache '~/dvc/registry/.dvc/cache'. | |
Saving information to 'model.p.dvc'. | |
To track the changes with git run: | |
git add .gitignore model.p.dvc | |
$ cd ~/dvc/registry/ | |
$ git add .gitignore model.p.dvc | |
$ git commit -m 'My CNN model v1.0' | |
$ git push | |
$ dvc push | |
# A usefull command. | |
$ cd ~/src/myproject | |
$ dvc dataset update mymodel ~/dvc/registry/ | |
### USAGE WITH NO GIT ### | |
# It would be great if `dvc pull` can work with no Git. | |
# It is needed or deployment systems when Git might be not available. | |
# We might need a separate command for that. | |
$ wget https://raw.githubusercontent.com/iterative/dvc/r1.1/model.p.dvc | |
$ wget https://raw.githubusercontent.com/iterative/dvc/r1.1/.dvc/config | |
$ dvc pull --config config model.p.dvc | |
$ ls model.p | |
-rw-r--r-- 1 dmitry staff 230M May 2 2017 setup.cfg | |
# A special command to get the same result: | |
$ dvc pull --deploy https://github.com/iterative/common/ model.p.dvc |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment