Skip to content

Instantly share code, notes, and snippets.

@flymin
Last active February 13, 2020 04:16
Show Gist options
  • Select an option

  • Save flymin/b0c263be0964704131f2ae6f13d5a08f to your computer and use it in GitHub Desktop.

Select an option

Save flymin/b0c263be0964704131f2ae6f13d5a08f to your computer and use it in GitHub Desktop.
[ModelArts] Model Migration for ModelArts #Other

original code from here

reference Using_Caffe_to_Create_a_MNIST_Dataset_Recognition_Application/codes/train.py

 from __future__ import print_function
 import argparse
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
 from torchvision import datasets, transforms
+import os
+import moxing as mox
 
 
 class Net(nn.Module):
     def __init__(self):
         super(Net, self).__init__()
         self.conv1 = nn.Conv2d(1, 20, 5, 1)
         self.conv2 = nn.Conv2d(20, 50, 5, 1)
         self.fc1 = nn.Linear(4*4*50, 500)
         self.fc2 = nn.Linear(500, 10)
 
     def forward(self, x):
         x = F.relu(self.conv1(x))
         x = F.max_pool2d(x, 2, 2)
         x = F.relu(self.conv2(x))
         x = F.max_pool2d(x, 2, 2)
         x = x.view(-1, 4*4*50)
         x = F.relu(self.fc1(x))
         x = self.fc2(x)
         return F.log_softmax(x, dim=1)
     
 def train(args, model, device, train_loader, optimizer, epoch):
     model.train()
     for batch_idx, (data, target) in enumerate(train_loader):
         data, target = data.to(device), target.to(device)
         optimizer.zero_grad()
         output = model(data)
         loss = F.nll_loss(output, target)
         loss.backward()
         optimizer.step()
         if batch_idx % args.log_interval == 0:
             print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                 epoch, batch_idx * len(data), len(train_loader.dataset),
                 100. * batch_idx / len(train_loader), loss.item()))
 
 def test(args, model, device, test_loader):
     model.eval()
     test_loss = 0
     correct = 0
     with torch.no_grad():
         for data, target in test_loader:
             data, target = data.to(device), target.to(device)
             output = model(data)
             test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
             pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
             correct += pred.eq(target.view_as(pred)).sum().item()
 
     test_loss /= len(test_loader.dataset)
 
     print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
         test_loss, correct, len(test_loader.dataset),
         100. * correct / len(test_loader.dataset)))
 
 def main():
     # Training settings
     parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
     parser.add_argument('--batch-size', type=int, default=64, metavar='N',
                         help='input batch size for training (default: 64)')
     parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
                         help='input batch size for testing (default: 1000)')
     parser.add_argument('--epochs', type=int, default=10, metavar='N',
                         help='number of epochs to train (default: 10)')
     parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
                         help='learning rate (default: 0.01)')
     parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
                         help='SGD momentum (default: 0.5)')
     parser.add_argument('--no-cuda', action='store_true', default=False,
                         help='disables CUDA training')
     parser.add_argument('--seed', type=int, default=1, metavar='S',
                         help='random seed (default: 1)')
     parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                         help='how many batches to wait before logging training status')
     
-    parser.add_argument('--save-model', action='store_true', default=False,
+    parser.add_argument('--save-model', action='store_true', default=True,
                         help='For Saving the current Model')
-    args = parser.parse_args()
+    parser.add_argument('--train_url', type=str, help='dir to save the current Model') # obs 的输出位置
+    parser.add_argument('--data_url', type=str, help='dir for dataset') # obs 的数据位置
+    parser.add_argument('--data_local_path', type=str, default="data_set", help='local dir for dataset') # local 的数据位置
+    parser.add_argument('--model_local_path', type=str, default="results", help='local dir for dataset') # local 的输出位置
+    args, _ = parser.parse_known_args()	# 只使用已知的参数
     use_cuda = not args.no_cuda and torch.cuda.is_available()
 
+    local_dataset_url = args.data_local_path
+    if not os.path.exists(local_dataset_url):
+        os.makedirs(local_dataset_url)
+    print('local_dataset_url: ' + local_dataset_url)
+
+    if mox.file.exists(args.data_url):
+        #copy data from obs to local
+        print("data obs url exists")
+        mox.file.copy_parallel(src_url=args.data_url, dst_url=local_dataset_url)	# 从桶中拷贝数据到镜像中
+        
+        # model save path
+        model_local_output = args.model_local_path
+        if not os.path.exists(model_local_output):
+            os.makedirs(model_local_output)
+        print("model_local_output: " + model_local_output)
+        model_obs_output = args.train_url
+
         torch.manual_seed(args.seed)
 
         device = torch.device("cuda" if use_cuda else "cpu")
 
         kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
         train_loader = torch.utils.data.DataLoader(
-        datasets.MNIST('../data', train=True, download=True,
+            datasets.MNIST(args.data_local_path, train=True, download=False,
                         transform=transforms.Compose([
                             transforms.ToTensor(),
                             transforms.Normalize((0.1307,), (0.3081,))
                         ])),
             batch_size=args.batch_size, shuffle=True, **kwargs)
         test_loader = torch.utils.data.DataLoader(
-        datasets.MNIST('../data', train=False, transform=transforms.Compose([
+            datasets.MNIST(args.data_local_path, train=False, transform=transforms.Compose([
                             transforms.ToTensor(),
                             transforms.Normalize((0.1307,), (0.3081,))
                         ])),
             batch_size=args.test_batch_size, shuffle=True, **kwargs)
 
 
         model = Net().to(device)
         optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
 
         for epoch in range(1, args.epochs + 1):
             train(args, model, device, train_loader, optimizer, epoch)
             test(args, model, device, test_loader)
 
         if (args.save_model):
-        torch.save(model.state_dict(),"mnist_cnn.pt")
+            torch.save(model.state_dict(),os.path.join(model_local_output, "mnist_cnn.pt"))
+
+        # copy final model from local to obs
+        model_obs_output = os.path.join(model_obs_output, "final")
+        print("model_obs_output: " + model_obs_output)
+        if not mox.file.exists(model_obs_output):
+            mox.file.make_dirs(model_obs_output)
+        mox.file.copy_parallel(src_url=model_local_output, dst_url=model_obs_output) # 将训练结果拷贝回桶中

  if __name__ == '__main__':
     main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment