jessfraz/google-tpu.patch Secret

## google-tpu.patch
diff --git a/mingpt/trainer.py b/mingpt/trainer.py
index 0ac491b..e246a10 100644
--- a/mingpt/trainer.py
+++ b/mingpt/trainer.py
@@ -36,6 +36,8 @@ class TrainerConfig:
         for k,v in kwargs.items():
             setattr(self, k, v)

+import torch_xla.core.xla_model as xm
+
 class Trainer:

     def __init__(self, model, train_dataset, test_dataset, config):
@@ -45,10 +47,8 @@ class Trainer:
         self.config = config

         # take over whatever gpus are on the system
-        self.device = 'cpu'
-        if torch.cuda.is_available():
-            self.device = torch.cuda.current_device()
-            self.model = torch.nn.DataParallel(self.model).to(self.device)
+        self.device = xm.xla_device()
+        self.model = torch.nn.DataParallel(self.model).to(self.device)

     def save_checkpoint(self):
         # DataParallel wrappers keep raw model object in .module attribute
	diff --git a/mingpt/trainer.py b/mingpt/trainer.py
	index 0ac491b..e246a10 100644
	--- a/mingpt/trainer.py
	+++ b/mingpt/trainer.py
	@@ -36,6 +36,8 @@ class TrainerConfig:
	for k,v in kwargs.items():
	setattr(self, k, v)

	+import torch_xla.core.xla_model as xm
	+
	class Trainer:

	def __init__(self, model, train_dataset, test_dataset, config):
	@@ -45,10 +47,8 @@ class Trainer:
	self.config = config

	# take over whatever gpus are on the system
	- self.device = 'cpu'
	- if torch.cuda.is_available():
	- self.device = torch.cuda.current_device()
	- self.model = torch.nn.DataParallel(self.model).to(self.device)
	+ self.device = xm.xla_device()
	+ self.model = torch.nn.DataParallel(self.model).to(self.device)

	def save_checkpoint(self):
	# DataParallel wrappers keep raw model object in .module attribute