Myle Ott myleott

## gist:071da5c68c884b750ca4ec81dc414183
import time

import torch
import torch.nn as nn

from apex.normalization import FusedLayerNorm


torch.backends.cudnn.benchmark = True

## test_mixed_precision.py
import argparse                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
import sys

## gist:b5d6b5d2f0a9f3fc4e2a5797d41aa8c7
#!/usr/bin/env python3

import argparse
import time

import torch

import torch_xla
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_multiprocessing as xmp

## gist:cdf685b8b3ce20b0221e1842782bce74
text: a b c </s> d e f g </s>

Suppose the model is trained with a context length of 4.
Then the most favorable way to evaluate your model's perplexity is:

batch 1:  a b c </s>
         |----------|    <-- count perplexity of this
batch 2:  b c </s> d
                  |-|    <-- count perplexity of this
batch 3:  c </s> d e

## gist:a66ba69601cbd21a5a2218a33b6363f8
import sys
import time

import torch
import torch.nn as nn
import torch.optim as optim


platform = "gpu" if torch.cuda.is_available() else "tpu"

## gist:fa49c10039c89b9472e6b0c59590b10b
Metric: CompileTime
  TotalSamples: 2
  Accumulator: 226ms137.620us
  ValueRate: 439ms863.608us / second
  Rate: 3.88139 / second
  Percentiles: 1%=109ms635.741us; 5%=109ms635.741us; 10%=109ms635.741us; 20%=109ms635.741us; 50%=118ms501.879us; 80%=118ms501.879us; 90%=118ms501.879us; 95%=118ms501.879us; 99%=118ms501.879us
Metric: DeviceLockWait
  TotalSamples: 101
  Accumulator: 31s573ms487.494us
  ValueRate: 754ms252.918us / second

## benchmark.py
import sys
import time

import torch
import torch.nn as nn
import torch.optim as optim

#platform = "tpu"
platform = "gpu"

## gist:333fc9c2e1dc8e3ebbec933905ccc905
# maybe change "fairseq-benchmark" below to your own tag
BENCHMARK=4node aws ec2 run-instances \
    --image-id ami-0707a23a1930bb11c \
    --count 4 \
    --instance-type p3dn.24xlarge \
    --key-name fair-$USER \
    --tag-specifications "ResourceType=instance,Tags=[{Key=fair-user,Value=$USER},{Key=fairseq-benchmark,Value=''}]" \
    --placement "GroupName = ${PLACEMENT_GROUP_NAME}" \
    --network-interfaces "DeviceIndex=0,InterfaceType=efa,Groups=${SECURITY_GROUP_ID},SubnetId=${SUBNET_ID}" \
    --block-device-mappings file://block_device_mappings.gp2.json

## gist:0e019badf60514e5d8c68ef9e6b6ceed
# maybe change "fairseq-benchmark" below to your own tag
BENCHMARK=4node aws ec2 run-instances \
    --image-id ami-0707a23a1930bb11c \
    --count 4 \
    --instance-type p3dn.24xlarge \
    --key-name fair-$USER \
    --tag-specifications "ResourceType=instance,Tags=[{Key=fair-user,Value=$USER},{Key=fairseq-benchmark,Value=''}]" \
    --placement "GroupName = ${PLACEMENT_GROUP_NAME}" \
    --network-interfaces "DeviceIndex=0,InterfaceType=efa,Groups=${SECURITY_GROUP_ID},SubnetId=${SUBNET_ID}" \
    --block-device-mappings file://block_device_mappings.gp2.json

## fairseq_benchmark_masked_lm.c10d.200mb.sh
#!/bin/bash
source activate fairseq
fairseq-train --no-save --disable-validation --task dummy_masked_lm --masked-lm-only --arch bert_large --num-segment 0 --optimizer adam --lr 1e-4 --max-sentences 8 --update-freq 1 --fp16 --ddp-backend $DDP_BACKEND --bucket-cap-mb $BUCKET_CAP_MB --tokens-per-sample 512 --dataset-size $DATASET_SIZE --criterion masked_lm_loss --log-format json --log-interval 10 --max-epoch 1
	import time

	import torch
	import torch.nn as nn

	from apex.normalization import FusedLayerNorm


	torch.backends.cudnn.benchmark = True
	#!/usr/bin/env python3

	import argparse
	import time

	import torch

	import torch_xla
	import torch_xla.core.xla_model as xm
	import torch_xla.distributed.xla_multiprocessing as xmp
	text: a b c </s> d e f g </s>

	Suppose the model is trained with a context length of 4.
	Then the most favorable way to evaluate your model's perplexity is:

	batch 1: a b c </s>
	\|----------\| <-- count perplexity of this
	batch 2: b c </s> d
	\|-\| <-- count perplexity of this
	batch 3: c </s> d e
	import sys
	import time

	import torch
	import torch.nn as nn
	import torch.optim as optim


	platform = "gpu" if torch.cuda.is_available() else "tpu"
	Metric: CompileTime
	TotalSamples: 2
	Accumulator: 226ms137.620us
	ValueRate: 439ms863.608us / second
	Rate: 3.88139 / second
	Percentiles: 1%=109ms635.741us; 5%=109ms635.741us; 10%=109ms635.741us; 20%=109ms635.741us; 50%=118ms501.879us; 80%=118ms501.879us; 90%=118ms501.879us; 95%=118ms501.879us; 99%=118ms501.879us
	Metric: DeviceLockWait
	TotalSamples: 101
	Accumulator: 31s573ms487.494us
	ValueRate: 754ms252.918us / second
	# maybe change "fairseq-benchmark" below to your own tag
	BENCHMARK=4node aws ec2 run-instances \
	--image-id ami-0707a23a1930bb11c \
	--count 4 \
	--instance-type p3dn.24xlarge \
	--key-name fair-$USER \
	--tag-specifications "ResourceType=instance,Tags=[{Key=fair-user,Value=$USER},{Key=fairseq-benchmark,Value=''}]" \
	--placement "GroupName = ${PLACEMENT_GROUP_NAME}" \
	--network-interfaces "DeviceIndex=0,InterfaceType=efa,Groups=${SECURITY_GROUP_ID},SubnetId=${SUBNET_ID}" \
	--block-device-mappings file://block_device_mappings.gp2.json
	#!/bin/bash
	source activate fairseq
	fairseq-train --no-save --disable-validation --task dummy_masked_lm --masked-lm-only --arch bert_large --num-segment 0 --optimizer adam --lr 1e-4 --max-sentences 8 --update-freq 1 --fp16 --ddp-backend $DDP_BACKEND --bucket-cap-mb $BUCKET_CAP_MB --tokens-per-sample 512 --dataset-size $DATASET_SIZE --criterion masked_lm_loss --log-format json --log-interval 10 --max-epoch 1