Bram Vanroy BramVanroy

## run_clm_lora.py
#!/usr/bin/env python
# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#

## convert_to_safetensors.py
import importlib
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional

from transformers import HfArgumentParser, AutoConfig, AutoTokenizer


@dataclass
class ScriptArguments:

## gpu-error-log.sh
# If there is an error in nvidia-smi, log it to a file in ~/gpu-errors!
nvidia_smi_output=$(nvidia-smi)
if echo "nvidia_smi_output" | grep -q "ERR"; then
    fname=~/gpu-errors/$(hostname)-error.txt
    pdir=$(dirname "$fname")
    mkdir -p "$pdir"
    nvcc_output=$(nvcc --version)
    echo "$nvidia_smi_output"$'\n'"$nvcc_version_output" > "$fname"
fi

## set_seed.py
def set_seed(seed: Optional[int]):
    if seed is not None:
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        np.random.seed(seed)
        random.seed(seed)
        os.environ["PYTHONHASHSEED"] = str(seed)

## run.py
# See https://gist.github.com/BramVanroy/f78530673b1437ed0d6be7c61cdbdd7c
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, HyperOptArguments))

try:
    # Assumes that the first .json file is the config file (if any)
    config_file = next(iter(arg for arg in sys.argv if arg.endswith(".json")))
except StopIteration:
    config_file = None

run_name_specified = False

## vsc-lmod-deepspeed.bashrc

# If we open a session/job that's on a host that starts with gpu* (e.g. gpu512.dodrio.os),
# load PyTorch with CUDA and pdsh
# This makes sure that deepspeed/pdsh work in multi node settings
if [[ $(hostname) == gpu* ]]; then
    module load PyTorch/1.12.0-foss-2022a-CUDA-11.7.0;
    module load pdsh/2.34-GCCcore-11.3.0;
fi

# Automatically generates a hostfile for the current job in the current directory,

## get_memory_usage.py
import math
import psutil
from pynvml import nvmlDeviceGetCount, nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo


def format_bytes(nbytes: int) -> str:
    if nbytes == 0:
        return "0 B"

    unit = ("B", "kB", "MB", "GB", "TB")

## get_words_of_tokens.py
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
text = "It 's a pre-tokenized , silly sentence !"
words = text.split()
encoded = tokenizer(words, is_split_into_words=True)

for token, wordid in zip(encoded.tokens(), encoded.word_ids()):
    if wordid is not None:
        print(token, words[wordid])

## spacy-disable-tok.py
from typing import List

import spacy
from spacy import Language, Vocab
from spacy.tokens import Doc

def load_nlp(model_name: str = "en_core_web_sm",
             is_tokenized: bool = False,
             exclude: List[str] = None):
    """Load a spaCy model. Disable sentence segmentation and tokenization with is_tokenized.

## remote-serve.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                BramVanroy
                / remote-serve.md
            
            
              Last active
              June 15, 2020 08:37
            
              
                Using web-serving tool from a remote server
              
          
    Oftentimes, you may want to use a web-based tool during programming, e.g. a Jupyter notebook, Tensorboard, Streamlit, and others. It is easy to set these tools up locally, on your own machine, but this computer may not be as powerful as a server that you have available. Here is a small guide to show you how to easily use the web-based tool remotely. As an example, we will use Tensorboard, allowing us to remotely monitor the live-updated progress of our machine learning system during training. This gist is simply an extension of the following Stack Overflow post. This gist does not cover how to use Tensorboard itself. To get started with that, read through the documentation (works for Tensorflow as well as PyTorch).
If we would start Tensorboard on our own machine, it would create a local server that is accessible through a s
	#!/usr/bin/env python
	# coding=utf-8
	# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	import importlib
	from dataclasses import dataclass, field
	from pathlib import Path
	from typing import Optional

	from transformers import HfArgumentParser, AutoConfig, AutoTokenizer


	@dataclass
	class ScriptArguments:
	# If there is an error in nvidia-smi, log it to a file in ~/gpu-errors!
	nvidia_smi_output=$(nvidia-smi)
	if echo "nvidia_smi_output" \| grep -q "ERR"; then
	fname=~/gpu-errors/$(hostname)-error.txt
	pdir=$(dirname "$fname")
	mkdir -p "$pdir"
	nvcc_output=$(nvcc --version)
	echo "$nvidia_smi_output"$'\n'"$nvcc_version_output" > "$fname"
	fi
	def set_seed(seed: Optional[int]):
	if seed is not None:
	torch.manual_seed(seed)
	torch.cuda.manual_seed_all(seed)
	torch.backends.cudnn.deterministic = True
	torch.backends.cudnn.benchmark = False
	np.random.seed(seed)
	random.seed(seed)
	os.environ["PYTHONHASHSEED"] = str(seed)
	# See https://gist.github.com/BramVanroy/f78530673b1437ed0d6be7c61cdbdd7c
	parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, HyperOptArguments))

	try:
	# Assumes that the first .json file is the config file (if any)
	config_file = next(iter(arg for arg in sys.argv if arg.endswith(".json")))
	except StopIteration:
	config_file = None

	run_name_specified = False

	# If we open a session/job that's on a host that starts with gpu* (e.g. gpu512.dodrio.os),
	# load PyTorch with CUDA and pdsh
	# This makes sure that deepspeed/pdsh work in multi node settings
	if [[ $(hostname) == gpu* ]]; then
	module load PyTorch/1.12.0-foss-2022a-CUDA-11.7.0;
	module load pdsh/2.34-GCCcore-11.3.0;
	fi

	# Automatically generates a hostfile for the current job in the current directory,
	import math
	import psutil
	from pynvml import nvmlDeviceGetCount, nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo


	def format_bytes(nbytes: int) -> str:
	if nbytes == 0:
	return "0 B"

	unit = ("B", "kB", "MB", "GB", "TB")
	from transformers import AutoTokenizer

	tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
	text = "It 's a pre-tokenized , silly sentence !"
	words = text.split()
	encoded = tokenizer(words, is_split_into_words=True)

	for token, wordid in zip(encoded.tokens(), encoded.word_ids()):
	if wordid is not None:
	print(token, words[wordid])
	from typing import List

	import spacy
	from spacy import Language, Vocab
	from spacy.tokens import Doc

	def load_nlp(model_name: str = "en_core_web_sm",
	is_tokenized: bool = False,
	exclude: List[str] = None):
	"""Load a spaCy model. Disable sentence segmentation and tokenization with is_tokenized.