Harsimranjeet Saini harry-stark

## sparkrun.py
from spark_session_builder import build_spark_session
spark= build_spark_session(master="spark://cpu128-dy-r6i-32xlarge-3:7077",num_cores=128,mem_gb=999)
from pyspark.ml.feature import MinHashLSH,MinHashLSHModel
from pyspark.ml.linalg import Vectors
import time
from pyspark.sql.functions import col
from pyspark.ml.feature import MinHashLSH, Tokenizer, HashingTF
hash_size=100
threshold=0.8
start=time.time()

## codegen_gptj_converter.py
import torch
from transformers import GPTJForCausalLM, GPTJConfig
from transformers import CodeGenTokenizer, CodeGenForCausalLM

def cg2gptj(code_model):
    cg_model = CodeGenForCausalLM.from_pretrained(code_model, torch_dtype="auto")
    cg_config = cg_model.config

    # Create empty GPTJ model
    print('Creating empty GPTJ model')

## conda_setup.sh
module load openmpi cuda/11.7

#CONDA_HOME=/fsx/quentin/miniconda3
CONDA_HOME=/fsx/gpt-neox/conda/envs/neox
#CONDA_HOME=/fsx/gpt-neox/conda/envs/improved-t5
CUDNN_HOME=/fsx/quentin/cudnn-linux-x86_64-8.6.0.163_cuda11-archive

export LD_LIBRARY_PATH=$CUDNN_HOME/lib:$LD_LIBRARY_PATH
export CPATH=$CUDNN_HOME/include:$CPATH

## ytdl_utils.py
import pandas as pd
from urllib.parse import urlparse
df=pd.read_csv()
def url_matches_dataframe(url: str, df: pd.DataFrame) -> bool:
    # Parse the given URL to get the netloc and hostname
    parsed_url = urlparse(url)
    netloc = parsed_url.netloc
    hostname = parsed_url.hostname

    # Remove "www" from the netloc and hostname

## multinode.sbatch
#!/bin/bash
#SBATCH --job-name="elm"
#SBATCH --partition=gpu
#SBATCH --mem-per-cpu=16GB        # Amount of CPU memory
#SBATCH --nodes=4
#SBATCH --ntasks-per-node=8      # Crucial - only 1 task per dist per node!
#SBATCH --cpus-per-task=6           # Number of cores per tasks
#SBATCH --hint=nomultithread         # We get physical cores not logical
#SBATCH --gres=gpu:8                 # Number of gpus
#SBATCH --output=%x_%j.out   # Set this dir where you want slurm outs to go

## check_ds.py
import deepspeed as ds
print(ds.__version__)

## main.sh
#!/bin/bash
#SBATCH --job-name="elm"
#SBATCH --partition=gpu
#SBATCH --mem-per-cpu=16GB        # Amount of CPU memory
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=8      # Crucial - only 1 task per dist per node!
#SBATCH --cpus-per-task=6           # Number of cores per tasks
#SBATCH --hint=nomultithread         # We get physical cores not logical
#SBATCH --gres=gpu:8                 # Number of gpus
#SBATCH --output=%x_%j.out   # Set this dir where you want slurm outs to go
	from spark_session_builder import build_spark_session
	spark= build_spark_session(master="spark://cpu128-dy-r6i-32xlarge-3:7077",num_cores=128,mem_gb=999)
	from pyspark.ml.feature import MinHashLSH,MinHashLSHModel
	from pyspark.ml.linalg import Vectors
	import time
	from pyspark.sql.functions import col
	from pyspark.ml.feature import MinHashLSH, Tokenizer, HashingTF
	hash_size=100
	threshold=0.8
	start=time.time()
	import torch
	from transformers import GPTJForCausalLM, GPTJConfig
	from transformers import CodeGenTokenizer, CodeGenForCausalLM

	def cg2gptj(code_model):
	cg_model = CodeGenForCausalLM.from_pretrained(code_model, torch_dtype="auto")
	cg_config = cg_model.config

	# Create empty GPTJ model
	print('Creating empty GPTJ model')
	module load openmpi cuda/11.7

	#CONDA_HOME=/fsx/quentin/miniconda3
	CONDA_HOME=/fsx/gpt-neox/conda/envs/neox
	#CONDA_HOME=/fsx/gpt-neox/conda/envs/improved-t5
	CUDNN_HOME=/fsx/quentin/cudnn-linux-x86_64-8.6.0.163_cuda11-archive

	export LD_LIBRARY_PATH=$CUDNN_HOME/lib:$LD_LIBRARY_PATH
	export CPATH=$CUDNN_HOME/include:$CPATH
	import pandas as pd
	from urllib.parse import urlparse
	df=pd.read_csv()
	def url_matches_dataframe(url: str, df: pd.DataFrame) -> bool:
	# Parse the given URL to get the netloc and hostname
	parsed_url = urlparse(url)
	netloc = parsed_url.netloc
	hostname = parsed_url.hostname

	# Remove "www" from the netloc and hostname
	#!/bin/bash
	#SBATCH --job-name="elm"
	#SBATCH --partition=gpu
	#SBATCH --mem-per-cpu=16GB # Amount of CPU memory
	#SBATCH --nodes=4
	#SBATCH --ntasks-per-node=8 # Crucial - only 1 task per dist per node!
	#SBATCH --cpus-per-task=6 # Number of cores per tasks
	#SBATCH --hint=nomultithread # We get physical cores not logical
	#SBATCH --gres=gpu:8 # Number of gpus
	#SBATCH --output=%x_%j.out # Set this dir where you want slurm outs to go