Skip to content

Instantly share code, notes, and snippets.

View harry-stark's full-sized avatar
💭
Songs ,Codes,Coffee and Memes all along the way.

Harsimranjeet Saini harry-stark

💭
Songs ,Codes,Coffee and Memes all along the way.
  • Remote all life
View GitHub Profile
from spark_session_builder import build_spark_session
spark= build_spark_session(master="spark://cpu128-dy-r6i-32xlarge-3:7077",num_cores=128,mem_gb=999)
from pyspark.ml.feature import MinHashLSH,MinHashLSHModel
from pyspark.ml.linalg import Vectors
import time
from pyspark.sql.functions import col
from pyspark.ml.feature import MinHashLSH, Tokenizer, HashingTF
hash_size=100
threshold=0.8
start=time.time()
@harry-stark
harry-stark / codegen_gptj_converter.py
Created January 19, 2023 16:33
```python convertFt.py --output_dir= --n_gpus=8```
import torch
from transformers import GPTJForCausalLM, GPTJConfig
from transformers import CodeGenTokenizer, CodeGenForCausalLM
def cg2gptj(code_model):
cg_model = CodeGenForCausalLM.from_pretrained(code_model, torch_dtype="auto")
cg_config = cg_model.config
# Create empty GPTJ model
print('Creating empty GPTJ model')
@harry-stark
harry-stark / conda_setup.sh
Created January 3, 2023 14:08
Latest paths script
module load openmpi cuda/11.7
#CONDA_HOME=/fsx/quentin/miniconda3
CONDA_HOME=/fsx/gpt-neox/conda/envs/neox
#CONDA_HOME=/fsx/gpt-neox/conda/envs/improved-t5
CUDNN_HOME=/fsx/quentin/cudnn-linux-x86_64-8.6.0.163_cuda11-archive
export LD_LIBRARY_PATH=$CUDNN_HOME/lib:$LD_LIBRARY_PATH
export CPATH=$CUDNN_HOME/include:$CPATH
import pandas as pd
from urllib.parse import urlparse
df=pd.read_csv()
def url_matches_dataframe(url: str, df: pd.DataFrame) -> bool:
# Parse the given URL to get the netloc and hostname
parsed_url = urlparse(url)
netloc = parsed_url.netloc
hostname = parsed_url.hostname
# Remove "www" from the netloc and hostname
#!/bin/bash
#SBATCH --job-name="elm"
#SBATCH --partition=gpu
#SBATCH --mem-per-cpu=16GB # Amount of CPU memory
#SBATCH --nodes=4
#SBATCH --ntasks-per-node=8 # Crucial - only 1 task per dist per node!
#SBATCH --cpus-per-task=6 # Number of cores per tasks
#SBATCH --hint=nomultithread # We get physical cores not logical
#SBATCH --gres=gpu:8 # Number of gpus
#SBATCH --output=%x_%j.out # Set this dir where you want slurm outs to go
import deepspeed as ds
print(ds.__version__)
@harry-stark
harry-stark / main.sh
Last active November 24, 2022 05:15
Batch script for multinode run
#!/bin/bash
#SBATCH --job-name="elm"
#SBATCH --partition=gpu
#SBATCH --mem-per-cpu=16GB # Amount of CPU memory
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=8 # Crucial - only 1 task per dist per node!
#SBATCH --cpus-per-task=6 # Number of cores per tasks
#SBATCH --hint=nomultithread # We get physical cores not logical
#SBATCH --gres=gpu:8 # Number of gpus
#SBATCH --output=%x_%j.out # Set this dir where you want slurm outs to go