Skip to content

Instantly share code, notes, and snippets.

View napsternxg's full-sized avatar
🎯
Focusing

Shubhanshu Mishra napsternxg

🎯
Focusing
View GitHub Profile
@napsternxg
napsternxg / prune_sklearn_model.py
Last active June 14, 2023 21:41
Prune Sklearn TF-IDF Logistic Regression model
from copy import deepcopy
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import sparse
from joblib import dump, load
import joblib
import time
@napsternxg
napsternxg / food.com.download.sh
Last active June 10, 2023 13:29
Food.com sitemap
mkdir food.com
cd food.com
wget https://www.food.com/sitemap.xml
for url in $(cat sitemap.xml | grep "<loc>https://www.food.com/sitemap-" | sed -n 's:.*<loc>\(.*\)</loc>.*:\1:p');
do echo "Download: $url";
done
for url in $(cat sitemap.xml | grep "<loc>https://www.food.com/sitemap-" | sed -n 's:.*<loc>\(.*\)</loc>.*:\1:p');
do wget "$url";
done
@napsternxg
napsternxg / gen_clip_embeddings.py
Created April 20, 2023 12:12
Gen Text Embeddings
from pathlib import Path
import torch
from transformers import CLIPProcessor, CLIPTextModelWithProjection
from accelerate import Accelerator
from datasets import Dataset
import pandas as pd
import numpy as np
@napsternxg
napsternxg / merge_pdfs.py
Last active April 18, 2023 20:46
Merge PDFs
"""
pip install pypdf
"""
from pypdf import PdfWriter
def main(args):
merger = PdfWriter()
file_paths = args.input_files
for pdf in file_paths:
@napsternxg
napsternxg / accelerated_sentence_transformer.diff
Last active November 7, 2023 16:20
accelerate support for sentence_transformer
diff --git a/sentence_transformers/SentenceTransformer.py b/sentence_transformers/SentenceTransformer.py
index e44e573..ae4dea4 100644
--- a/sentence_transformers/SentenceTransformer.py
+++ b/sentence_transformers/SentenceTransformer.py
@@ -16,6 +16,7 @@ from torch.optim import Optimizer
from torch.utils.data import DataLoader
import torch.multiprocessing as mp
from tqdm.autonotebook import trange
+from tqdm.autonotebook import tqdm
import math
@napsternxg
napsternxg / spacy_transformer.py
Last active April 15, 2023 06:21
Space sklearn Transformer - Use spacy embeddings in Sklearn model pipelines
"""Spacy Embedding Transformer for Sklearn pipeline
Install spacy and floret
```bash
pip install spacy floret scikit-learn
```
First download the vectors from:
```bash
@napsternxg
napsternxg / PyTorchBiggraph-Pytorch-pyarrow.diff
Created April 11, 2023 02:41
Improve edgelist processing speed of PyTorchBiggraph-Pytorch using pyarrow parquet reader.
diff --git a/torchbiggraph/converters/importers.py b/torchbiggraph/converters/importers.py
index fa84bc6..765e9fa 100644
--- a/torchbiggraph/converters/importers.py
+++ b/torchbiggraph/converters/importers.py
@@ -28,6 +28,7 @@ from torchbiggraph.graph_storages import (
RELATION_TYPE_STORAGES,
)
from torchbiggraph.types import UNPARTITIONED
+from tqdm import tqdm
@napsternxg
napsternxg / convergence_to_eigenvector.py
Created April 4, 2023 16:06
Repeated matrix multiplication makes the column vectors converge to eigen vectors of the matrix
# ! pip install celluloid
import numpy as np
import matplotlib.pyplot as plt
from celluloid import Camera
def plot_mat(A, evals=None, evecs=None, fig=None, ax=None):
if ax is None:
fig, ax = plt.subplots(1, 2, figsize=(8, 4))
A = A / np.linalg.norm(A, axis=0, keepdims=True)
ax[0].imshow(A)
@napsternxg
napsternxg / parse_tweetnerd_files.py
Created February 3, 2023 04:16
Parse TweetNERD files
from zipfile import ZipFile
from pathlib import Path
import pandas as pd
import numpy as np
import json
JOB_FILES = list(Path(".").glob("**/job_*.json.zip"))
JOB_ID_TO_OUTPUT_PART = {
1873084: 12,