Skip to content

Instantly share code, notes, and snippets.

View yuchenlin's full-sized avatar
:octocat:

(Bill) Yuchen Lin yuchenlin

:octocat:
View GitHub Profile
@yuchenlin
yuchenlin / clean_conceptnet.py
Created March 5, 2020 05:10
Cleaning ConceptNet
```
wget https://s3.amazonaws.com/conceptnet/downloads/2017/edges/conceptnet-assertions-5.5.5.csv.gz
gunzip -k conceptnet-assertions-5.5.5.csv.gz
```
import json
def del_pos(s):
"""
Deletes part-of-speech encoding from an entity string, if present.
@yuchenlin
yuchenlin / batched_roberta_infer.py
Last active May 23, 2020 06:09
Batched version for using RoBERTa to do inference
import torch
import numpy as np
from tqdm import tqdm
from fairseq.models.roberta import RobertaModel
from fairseq.data.data_utils import collate_tokens
from torch.utils.data import DataLoader, SequentialSampler
roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')
roberta.eval()
roberta.cuda()
@yuchenlin
yuchenlin / bert_kmeans.py
Created December 20, 2020 23:50
Text Clustering with Sentence BERT
from sentence_transformers import SentenceTransformer # pip install -U sentence-transformers
from sklearn.cluster import KMeans
from collections import defaultdict
INPUT_FILE = "/tmp/test_input.txt"
with open(INPUT_FILE, "r") as f:
lines = f.read().splitlines()
print(len(lines))
@yuchenlin
yuchenlin / mv.sh
Last active April 22, 2021 02:13
`mv` a folder with a progress bar and detailed logs.
#!/bin/bash
# Usage: bash mv.sh /path/to/src/ /path/to/target/
sourcedir=$1
targetdir=$2
filecount=$(find $sourcedir | wc -l)
echo $filecount # Print the number of the total files.
mkdir $targetdir
mv -v $sourcedir $targetdir | pv -l -s $filecount > /tmp/mv_log.txt
@yuchenlin
yuchenlin / encoding.py
Last active September 29, 2021 04:51
Encode examples with a BART model
from argparse import Namespace
from numpy.core.defchararray import index
from semanticdebugger.debug_algs.cl_simple_alg import ContinualFinetuning
from tqdm import tqdm
import torch
from semanticdebugger.models.utils import trim_batch
import json
from semanticdebugger.debug_algs import run_lifelong_finetune
"""
# stanza.download()
# http://nlp.stanford.edu/software/stanza/1.0.0/en/default.zip
Example usage:
CUDA_VISIBLE_DEVICES=1 \
python parsing.py \
--input_corpus_path ./corpora/gkb_best_sent.txt \
--output_json_path ./parses/gkb_best.parses.jsonl \
--prefix gkb_best --num_shards 10000 --shard_id 0
@yuchenlin
yuchenlin / grade.sh
Last active April 8, 2022 20:23
Grading for CSCI 561
#!/bin/bash
echo "HW3 Report" > $vocareumReportFile
echo "Programming language..." >> $vocareumReportFile
filename=$(ls|grep NeuralNetwork)
if [[ $filename =~ (^|[[:space:]])"NeuralNetwork.py"($|[[:space:]]) ]]; then
cmd="python NeuralNetwork.py train_image.csv train_label.csv test_image.csv"
mnist_cmd="python NeuralNetwork.py grading_train_image.csv grading_train_label.csv grading_test_image.csv"
ta_cmd="python NeuralNetwork.py grading_train_image.csv grading_train_label.csv additional_test_image.csv"
@yuchenlin
yuchenlin / 561-hw3-grading.py
Last active April 8, 2022 20:33
561-hw3-grading.py
import sys
acc = float(sys.argv[1]) # mnist acc
ta_acc = float(sys.argv[2]) # ta acc
t1=50.00
t2=90.00
tt1=30.0
tt2=60.0
"""
Decompose and memorize by program, only asking LLMs to do low-level computations.
"""
import openai
import re
import math
from tenacity import (
retry,
@yuchenlin
yuchenlin / gpt_sent_prob.py
Last active May 21, 2023 17:12
Compute sentence probability using GPT-2 with huggingface transformers
import torch
from transformers import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np
from scipy.special import softmax
def model_init(model_string, cuda):
if model_string.startswith("gpt2"):
tokenizer = GPT2Tokenizer.from_pretrained(model_string)
model = GPT2LMHeadModel.from_pretrained(model_string)