Skip to content

Instantly share code, notes, and snippets.

@joshua-taylor
joshua-taylor / blender_example.py
Last active January 7, 2023 11:02
How to get started with data viz and Blender
import bpy
#Add a light:
bpy.ops.object.light_add(type='AREA', align='WORLD', location=(2.8, -8, 14),)
data = [1,5,2,6]
barDist = 1.2
#Add a some cubes to create a bar chart from the data:
for row,bar in enumerate(data):
constraint = LinearConstraint(np.ones(n_suppliers),lb=requiredOrder,ub=1000)
bnds = [(0,None) for i in range(n_suppliers)]
minimizer_kwargs = {"method": "SLSQP",
"bounds":bnds,
"constraints": constraint,
}
res=basinhopping(func,
[200,200,200],
stepsize=500,
from scipy.optimize import minimize, LinearConstraint, basinhopping
from math import floor
import numpy as np
#Setting up the pricing amounts for each supplier
supplierPrice = [10.5,11,10]
supplierDiscountAmount = [0.1,0.35,0.05]
supplierDiscountThreshold = [100,260,300]
n_suppliers = len(supplierPrice)
#Our minimum order amount
input = 'flood defences'.lower().split()
query = [ft_model[vec] for vec in input]
query = np.mean(query,axis=0)
t0 = time.time()
ids, distances = index.knnQuery(query, k=10)
t1 = time.time()
print(f'Searched {df.shape[0]} records in {round(t1-t0,4) } seconds \n')
for i,j in zip(ids,distances):
import nmslib
# create a matrix from our document vectors
data = np.vstack(weighted_doc_vects)
# initialize a new index, using a HNSW index on Cosine Similarity
index = nmslib.init(method='hnsw', space='cosinesimil')
index.addDataPointBatch(data)
index.createIndex({'post': 2}, print_progress=True)
weighted_doc_vects = []
for i,doc in tqdm(enumerate(tok_text)):
doc_vector = []
for word in doc:
vector = ft_model[word]
weight = (bm25.idf[word] * ((bm25.k1 + 1.0)*bm25.doc_freqs[i][word]))
/
(bm25.k1 * (1.0 - bm25.b + bm25.b *(bm25.doc_len[i]/bm25.avgdl))+bm25.doc_freqs[i][word])
weighted_vector = vector * weight
from gensim.models.fasttext import FastText
ft_model = FastText(
sg=1, # use skip-gram: usually gives better results
size=100, # embedding dimension (default)
window=10, # window size: 10 tokens before and 10 tokens after to get wider context
min_count=5, # only consider tokens with at least n occurrences in the corpus
negative=15, # negative subsampling: bigger than default to sample negative examples more
min_n=2, # min character n-gram
max_n=5 # max character n-gram
@joshua-taylor
joshua-taylor / sPacy tokenize.py
Created October 10, 2020 09:45
sPacy tokenize
nlp = spacy.load("en_core_web_sm")
tok_text=[] # OUTPUT for our tokenised corpus
text = df.text.str.lower().values
text = [fix_text(str(i)) for i in text]
#Tokenising using SpaCy:
for doc in tqdm(nlp.pipe(text, n_threads=2, disable=["tagger", "parser","ner"])):
tok = [t.text for t in doc if (t.is_ascii and not t.is_punct and not t.is_space)]
tok_text.append(tok)
@joshua-taylor
joshua-taylor / Cluster labels.py
Last active November 24, 2019 13:48
Cluster labels
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
tfidf = vectorizer.fit_transform(df.questionText.values)
totals = 0
for cluster in df.cluster.value_counts()[0:10].index:
stg = " ".join(df.loc[df.cluster==cluster].questionText.values)
response = vectorizer.transform([stg])
count = df.cluster.value_counts().loc[cluster]
totals += count
@joshua-taylor
joshua-taylor / BERT vectors.py
Created November 24, 2019 13:16
BERT vectors and TFIDF
import spacy
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
import IPython
is_using_gpu = spacy.prefer_gpu()
if is_using_gpu:
torch.set_default_tensor_type("torch.cuda.FloatTensor")
nlp = spacy.load("en_trf_bertbaseuncased_lg")