Skip to content

Instantly share code, notes, and snippets.

def ngrams(string, n=3):
string = fix_text(string) # fix text encoding issues
string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars
string = string.lower() #make lower case
chars_to_remove = [")","(",".","|","[","]","{","}","'"]
rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
string = re.sub(rx, '', string) #remove the list of chars defined above
string = string.replace('&', 'and')
string = string.replace(',', ' ')
string = string.replace('-', ' ')
import IPython
tkn = tfidf.build_tokenizer()
sent = df.questionText.values[236178].lower()
sent = tkn(sent)
html=''
for wrd in sent:
try:
weight = (tfidf.idf_[tfidf.vocabulary_[wrd]])*10
print(weight/10)
except:
@joshua-taylor
joshua-taylor / BERT vectors.py
Created November 24, 2019 13:16
BERT vectors and TFIDF
import spacy
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
import IPython
is_using_gpu = spacy.prefer_gpu()
if is_using_gpu:
torch.set_default_tensor_type("torch.cuda.FloatTensor")
nlp = spacy.load("en_trf_bertbaseuncased_lg")
@joshua-taylor
joshua-taylor / Cluster labels.py
Last active November 24, 2019 13:48
Cluster labels
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
tfidf = vectorizer.fit_transform(df.questionText.values)
totals = 0
for cluster in df.cluster.value_counts()[0:10].index:
stg = " ".join(df.loc[df.cluster==cluster].questionText.values)
response = vectorizer.transform([stg])
count = df.cluster.value_counts().loc[cluster]
totals += count
@joshua-taylor
joshua-taylor / sPacy tokenize.py
Created October 10, 2020 09:45
sPacy tokenize
nlp = spacy.load("en_core_web_sm")
tok_text=[] # OUTPUT for our tokenised corpus
text = df.text.str.lower().values
text = [fix_text(str(i)) for i in text]
#Tokenising using SpaCy:
for doc in tqdm(nlp.pipe(text, n_threads=2, disable=["tagger", "parser","ner"])):
tok = [t.text for t in doc if (t.is_ascii and not t.is_punct and not t.is_space)]
tok_text.append(tok)
from gensim.models.fasttext import FastText
ft_model = FastText(
sg=1, # use skip-gram: usually gives better results
size=100, # embedding dimension (default)
window=10, # window size: 10 tokens before and 10 tokens after to get wider context
min_count=5, # only consider tokens with at least n occurrences in the corpus
negative=15, # negative subsampling: bigger than default to sample negative examples more
min_n=2, # min character n-gram
max_n=5 # max character n-gram
weighted_doc_vects = []
for i,doc in tqdm(enumerate(tok_text)):
doc_vector = []
for word in doc:
vector = ft_model[word]
weight = (bm25.idf[word] * ((bm25.k1 + 1.0)*bm25.doc_freqs[i][word]))
/
(bm25.k1 * (1.0 - bm25.b + bm25.b *(bm25.doc_len[i]/bm25.avgdl))+bm25.doc_freqs[i][word])
weighted_vector = vector * weight
import nmslib
# create a matrix from our document vectors
data = np.vstack(weighted_doc_vects)
# initialize a new index, using a HNSW index on Cosine Similarity
index = nmslib.init(method='hnsw', space='cosinesimil')
index.addDataPointBatch(data)
index.createIndex({'post': 2}, print_progress=True)
input = 'flood defences'.lower().split()
query = [ft_model[vec] for vec in input]
query = np.mean(query,axis=0)
t0 = time.time()
ids, distances = index.knnQuery(query, k=10)
t1 = time.time()
print(f'Searched {df.shape[0]} records in {round(t1-t0,4) } seconds \n')
for i,j in zip(ids,distances):
from scipy.optimize import minimize, LinearConstraint, basinhopping
from math import floor
import numpy as np
#Setting up the pricing amounts for each supplier
supplierPrice = [10.5,11,10]
supplierDiscountAmount = [0.1,0.35,0.05]
supplierDiscountThreshold = [100,260,300]
n_suppliers = len(supplierPrice)
#Our minimum order amount