Skip to content

Instantly share code, notes, and snippets.

View davidmezzetti's full-sized avatar

David Mezzetti davidmezzetti

View GitHub Profile
import numpy as np
import tensorflow as tf
def np_cosine_similarity(u, v):
u = np.expand_dims(u, 1)
n = np.sum(u * v, axis=2)
d = np.linalg.norm(u, axis=2) * np.linalg.norm(v, axis=1)
return n / d
import time
# Generate random data
x = np.random.rand(1000, 25)
y = np.random.rand(50, 25)
# Time NumPy
start = time.time()
for _ in range(100):
np_cosine_similarity(x, y)
# Generate random data
x = np.random.rand(5, 5)
y = np.random.rand(1, 5)
print("x:", "\n", x)
print("y:", "\n", y)
# Calculate cosine similarity in NumPy
results = np_cosine_similarity(x, y)
import math
import pprint
import random
from statistics import mean, stdev
pp = pprint.PrettyPrinter()
# Set seed to generate predictable data
random.seed(500)
import re
import sys
from nltk.stem.porter import PorterStemmer
class Tokenizer(object):
# Standard stop words used by Lucene/Elasticsearch
STOP_WORDS = set(["a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"])
>>> import pytz
>>> from pytz import timezone
>>> utc = pytz.utc
>>> eastern = timezone('US/Eastern')
>>> date = datetime.utcnow().astimezone(utc)
>>> date
datetime.datetime(2020, 2, 6, 21, 22, 49, 384018, tzinfo=<UTC>)
>>> date.astimezone(eastern)
datetime.datetime(2020, 2, 6, 16, 22, 49, 384018, tzinfo=<DstTzInfo 'US/Eastern' EST-1 day, 19:00:00 STD>)
>>> from datetime import datetime, timedelta
>>> date = datetime.now()
>>> date
datetime.datetime(2020, 2, 6, 14, 49, 14, 277747)
>>> date - timedelta(minutes=10)
datetime.datetime(2020, 2, 6, 14, 39, 14, 277747)
>>> date - timedelta(hours=10)
datetime.datetime(2020, 2, 6, 4, 49, 14, 277747)
>>> date - timedelta(days=10)
datetime.datetime(2020, 1, 27, 14, 49, 14, 277747)
from txtai.embeddings import Embeddings
# Create embeddings model, backed by sentence-transformers & transformers
embeddings = Embeddings({"path": "sentence-transformers/nli-mpnet-base-v2"})
data = ["US tops 5 million confirmed virus cases",
"Canada's last fully intact ice shelf has suddenly collapsed, forming a Manhattan-sized iceberg",
"Beijing mobilises invasion craft along coast as Taiwan tensions escalate",
"The National Park Service warns against sacrificing slower friends in a bear attack",
"Maine man wins $1M from $25 lottery ticket",
# Create an index for the list of text
embeddings.index([(uid, text, None) for uid, text in enumerate(data)])
print("%-20s %s" % ("Query", "Best Match"))
print("-" * 50)
# Run an embeddings search for each query
for query in ("feel good story", "climate change", "public health story", "war", "wildlife",
"asia", "lucky", "dishonest junk"):
# Extract uid of first result
from txtai.embeddings import Embeddings
from txtai.pipeline import Extractor
# Create embeddings model, backed by sentence-transformers & transformers
embeddings = Embeddings({"path": "sentence-transformers/nli-mpnet-base-v2"})
# Create extractor instance
extractor = Extractor(embeddings, "distilbert-base-cased-distilled-squad")