David Mezzetti davidmezzetti

## similarity.py
import numpy as np
import tensorflow as tf

def np_cosine_similarity(u, v):
  u = np.expand_dims(u, 1)
  n = np.sum(u * v, axis=2)
  d = np.linalg.norm(u, axis=2) * np.linalg.norm(v, axis=1)

  return n / d

## similarity_timing.py
import time

# Generate random data
x = np.random.rand(1000, 25)
y = np.random.rand(50, 25)

# Time NumPy
start = time.time()
for _ in range(100):
  np_cosine_similarity(x, y)

## similarity_indices.py
# Generate random data
x = np.random.rand(5, 5)
y = np.random.rand(1, 5)

print("x:", "\n", x)
print("y:", "\n", y)

# Calculate cosine similarity in NumPy
results = np_cosine_similarity(x, y)

## transformations.py
import math
import pprint
import random
from statistics import mean, stdev

pp = pprint.PrettyPrinter()

# Set seed to generate predictable data
random.seed(500)

## tokenizer.py
import re
import sys

from nltk.stem.porter import PorterStemmer

class Tokenizer(object):
  # Standard stop words used by Lucene/Elasticsearch
  STOP_WORDS = set(["a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it",
                    "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these",
                    "they", "this", "to", "was", "will", "with"])

## timezone.py
>>> import pytz
>>> from pytz import timezone
>>> utc = pytz.utc
>>> eastern = timezone('US/Eastern')
>>> date = datetime.utcnow().astimezone(utc)
>>> date
datetime.datetime(2020, 2, 6, 21, 22, 49, 384018, tzinfo=<UTC>)
>>> date.astimezone(eastern)
datetime.datetime(2020, 2, 6, 16, 22, 49, 384018, tzinfo=<DstTzInfo 'US/Eastern' EST-1 day, 19:00:00 STD>)

## datemath.py
>>> from datetime import datetime, timedelta
>>> date = datetime.now()
>>> date
datetime.datetime(2020, 2, 6, 14, 49, 14, 277747)
>>> date - timedelta(minutes=10)
datetime.datetime(2020, 2, 6, 14, 39, 14, 277747)
>>> date - timedelta(hours=10)
datetime.datetime(2020, 2, 6, 4, 49, 14, 277747)
>>> date - timedelta(days=10)
datetime.datetime(2020, 1, 27, 14, 49, 14, 277747)

## txtai-similarity.py
from txtai.embeddings import Embeddings

# Create embeddings model, backed by sentence-transformers & transformers
embeddings = Embeddings({"path": "sentence-transformers/nli-mpnet-base-v2"})

data = ["US tops 5 million confirmed virus cases",
        "Canada's last fully intact ice shelf has suddenly collapsed, forming a Manhattan-sized iceberg",
        "Beijing mobilises invasion craft along coast as Taiwan tensions escalate",
        "The National Park Service warns against sacrificing slower friends in a bear attack",
        "Maine man wins $1M from $25 lottery ticket",

## txtai-index.py
# Create an index for the list of text
embeddings.index([(uid, text, None) for uid, text in enumerate(data)])

print("%-20s %s" % ("Query", "Best Match"))
print("-" * 50)

# Run an embeddings search for each query
for query in ("feel good story", "climate change", "public health story", "war", "wildlife",
              "asia", "lucky", "dishonest junk"):
    # Extract uid of first result

## txtai-qa.py
from txtai.embeddings import Embeddings
from txtai.pipeline import Extractor

# Create embeddings model, backed by sentence-transformers & transformers
embeddings = Embeddings({"path": "sentence-transformers/nli-mpnet-base-v2"})

# Create extractor instance
extractor = Extractor(embeddings, "distilbert-base-cased-distilled-squad")
	import numpy as np
	import tensorflow as tf

	def np_cosine_similarity(u, v):
	u = np.expand_dims(u, 1)
	n = np.sum(u * v, axis=2)
	d = np.linalg.norm(u, axis=2) * np.linalg.norm(v, axis=1)

	return n / d
	import time

	# Generate random data
	x = np.random.rand(1000, 25)
	y = np.random.rand(50, 25)

	# Time NumPy
	start = time.time()
	for _ in range(100):
	np_cosine_similarity(x, y)
	# Generate random data
	x = np.random.rand(5, 5)
	y = np.random.rand(1, 5)

	print("x:", "\n", x)
	print("y:", "\n", y)

	# Calculate cosine similarity in NumPy
	results = np_cosine_similarity(x, y)
	import math
	import pprint
	import random
	from statistics import mean, stdev

	pp = pprint.PrettyPrinter()

	# Set seed to generate predictable data
	random.seed(500)
	import re
	import sys

	from nltk.stem.porter import PorterStemmer

	class Tokenizer(object):
	# Standard stop words used by Lucene/Elasticsearch
	STOP_WORDS = set(["a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it",
	"no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these",
	"they", "this", "to", "was", "will", "with"])
	>>> import pytz
	>>> from pytz import timezone
	>>> utc = pytz.utc
	>>> eastern = timezone('US/Eastern')
	>>> date = datetime.utcnow().astimezone(utc)
	>>> date
	datetime.datetime(2020, 2, 6, 21, 22, 49, 384018, tzinfo=<UTC>)
	>>> date.astimezone(eastern)
	datetime.datetime(2020, 2, 6, 16, 22, 49, 384018, tzinfo=<DstTzInfo 'US/Eastern' EST-1 day, 19:00:00 STD>)
	>>> from datetime import datetime, timedelta
	>>> date = datetime.now()
	>>> date
	datetime.datetime(2020, 2, 6, 14, 49, 14, 277747)
	>>> date - timedelta(minutes=10)
	datetime.datetime(2020, 2, 6, 14, 39, 14, 277747)
	>>> date - timedelta(hours=10)
	datetime.datetime(2020, 2, 6, 4, 49, 14, 277747)
	>>> date - timedelta(days=10)
	datetime.datetime(2020, 1, 27, 14, 49, 14, 277747)
	from txtai.embeddings import Embeddings

	# Create embeddings model, backed by sentence-transformers & transformers
	embeddings = Embeddings({"path": "sentence-transformers/nli-mpnet-base-v2"})

	data = ["US tops 5 million confirmed virus cases",
	"Canada's last fully intact ice shelf has suddenly collapsed, forming a Manhattan-sized iceberg",
	"Beijing mobilises invasion craft along coast as Taiwan tensions escalate",
	"The National Park Service warns against sacrificing slower friends in a bear attack",
	"Maine man wins $1M from $25 lottery ticket",
	# Create an index for the list of text
	embeddings.index([(uid, text, None) for uid, text in enumerate(data)])

	print("%-20s %s" % ("Query", "Best Match"))
	print("-" * 50)

	# Run an embeddings search for each query
	for query in ("feel good story", "climate change", "public health story", "war", "wildlife",
	"asia", "lucky", "dishonest junk"):
	# Extract uid of first result
	from txtai.embeddings import Embeddings
	from txtai.pipeline import Extractor

	# Create embeddings model, backed by sentence-transformers & transformers
	embeddings = Embeddings({"path": "sentence-transformers/nli-mpnet-base-v2"})

	# Create extractor instance
	extractor = Extractor(embeddings, "distilbert-base-cased-distilled-squad")