This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def bpr_loss(positive_predictions, negative_predictions): | |
""" | |
Bayesian Personalised Ranking pairwise loss function. Original Implementation: https://github.com/maciejkula/spotlight | |
""" | |
loss = (1.0 - F.sigmoid(positive_predictions - | |
negative_predictions)) | |
return loss.mean() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def to_dask_array(df): | |
# https://stackoverflow.com/questions/37444943/dask-array-from-dataframe?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa | |
partitions = df.to_delayed() | |
shapes = [part.values.shape for part in partitions] | |
dtypes = partitions[0].dtypes | |
results = compute(dtypes, *shapes) # trigger computation to find shape | |
dtypes, shapes = results[0], results[1:] | |
chunks = [da.from_delayed(part.values, shape, dtypes) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
interactions = da.from_npy_stack('interactions') | |
users = interactions[:,0] | |
items = interactions[:,1] | |
slicer = 10000000 | |
for i in tqdm(range(math.ceil((len(interactions))/slicer))): | |
if i == 0: | |
user_set = set(users[i*slicer: (i+1)*slicer].compute()) | |
else: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from annoy import AnnoyIndex | |
f = 32 | |
t = AnnoyIndex(f) | |
for i in range(len(item_embeddings)): | |
t.add_item(i, item_embeddings[i]) | |
t.build(10) # 10 trees | |
t.save('github.ann') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def update_prospects(self): | |
self.df_es = self.get_reviews(gt=True) | |
nlp = spacy.load('en') | |
nlp_sent = spacy.load('appreviews/appclass') | |
matcher = Matcher(nlp.vocab) | |
matcher.add("feednoun", None, [{POS: 'NOUN', 'LOWER': 'feed'}]) | |
matcher.add("follow", None, [{'LOWER': 'follow'}, {LEMMA: 'relation'}]) | |
matcher.add("follows", None, [{'LOWER': 'follow'}, {'LOWER': 'relationships'}]) | |
matcher.add("follows", None, [{'LOWER': 'follow'}, {LEMMA: 'relationships'}]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
nlp = spacy.blank('en') # create blank Language class | |
print("Created blank 'en' model") | |
# add the text classifier to the pipeline if it doesn't exist | |
# nlp.create_pipe works for built-ins that are registered with spaCy | |
if 'textcat' not in nlp.pipe_names: | |
textcat = nlp.create_pipe('textcat') | |
nlp.add_pipe(textcat, last=True) | |
# otherwise, get it, so we can add labels to it | |
else: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
print('users') | |
users = da.from_npy_stack('users', mmap_mode=None).compute().astype(np.int32) | |
print('items') | |
items = da.from_npy_stack('items', mmap_mode=None).compute().astype(np.int32) | |
print('getting unique') | |
unique_items, item_inverse, item_count = np.unique(items, return_counts=True, return_inverse=True) | |
print('creating mask') | |
good_items = unique_items[np.where(item_count > 50)[0]] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import pyarrow as pa | |
import pyarrow.parquet as pq | |
import requests | |
import datetime | |
import os | |
import gzip | |
from joblib import Parallel, delayed |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from distributed import Client, LocalCluster | |
import dask.dataframe as dd | |
import numpy as np | |
cluster = LocalCluster(ip='0.0.0.0', n_workers=32, threads_per_worker=1, diagnostics_port=8787, **{'memory_limit': 2e9}) | |
client = Client(cluster) | |
print(client) | |
df = dd.read_parquet('parquet/') | |
print(f'found {len(df)} interactions') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") | |
def gpu(tensor, gpu=False): | |
if gpu: | |
return tensor.cuda() | |
else: |
OlderNewer