Balazs Horanyi BalazsHoranyi

## bpr_loss.py
def bpr_loss(positive_predictions, negative_predictions):
    """
    Bayesian Personalised Ranking pairwise loss function. Original Implementation: https://github.com/maciejkula/spotlight

    """

    loss = (1.0 - F.sigmoid(positive_predictions -
                            negative_predictions))

    return loss.mean()

## dask_to_array.py
def to_dask_array(df):
    #  https://stackoverflow.com/questions/37444943/dask-array-from-dataframe?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
    partitions = df.to_delayed()
    shapes = [part.values.shape for part in partitions]
    dtypes = partitions[0].dtypes

    results = compute(dtypes, *shapes)  # trigger computation to find shape
    dtypes, shapes = results[0], results[1:]

    chunks = [da.from_delayed(part.values, shape, dtypes)

## user_item_interaction_split.py

interactions = da.from_npy_stack('interactions')
users = interactions[:,0]
items = interactions[:,1]
slicer = 10000000

for i in tqdm(range(math.ceil((len(interactions))/slicer))):
    if i == 0:
        user_set = set(users[i*slicer: (i+1)*slicer].compute())
    else:

## ANN_index_build.py
from annoy import AnnoyIndex

f = 32
t = AnnoyIndex(f)
for i in range(len(item_embeddings)):
    t.add_item(i, item_embeddings[i])

t.build(10) # 10 trees
t.save('github.ann')

## lead_generator.py
def update_prospects(self):

    self.df_es = self.get_reviews(gt=True)
    nlp = spacy.load('en')
    nlp_sent = spacy.load('appreviews/appclass')
    matcher = Matcher(nlp.vocab)
    matcher.add("feednoun", None, [{POS: 'NOUN', 'LOWER': 'feed'}])
    matcher.add("follow", None, [{'LOWER': 'follow'}, {LEMMA: 'relation'}])
    matcher.add("follows", None, [{'LOWER': 'follow'}, {'LOWER': 'relationships'}])
    matcher.add("follows", None, [{'LOWER': 'follow'}, {LEMMA: 'relationships'}])

## spacy_sentiment_classifier.py
nlp = spacy.blank('en')  # create blank Language class
print("Created blank 'en' model")

# add the text classifier to the pipeline if it doesn't exist
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'textcat' not in nlp.pipe_names:
    textcat = nlp.create_pipe('textcat')
    nlp.add_pipe(textcat, last=True)
# otherwise, get it, so we can add labels to it
else:

## normalize_interactions.py
print('users')
users = da.from_npy_stack('users', mmap_mode=None).compute().astype(np.int32)

print('items')
items = da.from_npy_stack('items', mmap_mode=None).compute().astype(np.int32)

print('getting unique')
unique_items,  item_inverse, item_count = np.unique(items, return_counts=True, return_inverse=True)
print('creating mask')
good_items = unique_items[np.where(item_count > 50)[0]]

## GH_Archive.py
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import requests
import datetime
import os
import gzip
from joblib import Parallel, delayed

## strip_user_repo.py
from distributed import Client, LocalCluster
import dask.dataframe as dd
import numpy as np

cluster = LocalCluster(ip='0.0.0.0', n_workers=32, threads_per_worker=1, diagnostics_port=8787, **{'memory_limit': 2e9})
client = Client(cluster)
print(client)

df = dd.read_parquet('parquet/')
print(f'found {len(df)} interactions')

## NeuMF.py
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def gpu(tensor, gpu=False):
    if gpu:
        return tensor.cuda()
    else:
	def bpr_loss(positive_predictions, negative_predictions):
	"""
	Bayesian Personalised Ranking pairwise loss function. Original Implementation: https://github.com/maciejkula/spotlight

	"""

	loss = (1.0 - F.sigmoid(positive_predictions -
	negative_predictions))

	return loss.mean()
	def to_dask_array(df):
	# https://stackoverflow.com/questions/37444943/dask-array-from-dataframe?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
	partitions = df.to_delayed()
	shapes = [part.values.shape for part in partitions]
	dtypes = partitions[0].dtypes

	results = compute(dtypes, *shapes) # trigger computation to find shape
	dtypes, shapes = results[0], results[1:]

	chunks = [da.from_delayed(part.values, shape, dtypes)

	interactions = da.from_npy_stack('interactions')
	users = interactions[:,0]
	items = interactions[:,1]
	slicer = 10000000

	for i in tqdm(range(math.ceil((len(interactions))/slicer))):
	if i == 0:
	user_set = set(users[islicer: (i+1)slicer].compute())
	else:
	from annoy import AnnoyIndex

	f = 32
	t = AnnoyIndex(f)
	for i in range(len(item_embeddings)):
	t.add_item(i, item_embeddings[i])

	t.build(10) # 10 trees
	t.save('github.ann')
	def update_prospects(self):

	self.df_es = self.get_reviews(gt=True)
	nlp = spacy.load('en')
	nlp_sent = spacy.load('appreviews/appclass')
	matcher = Matcher(nlp.vocab)
	matcher.add("feednoun", None, [{POS: 'NOUN', 'LOWER': 'feed'}])
	matcher.add("follow", None, [{'LOWER': 'follow'}, {LEMMA: 'relation'}])
	matcher.add("follows", None, [{'LOWER': 'follow'}, {'LOWER': 'relationships'}])
	matcher.add("follows", None, [{'LOWER': 'follow'}, {LEMMA: 'relationships'}])
	nlp = spacy.blank('en') # create blank Language class
	print("Created blank 'en' model")

	# add the text classifier to the pipeline if it doesn't exist
	# nlp.create_pipe works for built-ins that are registered with spaCy
	if 'textcat' not in nlp.pipe_names:
	textcat = nlp.create_pipe('textcat')
	nlp.add_pipe(textcat, last=True)
	# otherwise, get it, so we can add labels to it
	else:
	print('users')
	users = da.from_npy_stack('users', mmap_mode=None).compute().astype(np.int32)

	print('items')
	items = da.from_npy_stack('items', mmap_mode=None).compute().astype(np.int32)

	print('getting unique')
	unique_items, item_inverse, item_count = np.unique(items, return_counts=True, return_inverse=True)
	print('creating mask')
	good_items = unique_items[np.where(item_count > 50)[0]]
	import pandas as pd
	import numpy as np
	import pyarrow as pa
	import pyarrow.parquet as pq
	import requests
	import datetime
	import os
	import gzip
	from joblib import Parallel, delayed
	from distributed import Client, LocalCluster
	import dask.dataframe as dd
	import numpy as np

	cluster = LocalCluster(ip='0.0.0.0', n_workers=32, threads_per_worker=1, diagnostics_port=8787, **{'memory_limit': 2e9})
	client = Client(cluster)
	print(client)

	df = dd.read_parquet('parquet/')
	print(f'found {len(df)} interactions')
	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

	def gpu(tensor, gpu=False):
	if gpu:
	return tensor.cuda()
	else: