Schaun Wheeler schaunwheeler

## pyspark_minhash_jaccard.py
from numpy.random import RandomState
import pyspark.sql.functions as f
from pyspark import StorageLevel


def hashmin_jaccard_spark(
    sdf, node_col, edge_basis_col, suffixes=('A', 'B'),
    n_draws=100, storage_level=None, seed=42, verbose=False):
    """
    Calculate a sparse Jaccard similarity matrix using MinHash.

## amortize.r
# The MIT License (MIT)
#
# Copyright (c) 2012 Schaun Jacob Wheeler
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

## xlsxToR.r
# The MIT License (MIT)
#
# Copyright (c) 2012 Schaun Jacob Wheeler
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

## doc_to_spans.py
from spacy import load as spacy_load

# This loads the largest English corpus, which must be downloaded
# separate from package installation. Other choices are available.
nlp = spacy_load('en_core_web_lg')


def doc_to_spans(list_of_texts, join_string=' ||| '):
    all_docs = nlp(' ||| '.join(list_of_texts))
    split_inds = [i for i, token in enumerate(all_docs) if token.text == '|||'] + [len(all_docs)]

## randomforestregressor_predict.scala
import rapture.json.jsonBackends.jawn._
import rapture.json.Json
import scala.annotation.tailrec


case class RandomForestTree(
  treeId: Int,
  undefinedIndex: Int,
  features: Array[Int],
  thresholds: Array[Double],

## rfr_example.json
{
    "i": 0,
    "tree_undefined": -2,
    "features": [
        3,
        3,
        2,
        3,
        -2,
        -2,

## pure_python_rfr.py
from sklearn.tree import _tree

tree_template = '''
def tree{i}(inputs):

    tree_undefined = {tree_undefined}

    features = {features}
    thresholds = {thresholds}
    children_left = {children_left}

## rfr_to_json.py
from json import dumps

def rfr_to_json(rfr_object, feature_list, json_filepath=None):
    '''
    Function to convert a scikit-learn RandomForestRegressor object to JSON.

    '''

    output_dict = dict()
    output_dict['name'] = 'rf_regression_pipeline'

## spacy_pyspark_wordvec_udf.py
import pyspark.sql.types as t
import pyspark.sql.functions as f


def spacy_word2vec_grouped(cat_list, id_col, string_col):
    """
    Example usage:

    vec_sdf = (
        sdf

## ds_prod_scale1.py
from pandas import DataFrame
from pyspark.sql import types as t, functions as f

df = DataFrame({'ids': [1, 2, 3], 'words': ['abracadabra', 'hocuspocus', 'shazam']})
sdf = sparkSession.createDataFrame(df)

normalize_word_udf = f.udf(normalize_word, t.StringType())
stops = f.array([f.lit(c) for c in STOPCHARS])

results = sdf.select('ids', normalize_word_udf(f.col('words'), stops).alias('norms'))
	from numpy.random import RandomState
	import pyspark.sql.functions as f
	from pyspark import StorageLevel


	def hashmin_jaccard_spark(
	sdf, node_col, edge_basis_col, suffixes=('A', 'B'),
	n_draws=100, storage_level=None, seed=42, verbose=False):
	"""
	Calculate a sparse Jaccard similarity matrix using MinHash.
	# The MIT License (MIT)
	#
	# Copyright (c) 2012 Schaun Jacob Wheeler
	#
	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:
	from spacy import load as spacy_load

	# This loads the largest English corpus, which must be downloaded
	# separate from package installation. Other choices are available.
	nlp = spacy_load('en_core_web_lg')


	def doc_to_spans(list_of_texts, join_string=' \|\|\| '):
	all_docs = nlp(' \|\|\| '.join(list_of_texts))
	split_inds = [i for i, token in enumerate(all_docs) if token.text == '\|\|\|'] + [len(all_docs)]
	import rapture.json.jsonBackends.jawn._
	import rapture.json.Json
	import scala.annotation.tailrec


	case class RandomForestTree(
	treeId: Int,
	undefinedIndex: Int,
	features: Array[Int],
	thresholds: Array[Double],
	from sklearn.tree import _tree

	tree_template = '''
	def tree{i}(inputs):

	tree_undefined = {tree_undefined}

	features = {features}
	thresholds = {thresholds}
	children_left = {children_left}
	from json import dumps

	def rfr_to_json(rfr_object, feature_list, json_filepath=None):
	'''
	Function to convert a scikit-learn RandomForestRegressor object to JSON.

	'''

	output_dict = dict()
	output_dict['name'] = 'rf_regression_pipeline'
	import pyspark.sql.types as t
	import pyspark.sql.functions as f


	def spacy_word2vec_grouped(cat_list, id_col, string_col):
	"""
	Example usage:

	vec_sdf = (
	sdf
	from pandas import DataFrame
	from pyspark.sql import types as t, functions as f

	df = DataFrame({'ids': [1, 2, 3], 'words': ['abracadabra', 'hocuspocus', 'shazam']})
	sdf = sparkSession.createDataFrame(df)

	normalize_word_udf = f.udf(normalize_word, t.StringType())
	stops = f.array([f.lit(c) for c in STOPCHARS])

	results = sdf.select('ids', normalize_word_udf(f.col('words'), stops).alias('norms'))