Skip to content

Instantly share code, notes, and snippets.

View schaunwheeler's full-sized avatar

Schaun Wheeler schaunwheeler

View GitHub Profile
@schaunwheeler
schaunwheeler / rfr_example.json
Created April 19, 2019 13:21
Example JSON output for single tree of RandomForestRegressor
{
"i": 0,
"tree_undefined": -2,
"features": [
3,
3,
2,
3,
-2,
-2,
@schaunwheeler
schaunwheeler / pure_python_rfr.py
Created April 13, 2019 12:18
Create function in pure Python that calculates predictions from a Scikit-Learn RandomForestRegressor
from sklearn.tree import _tree
tree_template = '''
def tree{i}(inputs):
tree_undefined = {tree_undefined}
features = {features}
thresholds = {thresholds}
children_left = {children_left}
@schaunwheeler
schaunwheeler / rfr_to_json.py
Created April 13, 2019 12:12
Funtion to dump trained Scikit-Learn RandomForestRegressor to JSON
from json import dumps
def rfr_to_json(rfr_object, feature_list, json_filepath=None):
'''
Function to convert a scikit-learn RandomForestRegressor object to JSON.
'''
output_dict = dict()
output_dict['name'] = 'rf_regression_pipeline'
@schaunwheeler
schaunwheeler / randomforestregressor_predict.scala
Last active April 23, 2019 13:43
An example of using Scala to call the predict function from a Scikit-Learn RandomForestRegressor
import rapture.json.jsonBackends.jawn._
import rapture.json.Json
import scala.annotation.tailrec
case class RandomForestTree(
treeId: Int,
undefinedIndex: Int,
features: Array[Int],
thresholds: Array[Double],
@schaunwheeler
schaunwheeler / spacy_pyspark_wordvec_udf.py
Created April 13, 2019 11:44
Example of using spaCy on Spark
import pyspark.sql.types as t
import pyspark.sql.functions as f
def spacy_word2vec_grouped(cat_list, id_col, string_col):
"""
Example usage:
vec_sdf = (
sdf
@schaunwheeler
schaunwheeler / doc_to_spans.py
Last active May 6, 2020 16:39
Example of how to use spaCy to process many texts at once
from spacy import load as spacy_load
# This loads the largest English corpus, which must be downloaded
# separate from package installation. Other choices are available.
nlp = spacy_load('en_core_web_lg')
def doc_to_spans(list_of_texts, join_string=' ||| '):
all_docs = nlp(' ||| '.join(list_of_texts))
split_inds = [i for i, token in enumerate(all_docs) if token.text == '|||'] + [len(all_docs)]
@schaunwheeler
schaunwheeler / pyspark_minhash_jaccard.py
Last active June 8, 2023 23:22
Use MinHash to get Jaccard Similarity in Pyspark
from numpy.random import RandomState
import pyspark.sql.functions as f
from pyspark import StorageLevel
def hashmin_jaccard_spark(
sdf, node_col, edge_basis_col, suffixes=('A', 'B'),
n_draws=100, storage_level=None, seed=42, verbose=False):
"""
Calculate a sparse Jaccard similarity matrix using MinHash.
@schaunwheeler
schaunwheeler / ds_prod_scale1.py
Last active March 12, 2019 19:29
Data science productionizaton: scale - example 1.py
from pandas import DataFrame
from pyspark.sql import types as t, functions as f
df = DataFrame({'ids': [1, 2, 3], 'words': ['abracadabra', 'hocuspocus', 'shazam']})
sdf = sparkSession.createDataFrame(df)
normalize_word_udf = f.udf(normalize_word, t.StringType())
stops = f.array([f.lit(c) for c in STOPCHARS])
results = sdf.select('ids', normalize_word_udf(f.col('words'), stops).alias('norms'))
@schaunwheeler
schaunwheeler / ds_prod_scale2.py
Last active March 12, 2019 19:28
Data science productionization: scale - example 2
outcome_sdf = (
sdf
.select(
f.create_map(
f.col('unique_id'),
f.col('feature_list')
).alias('feature_map'),
)
.groupby(
f.floor(f.rand() * nparts).alias('grouper')
@schaunwheeler
schaunwheeler / ds_prod_maintenance1.py
Last active March 8, 2019 17:35
Data science productionization: maintenance - example 1
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# create a file handler
handler = logging.FileHandler('error.log')
handler.setLevel(logging.ERROR)
# create a logging format