View rfr_example.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"i": 0, | |
"tree_undefined": -2, | |
"features": [ | |
3, | |
3, | |
2, | |
3, | |
-2, | |
-2, |
View pure_python_rfr.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.tree import _tree | |
tree_template = ''' | |
def tree{i}(inputs): | |
tree_undefined = {tree_undefined} | |
features = {features} | |
thresholds = {thresholds} | |
children_left = {children_left} |
View rfr_to_json.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from json import dumps | |
def rfr_to_json(rfr_object, feature_list, json_filepath=None): | |
''' | |
Function to convert a scikit-learn RandomForestRegressor object to JSON. | |
''' | |
output_dict = dict() | |
output_dict['name'] = 'rf_regression_pipeline' |
View randomforestregressor_predict.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import rapture.json.jsonBackends.jawn._ | |
import rapture.json.Json | |
import scala.annotation.tailrec | |
case class RandomForestTree( | |
treeId: Int, | |
undefinedIndex: Int, | |
features: Array[Int], | |
thresholds: Array[Double], |
View spacy_pyspark_wordvec_udf.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pyspark.sql.types as t | |
import pyspark.sql.functions as f | |
def spacy_word2vec_grouped(cat_list, id_col, string_col): | |
""" | |
Example usage: | |
vec_sdf = ( | |
sdf |
View doc_to_spans.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from spacy import load as spacy_load | |
# This loads the largest English corpus, which must be downloaded | |
# separate from package installation. Other choices are available. | |
nlp = spacy_load('en_core_web_lg') | |
def doc_to_spans(list_of_texts, join_string=' ||| '): | |
all_docs = nlp(' ||| '.join(list_of_texts)) | |
split_inds = [i for i, token in enumerate(all_docs) if token.text == '|||'] + [len(all_docs)] |
View pyspark_minhash_jaccard.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from numpy.random import RandomState | |
import pyspark.sql.functions as f | |
from pyspark import StorageLevel | |
def hashmin_jaccard_spark( | |
sdf, node_col, edge_basis_col, suffixes=('A', 'B'), | |
n_draws=100, storage_level=None, seed=42, verbose=False): | |
""" | |
Calculate a sparse Jaccard similarity matrix using MinHash. |
View ds_prod_scale1.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pandas import DataFrame | |
from pyspark.sql import types as t, functions as f | |
df = DataFrame({'ids': [1, 2, 3], 'words': ['abracadabra', 'hocuspocus', 'shazam']}) | |
sdf = sparkSession.createDataFrame(df) | |
normalize_word_udf = f.udf(normalize_word, t.StringType()) | |
stops = f.array([f.lit(c) for c in STOPCHARS]) | |
results = sdf.select('ids', normalize_word_udf(f.col('words'), stops).alias('norms')) |
View ds_prod_scale2.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
outcome_sdf = ( | |
sdf | |
.select( | |
f.create_map( | |
f.col('unique_id'), | |
f.col('feature_list') | |
).alias('feature_map'), | |
) | |
.groupby( | |
f.floor(f.rand() * nparts).alias('grouper') |
View ds_prod_maintenance1.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
logger = logging.getLogger(__name__) | |
logger.setLevel(logging.INFO) | |
# create a file handler | |
handler = logging.FileHandler('error.log') | |
handler.setLevel(logging.ERROR) | |
# create a logging format |
NewerOlder