This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from numpy.random import RandomState | |
import pyspark.sql.functions as f | |
from pyspark import StorageLevel | |
def hashmin_jaccard_spark( | |
sdf, node_col, edge_basis_col, suffixes=('A', 'B'), | |
n_draws=100, storage_level=None, seed=42, verbose=False): | |
""" | |
Calculate a sparse Jaccard similarity matrix using MinHash. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# The MIT License (MIT) | |
# | |
# Copyright (c) 2012 Schaun Jacob Wheeler | |
# | |
# Permission is hereby granted, free of charge, to any person obtaining a copy | |
# of this software and associated documentation files (the "Software"), to deal | |
# in the Software without restriction, including without limitation the rights | |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
# copies of the Software, and to permit persons to whom the Software is | |
# furnished to do so, subject to the following conditions: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# The MIT License (MIT) | |
# | |
# Copyright (c) 2012 Schaun Jacob Wheeler | |
# | |
# Permission is hereby granted, free of charge, to any person obtaining a copy | |
# of this software and associated documentation files (the "Software"), to deal | |
# in the Software without restriction, including without limitation the rights | |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
# copies of the Software, and to permit persons to whom the Software is | |
# furnished to do so, subject to the following conditions: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from spacy import load as spacy_load | |
# This loads the largest English corpus, which must be downloaded | |
# separate from package installation. Other choices are available. | |
nlp = spacy_load('en_core_web_lg') | |
def doc_to_spans(list_of_texts, join_string=' ||| '): | |
all_docs = nlp(' ||| '.join(list_of_texts)) | |
split_inds = [i for i, token in enumerate(all_docs) if token.text == '|||'] + [len(all_docs)] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import rapture.json.jsonBackends.jawn._ | |
import rapture.json.Json | |
import scala.annotation.tailrec | |
case class RandomForestTree( | |
treeId: Int, | |
undefinedIndex: Int, | |
features: Array[Int], | |
thresholds: Array[Double], |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"i": 0, | |
"tree_undefined": -2, | |
"features": [ | |
3, | |
3, | |
2, | |
3, | |
-2, | |
-2, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.tree import _tree | |
tree_template = ''' | |
def tree{i}(inputs): | |
tree_undefined = {tree_undefined} | |
features = {features} | |
thresholds = {thresholds} | |
children_left = {children_left} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from json import dumps | |
def rfr_to_json(rfr_object, feature_list, json_filepath=None): | |
''' | |
Function to convert a scikit-learn RandomForestRegressor object to JSON. | |
''' | |
output_dict = dict() | |
output_dict['name'] = 'rf_regression_pipeline' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pyspark.sql.types as t | |
import pyspark.sql.functions as f | |
def spacy_word2vec_grouped(cat_list, id_col, string_col): | |
""" | |
Example usage: | |
vec_sdf = ( | |
sdf |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pandas import DataFrame | |
from pyspark.sql import types as t, functions as f | |
df = DataFrame({'ids': [1, 2, 3], 'words': ['abracadabra', 'hocuspocus', 'shazam']}) | |
sdf = sparkSession.createDataFrame(df) | |
normalize_word_udf = f.udf(normalize_word, t.StringType()) | |
stops = f.array([f.lit(c) for c in STOPCHARS]) | |
results = sdf.select('ids', normalize_word_udf(f.col('words'), stops).alias('norms')) |
NewerOlder