This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import yaml | |
def parse_config(path=None, data=None, tag='!ENV'): | |
""" | |
Load a yaml configuration file and resolve any environment variables | |
The environment variables must have !ENV before them and be in this format | |
to be parsed: ${VAR_NAME}. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
q = '(select min(id) as min, max(id) as max from table_name where condition) as bounds' | |
user = 'postgres' | |
password = 'secret' | |
db_driver = 'org.postgresql.Driver' | |
host = '127.0.0.1' | |
db_url = f'jdbc:postgresql://{host}:5432/dbname?user={user}&password={password}' | |
partitions = os.cpu_count() * 2 # a good starting point | |
conn_properties = { |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from psutil import virtual_memory | |
from pyspark import SparkConf | |
from pyspark.ml.linalg import Vectors, VectorUDT | |
from pyspark.sql import functions as F, SparkSession, types as T, Window | |
def get_spark_session(): | |
""" | |
With an effort to optimize memory and partitions |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.sql import SparkSession, functions as F, types as T | |
from sklearn.ensemble import IsolationForest | |
from sklearn.preprocessing import StandardScaler | |
np.random.seed(42) | |
conf = SparkConf() | |
spark_session = SparkSession.builder \ | |
.config(conf=conf) \ | |
.appName('test') \ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>>> from pyspark.sql import SparkSession, functions as F | |
>>> from pyspark import SparkConf | |
>>> conf = SparkConf() | |
>>> spark = SparkSession.builder \ | |
.config(conf=conf) \ | |
.appName('Dataframe with Indexes') \ | |
.getOrCreate() | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark import SparkConf | |
from pyspark.sql import SparkSession, functions as F | |
from pyspark.ml.feature import VectorAssembler, StandardScaler | |
from pyspark_iforest.ml.iforest import IForest, IForestModel | |
import tempfile | |
conf = SparkConf() | |
conf.set('spark.jars', '/full/path/to/spark-iforest-2.4.0.jar') | |
spark = SparkSession \ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
if __name__ == '__main__': | |
from pyspark.sql import SparkSession, functions as F | |
from pyspark import SparkConf | |
from pyspark.sql import functions as F | |
conf = SparkConf() | |
spark = SparkSession.builder \ | |
.config(conf=conf) \ | |
.appName('Dataframe with Indexes') \ | |
.getOrCreate() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
import numpy as np | |
import pyspark | |
from shapley_spark_calculation import \ | |
calculate_shapley_values, select_row | |
from pyspark.ml.classification import RandomForestClassifier, LinearSVC, \ | |
DecisionTreeClassifier | |
from pyspark.ml.evaluation import BinaryClassificationEvaluator | |
from pyspark.ml.feature import VectorAssembler |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import operator | |
import os | |
import time | |
import warnings | |
from pyspark.ml.linalg import Vectors, VectorUDT | |
from pyspark.sql import functions as F, SparkSession, types as T, Window |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
+----+-------------------------------------++-------------+---------------------+ | |
|id |features |prediction |marginal_contribution| | |
+----+-------------------------------------+--------------+---------------------+ | |
|1677|[0.349,0.141,0.162,0.162,0.162,0.349]|0.0 |null | | |
|1677|[0.886,0.141,0.162,0.162,0.162,0.349]|0.0 |0.0 | | |
|2250|[0.106,0.423,0.777,0.777,0.777,0.886]|0.0 |null | | |
|2250|[0.886,0.423,0.777,0.777,0.777,0.886]|0.0 |0.0 | | |
|2453|[0.801,0.423,0.777,0.777,0.87,0.886] |0.0 |null | | |
+----+-------------------------------------+--------------+---------------------+ | |
only showing top 5 rows |
NewerOlder