This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark import SparkConf | |
from pyspark.sql import SparkSession, functions as F | |
from pyspark.ml.feature import VectorAssembler, StandardScaler | |
from pyspark_iforest.ml.iforest import IForest, IForestModel | |
import tempfile | |
conf = SparkConf() | |
conf.set('spark.jars', '/full/path/to/spark-iforest/target/spark-iforest-2.4.0.jar') | |
spark = SparkSession \ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.sql import SparkSession, functions as F, types as T | |
from sklearn.ensemble import IsolationForest | |
from sklearn.preprocessing import StandardScaler | |
np.random.seed(42) | |
conf = SparkConf() | |
spark_session = SparkSession.builder \ | |
.config(conf=conf) \ | |
.appName('test') \ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from databricks import koalas as ks | |
from pyspark.sql import SparkSession, functions as F | |
# define the data - example taken from https://koalas.readthedocs.io/en/latest/getting_started/10min.html | |
data = {'a': [1, 2, 3, 4, 5, 6], | |
'b': [100, 200, 300, 400, 500, 600], | |
'c': ["one", "two", "three", "four", "five", "six"]} | |
index = [10, 20, 30, 40, 50, 60] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark import SparkConf | |
from pyspark.sql import SparkSession, functions as F, types as T | |
conf = SparkConf() | |
spark_session = SparkSession.builder \ | |
.config(conf=conf) \ | |
.appName('test') \ | |
.getOrCreate() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark import SparkConf | |
from pyspark.sql import SparkSession | |
# depending on your set up: | |
# if you are running the spark app locally, set the driver memory to something your system can handle | |
# if you are running on a cluster, then also set the executor memory - if necessary (depends on how your cluster is configured) | |
conf = SparkConf() | |
conf.set('spark.executor.memory', '16g') | |
conf.set('spark.driver.memory', '8g') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.sql.utils import AnalysisException | |
from pyspark_unittesting import SparkSQLTestCase | |
class TestFeatureAToBRatio(SparkSQLTestCase): | |
def setUp(self): | |
super(TestFeatureAToBRatio, self).setUp() | |
self.feature = FeatureAToBRatio() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.sql import functions as F | |
class FeatureAToBRatio(object): | |
feature_name = 'a_to_b_ratio' | |
default_value = 0. | |
def calculate(self, df): | |
""" | |
Given a dataframe that contains columns a and b, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import traceback | |
from sparktestingbase.sqltestcase import SQLTestCase | |
class SparkSQLTestCase(SQLTestCase): | |
def getConf(self): | |
from pyspark import SparkConf | |
conf = SparkConf() | |
conf.set( |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def constructor_env_variables(loader, node): | |
""" | |
Extracts the environment variable from the node's value | |
:param yaml.Loader loader: the yaml loader | |
:param node: the current node in the yaml | |
:return: the parsed string that contains the value of the environment | |
variable | |
""" | |
value = loader.construct_scalar(node) | |
match = pattern.findall(value) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# First add a column using the F.monotonically_increasing_id(). | |
# This will add monotonically increasing 64-bit integers like this: | |
>>> df_final = df_final.withColumn("monotonically_increasing_id", F.monotonically_increasing_id()) | |
+--------+---+-----+-------+-------+----------+---------------------------+ | |
| _1| _2|index|column1|column2|row_number|monotonically_increasing_id| | |
+--------+---+-----+-------+-------+----------+---------------------------+ | |
| [1, 2]| 0| 0| 1| 2| 1| 0| | |
|[15, 21]| 1| 1| 15| 21| 2| 1| | |
+--------+---+-----+-------+-------+----------+---------------------------+ |