Maria Karanasou mkaranasou

## pyspark_ml_isolation_forest.py
from pyspark import SparkConf
from pyspark.sql import SparkSession, functions as F
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark_iforest.ml.iforest import IForest, IForestModel
import tempfile

conf = SparkConf()
conf.set('spark.jars', '/full/path/to/spark-iforest/target/spark-iforest-2.4.0.jar')

spark = SparkSession \

## pyspark_scikit_isolation_forest.py
from pyspark.sql import SparkSession, functions as F, types as T
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

np.random.seed(42)

conf = SparkConf()
spark_session = SparkSession.builder \
    .config(conf=conf) \
    .appName('test') \

## from_pandas_to_koalas.py
import pandas as pd
from databricks import koalas as ks
from pyspark.sql import SparkSession, functions as F

# define the data - example taken from https://koalas.readthedocs.io/en/latest/getting_started/10min.html
data = {'a': [1, 2, 3, 4, 5, 6],
        'b': [100, 200, 300, 400, 500, 600],
        'c': ["one", "two", "three", "four", "five", "six"]}

index = [10, 20, 30, 40, 50, 60]

## example_null_column_returned_from_udf.py
from pyspark import SparkConf
from pyspark.sql import SparkSession, functions as F, types as T


conf = SparkConf()
spark_session = SparkSession.builder \
    .config(conf=conf) \
    .appName('test') \
    .getOrCreate()

## pyspark_set_allowed_memory.py
from pyspark import SparkConf
from pyspark.sql import SparkSession


# depending on your set up:
# if you are running the spark app locally, set the driver memory to something your system can handle
# if you are running on a cluster, then also set the executor memory - if necessary (depends on how your cluster is configured)
conf = SparkConf()
conf.set('spark.executor.memory', '16g')
conf.set('spark.driver.memory', '8g')

## test_feature_a_to_b_ratio.py
from pyspark.sql.utils import AnalysisException
from pyspark_unittesting import SparkSQLTestCase


class TestFeatureAToBRatio(SparkSQLTestCase):

    def setUp(self):
        super(TestFeatureAToBRatio, self).setUp()
        self.feature = FeatureAToBRatio()

## pyspark_feature_a_to_b_ratio_example.py
from pyspark.sql import functions as F


class FeatureAToBRatio(object):
    feature_name = 'a_to_b_ratio'
    default_value = 0.

    def calculate(self, df):
        """
        Given a dataframe that contains columns a and b,

## pyspark_unittesting.py
import traceback
from sparktestingbase.sqltestcase import SQLTestCase


class SparkSQLTestCase(SQLTestCase):
    def getConf(self):
        from pyspark import SparkConf
        conf = SparkConf()
        conf.set(

## constructor_env_variables.py
def constructor_env_variables(loader, node):
    """
    Extracts the environment variable from the node's value
    :param yaml.Loader loader: the yaml loader
    :param node: the current node in the yaml
    :return: the parsed string that contains the value of the environment
    variable
    """
    value = loader.construct_scalar(node)
    match = pattern.findall(value)

## pyspark_index_with_row_num_non_sortable_data.py
# First add a column using the F.monotonically_increasing_id().
# This will add monotonically increasing 64-bit integers like this:
>>> df_final = df_final.withColumn("monotonically_increasing_id", F.monotonically_increasing_id())

+--------+---+-----+-------+-------+----------+---------------------------+
|      _1| _2|index|column1|column2|row_number|monotonically_increasing_id|
+--------+---+-----+-------+-------+----------+---------------------------+
|  [1, 2]|  0|    0|      1|      2|         1|                          0|
|[15, 21]|  1|    1|     15|     21|         2|                          1|
+--------+---+-----+-------+-------+----------+---------------------------+
	from pyspark import SparkConf
	from pyspark.sql import SparkSession, functions as F
	from pyspark.ml.feature import VectorAssembler, StandardScaler
	from pyspark_iforest.ml.iforest import IForest, IForestModel
	import tempfile

	conf = SparkConf()
	conf.set('spark.jars', '/full/path/to/spark-iforest/target/spark-iforest-2.4.0.jar')

	spark = SparkSession \
	from pyspark.sql import SparkSession, functions as F, types as T
	from sklearn.ensemble import IsolationForest
	from sklearn.preprocessing import StandardScaler

	np.random.seed(42)

	conf = SparkConf()
	spark_session = SparkSession.builder \
	.config(conf=conf) \
	.appName('test') \
	import pandas as pd
	from databricks import koalas as ks
	from pyspark.sql import SparkSession, functions as F

	# define the data - example taken from https://koalas.readthedocs.io/en/latest/getting_started/10min.html
	data = {'a': [1, 2, 3, 4, 5, 6],
	'b': [100, 200, 300, 400, 500, 600],
	'c': ["one", "two", "three", "four", "five", "six"]}

	index = [10, 20, 30, 40, 50, 60]
	from pyspark import SparkConf
	from pyspark.sql import SparkSession


	# depending on your set up:
	# if you are running the spark app locally, set the driver memory to something your system can handle
	# if you are running on a cluster, then also set the executor memory - if necessary (depends on how your cluster is configured)
	conf = SparkConf()
	conf.set('spark.executor.memory', '16g')
	conf.set('spark.driver.memory', '8g')
	from pyspark.sql.utils import AnalysisException
	from pyspark_unittesting import SparkSQLTestCase


	class TestFeatureAToBRatio(SparkSQLTestCase):

	def setUp(self):
	super(TestFeatureAToBRatio, self).setUp()
	self.feature = FeatureAToBRatio()
	from pyspark.sql import functions as F


	class FeatureAToBRatio(object):
	feature_name = 'a_to_b_ratio'
	default_value = 0.

	def calculate(self, df):
	"""
	Given a dataframe that contains columns a and b,
	import traceback
	from sparktestingbase.sqltestcase import SQLTestCase



	class SparkSQLTestCase(SQLTestCase):
	def getConf(self):
	from pyspark import SparkConf
	conf = SparkConf()
	conf.set(
	def constructor_env_variables(loader, node):
	"""
	Extracts the environment variable from the node's value
	:param yaml.Loader loader: the yaml loader
	:param node: the current node in the yaml
	:return: the parsed string that contains the value of the environment
	variable
	"""
	value = loader.construct_scalar(node)
	match = pattern.findall(value)
	# First add a column using the F.monotonically_increasing_id().
	# This will add monotonically increasing 64-bit integers like this:
	>>> df_final = df_final.withColumn("monotonically_increasing_id", F.monotonically_increasing_id())

	+--------+---+-----+-------+-------+----------+---------------------------+
	\| _1\| _2\|index\|column1\|column2\|row_number\|monotonically_increasing_id\|
	+--------+---+-----+-------+-------+----------+---------------------------+
	\| [1, 2]\| 0\| 0\| 1\| 2\| 1\| 0\|
	\|[15, 21]\| 1\| 1\| 15\| 21\| 2\| 1\|
	+--------+---+-----+-------+-------+----------+---------------------------+