Maria Karanasou mkaranasou

## bbtree.py
#!/usr/bin/env python2.7

import random
import subprocess


class Node(object):
    def __init__(self, key, value):
        self.key = key
        self.value = value

## relations.py
# -*- coding: utf-8 -*-
# Many thanks to http://stackoverflow.com/users/400617/davidism
# This code under "I don't care" license
# Take it, use it, learn from it, make it better.
# Start this from cmd or shell or whatever
# Go to favourite browser and type localhost:5000/admin
import sys
from flask import Flask
from flask.ext.sqlalchemy import SQLAlchemy
from flask.ext.admin import Admin

## pyspark_uneven_df_union.py
def union_uneven(df_base, df_new, default=None):
    """
    Union dfs with different columns
    :param: pyspark.DataFrame df_base: the dataframe to join to
    :param: pyspark.DataFrame df_new: the dataframe to be joined
    :return: the union of the two dataframes, having the missing columns filled with the default value
    :rtype: pyspark.DataFrame
    """
    base_columns = set(df_base.columns)
    df_new_columns = set(df_new.columns)

## pyspark_parse_json_and_expand_into_columns.py
json_col = 'json_col'

# either infer the features schema:
schema = self.spark.read.json(df.select(json_col).rdd.map(lambda x: x[0])).schema

# parse the features string into a map
df = df.withColumn(json_col, (F.from_json(F.col(json_col), schema)))

# access the feature columns by name
df.select(F.col(json_col)['some_key']).show()

## ImageTools.js
import EXIF from 'exif-js';

const hasBlobConstructor = typeof (Blob) !== 'undefined' && (function checkBlobConstructor() {
    try {
        return Boolean(new Blob());
    } catch (error) {
        return false;
    }
}());

## pyspark_index_with_row_num_sortable_data.py
>>> from pyspark.sql import Window
# the window is necessary here because row_number is a windowing function
# that means you can have row_number run over some amount of your data
# we'll be currently running it over the sorted by column1 data, row per row - our window will be of size 2 (rows),
# the whole dataframe that is.
>>> window = Window.orderBy(F.col('column1'))
>>> df_final = df_final.withColumn('row_number', F.row_number().over(window)
>>> df_final.select('index', 'row_number', 'column1', 'column2').show()

+-----+----------+-------+-------+

## pyspark_simple_file_read_short.py
from pyspark import SparkConf
from pyspark.sql import SparkSession, functions as F

conf = SparkConf()
# optional but it would be good to set the amount of ram the driver can use to
# a reasonable (regarding the size of the file we want to read) amount, so that we don't get an OOM exception
conf.set('spark.driver.memory', '6G')

spark = SparkSession.builder \
        .config(conf=conf) \

## pyspark_simple_read_text_file.py
from pyspark import SparkConf
from pyspark.sql import SparkSession, functions as F

conf = SparkConf()
# optional but it would be good to set the amount of ram the driver can use to
# a reasonable (regarding the size of the file we want to read) amount, so that we don't get an OOM exception
conf.set('spark.driver.memory', '6G')

# create a spark session - nothing can be done without this:
spark = SparkSession.builder \

## pyspark_feature_a_to_b_ratio_example.py
from pyspark.sql import functions as F


class FeatureAToBRatio(object):
    feature_name = 'a_to_b_ratio'
    default_value = 0.

    def calculate(self, df):
        """
        Given a dataframe that contains columns a and b,

## test_feature_a_to_b_ratio.py
from pyspark.sql.utils import AnalysisException
from pyspark_unittesting import SparkSQLTestCase


class TestFeatureAToBRatio(SparkSQLTestCase):

    def setUp(self):
        super(TestFeatureAToBRatio, self).setUp()
        self.feature = FeatureAToBRatio()
	#!/usr/bin/env python2.7

	import random
	import subprocess


	class Node(object):
	def __init__(self, key, value):
	self.key = key
	self.value = value
	# -- coding: utf-8 --
	# Many thanks to http://stackoverflow.com/users/400617/davidism
	# This code under "I don't care" license
	# Take it, use it, learn from it, make it better.
	# Start this from cmd or shell or whatever
	# Go to favourite browser and type localhost:5000/admin
	import sys
	from flask import Flask
	from flask.ext.sqlalchemy import SQLAlchemy
	from flask.ext.admin import Admin
	def union_uneven(df_base, df_new, default=None):
	"""
	Union dfs with different columns
	:param: pyspark.DataFrame df_base: the dataframe to join to
	:param: pyspark.DataFrame df_new: the dataframe to be joined
	:return: the union of the two dataframes, having the missing columns filled with the default value
	:rtype: pyspark.DataFrame
	"""
	base_columns = set(df_base.columns)
	df_new_columns = set(df_new.columns)
	json_col = 'json_col'

	# either infer the features schema:
	schema = self.spark.read.json(df.select(json_col).rdd.map(lambda x: x[0])).schema

	# parse the features string into a map
	df = df.withColumn(json_col, (F.from_json(F.col(json_col), schema)))

	# access the feature columns by name
	df.select(F.col(json_col)['some_key']).show()
	import EXIF from 'exif-js';

	const hasBlobConstructor = typeof (Blob) !== 'undefined' && (function checkBlobConstructor() {
	try {
	return Boolean(new Blob());
	} catch (error) {
	return false;
	}
	}());
	>>> from pyspark.sql import Window
	# the window is necessary here because row_number is a windowing function
	# that means you can have row_number run over some amount of your data
	# we'll be currently running it over the sorted by column1 data, row per row - our window will be of size 2 (rows),
	# the whole dataframe that is.
	>>> window = Window.orderBy(F.col('column1'))
	>>> df_final = df_final.withColumn('row_number', F.row_number().over(window)
	>>> df_final.select('index', 'row_number', 'column1', 'column2').show()

	+-----+----------+-------+-------+
	from pyspark import SparkConf
	from pyspark.sql import SparkSession, functions as F

	conf = SparkConf()
	# optional but it would be good to set the amount of ram the driver can use to
	# a reasonable (regarding the size of the file we want to read) amount, so that we don't get an OOM exception
	conf.set('spark.driver.memory', '6G')

	spark = SparkSession.builder \
	.config(conf=conf) \
	from pyspark.sql import functions as F


	class FeatureAToBRatio(object):
	feature_name = 'a_to_b_ratio'
	default_value = 0.

	def calculate(self, df):
	"""
	Given a dataframe that contains columns a and b,
	from pyspark.sql.utils import AnalysisException
	from pyspark_unittesting import SparkSQLTestCase


	class TestFeatureAToBRatio(SparkSQLTestCase):

	def setUp(self):
	super(TestFeatureAToBRatio, self).setUp()
	self.feature = FeatureAToBRatio()