This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2.7 | |
import random | |
import subprocess | |
class Node(object): | |
def __init__(self, key, value): | |
self.key = key | |
self.value = value |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# Many thanks to http://stackoverflow.com/users/400617/davidism | |
# This code under "I don't care" license | |
# Take it, use it, learn from it, make it better. | |
# Start this from cmd or shell or whatever | |
# Go to favourite browser and type localhost:5000/admin | |
import sys | |
from flask import Flask | |
from flask.ext.sqlalchemy import SQLAlchemy | |
from flask.ext.admin import Admin |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def union_uneven(df_base, df_new, default=None): | |
""" | |
Union dfs with different columns | |
:param: pyspark.DataFrame df_base: the dataframe to join to | |
:param: pyspark.DataFrame df_new: the dataframe to be joined | |
:return: the union of the two dataframes, having the missing columns filled with the default value | |
:rtype: pyspark.DataFrame | |
""" | |
base_columns = set(df_base.columns) | |
df_new_columns = set(df_new.columns) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
json_col = 'json_col' | |
# either infer the features schema: | |
schema = self.spark.read.json(df.select(json_col).rdd.map(lambda x: x[0])).schema | |
# parse the features string into a map | |
df = df.withColumn(json_col, (F.from_json(F.col(json_col), schema))) | |
# access the feature columns by name | |
df.select(F.col(json_col)['some_key']).show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import EXIF from 'exif-js'; | |
const hasBlobConstructor = typeof (Blob) !== 'undefined' && (function checkBlobConstructor() { | |
try { | |
return Boolean(new Blob()); | |
} catch (error) { | |
return false; | |
} | |
}()); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>>> from pyspark.sql import Window | |
# the window is necessary here because row_number is a windowing function | |
# that means you can have row_number run over some amount of your data | |
# we'll be currently running it over the sorted by column1 data, row per row - our window will be of size 2 (rows), | |
# the whole dataframe that is. | |
>>> window = Window.orderBy(F.col('column1')) | |
>>> df_final = df_final.withColumn('row_number', F.row_number().over(window) | |
>>> df_final.select('index', 'row_number', 'column1', 'column2').show() | |
+-----+----------+-------+-------+ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark import SparkConf | |
from pyspark.sql import SparkSession, functions as F | |
conf = SparkConf() | |
# optional but it would be good to set the amount of ram the driver can use to | |
# a reasonable (regarding the size of the file we want to read) amount, so that we don't get an OOM exception | |
conf.set('spark.driver.memory', '6G') | |
spark = SparkSession.builder \ | |
.config(conf=conf) \ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark import SparkConf | |
from pyspark.sql import SparkSession, functions as F | |
conf = SparkConf() | |
# optional but it would be good to set the amount of ram the driver can use to | |
# a reasonable (regarding the size of the file we want to read) amount, so that we don't get an OOM exception | |
conf.set('spark.driver.memory', '6G') | |
# create a spark session - nothing can be done without this: | |
spark = SparkSession.builder \ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.sql import functions as F | |
class FeatureAToBRatio(object): | |
feature_name = 'a_to_b_ratio' | |
default_value = 0. | |
def calculate(self, df): | |
""" | |
Given a dataframe that contains columns a and b, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.sql.utils import AnalysisException | |
from pyspark_unittesting import SparkSQLTestCase | |
class TestFeatureAToBRatio(SparkSQLTestCase): | |
def setUp(self): | |
super(TestFeatureAToBRatio, self).setUp() | |
self.feature = FeatureAToBRatio() |
OlderNewer