Skip to content

Instantly share code, notes, and snippets.

def main(sc, args):
# ...
# read input logs
business_rdd = sc.textFile(path_finder('yelp', 'business')).map(parse_line)
# ...
# ...
# apply logic
eligible_biz_list_rdd = get_eligible_business_list_rdd(business_rdd)
biz_id_to_locations_rdd = get_business_id_to_location(unique_biz_ids, business_rdd)
# ...
def test_business_filters(self, business_rdd):
expected_businesses = [5, 6, 8, 9, 10]
assert sorted(expected_businesses) == sorted(
business_rdd.filter(business_filters).map(lambda l: l['id']).collect())
def business_rdd(spark_context):
return spark_context.parallelize(business_table_data, 1).map(parse_line)
def spark_context():
conf = SparkConf().setMaster('local[2]').setAppName('pytest-pyspark-local-testing')
sc = SparkContext(conf=conf)
yield sc
asgillmor /
Last active March 6, 2019 11:37
Heavily mocked tests are a code smell
def mock_spark_session():
return mock.Mock(name='spark_session')
def test_get_user_latest_device_df(mock_spark_session):
mock_raw_dataframe = mock.MagicMock(name='fake_dataframe') = mock_raw_dataframe
mock_user_device_df = mock.Mock(name='mock_user_device') = mock_user_device_df
fake_s3_path = 's3://foobar'
View mersenne-twister.js
I've wrapped Makoto Matsumoto and Takuji Nishimura's code in a namespace
so it's better encapsulated. Now you can have multiple random number generators
and they won't stomp all over eachother's state.
If you want to use this as a substitute for Math.random(), use the random()
method like so:
var m = new MersenneTwister();