Alex Gillmor asgillmor

## soc.py
def main(sc, args):
   # ...
   # read input logs
   business_rdd = sc.textFile(path_finder('yelp', 'business')).map(parse_line)
   # ...
   # ...
   # apply logic
   eligible_biz_list_rdd = get_eligible_business_list_rdd(business_rdd)
   biz_id_to_locations_rdd = get_business_id_to_location(unique_biz_ids, business_rdd)
   # ...

## biz_filters_test.py
def test_business_filters(self, business_rdd):
   expected_businesses = [5, 6, 8, 9, 10]
   assert sorted(expected_businesses) == sorted(
       business_rdd.filter(business_filters).map(lambda l: l['id']).collect())

## biz_fixture.py
@pytest.fixture()
def business_rdd(spark_context):
   return spark_context.parallelize(business_table_data, 1).map(parse_line)

## spark_context_fixture.py
@pytest.fixture(scope='session')
def spark_context():
   conf = SparkConf().setMaster('local[2]').setAppName('pytest-pyspark-local-testing')
   sc = SparkContext(conf=conf)
   yield sc

## mock_everything_poorly.py
@pytest.fixture
def mock_spark_session():
   return mock.Mock(name='spark_session')

def test_get_user_latest_device_df(mock_spark_session):
   mock_raw_dataframe = mock.MagicMock(name='fake_dataframe')
   mock_spark_session.read.json.return_value = mock_raw_dataframe
   mock_user_device_df = mock.Mock(name='mock_user_device')
   mock_raw_dataframe.select.return_value = mock_user_device_df
   fake_s3_path = 's3://foobar'

## mersenne-twister.js

/*
  I've wrapped Makoto Matsumoto and Takuji Nishimura's code in a namespace
  so it's better encapsulated. Now you can have multiple random number generators
  and they won't stomp all over eachother's state.

  If you want to use this as a substitute for Math.random(), use the random()
  method like so:

  var m = new MersenneTwister();
	def main(sc, args):
	# ...
	# read input logs
	business_rdd = sc.textFile(path_finder('yelp', 'business')).map(parse_line)
	# ...
	# ...
	# apply logic
	eligible_biz_list_rdd = get_eligible_business_list_rdd(business_rdd)
	biz_id_to_locations_rdd = get_business_id_to_location(unique_biz_ids, business_rdd)
	# ...
	def test_business_filters(self, business_rdd):
	expected_businesses = [5, 6, 8, 9, 10]
	assert sorted(expected_businesses) == sorted(
	business_rdd.filter(business_filters).map(lambda l: l['id']).collect())
	@pytest.fixture()
	def business_rdd(spark_context):
	return spark_context.parallelize(business_table_data, 1).map(parse_line)
	@pytest.fixture(scope='session')
	def spark_context():
	conf = SparkConf().setMaster('local[2]').setAppName('pytest-pyspark-local-testing')
	sc = SparkContext(conf=conf)
	yield sc
	@pytest.fixture
	def mock_spark_session():
	return mock.Mock(name='spark_session')

	def test_get_user_latest_device_df(mock_spark_session):
	mock_raw_dataframe = mock.MagicMock(name='fake_dataframe')
	mock_spark_session.read.json.return_value = mock_raw_dataframe
	mock_user_device_df = mock.Mock(name='mock_user_device')
	mock_raw_dataframe.select.return_value = mock_user_device_df
	fake_s3_path = 's3://foobar'

	/*
	I've wrapped Makoto Matsumoto and Takuji Nishimura's code in a namespace
	so it's better encapsulated. Now you can have multiple random number generators
	and they won't stomp all over eachother's state.

	If you want to use this as a substitute for Math.random(), use the random()
	method like so:

	var m = new MersenneTwister();