ijan10/data_preparation_spark_sql_query_ctr_model.py Secret

## data_preparation_spark_sql_query_ctr_model.py
model_feature_name = args.model_feature_name
models_max_sample_size = 10000000 # 10M
sql_query = '''SELECT *
                    FROM   (SELECT dt,
                                   request_time,
                                   request_hour,
                                   region,
                                   dma,
                                   city,
                                   form_factor,
                                   brand_name,
                                   advertised_device_os,
                                   advertised_browser,
                                   netspeed,
                                   model_name,
                                   pub_acnt_id,
                                   domain,
                                   ad_unit_id,
                                   placement_type_name,
                                   advertiser_category_id,
                                   io_number,
                                   io_line_item_number,
                                   variant_id,
                                   dim_io_line_item_udms2_x_ad_product_id AS product_id,
                                   clicks                                 AS y_label,
                                   Rand()                                 random,
                                   Count(*)
                                     OVER (
                                       partition BY %s)  AS
                                          total_impr_by_model_feature_name
                            FROM   udms_dl_data_assets.dl_optimization_input o
                                   LEFT OUTER JOIN udms_dl_ingested_data_dim.tbv_dim_io_line_item l
                                                ON o.io_line_item_number =
                                                   l.dim_io_line_item_udms2_x_io_line_item_number
                            WHERE  requests > 0
                                   AND impressions > 0
                                   AND request_time IS NOT NULL
                                   AND programmatic_deal_type NOT IN ( 'Non-Guaranteed' )
                                   AND dt >= "20191007"
                                   AND is_robot = 0)
                    WHERE  random < ( %s / total_impr_by_model_feature_name )''' % (model_feature_name, models_max_sample_size)
"""
The above is a customaize query that create max of 10M sample for each parition of selected feature name (there are some additional configure value that we select droping in this gist)
There are other methods for sample per one feature but we prefer using spark sql.

"""

df_all_data = spark.sql(sql_query)

# Handle Missing Data
df_all_data = df_all_data.withColumn(args.model_type,
                                             when((df_all_data[model_feature_name] == '') | (df_all_data[model_feature_name] == 'N/A'), 'Empty').otherwise(df_all_data[model_feature_name]))

 df_all_data = df_all_data.fillna('-1')
 df_all_data = df_all_data.drop('random').drop(total_impr_by_model_feature_name)
	model_feature_name = args.model_feature_name
	models_max_sample_size = 10000000 # 10M
	sql_query = '''SELECT *
	FROM (SELECT dt,
	request_time,
	request_hour,
	region,
	dma,
	city,
	form_factor,
	brand_name,
	advertised_device_os,
	advertised_browser,
	netspeed,
	model_name,
	pub_acnt_id,
	domain,
	ad_unit_id,
	placement_type_name,
	advertiser_category_id,
	io_number,
	io_line_item_number,
	variant_id,
	dim_io_line_item_udms2_x_ad_product_id AS product_id,
	clicks AS y_label,
	Rand() random,
	Count(*)
	OVER (
	partition BY %s) AS
	total_impr_by_model_feature_name
	FROM udms_dl_data_assets.dl_optimization_input o
	LEFT OUTER JOIN udms_dl_ingested_data_dim.tbv_dim_io_line_item l
	ON o.io_line_item_number =
	l.dim_io_line_item_udms2_x_io_line_item_number
	WHERE requests > 0
	AND impressions > 0
	AND request_time IS NOT NULL
	AND programmatic_deal_type NOT IN ( 'Non-Guaranteed' )
	AND dt >= "20191007"
	AND is_robot = 0)
	WHERE random < ( %s / total_impr_by_model_feature_name )''' % (model_feature_name, models_max_sample_size)
	"""
	The above is a customaize query that create max of 10M sample for each parition of selected feature name (there are some additional configure value that we select droping in this gist)
	There are other methods for sample per one feature but we prefer using spark sql.

	"""

	df_all_data = spark.sql(sql_query)

	# Handle Missing Data
	df_all_data = df_all_data.withColumn(args.model_type,
	when((df_all_data[model_feature_name] == '') \| (df_all_data[model_feature_name] == 'N/A'), 'Empty').otherwise(df_all_data[model_feature_name]))

	df_all_data = df_all_data.fillna('-1')
	df_all_data = df_all_data.drop('random').drop(total_impr_by_model_feature_name)