Last active
March 20, 2023 10:37
-
-
Save ijan10/d7b11d6906e1b87bedd339b0cc04146f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
model_feature_name = args.model_feature_name | |
models_max_sample_size = 10000000 # 10M | |
sql_query = '''SELECT * | |
FROM (SELECT dt, | |
request_time, | |
request_hour, | |
region, | |
dma, | |
city, | |
form_factor, | |
brand_name, | |
advertised_device_os, | |
advertised_browser, | |
netspeed, | |
model_name, | |
pub_acnt_id, | |
domain, | |
ad_unit_id, | |
placement_type_name, | |
advertiser_category_id, | |
io_number, | |
io_line_item_number, | |
variant_id, | |
dim_io_line_item_udms2_x_ad_product_id AS product_id, | |
clicks AS y_label, | |
Rand() random, | |
Count(*) | |
OVER ( | |
partition BY %s) AS | |
total_impr_by_model_feature_name | |
FROM udms_dl_data_assets.dl_optimization_input o | |
LEFT OUTER JOIN udms_dl_ingested_data_dim.tbv_dim_io_line_item l | |
ON o.io_line_item_number = | |
l.dim_io_line_item_udms2_x_io_line_item_number | |
WHERE requests > 0 | |
AND impressions > 0 | |
AND request_time IS NOT NULL | |
AND programmatic_deal_type NOT IN ( 'Non-Guaranteed' ) | |
AND dt >= "20191007" | |
AND is_robot = 0) | |
WHERE random < ( %s / total_impr_by_model_feature_name )''' % (model_feature_name, models_max_sample_size) | |
""" | |
The above is a customaize query that create max of 10M sample for each parition of selected feature name (there are some additional configure value that we select droping in this gist) | |
There are other methods for sample per one feature but we prefer using spark sql. | |
""" | |
df_all_data = spark.sql(sql_query) | |
# Handle Missing Data | |
df_all_data = df_all_data.withColumn(args.model_type, | |
when((df_all_data[model_feature_name] == '') | (df_all_data[model_feature_name] == 'N/A'), 'Empty').otherwise(df_all_data[model_feature_name])) | |
df_all_data = df_all_data.fillna('-1') | |
df_all_data = df_all_data.drop('random').drop(total_impr_by_model_feature_name) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment