Created
August 4, 2015 06:50
-
-
Save steve500002/55cc94077f9b20d51d77 to your computer and use it in GitHub Desktop.
A keen python util so queries return a consistent panda dataframe with / without timeframes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datetime import date, timedelta | |
import datetime | |
import pandas as pd | |
import json | |
import yaml | |
from keen.client import KeenClient | |
def events(): | |
f = open('../config.yml') | |
# use safe_load instead load | |
configMap = yaml.safe_load(f) | |
f.close() | |
client = KeenClient( | |
project_id=configMap['project_id'], | |
read_key=configMap['read_key'] | |
) | |
return client | |
def extract_timeinterval(keen_interval_extract): | |
raw_data = [] | |
for item_row in keen_interval_extract: | |
# Takes in a keen raw output, not a df | |
this_item_row_start = datetime.datetime.strptime(item_row['timeframe']['start'], '%Y-%m-%dT%H:%M:%S.%fZ') | |
if type(item_row['value']) == type(1): | |
item_row['start']=this_item_row_start | |
raw_data.append({'start': this_item_row_start, 'result': item_row['value'] }) | |
else: | |
for this_element in item_row['value']: | |
this_element['start']=this_item_row_start | |
raw_data.append(this_element) | |
return pd.DataFrame(raw_data) | |
def excluded_players(excluded_players_filename='../data/reference/excluded_players.json'): | |
## Define Player Exclusion Filters | |
j_data=open(excluded_players_filename).read() | |
j_data = json.loads(j_data) | |
j_players_list = [j_data['excluded_players'][x]['player_id'] for x in range(0,len(j_data['excluded_players']))] | |
j_players_list = [x.encode('UTF8') for x in j_players_list] | |
exclude_players_filters = [] | |
for player in j_players_list: | |
this_player_exclude = {"property_name": "player_id", "operator": "ne", "property_value": player } | |
exclude_players_filters.append(this_player_exclude) | |
return exclude_players_filters | |
def extract_variable(keen_interval_extract,variable_name,period_start): | |
raw_df=pd.DataFrame(keen_interval_extract) | |
raw_df['start']=raw_df['result'].map(lambda x: period_start) | |
return raw_df | |
def keen_count(table,result_variable_name,period_start,period_finish,input_group_by_variables=[],selection_filters=[],interval=None): | |
group_by_variables = input_group_by_variables | |
group_df_variables = [] | |
for input_group_by_variable in input_group_by_variables: | |
group_df_variables = group_df_variables + [input_group_by_variable] | |
the_excluded_players=excluded_players() | |
filters = selection_filters+the_excluded_players | |
if interval == None: | |
raw = (events().count( | |
table, | |
group_by=group_by_variables, | |
filters=filters, | |
timeframe={'start': period_start.strftime('%Y-%m-%dT%H:%M:%S.%fZ'), 'end': period_finish.strftime('%Y-%m-%dT%H:%M:%S.%fZ') }, | |
timezone='utc' | |
)) | |
if len(group_df_variables) == 0: | |
raw_df = pd.DataFrame([{'start': period_start, 'result': raw}]) | |
else: | |
raw_df=pd.DataFrame(raw) | |
raw_df=raw_df.rename(columns={'result' : result_variable_name}) | |
return raw_df | |
else: | |
raw = (events().count( | |
table, | |
group_by=group_by_variables, | |
filters=filters, | |
timeframe={'start': period_start.strftime('%Y-%m-%dT%H:%M:%S.%fZ'), 'end': period_finish.strftime('%Y-%m-%dT%H:%M:%S.%fZ') }, | |
interval = interval, | |
timezone='utc')) | |
raw_df = pd.DataFrame(extract_timeinterval(raw)) | |
raw_df=raw_df.rename(columns={'result' : result_variable_name}) | |
if len(group_df_variables) == 0: | |
return raw_df | |
else: | |
data_df = raw_df.groupby(group_df_variables+['start']).sum() | |
return data_df | |
def keen_count_unique(table,target_variable,result_variable_name,period_start,period_finish,input_group_by_variables=[],selection_filters=[],interval=None): | |
group_by_variables = input_group_by_variables | |
group_df_variables = [] | |
for input_group_by_variable in input_group_by_variables: | |
group_df_variables = group_df_variables + [input_group_by_variable] | |
the_excluded_players=excluded_players() | |
filters = selection_filters+the_excluded_players | |
if interval == None: | |
raw = (events().count_unique( | |
table, | |
group_by=group_by_variables, | |
target_property=target_variable, | |
filters=filters, | |
timeframe={'start': period_start.strftime('%Y-%m-%dT%H:%M:%S.%fZ'), 'end': period_finish.strftime('%Y-%m-%dT%H:%M:%S.%fZ') }, | |
timezone='utc' | |
)) | |
if len(group_df_variables) == 0: | |
raw_df = pd.DataFrame([{'start': period_start, 'result': raw}]) | |
else: | |
raw_df=pd.DataFrame(raw) | |
raw_df=raw_df.rename(columns={'result' : result_variable_name}) | |
return raw_df | |
else: | |
raw = (events().count_unique( | |
table, | |
target_property=target_variable, | |
group_by=group_by_variables, | |
filters=filters, | |
timeframe={'start': period_start.strftime('%Y-%m-%dT%H:%M:%S.%fZ'), 'end': period_finish.strftime('%Y-%m-%dT%H:%M:%S.%fZ') }, | |
interval = interval, | |
timezone='utc')) | |
raw_df = pd.DataFrame(extract_timeinterval(raw)) | |
raw_df=raw_df.rename(columns={'result' : result_variable_name}) | |
if len(group_df_variables) == 0: | |
return raw_df | |
else: | |
data_df = raw_df.groupby(group_df_variables+['start']).sum() | |
return data_df | |
def keen_maximum(table,target_variable,result_variable_name,period_start,period_finish,input_group_by_variables=[],selection_filters=[],interval=None): | |
group_by_variables = input_group_by_variables | |
group_df_variables = [] | |
for input_group_by_variable in input_group_by_variables: | |
group_df_variables = group_df_variables + [input_group_by_variable] | |
the_excluded_players=excluded_players() | |
filters = selection_filters+the_excluded_players | |
if interval == None: | |
raw = (events().maximum( | |
table, | |
group_by=group_by_variables, | |
target_property=target_variable, | |
filters=filters, | |
timeframe={'start': period_start.strftime('%Y-%m-%dT%H:%M:%S.%fZ'), 'end': period_finish.strftime('%Y-%m-%dT%H:%M:%S.%fZ') }, | |
timezone='utc' | |
)) | |
if len(group_df_variables) == 0: | |
raw_df = pd.DataFrame([{'start': period_start, 'result': raw}]) | |
else: | |
raw_df=pd.DataFrame(raw) | |
raw_df=raw_df.rename(columns={'result' : result_variable_name}) | |
return raw_df | |
else: | |
raw = (events().maximum( | |
table, | |
target_property=target_variable, | |
group_by=group_by_variables, | |
filters=filters, | |
timeframe={'start': period_start.strftime('%Y-%m-%dT%H:%M:%S.%fZ'), 'end': period_finish.strftime('%Y-%m-%dT%H:%M:%S.%fZ') }, | |
interval = interval, | |
timezone='utc')) | |
raw_df = pd.DataFrame(extract_timeinterval(raw)) | |
raw_df=raw_df.rename(columns={'result' : result_variable_name}) | |
if len(group_df_variables) == 0: | |
return raw_df | |
else: | |
data_df = raw_df.groupby(group_df_variables+['start']).sum() | |
return data_df | |
def keen_minimum(table,target_variable,result_variable_name,period_start,period_finish,input_group_by_variables=[],selection_filters=[],interval=None): | |
group_by_variables = input_group_by_variables | |
group_df_variables = [] | |
for input_group_by_variable in input_group_by_variables: | |
group_df_variables = group_df_variables + [input_group_by_variable] | |
the_excluded_players=excluded_players() | |
filters = selection_filters+the_excluded_players | |
if interval == None: | |
raw = (events().minimum( | |
table, | |
group_by=group_by_variables, | |
target_property=target_variable, | |
filters=filters, | |
timeframe={'start': period_start.strftime('%Y-%m-%dT%H:%M:%S.%fZ'), 'end': period_finish.strftime('%Y-%m-%dT%H:%M:%S.%fZ') }, | |
timezone='utc' | |
)) | |
if len(group_df_variables) == 0: | |
raw_df = pd.DataFrame([{'start': period_start, 'result': raw}]) | |
else: | |
raw_df=pd.DataFrame(raw) | |
raw_df=raw_df.rename(columns={'result' : result_variable_name}) | |
return raw_df | |
else: | |
raw = (events().minimum( | |
table, | |
target_property=target_variable, | |
group_by=group_by_variables, | |
filters=filters, | |
timeframe={'start': period_start.strftime('%Y-%m-%dT%H:%M:%S.%fZ'), 'end': period_finish.strftime('%Y-%m-%dT%H:%M:%S.%fZ') }, | |
interval = interval, | |
timezone='utc')) | |
raw_df = pd.DataFrame(extract_timeinterval(raw)) | |
raw_df=raw_df.rename(columns={'result' : result_variable_name}) | |
if len(group_df_variables) == 0: | |
return raw_df | |
else: | |
data_df = raw_df.groupby(group_df_variables+['start']).sum() | |
return data_df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment