Skip to content

Instantly share code, notes, and snippets.

@steve500002
Created August 4, 2015 06:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save steve500002/55cc94077f9b20d51d77 to your computer and use it in GitHub Desktop.
Save steve500002/55cc94077f9b20d51d77 to your computer and use it in GitHub Desktop.
A keen python util so queries return a consistent panda dataframe with / without timeframes
from datetime import date, timedelta
import datetime
import pandas as pd
import json
import yaml
from keen.client import KeenClient
def events():
f = open('../config.yml')
# use safe_load instead load
configMap = yaml.safe_load(f)
f.close()
client = KeenClient(
project_id=configMap['project_id'],
read_key=configMap['read_key']
)
return client
def extract_timeinterval(keen_interval_extract):
raw_data = []
for item_row in keen_interval_extract:
# Takes in a keen raw output, not a df
this_item_row_start = datetime.datetime.strptime(item_row['timeframe']['start'], '%Y-%m-%dT%H:%M:%S.%fZ')
if type(item_row['value']) == type(1):
item_row['start']=this_item_row_start
raw_data.append({'start': this_item_row_start, 'result': item_row['value'] })
else:
for this_element in item_row['value']:
this_element['start']=this_item_row_start
raw_data.append(this_element)
return pd.DataFrame(raw_data)
def excluded_players(excluded_players_filename='../data/reference/excluded_players.json'):
## Define Player Exclusion Filters
j_data=open(excluded_players_filename).read()
j_data = json.loads(j_data)
j_players_list = [j_data['excluded_players'][x]['player_id'] for x in range(0,len(j_data['excluded_players']))]
j_players_list = [x.encode('UTF8') for x in j_players_list]
exclude_players_filters = []
for player in j_players_list:
this_player_exclude = {"property_name": "player_id", "operator": "ne", "property_value": player }
exclude_players_filters.append(this_player_exclude)
return exclude_players_filters
def extract_variable(keen_interval_extract,variable_name,period_start):
raw_df=pd.DataFrame(keen_interval_extract)
raw_df['start']=raw_df['result'].map(lambda x: period_start)
return raw_df
def keen_count(table,result_variable_name,period_start,period_finish,input_group_by_variables=[],selection_filters=[],interval=None):
group_by_variables = input_group_by_variables
group_df_variables = []
for input_group_by_variable in input_group_by_variables:
group_df_variables = group_df_variables + [input_group_by_variable]
the_excluded_players=excluded_players()
filters = selection_filters+the_excluded_players
if interval == None:
raw = (events().count(
table,
group_by=group_by_variables,
filters=filters,
timeframe={'start': period_start.strftime('%Y-%m-%dT%H:%M:%S.%fZ'), 'end': period_finish.strftime('%Y-%m-%dT%H:%M:%S.%fZ') },
timezone='utc'
))
if len(group_df_variables) == 0:
raw_df = pd.DataFrame([{'start': period_start, 'result': raw}])
else:
raw_df=pd.DataFrame(raw)
raw_df=raw_df.rename(columns={'result' : result_variable_name})
return raw_df
else:
raw = (events().count(
table,
group_by=group_by_variables,
filters=filters,
timeframe={'start': period_start.strftime('%Y-%m-%dT%H:%M:%S.%fZ'), 'end': period_finish.strftime('%Y-%m-%dT%H:%M:%S.%fZ') },
interval = interval,
timezone='utc'))
raw_df = pd.DataFrame(extract_timeinterval(raw))
raw_df=raw_df.rename(columns={'result' : result_variable_name})
if len(group_df_variables) == 0:
return raw_df
else:
data_df = raw_df.groupby(group_df_variables+['start']).sum()
return data_df
def keen_count_unique(table,target_variable,result_variable_name,period_start,period_finish,input_group_by_variables=[],selection_filters=[],interval=None):
group_by_variables = input_group_by_variables
group_df_variables = []
for input_group_by_variable in input_group_by_variables:
group_df_variables = group_df_variables + [input_group_by_variable]
the_excluded_players=excluded_players()
filters = selection_filters+the_excluded_players
if interval == None:
raw = (events().count_unique(
table,
group_by=group_by_variables,
target_property=target_variable,
filters=filters,
timeframe={'start': period_start.strftime('%Y-%m-%dT%H:%M:%S.%fZ'), 'end': period_finish.strftime('%Y-%m-%dT%H:%M:%S.%fZ') },
timezone='utc'
))
if len(group_df_variables) == 0:
raw_df = pd.DataFrame([{'start': period_start, 'result': raw}])
else:
raw_df=pd.DataFrame(raw)
raw_df=raw_df.rename(columns={'result' : result_variable_name})
return raw_df
else:
raw = (events().count_unique(
table,
target_property=target_variable,
group_by=group_by_variables,
filters=filters,
timeframe={'start': period_start.strftime('%Y-%m-%dT%H:%M:%S.%fZ'), 'end': period_finish.strftime('%Y-%m-%dT%H:%M:%S.%fZ') },
interval = interval,
timezone='utc'))
raw_df = pd.DataFrame(extract_timeinterval(raw))
raw_df=raw_df.rename(columns={'result' : result_variable_name})
if len(group_df_variables) == 0:
return raw_df
else:
data_df = raw_df.groupby(group_df_variables+['start']).sum()
return data_df
def keen_maximum(table,target_variable,result_variable_name,period_start,period_finish,input_group_by_variables=[],selection_filters=[],interval=None):
group_by_variables = input_group_by_variables
group_df_variables = []
for input_group_by_variable in input_group_by_variables:
group_df_variables = group_df_variables + [input_group_by_variable]
the_excluded_players=excluded_players()
filters = selection_filters+the_excluded_players
if interval == None:
raw = (events().maximum(
table,
group_by=group_by_variables,
target_property=target_variable,
filters=filters,
timeframe={'start': period_start.strftime('%Y-%m-%dT%H:%M:%S.%fZ'), 'end': period_finish.strftime('%Y-%m-%dT%H:%M:%S.%fZ') },
timezone='utc'
))
if len(group_df_variables) == 0:
raw_df = pd.DataFrame([{'start': period_start, 'result': raw}])
else:
raw_df=pd.DataFrame(raw)
raw_df=raw_df.rename(columns={'result' : result_variable_name})
return raw_df
else:
raw = (events().maximum(
table,
target_property=target_variable,
group_by=group_by_variables,
filters=filters,
timeframe={'start': period_start.strftime('%Y-%m-%dT%H:%M:%S.%fZ'), 'end': period_finish.strftime('%Y-%m-%dT%H:%M:%S.%fZ') },
interval = interval,
timezone='utc'))
raw_df = pd.DataFrame(extract_timeinterval(raw))
raw_df=raw_df.rename(columns={'result' : result_variable_name})
if len(group_df_variables) == 0:
return raw_df
else:
data_df = raw_df.groupby(group_df_variables+['start']).sum()
return data_df
def keen_minimum(table,target_variable,result_variable_name,period_start,period_finish,input_group_by_variables=[],selection_filters=[],interval=None):
group_by_variables = input_group_by_variables
group_df_variables = []
for input_group_by_variable in input_group_by_variables:
group_df_variables = group_df_variables + [input_group_by_variable]
the_excluded_players=excluded_players()
filters = selection_filters+the_excluded_players
if interval == None:
raw = (events().minimum(
table,
group_by=group_by_variables,
target_property=target_variable,
filters=filters,
timeframe={'start': period_start.strftime('%Y-%m-%dT%H:%M:%S.%fZ'), 'end': period_finish.strftime('%Y-%m-%dT%H:%M:%S.%fZ') },
timezone='utc'
))
if len(group_df_variables) == 0:
raw_df = pd.DataFrame([{'start': period_start, 'result': raw}])
else:
raw_df=pd.DataFrame(raw)
raw_df=raw_df.rename(columns={'result' : result_variable_name})
return raw_df
else:
raw = (events().minimum(
table,
target_property=target_variable,
group_by=group_by_variables,
filters=filters,
timeframe={'start': period_start.strftime('%Y-%m-%dT%H:%M:%S.%fZ'), 'end': period_finish.strftime('%Y-%m-%dT%H:%M:%S.%fZ') },
interval = interval,
timezone='utc'))
raw_df = pd.DataFrame(extract_timeinterval(raw))
raw_df=raw_df.rename(columns={'result' : result_variable_name})
if len(group_df_variables) == 0:
return raw_df
else:
data_df = raw_df.groupby(group_df_variables+['start']).sum()
return data_df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment