Josh Reini joshreini1

## performance_test_all_data_collections.py
data_collections = tru.get_data_collections()
for dc in data_collections:
    tru.set_data_collection(dc)
    data_splits = tru.get_data_splits()
    for split in data_splits:
        tru.tester.add_performance_test(
            data_split_name = split,
            metric = 'AUC',
            warn_if_less_than = 0.85,
            fail_if_less_than = 0.80

## stability_test_all_splits.py
data_collections = tru.get_data_collections()
for dc in data_collections:
    tru.set_data_collection(dc)
    splits = tru.get_data_splits()
    for split_key in splits:
        split = f'{split_key}'
        tru.tester.add_stability_test(
                comparison_data_split_name = split,
                base_data_split_name = 'train',
                metric = 'DIFFERENCE_OF_MEAN',

## load_data_for_year_hdf5.py
import h5py

def load_data_for_year(year) -> pd.DataFrame:
    if year > 2016:
        filename = f'GFED4.1s_{year}_beta.hdf5'
    else:
        filename = f'GFED4.1s_{year}.hdf5'
    filepath = os.path.join(data_dir, filename)
    data = h5py.File(filepath, 'r')
    df = pd.DataFrame({

## get_feature_for_year_and_month.py
def get_feature_for_year_and_month(data_cache,year, month, suffix) -> pd.DataFrame:
    if year not in data_cache:
        data_cache[year] = load_data_for_year(year)
    data = data_cache[year]
    df = pd.DataFrame({
        f'burned_fraction_{suffix}': data['burned_area/{:02}/burned_fraction'.format(month)],
        f'emissions_DM_{suffix}': data['emissions/{:02}/DM'.format(month)],
        f'emissions_C_{suffix}': data['emissions/{:02}/C'.format(month)],
        f'emissions_small_fire_fraction_{suffix}': data['emissions/{:02}/small_fire_fraction'.format(month)],
        f'biosphere_NPP_{suffix}': data['biosphere/{:02}/NPP'.format(month)],

## map_fires.py
import plotly.express as px
px.set_mapbox_access_token(mapbox_token)
fig = px.scatter_mapbox(df_usa_2016.drop('year',axis=1), color = "month", lat="lat", lon="lon", size="burned_area", size_max=15, zoom=10)
fig.show()

## fire_prediction_training_loop.py
models = {}
import time
for window_size in range(1,11):
    time_start = time.time()
    key = f'{window_size}year_window'
    print(f'Training linear model for {key}')
    models[f'linear_{key}'] = LogisticRegression(random_state=321, max_iter=1000, solver='saga').fit(data_train_x[key], data_train_y[key]>0.01)

    n_est = 20
    print(f'Training gb{n_est} model for {key}')

## fire_pred_data_and_models_to_truera.py
project_name = 'Fire_Party'
tru.set_environment('local')
tru.add_project(project_name, score_type='probits')
extra_data_columns = ['year']
train_split_name = 'train'
burned_fraction_th = 0.01
for window_size in range(1,11):
    key = f'{window_size}year_window'
    print(key)
    tru.add_data_collection(key)

## target_encoder_latlong.py
class target_encoder(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def fit(self, X, y = None):
        return self

    def transform(self, X, y = None):
        #target encode lat and long

## target_encoder_grid.py
class target_encoder(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def fit(self, X, y = None):
        return self

    def transform(self, X, y = None):
        #target encode lat and long

## reframe_price_prediction_task.py
tru.set_data_collection("data_collection")
splits = tru.get_data_splits()
for split in splits:
    tru.set_data_collection("data_collection")
    tru.set_data_split(split)
    xs = tru.get_xs()
    ys = tru.get_ys()
    tru.set_data_collection("data_collection_v2")
    ys_mean = ys.mean()
    ys_std = ys.std()
	data_collections = tru.get_data_collections()
	for dc in data_collections:
	tru.set_data_collection(dc)
	data_splits = tru.get_data_splits()
	for split in data_splits:
	tru.tester.add_performance_test(
	data_split_name = split,
	metric = 'AUC',
	warn_if_less_than = 0.85,
	fail_if_less_than = 0.80
	import h5py

	def load_data_for_year(year) -> pd.DataFrame:
	if year > 2016:
	filename = f'GFED4.1s_{year}_beta.hdf5'
	else:
	filename = f'GFED4.1s_{year}.hdf5'
	filepath = os.path.join(data_dir, filename)
	data = h5py.File(filepath, 'r')
	df = pd.DataFrame({
	def get_feature_for_year_and_month(data_cache,year, month, suffix) -> pd.DataFrame:
	if year not in data_cache:
	data_cache[year] = load_data_for_year(year)
	data = data_cache[year]
	df = pd.DataFrame({
	f'burned_fraction_{suffix}': data['burned_area/{:02}/burned_fraction'.format(month)],
	f'emissions_DM_{suffix}': data['emissions/{:02}/DM'.format(month)],
	f'emissions_C_{suffix}': data['emissions/{:02}/C'.format(month)],
	f'emissions_small_fire_fraction_{suffix}': data['emissions/{:02}/small_fire_fraction'.format(month)],
	f'biosphere_NPP_{suffix}': data['biosphere/{:02}/NPP'.format(month)],
	import plotly.express as px
	px.set_mapbox_access_token(mapbox_token)
	fig = px.scatter_mapbox(df_usa_2016.drop('year',axis=1), color = "month", lat="lat", lon="lon", size="burned_area", size_max=15, zoom=10)
	fig.show()
	models = {}
	import time
	for window_size in range(1,11):
	time_start = time.time()
	key = f'{window_size}year_window'
	print(f'Training linear model for {key}')
	models[f'linear_{key}'] = LogisticRegression(random_state=321, max_iter=1000, solver='saga').fit(data_train_x[key], data_train_y[key]>0.01)

	n_est = 20
	print(f'Training gb{n_est} model for {key}')
	project_name = 'Fire_Party'
	tru.set_environment('local')
	tru.add_project(project_name, score_type='probits')
	extra_data_columns = ['year']
	train_split_name = 'train'
	burned_fraction_th = 0.01
	for window_size in range(1,11):
	key = f'{window_size}year_window'
	print(key)
	tru.add_data_collection(key)
	class target_encoder(BaseEstimator, TransformerMixin):

	def __init__(self):
	pass

	def fit(self, X, y = None):
	return self

	def transform(self, X, y = None):
	#target encode lat and long
	tru.set_data_collection("data_collection")
	splits = tru.get_data_splits()
	for split in splits:
	tru.set_data_collection("data_collection")
	tru.set_data_split(split)
	xs = tru.get_xs()
	ys = tru.get_ys()
	tru.set_data_collection("data_collection_v2")
	ys_mean = ys.mean()
	ys_std = ys.std()