Skip to content

Instantly share code, notes, and snippets.

View joshreini1's full-sized avatar

Josh Reini joshreini1

View GitHub Profile
data_collections = tru.get_data_collections()
for dc in data_collections:
tru.set_data_collection(dc)
data_splits = tru.get_data_splits()
for split in data_splits:
tru.tester.add_performance_test(
data_split_name = split,
metric = 'AUC',
warn_if_less_than = 0.85,
fail_if_less_than = 0.80
data_collections = tru.get_data_collections()
for dc in data_collections:
tru.set_data_collection(dc)
splits = tru.get_data_splits()
for split_key in splits:
split = f'{split_key}'
tru.tester.add_stability_test(
comparison_data_split_name = split,
base_data_split_name = 'train',
metric = 'DIFFERENCE_OF_MEAN',
import h5py
def load_data_for_year(year) -> pd.DataFrame:
if year > 2016:
filename = f'GFED4.1s_{year}_beta.hdf5'
else:
filename = f'GFED4.1s_{year}.hdf5'
filepath = os.path.join(data_dir, filename)
data = h5py.File(filepath, 'r')
df = pd.DataFrame({
def get_feature_for_year_and_month(data_cache,year, month, suffix) -> pd.DataFrame:
if year not in data_cache:
data_cache[year] = load_data_for_year(year)
data = data_cache[year]
df = pd.DataFrame({
f'burned_fraction_{suffix}': data['burned_area/{:02}/burned_fraction'.format(month)],
f'emissions_DM_{suffix}': data['emissions/{:02}/DM'.format(month)],
f'emissions_C_{suffix}': data['emissions/{:02}/C'.format(month)],
f'emissions_small_fire_fraction_{suffix}': data['emissions/{:02}/small_fire_fraction'.format(month)],
f'biosphere_NPP_{suffix}': data['biosphere/{:02}/NPP'.format(month)],
import plotly.express as px
px.set_mapbox_access_token(mapbox_token)
fig = px.scatter_mapbox(df_usa_2016.drop('year',axis=1), color = "month", lat="lat", lon="lon", size="burned_area", size_max=15, zoom=10)
fig.show()
models = {}
import time
for window_size in range(1,11):
time_start = time.time()
key = f'{window_size}year_window'
print(f'Training linear model for {key}')
models[f'linear_{key}'] = LogisticRegression(random_state=321, max_iter=1000, solver='saga').fit(data_train_x[key], data_train_y[key]>0.01)
n_est = 20
print(f'Training gb{n_est} model for {key}')
project_name = 'Fire_Party'
tru.set_environment('local')
tru.add_project(project_name, score_type='probits')
extra_data_columns = ['year']
train_split_name = 'train'
burned_fraction_th = 0.01
for window_size in range(1,11):
key = f'{window_size}year_window'
print(key)
tru.add_data_collection(key)
class target_encoder(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, X, y = None):
return self
def transform(self, X, y = None):
#target encode lat and long
class target_encoder(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, X, y = None):
return self
def transform(self, X, y = None):
#target encode lat and long
tru.set_data_collection("data_collection")
splits = tru.get_data_splits()
for split in splits:
tru.set_data_collection("data_collection")
tru.set_data_split(split)
xs = tru.get_xs()
ys = tru.get_ys()
tru.set_data_collection("data_collection_v2")
ys_mean = ys.mean()
ys_std = ys.std()