Bruno Gonzalez bruno-uy

## date_frequency_groupby.py
import pandas as pd

# Create sample pandas.Series to calculate frequency
s = pd.Series(pd.date_range("2021", freq="18H", periods=1000))
# Calculate frequency grouping by month
# If you want another period check following link:
# https://pandas.pydata.org/docs/user_guide/timeseries.html#timeseries-offset-aliases
freq_count = s.dt.to_period("M").value_counts(sort=False)

## evenly_distributed_date_intervals.py
import pandas as pd

# The function shows `str` for dates, but could be datetime.date objects as well
def n_date_intervals(start_date: str, end_date: str, intervals_count: int) -> pd.Series:
    return pd.Series(pd.date_range(start_date, end_date, periods=intervals_count))

# Change this variables as needed
start_date = "2018-01-01"
end_date = "2021-03-24"
intervals_count = 5

## columns_exists_redshift_table.sql
-- You can count the rows to be equal to columns that you're checking
SELECT *
FROM information_schema.columns
WHERE table_schema = '<table_schema>'
	AND table_name = '<table_name>'
	AND column_name IN ('<column1>', '<column2>');

## use_and_update_partial_function.py
from functools import partial

def print_n_dict_elem(n, dictionary, elem):
    print(n * dictionary[elem])

# Create a new function that print elem 3 times
print_3_dict_elem = partial(print_n_dict_elem, n=3, dictionary={"a": "A", "b": "B"})
print_3_dict_elem(elem="a")

# Update dictionary passed as an argument

## iterable_n_size_chunks_filter_none.py
# Remember to install more_itertools first
# pip install more-itertools
from more_itertools import grouper

def do_something_with_iterable(iterable):
    pass

n = 3
chunks = grouper('a'*10, n)
for c in chunks:

## multiindex_df_to_nested_dict.py
import pandas as pd

df = pd.DataFrame({"A": [0, 1, 0], "B": ["a", "b", "c"], "C": [1, 2, 3]})
df.set_index(["A", "C"], drop=False, inplace=True)
result = {level: df.xs(level).to_dict("index") for level in df.index.levels[0]}
print(result[0])

## utc_now_round_up_without_tz.py
import pandas as pd

utc_now_pd = pd.Timestamp.utcnow()
# The function round(freq="D") is magic behind the round up
# Use replace(tzinfo=None) to remove timezone information
utc_now_ceil = utc_now_pd.round(freq="D").to_pydatetime().replace(tzinfo=None)
# Convert to ISO format
utc_now_str = utc_now_ceil.strftime("%Y-%m-%dT%H:%M:%S")

## read_all_csv_gz_current_folder.py
import pandas as pd
df = pd.concat([pd.read_csv(f, compression="gzip") for f in os.listdir() if f.endswith(".gz")], ignore_index=True)

## df_to_dict_with_none.py
import pandas as pd
import numpy as np

df = pd.DataFrame({"A": [1, 2, 3], "B": [1.2, np.NaN, 3.4]})
result = (
    df
    .replace([np.nan], [None], regex=False)
    .to_dict(orient="records")
)

## add_schema_to_search_path.sql
-- Problem: you don't see all the schemas when querying PG_TABLE_DEF
-- Solution:
-- 1. First check if the schema you're trying to query is on the search path
show search_path;
-- 2. Add the missing one(s) to the search path (imagine the result was only public and you're missing data_warehouse and matching)
set search_path to '$user', public, data_warehouse, matching;  -- No matter which is your user, use '$user'
	import pandas as pd

	# Create sample pandas.Series to calculate frequency
	s = pd.Series(pd.date_range("2021", freq="18H", periods=1000))
	# Calculate frequency grouping by month
	# If you want another period check following link:
	# https://pandas.pydata.org/docs/user_guide/timeseries.html#timeseries-offset-aliases
	freq_count = s.dt.to_period("M").value_counts(sort=False)
	import pandas as pd

	# The function shows `str` for dates, but could be datetime.date objects as well
	def n_date_intervals(start_date: str, end_date: str, intervals_count: int) -> pd.Series:
	return pd.Series(pd.date_range(start_date, end_date, periods=intervals_count))

	# Change this variables as needed
	start_date = "2018-01-01"
	end_date = "2021-03-24"
	intervals_count = 5
	-- You can count the rows to be equal to columns that you're checking
	SELECT *
	FROM information_schema.columns
	WHERE table_schema = '<table_schema>'
	AND table_name = '<table_name>'
	AND column_name IN ('<column1>', '<column2>');
	from functools import partial

	def print_n_dict_elem(n, dictionary, elem):
	print(n * dictionary[elem])

	# Create a new function that print elem 3 times
	print_3_dict_elem = partial(print_n_dict_elem, n=3, dictionary={"a": "A", "b": "B"})
	print_3_dict_elem(elem="a")

	# Update dictionary passed as an argument
	# Remember to install more_itertools first
	# pip install more-itertools
	from more_itertools import grouper

	def do_something_with_iterable(iterable):
	pass

	n = 3
	chunks = grouper('a'*10, n)
	for c in chunks:
	import pandas as pd

	df = pd.DataFrame({"A": [0, 1, 0], "B": ["a", "b", "c"], "C": [1, 2, 3]})
	df.set_index(["A", "C"], drop=False, inplace=True)
	result = {level: df.xs(level).to_dict("index") for level in df.index.levels[0]}
	print(result[0])
	import pandas as pd

	utc_now_pd = pd.Timestamp.utcnow()
	# The function round(freq="D") is magic behind the round up
	# Use replace(tzinfo=None) to remove timezone information
	utc_now_ceil = utc_now_pd.round(freq="D").to_pydatetime().replace(tzinfo=None)
	# Convert to ISO format
	utc_now_str = utc_now_ceil.strftime("%Y-%m-%dT%H:%M:%S")
	import pandas as pd
	df = pd.concat([pd.read_csv(f, compression="gzip") for f in os.listdir() if f.endswith(".gz")], ignore_index=True)
	import pandas as pd
	import numpy as np

	df = pd.DataFrame({"A": [1, 2, 3], "B": [1.2, np.NaN, 3.4]})
	result = (
	df
	.replace([np.nan], [None], regex=False)
	.to_dict(orient="records")
	)
	-- Problem: you don't see all the schemas when querying PG_TABLE_DEF
	-- Solution:
	-- 1. First check if the schema you're trying to query is on the search path
	show search_path;
	-- 2. Add the missing one(s) to the search path (imagine the result was only public and you're missing data_warehouse and matching)
	set search_path to '$user', public, data_warehouse, matching; -- No matter which is your user, use '$user'