Skip to content

Instantly share code, notes, and snippets.

View BexTuychiev's full-sized avatar
🏠
Working from home

bexgboost BexTuychiev

🏠
Working from home
View GitHub Profile
PS1='\[\033]0;WSL2 Bash\W\007\]' # set window title
PS1="$PS1"'\n' # new line
PS1="$PS1"'\[\033[36m\]' # change to green
PS1="$PS1"'bash@bexgboost ' # user@host<space>
PS1="$PS1"'\[\033[31m\]' # change to brownish yellow
PS1="$PS1"'\W' # current working directory
source /usr/lib/git-core/git-sh-prompt
export PS1="${debian_chroot:+($debian_chroot)}\[\033[01;36m\]\u:\[\033[01;31m\]\w\[\033[33m\]\$(__git_ps1)\[\033[33m\]"
Feature Jupyter Notebooks Databricks Notebooks
Platform Open-source, runs locally or on cloud platforms Exclusive to the Databricks platform
Collaboration and Sharing Limited collaboration features, manual sharing Built-in collaboration, real-time concurrent editing
Execution Relies on local or external servers Execution on Databricks clusters
Integration with Big Data Can be integrated with Spark, requires additional configurations Native integration with Apache Spark, optimized for big data
Built-in Features External tools/extensions for version control, collaboration, and visualization Integrated with Databricks-specific features like Delta Lake, built-in support for collaboration and analytics tools
Cost and Scaling Local installations are often free, cloud-based solutions may have costs Paid service, costs depend on usage, scales seamlessly with Databricks clusters
Ease of Use Familiar and widely used in the data science commun
import pandas as pd
import seaborn as sns
# Load the dataset from Seaborn
diamonds = sns.load_dataset("diamonds")
# Create a Pandas DataFrame
df = pd.DataFrame(diamonds)
# Save the DataFrame directly as a Parquet file
import pandas as pd
import seaborn as sns
# Load the dataset from Seaborn
diamonds = sns.load_dataset("diamonds")
# Create a Pandas DataFrame
df = pd.DataFrame(diamonds)
# Save the DataFrame directly as a Parquet file
import optuna
import xgboost as xgb
from sklearn.metrics import mean_squared_error # or any other metric
from sklearn.model_selection import train_test_split
# Load the dataset
X, y = ... # load your own
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Define the objective function for Optuna
# Find median
median_score = marks['math score'].median()
# Extract its prob
median_prob = math_pmf[median_score]
# Find 25th percentile
percentile_25th = marks['math score'].describe()['25%']
# Extract its prob
percentile_prob = math_pmf[percentile_25th]
# Recreate the plot with annotations
def reduce_memory_usage(df, verbose=True):
numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
start_mem = df.memory_usage().sum() / 1024 ** 2
for col in df.columns:
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == "int":
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
This is a test gist. 13894950uijklakd#$%^&*'\\./
import pandas as pd
from sklearn.preprocessing import QuantileTransformer
qt = QuantileTransformer().fit(crazy_distributions)
crazy_feature_names = ["f18", "f31", "f61"]
crazy_distributions = pd.DataFrame(qt.transform(crazy_distributions), columns=crazy_feature_names)
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
def reduce_memory(X: pd.DataFrame, y=None):
"""Simple function to reduce memory usage by casting numeric columns to float32."""
num_cols = X.select_dtypes(incluce=np.number).columns
for col in num_cols:
X[col] = X.astype("float32")