bexgboost BexTuychiev

## format_terminal.sh
PS1='\[\033]0;WSL2 Bash\W\007\]' # set window title
PS1="$PS1"'\n'                 # new line
PS1="$PS1"'\[\033[36m\]'       # change to green
PS1="$PS1"'bash@bexgboost '    # user@host<space>
PS1="$PS1"'\[\033[31m\]'       # change to brownish yellow
PS1="$PS1"'\W'                 # current working directory

source /usr/lib/git-core/git-sh-prompt
export PS1="${debian_chroot:+($debian_chroot)}\[\033[01;36m\]\u:\[\033[01;31m\]\w\[\033[33m\]\$(__git_ps1)\[\033[33m\]"

## comparison_table.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                BexTuychiev
                / comparison_table.md
            
            
              Created
              January 16, 2024 17:42
            
          
Feature
Jupyter Notebooks
Databricks Notebooks


Platform
Open-source, runs locally or on cloud platforms
Exclusive to the Databricks platform


Collaboration and Sharing
Limited collaboration features, manual sharing
Built-in collaboration, real-time concurrent editing


Execution
Relies on local or external servers
Execution on Databricks clusters


Integration with Big Data
Can be integrated with Spark, requires additional configurations
Native integration with Apache Spark, optimized for big data


Built-in Features
External tools/extensions for version control, collaboration, and visualization
Integrated with Databricks-specific features like Delta Lake, built-in support for collaboration and analytics tools


Cost and Scaling
Local installations are often free, cloud-based solutions may have costs
Paid service, costs depend on usage, scales seamlessly with Databricks clusters


Ease of Use
Familiar and widely used in the data science commun


## diamonds_as_csv.py
import pandas as pd
import seaborn as sns

# Load the dataset from Seaborn
diamonds = sns.load_dataset("diamonds")

# Create a Pandas DataFrame
df = pd.DataFrame(diamonds)

# Save the DataFrame directly as a Parquet file

## diamonds_as_parquet.py
import pandas as pd
import seaborn as sns

# Load the dataset from Seaborn
diamonds = sns.load_dataset("diamonds")

# Create a Pandas DataFrame
df = pd.DataFrame(diamonds)

# Save the DataFrame directly as a Parquet file

## optuna_xgboost.py
import optuna
import xgboost as xgb
from sklearn.metrics import mean_squared_error # or any other metric
from sklearn.model_selection import train_test_split

# Load the dataset
X, y = ... # load your  own
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the objective function for Optuna

## 3304.py
# Find median
median_score = marks['math score'].median()
# Extract its prob
median_prob = math_pmf[median_score]
# Find 25th percentile
percentile_25th = marks['math score'].describe()['25%']
# Extract its prob
percentile_prob = math_pmf[percentile_25th]

# Recreate the plot with annotations

## 10206.py
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:

## test.py
This is a test gist. 13894950uijklakd#$%^&*'\\./

## 9611.py
import pandas as pd
from sklearn.preprocessing import QuantileTransformer

qt = QuantileTransformer().fit(crazy_distributions)

crazy_feature_names = ["f18", "f31", "f61"]
crazy_distributions = pd.DataFrame(qt.transform(crazy_distributions), columns=crazy_feature_names)

fig, axes = plt.subplots(1, 3, figsize=(20, 6))


## 9601.py
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer

def reduce_memory(X: pd.DataFrame, y=None):
    """Simple function to reduce memory usage by casting numeric columns to float32."""

    num_cols = X.select_dtypes(incluce=np.number).columns
    for col in num_cols:
        X[col] = X.astype("float32")
	PS1='\[\033]0;WSL2 Bash\W\007\]' # set window title
	PS1="$PS1"'\n' # new line
	PS1="$PS1"'\[\033[36m\]' # change to green
	PS1="$PS1"'bash@bexgboost ' # user@host<space>
	PS1="$PS1"'\[\033[31m\]' # change to brownish yellow
	PS1="$PS1"'\W' # current working directory

	source /usr/lib/git-core/git-sh-prompt
	export PS1="${debian_chroot:+($debian_chroot)}\[\033[01;36m\]\u:\[\033[01;31m\]\w\[\033[33m\]\$(__git_ps1)\[\033[33m\]"
Feature	Jupyter Notebooks	Databricks Notebooks
Platform	Open-source, runs locally or on cloud platforms	Exclusive to the Databricks platform
Collaboration and Sharing	Limited collaboration features, manual sharing	Built-in collaboration, real-time concurrent editing
Execution	Relies on local or external servers	Execution on Databricks clusters
Integration with Big Data	Can be integrated with Spark, requires additional configurations	Native integration with Apache Spark, optimized for big data
Built-in Features	External tools/extensions for version control, collaboration, and visualization	Integrated with Databricks-specific features like Delta Lake, built-in support for collaboration and analytics tools
Cost and Scaling	Local installations are often free, cloud-based solutions may have costs	Paid service, costs depend on usage, scales seamlessly with Databricks clusters
Ease of Use	Familiar and widely used in the data science commun
	import pandas as pd
	import seaborn as sns

	# Load the dataset from Seaborn
	diamonds = sns.load_dataset("diamonds")

	# Create a Pandas DataFrame
	df = pd.DataFrame(diamonds)

	# Save the DataFrame directly as a Parquet file
	import optuna
	import xgboost as xgb
	from sklearn.metrics import mean_squared_error # or any other metric
	from sklearn.model_selection import train_test_split

	# Load the dataset
	X, y = ... # load your own
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	# Define the objective function for Optuna
	# Find median
	median_score = marks['math score'].median()
	# Extract its prob
	median_prob = math_pmf[median_score]
	# Find 25th percentile
	percentile_25th = marks['math score'].describe()['25%']
	# Extract its prob
	percentile_prob = math_pmf[percentile_25th]

	# Recreate the plot with annotations
	def reduce_memory_usage(df, verbose=True):
	numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
	start_mem = df.memory_usage().sum() / 1024 ** 2
	for col in df.columns:
	col_type = df[col].dtypes
	if col_type in numerics:
	c_min = df[col].min()
	c_max = df[col].max()
	if str(col_type)[:3] == "int":
	if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
	import pandas as pd
	from sklearn.preprocessing import QuantileTransformer

	qt = QuantileTransformer().fit(crazy_distributions)

	crazy_feature_names = ["f18", "f31", "f61"]
	crazy_distributions = pd.DataFrame(qt.transform(crazy_distributions), columns=crazy_feature_names)

	fig, axes = plt.subplots(1, 3, figsize=(20, 6))
	from sklearn.pipeline import make_pipeline
	from sklearn.preprocessing import FunctionTransformer

	def reduce_memory(X: pd.DataFrame, y=None):
	"""Simple function to reduce memory usage by casting numeric columns to float32."""

	num_cols = X.select_dtypes(incluce=np.number).columns
	for col in num_cols:
	X[col] = X.astype("float32")