Skip to content

Instantly share code, notes, and snippets.

View renardeinside's full-sized avatar
🦊
Make the elephant dance!

Ivan Trusov renardeinside

🦊
Make the elephant dance!
View GitHub Profile
trigger:
batch: true
branches:
include:
- '*'
tags:
include:
- v*.*
- prod
@renardeinside
renardeinside / vertical-append-test.py
Created June 18, 2022 20:39
vertical-append-test.py
import logging
from pyspark.sql import SparkSession
from databricks_python_packaging.utils.common import vertical_append
def test_append(spark: SparkSession):
logging.info("Testing the vertical append function")
frames = [
spark.range(100).toDF("id"),
@renardeinside
renardeinside / vertical-append-example.py
Created June 18, 2022 20:37
vertical-append-example.py
from typing import List
from pyspark.sql import DataFrame
from functools import reduce
def vertical_append(frames: List[DataFrame]) -> DataFrame:
"""
Vertically assembles (appends) a list of Spark Data Frames.
All frames are expected to have same columns.
@renardeinside
renardeinside / log4j-filtered.properties
Created June 13, 2022 10:40
pyspark-log4j-filtered
# define the root category and GatewayServer properties
# For PySpark applications it's better to keep these properties in sync for log consistency
log4j.rootCategory=INFO, console
log4j.logger.org.apache.spark.api.python.PythonGatewayServer=INFO
# configure the format and output for the console appender
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=[spark][%d{yyyy-MM-dd HH:mm:ss}][%p][%c][%m]%n
# define the root category and GatewayServer properties
# For PySpark applications it's better to keep these properties in sync for log consistency
log4j.rootCategory=DEBUG, console
log4j.logger.org.apache.spark.api.python.PythonGatewayServer=DEBUG
# configure the format and output for the console appender
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=[spark][%d{yyyy-MM-dd HH:mm:ss}][%p][%c][%m]%n
log4j.rootCategory=DEBUG, console
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=[spark][%d{yyyy-MM-dd HH:mm:ss}][%p][%c][%m]%n
log4j.appender.publicFile.layout.ConversionPattern=[spark][%p][%d{yy/MM/dd HH:mm:ss}][%c][%m]%n
@renardeinside
renardeinside / pyspark-logging-init.py
Last active October 22, 2022 16:54
pyspark-logging-init
from pyspark.sql import SparkSession
from typing import Optional
class LoggerProvider:
def get_logger(self, spark: SparkSession, custom_prefix: Optional[str] = ""):
log4j_logger = spark._jvm.org.apache.log4j # noqa
return log4j_logger.LogManager.getLogger(custom_prefix + self.__full_name__())
def __full_name__(self):
klass = self.__class__
class EndpointManager:
# here I've ommited some connection management code
# consider self._conn is a ready-to-use connection object
def get_table_infos(self) -> List[TableInfo]:
with self._conn.cursor() as c:
table_names = (
c.tables(catalog_name=self._catalog, schema_name=self._schema)
.fetchall_arrow()
.to_pandas()["TABLE_NAME"]
)
@renardeinside
renardeinside / databricks-streamlit-demo-p4.py
Created July 3, 2021 19:47
databricks-streamlit-demo-p4.py
def _spinner_component(text: str) -> str:
component = f"""
<link href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" rel="stylesheet"/>
<div class="d-flex flex-column align-items-center justify-content-center">
<div class="row">
<div class="spinner-border m-1 text-success" role="status">
<span class="sr-only">Loading...</span>
</div>
</div>
<div class="row">
@renardeinside
renardeinside / databricks-streamlit-demo-p3.py
Created July 3, 2021 19:33
databricks-streamlit-demo-p3
filter_box, minute_dynamic_box = st.beta_columns([1, 4])
with filter_box:
write_aligned_header("Please choose the date")
chosen_date = st.date_input("", dt.date(2016, 6, 30))
plotter.add_counter_plot(chosen_date)
with minute_dynamic_box:
plotter.add_minute_plot(chosen_date)