technillogue/vector_logging.py

## vector_logging.py
import logging
import asyncio
import json
import os
import sys
import contextvars


def FuckAiohttp(record: logging.LogRecord) -> bool:
    str_msg = str(getattr(record, "msg", ""))
    if "was destroyed but it is pending" in str_msg:
        return False
    if str_msg.startswith("task:") and str_msg.endswith(">"):
        return False
    return True


prompt_context: contextvars.ContextVar[int] = contextvars.ContextVar("prompt_context")

# adapted from https://stackoverflow.com/questions/50144628/python-logging-into-file-as-a-dictionary-or-json
class JsonFormatter(logging.Formatter):
    """
    Formatter that outputs JSON strings after parsing the LogRecord.
    """

    def format(self, record: logging.LogRecord) -> str:
        record.message = record.getMessage()
        # this is all the normal logrecord attributes except "levelname", "module", "funcName", "lineno"
        # we're doing this to recover extras
        # https://github.com/python/cpython/blob/main/Lib/logging/__init__.py#L1653
        exclude = [
            "name",
            "msg",
            "args",
            "levelno",
            "pathname",
            "filename",
            "exc_info",
            "exc_text",
            "stack_info",
            "created",
            "msecs",
            "relativeCreated",
            "thread",
            "threadName",
            "processName",
            "process",
        ]
        message_dict = {k: v for k, v in record.__dict__.items() if k not in exclude}
        prompt = prompt_context.get(None)
        if prompt:
            message_dict["prompt"] = prompt

        if record.exc_info:
            # Cache the traceback text to avoid converting it multiple times
            # (it's constant anyway)
            if not record.exc_text:
                record.exc_text = self.formatException(record.exc_info)

        if record.exc_text:
            message_dict["exc_info"] = record.exc_text

        if record.stack_info:
            message_dict["stack_info"] = self.formatStack(record.stack_info)

        return json.dumps(message_dict, default=str)


USE_VECTOR = os.getenv("HONEYCOMB_API_KEY") and os.path.exists("./vector")


class Vector:
    # all the stuff in this class declaration will be run once and available from self
    logger = logging.getLogger()
    logger.setLevel("DEBUG")
    prompt_context = prompt_context

    if USE_VECTOR:
        print("using vector")
        original_stdout = os.dup(sys.stdout.fileno())
        original_stderr = os.dup(sys.stderr.fileno())

        # explicit pipe we'll use for tee later
        tee_read, tee_write = os.pipe()
        # explicit pipe for vector -- we'll be passing this to tee
        vector_read, vector_write = os.pipe()
        # adapted from https://stackoverflow.com/a/651718
        # Cause tee's stdin to get a copy of our stdin/stdout (as well as that
        # of any child processes we spawn)
        # the pipe will buffer everything we write until vector starts reading
        # note that logging calls will appear before prints
        # note note any fatal errors between this and tee starting will be lost forever!
        os.dup2(tee_write, sys.stdout.fileno())
        os.dup2(tee_write, sys.stderr.fileno())

        # set up logging
        # write structured logs to only vector, not original stderr
        vector_file = os.fdopen(vector_write, mode="w")
        vector_handler = logging.StreamHandler(vector_file)
        vector_handler.addFilter(FuckAiohttp)
        vector_handler.setLevel("DEBUG")
        vector_handler.setFormatter(JsonFormatter())
        logger.addHandler(vector_handler)
        # we want to write formatted logs only to the original stderr, not vector
        # normally this would be sys.stder
        # but we need to open the duplicated fd as a file
        stderr_file = os.fdopen(original_stderr, mode="w")
        console_handler: logging.StreamHandler = logging.StreamHandler(stderr_file)
    else:
        console_handler = logging.StreamHandler()
    fmt = logging.Formatter("{levelname} {module}:{lineno}: {message}", style="{")
    console_handler.setLevel(
        ((os.getenv("LOGLEVEL") or os.getenv("LOG_LEVEL")) or "DEBUG").upper()
    )
    console_handler.setFormatter(fmt)
    console_handler.addFilter(FuckAiohttp)
    logger.addHandler(console_handler)
    # if i hear about epoll selector one more time i'mna end it
    logging.getLogger("asyncio").setLevel("INFO")

    logging.info("starting")

    async def init_vector(self) -> None:
        if not USE_VECTOR:
            return
        self.tee = await asyncio.create_subprocess_shell(
            # write to vector's fd and stdout
            f"tee /dev/fd/{self.vector_write}",
            # if we just set stdin to just PIPE, it would be a StreamWriter and not have a real fd
            # so we're using the explicit pipe we opened earlier
            stdin=self.tee_read,
            stdout=self.original_stdout,
            stderr=self.original_stderr,
            # tee should have access to the vector fd
            pass_fds=[self.vector_write],
        )
        self.vector = await asyncio.create_subprocess_shell(
            # "cat - > /tmp/fake_vector",
            "./vector --quiet -c vector.toml",
            stdin=self.vector_read,
            env={"HONEYCOMB_API_KEY": os.environ["HONEYCOMB_API_KEY"]},
        )
        logging.info("started vector")

    # this seems to make things exit early without logging
    # anyway init waits for orphaned processes to exit so it's fine
    # async def cleanup(self) -> None:
    #     self.vector.terminate()
    #     self.tee.terminate()
    #     await self.vector.communicate()
    #     await self.tee.communicate()


def sync_start_vector() -> None:
    asyncio.run(Vector().init_vector())


async def main() -> None:
    vector = Vector()
    await vector.init_vector()
    # this goes to tee, vector sees it's not json and leaves it unchanged
    print("example print")
    # subprocesses inherit the same thing
    await (await asyncio.create_subprocess_shell("date")).wait()
    # logging is special and prettyprinted to original stdout but structured for vector
    logging.info("a log message with extra information", extra={"attribute": "42"})


if __name__ == "__main__":
    asyncio.run(main())
    print("message after asyncio")
    # open("/tmp/test", "a").write("we ran\n")
	import logging
	import asyncio
	import json
	import os
	import sys
	import contextvars


	def FuckAiohttp(record: logging.LogRecord) -> bool:
	str_msg = str(getattr(record, "msg", ""))
	if "was destroyed but it is pending" in str_msg:
	return False
	if str_msg.startswith("task:") and str_msg.endswith(">"):
	return False
	return True


	prompt_context: contextvars.ContextVar[int] = contextvars.ContextVar("prompt_context")

	# adapted from https://stackoverflow.com/questions/50144628/python-logging-into-file-as-a-dictionary-or-json
	class JsonFormatter(logging.Formatter):
	"""
	Formatter that outputs JSON strings after parsing the LogRecord.
	"""

	def format(self, record: logging.LogRecord) -> str:
	record.message = record.getMessage()
	# this is all the normal logrecord attributes except "levelname", "module", "funcName", "lineno"
	# we're doing this to recover extras
	# https://github.com/python/cpython/blob/main/Lib/logging/__init__.py#L1653
	exclude = [
	"name",
	"msg",
	"args",
	"levelno",
	"pathname",
	"filename",
	"exc_info",
	"exc_text",
	"stack_info",
	"created",
	"msecs",
	"relativeCreated",
	"thread",
	"threadName",
	"processName",
	"process",
	]
	message_dict = {k: v for k, v in record.__dict__.items() if k not in exclude}
	prompt = prompt_context.get(None)
	if prompt:
	message_dict["prompt"] = prompt

	if record.exc_info:
	# Cache the traceback text to avoid converting it multiple times
	# (it's constant anyway)
	if not record.exc_text:
	record.exc_text = self.formatException(record.exc_info)

	if record.exc_text:
	message_dict["exc_info"] = record.exc_text

	if record.stack_info:
	message_dict["stack_info"] = self.formatStack(record.stack_info)

	return json.dumps(message_dict, default=str)


	USE_VECTOR = os.getenv("HONEYCOMB_API_KEY") and os.path.exists("./vector")


	class Vector:
	# all the stuff in this class declaration will be run once and available from self
	logger = logging.getLogger()
	logger.setLevel("DEBUG")
	prompt_context = prompt_context

	if USE_VECTOR:
	print("using vector")
	original_stdout = os.dup(sys.stdout.fileno())
	original_stderr = os.dup(sys.stderr.fileno())

	# explicit pipe we'll use for tee later
	tee_read, tee_write = os.pipe()
	# explicit pipe for vector -- we'll be passing this to tee
	vector_read, vector_write = os.pipe()
	# adapted from https://stackoverflow.com/a/651718
	# Cause tee's stdin to get a copy of our stdin/stdout (as well as that
	# of any child processes we spawn)
	# the pipe will buffer everything we write until vector starts reading
	# note that logging calls will appear before prints
	# note note any fatal errors between this and tee starting will be lost forever!
	os.dup2(tee_write, sys.stdout.fileno())
	os.dup2(tee_write, sys.stderr.fileno())

	# set up logging
	# write structured logs to only vector, not original stderr
	vector_file = os.fdopen(vector_write, mode="w")
	vector_handler = logging.StreamHandler(vector_file)
	vector_handler.addFilter(FuckAiohttp)
	vector_handler.setLevel("DEBUG")
	vector_handler.setFormatter(JsonFormatter())
	logger.addHandler(vector_handler)
	# we want to write formatted logs only to the original stderr, not vector
	# normally this would be sys.stder
	# but we need to open the duplicated fd as a file
	stderr_file = os.fdopen(original_stderr, mode="w")
	console_handler: logging.StreamHandler = logging.StreamHandler(stderr_file)
	else:
	console_handler = logging.StreamHandler()
	fmt = logging.Formatter("{levelname} {module}:{lineno}: {message}", style="{")
	console_handler.setLevel(
	((os.getenv("LOGLEVEL") or os.getenv("LOG_LEVEL")) or "DEBUG").upper()
	)
	console_handler.setFormatter(fmt)
	console_handler.addFilter(FuckAiohttp)
	logger.addHandler(console_handler)
	# if i hear about epoll selector one more time i'mna end it
	logging.getLogger("asyncio").setLevel("INFO")

	logging.info("starting")

	async def init_vector(self) -> None:
	if not USE_VECTOR:
	return
	self.tee = await asyncio.create_subprocess_shell(
	# write to vector's fd and stdout
	f"tee /dev/fd/{self.vector_write}",
	# if we just set stdin to just PIPE, it would be a StreamWriter and not have a real fd
	# so we're using the explicit pipe we opened earlier
	stdin=self.tee_read,
	stdout=self.original_stdout,
	stderr=self.original_stderr,
	# tee should have access to the vector fd
	pass_fds=[self.vector_write],
	)
	self.vector = await asyncio.create_subprocess_shell(
	# "cat - > /tmp/fake_vector",
	"./vector --quiet -c vector.toml",
	stdin=self.vector_read,
	env={"HONEYCOMB_API_KEY": os.environ["HONEYCOMB_API_KEY"]},
	)
	logging.info("started vector")

	# this seems to make things exit early without logging
	# anyway init waits for orphaned processes to exit so it's fine
	# async def cleanup(self) -> None:
	# self.vector.terminate()
	# self.tee.terminate()
	# await self.vector.communicate()
	# await self.tee.communicate()


	def sync_start_vector() -> None:
	asyncio.run(Vector().init_vector())


	async def main() -> None:
	vector = Vector()
	await vector.init_vector()
	# this goes to tee, vector sees it's not json and leaves it unchanged
	print("example print")
	# subprocesses inherit the same thing
	await (await asyncio.create_subprocess_shell("date")).wait()
	# logging is special and prettyprinted to original stdout but structured for vector
	logging.info("a log message with extra information", extra={"attribute": "42"})


	if __name__ == "__main__":
	asyncio.run(main())
	print("message after asyncio")
	# open("/tmp/test", "a").write("we ran\n")