Stephan Garland stephanGarland

## zfs_data_size_with_varying_recordsizes.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                stephanGarland
                / zfs_data_size_with_varying_recordsizes.md
            
            
              Last active
              June 11, 2023 16:33
            
              
                ZFS data size exploration
              
          
    I ran some experiments with varying recordsizes, filesizes, and compression. The files were .csv, representative of a simple schema:
full_name,external_id,last_modified
'Past, Gabrielle',40605,'2006-07-09 23:17:20' 
'Vachil, Corry',44277,'1996-09-05 05:12:44'

The files were all generated on an ext4 filesystem. There were three sets of five files, with 75, 100,000, and 1,000,000 rows each, resulting in the following sizes:
❯ find . -name '*small*.csv' -exec du -bc {} + | \

awk 'END {printf "%s %.2f %s\n", "Average file size:", ($1 / (NR-1) / 1024), "KiB"}'

  
## zfs_file_life.txt
# create a simple dataset with a small record size, and chown it

❯ sudo zfs create -o recordsize=512 tank/foobar && sudo chown $YOUR_USER:$YOUR_GROUP tank/foobar
❯ cd tank/foobar

# make a 1K file filled with hex FF (pull from /dev/zero, then use tr to translate to FF, which is 377 in octal)
# if it's just zeros, there isn't much to look at with zdb
❯ dd if=/dev/zero bs=1k count=1 | tr "\000" "\377" >file.txt
1+0 records in
1+0 records out

## gpt-4-algebra.txt
# ChatGPT output surrounded by ``` fence

I'm going to give you some simple MySQL 8 schema, and then values to fill it with.
The table creation will be from the output of SHOW CREATE TABLE <table>,
but the values will be of a simpler format, like this:

<table> = {
col_name:col_type
value
}

## mysql_1E6_rows_sp
DELIMITER // -- This is needed so that the individual commands don't end the stored procedure
CREATE PROCEDURE insert_zaps(IN num_rows int, IN pct_shared float) -- Two input args are needed
BEGIN
  DECLARE loop_count bigint; -- Variables are initialized with a type
  DECLARE len_table bigint;
  DECLARE rand_base float;
  DECLARE rand_offset float;
  DECLARE rand_ts timestamp;
  DECLARE rand_user bigint;
  DECLARE shared_with_user bigint;

## gen_sql_example.py
import argparse
from collections import deque
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, wait
from math import floor
from multiprocessing import Value
from os import urandom
import random

class Allocator:
    def __init__(self, id_max: int):

## mysql_8_0_23_investigation.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                stephanGarland
                / mysql_8_0_23_investigation.md
            
            
              Created
              November 19, 2022 21:51
            
              
                Investigating why doing a SELECT COUNT(*) followed by a TRUNCATE or DROP is far faster than those DDL operations alone
              
          
    Examining MySQL 8.0.23 with strace during SELECT COUNT(*), TRUNCATE TABLE, and DROP TABLE operations

Notes


Ran on a Proxmox VM on a Dell R620 with dual E5-2650 v2, all 32 cores given to the VM, and 64 GiB of RAM allocated
Disk was an ext4 filesystem on an HDD-based virtual disk, presented via a ZFS array in another physical server
Captured with strace -T -ff -o /usr/local/mysql/trace /usr/local/mysql/bin/mysqld

Tests

This is running a SELECT COUNT(*), then a TRUNCATE, then a DROP, with FOREIGN_KEY_CHECKS=OFF


## k8s_copy_files.sh
#!/usr/bin/env bash

remove_claim() {
    pvc=$(export name="$1" && kubectl get pv --no-headers | awk -v name="$name" '$0~name {print $1}')
    kubectl patch pv $pvc --type json -p '[{"op": "remove", "path": "/spec/claimRef"}]'
}

if [ -n $2 ]; then
  namespace=$2
else

## fix_unifi_log4j.sh
#!/usr/bin/env bash
set -e

read -p "Enter Unifi controller's container name: " container
curl -s https://dlcdn.apache.org/logging/log4j/2.15.0/apache-log4j-2.15.0-bin.tar.gz -o log4j.tar.gz
mkdir -p ./log4j && tar xzf log4j.tar.gz -C ./log4j --strip-components=1
cd log4j
for file in log4j-{api,core,slf4j-impl}-2.15.0.jar; do
	docker cp "$file" "$container:/usr/lib/unifi/lib/"
done

## wake_backup.py
#!/usr/bin/env python3

import datetime
import os
import socket
import subprocess
import sys
import time

MAC_ADDR = "0025904F3A00"

## hsa_pdf_creator.py
#!/usr/bin/env python3

import argparse
from datetime import date

from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2.generic import BooleanObject, NameObject, IndirectObject

def setup_args():
    parser = argparse.ArgumentParser(description="Modify a PDF's fields for HSA contribution changes.")
	# create a simple dataset with a small record size, and chown it

	❯ sudo zfs create -o recordsize=512 tank/foobar && sudo chown $YOUR_USER:$YOUR_GROUP tank/foobar
	❯ cd tank/foobar

	# make a 1K file filled with hex FF (pull from /dev/zero, then use tr to translate to FF, which is 377 in octal)
	# if it's just zeros, there isn't much to look at with zdb
	❯ dd if=/dev/zero bs=1k count=1 \| tr "\000" "\377" >file.txt
	1+0 records in
	1+0 records out
	# ChatGPT output surrounded by ``` fence

	I'm going to give you some simple MySQL 8 schema, and then values to fill it with.
	The table creation will be from the output of SHOW CREATE TABLE <table>,
	but the values will be of a simpler format, like this:

	<table> = {
	col_name:col_type
	value
	}
	DELIMITER // -- This is needed so that the individual commands don't end the stored procedure
	CREATE PROCEDURE insert_zaps(IN num_rows int, IN pct_shared float) -- Two input args are needed
	BEGIN
	DECLARE loop_count bigint; -- Variables are initialized with a type
	DECLARE len_table bigint;
	DECLARE rand_base float;
	DECLARE rand_offset float;
	DECLARE rand_ts timestamp;
	DECLARE rand_user bigint;
	DECLARE shared_with_user bigint;
	import argparse
	from collections import deque
	from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, wait
	from math import floor
	from multiprocessing import Value
	from os import urandom
	import random

	class Allocator:
	def __init__(self, id_max: int):
	#!/usr/bin/env bash

	remove_claim() {
	pvc=$(export name="$1" && kubectl get pv --no-headers \| awk -v name="$name" '$0~name {print $1}')
	kubectl patch pv $pvc --type json -p '[{"op": "remove", "path": "/spec/claimRef"}]'
	}

	if [ -n $2 ]; then
	namespace=$2
	else
	#!/usr/bin/env bash
	set -e

	read -p "Enter Unifi controller's container name: " container
	curl -s https://dlcdn.apache.org/logging/log4j/2.15.0/apache-log4j-2.15.0-bin.tar.gz -o log4j.tar.gz
	mkdir -p ./log4j && tar xzf log4j.tar.gz -C ./log4j --strip-components=1
	cd log4j
	for file in log4j-{api,core,slf4j-impl}-2.15.0.jar; do
	docker cp "$file" "$container:/usr/lib/unifi/lib/"
	done
	#!/usr/bin/env python3

	import datetime
	import os
	import socket
	import subprocess
	import sys
	import time

	MAC_ADDR = "0025904F3A00"
	#!/usr/bin/env python3

	import argparse
	from datetime import date

	from PyPDF2 import PdfFileReader, PdfFileWriter
	from PyPDF2.generic import BooleanObject, NameObject, IndirectObject

	def setup_args():
	parser = argparse.ArgumentParser(description="Modify a PDF's fields for HSA contribution changes.")