Mike Lin mlin

## census_dataset_presence.html
<!DOCTYPE html>

<html>

<head>

<meta charset="utf-8" />
<meta name="generator" content="pandoc" />
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />

## census_datasets.html
<!DOCTYPE html>

<html>

<head>

<meta charset="utf-8" />
<meta name="generator" content="pandoc" />
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />

## Dockerfile
FROM ubuntu:20.04
RUN apt-get -qq update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
    wget curl python3-pip python-is-python3
RUN pip3 install --system miniwdl==1.4.2

ENV UDOCKER_VERSION=1.3.1
WORKDIR /usr/local
RUN wget -nv https://github.com/indigo-dc/udocker/releases/download/v${UDOCKER_VERSION}/udocker-${UDOCKER_VERSION}.tar.gz \
    && tar zxf udocker-${UDOCKER_VERSION}.tar.gz \
    && rm udocker-${UDOCKER_VERSION}.tar.gz

## README.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              2 stars
            
          
                mlin
                / README.md
            
            
              Last active
              December 1, 2023 13:56
            
              
                static.wiki database compression
              
          
    Context: static.wiki and Show HN post
We downloaded static.wiki's 40.3 GiB SQLite database of English Wikipedia and created a compressed version of it with sqlite_zstd_vfs, our read/write Zstandard compression layer for SQLite3. The compressed version is 10.4 GiB (26%), and the VFS supports HTTP random access in the spirit of the original (although we don't yet have a WebAssembly build; it's a library for CLI & desktop apps for now). You can try it out on Linux or macOS x86-64:
pip3 install genomicsqlite
genomicsqlite https://f000.backblazeb2.com/file/mlin-public/static.wiki/en.zstd.db \
    "select text from wiki_articles where title = 'SQLite'"


## sqlite_seekscan_regression.py
#!/usr/bin/env python3
# run this script using LD_LIBRARY_PATH to manipulate the SQlite3 library version
import os
import random
import time
import sqlite3

N = 100000
random.seed(42)

## htsget-openapi-docs.html
<!DOCTYPE html>
<html>

<head>
  <meta charset="utf8" />
  <title>htsget</title>
  <!-- needed for adaptive design -->
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <style>
    body {

## paste_wdl_imports.py
#!/usr/bin/env python3

"""
Generate a standalone WDL document from a given workflow using imported tasks. Requires: miniwdl

    python3 paste_wdl_imports.py [-o STANDALONE.wdl] WORKFLOW.wdl

For each "call imported_namespace.task_name [as alias]" in the workflow, appends the task's source
code with the task name changed to "imported_namespace__task_name", and rewrites the call to refer
to this new name (keeping the original alias). Also blanks out the import statements.

## log_cpu_mem_in_docker.sh
#!/bin/bash

# Running inside a docker container, periodically read the container's CPU/memory usage counters
# and log them to standard error. Fields:
#
#     cpu_pct        average user %CPU usage over the most recent period
#     mem_MiB        container's current RSS (excludes file cache), in mebibytes (= 2**20 bytes)
#     cpu_total_s    container's user CPU time consumption since this script started, in seconds
#     elapsed_s      wall time elapsed since this script started, in seconds
#

## split_vcf_for_spark.wdl
version 1.0

task split_vcf_for_spark {
    # Quickly split a large .vcf.gz file into a specified number of compressed partitions.
    #
    # Motivation: calling SparkContext.textFile on a single large vcf.gz can be painfully slow,
    # because it's decompressed and parsed in ~1 thread. Use this to first split it up (with a
    # faster multithreaded pipeline); then tell Spark to parallel load the data using textFile on a
    # glob pattern.
    #

## swarmsub.py
#!/usr/bin/env python3

import sys
import time
import docker
import multiprocessing
from argparse import ArgumentParser, REMAINDER

def swarmsub(image, command=None, cpu=1, mounts=None):
    client = docker.from_env()
	<!DOCTYPE html>

	<html>

	<head>

	<meta charset="utf-8" />
	<meta name="generator" content="pandoc" />
	<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />
	FROM ubuntu:20.04
	RUN apt-get -qq update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
	wget curl python3-pip python-is-python3
	RUN pip3 install --system miniwdl==1.4.2

	ENV UDOCKER_VERSION=1.3.1
	WORKDIR /usr/local
	RUN wget -nv https://github.com/indigo-dc/udocker/releases/download/v${UDOCKER_VERSION}/udocker-${UDOCKER_VERSION}.tar.gz \
	&& tar zxf udocker-${UDOCKER_VERSION}.tar.gz \
	&& rm udocker-${UDOCKER_VERSION}.tar.gz
	#!/usr/bin/env python3
	# run this script using LD_LIBRARY_PATH to manipulate the SQlite3 library version
	import os
	import random
	import time
	import sqlite3

	N = 100000
	random.seed(42)
	#!/usr/bin/env python3

	"""
	Generate a standalone WDL document from a given workflow using imported tasks. Requires: miniwdl

	python3 paste_wdl_imports.py [-o STANDALONE.wdl] WORKFLOW.wdl

	For each "call imported_namespace.task_name [as alias]" in the workflow, appends the task's source
	code with the task name changed to "imported_namespace__task_name", and rewrites the call to refer
	to this new name (keeping the original alias). Also blanks out the import statements.
	#!/bin/bash

	# Running inside a docker container, periodically read the container's CPU/memory usage counters
	# and log them to standard error. Fields:
	#
	# cpu_pct average user %CPU usage over the most recent period
	# mem_MiB container's current RSS (excludes file cache), in mebibytes (= 2**20 bytes)
	# cpu_total_s container's user CPU time consumption since this script started, in seconds
	# elapsed_s wall time elapsed since this script started, in seconds
	#
	version 1.0

	task split_vcf_for_spark {
	# Quickly split a large .vcf.gz file into a specified number of compressed partitions.
	#
	# Motivation: calling SparkContext.textFile on a single large vcf.gz can be painfully slow,
	# because it's decompressed and parsed in ~1 thread. Use this to first split it up (with a
	# faster multithreaded pipeline); then tell Spark to parallel load the data using textFile on a
	# glob pattern.
	#
	#!/usr/bin/env python3

	import sys
	import time
	import docker
	import multiprocessing
	from argparse import ArgumentParser, REMAINDER

	def swarmsub(image, command=None, cpu=1, mounts=None):
	client = docker.from_env()