Heng Li lh3

## 00_label.txt
===> Where do you most often perform bioinformatics analysis <===

1	Personal computer (laptop/desktop)
2	Lab server
3	Departmental server
4	University server/cluster
5	Cloud
6	Other

===> Type of problems <===

## 00_On_Matrix_Multiplication.md

      
              3 files
            
          
              0 forks
            
          
              1 comment
            
          
              2 stars
            
          
                lh3
                / 00_On_Matrix_Multiplication.md
            
            
              Last active
              September 22, 2016 09:27
            
          
    Writing a program for efficient matrix multiplication is tricky. A naive implementation usually looks like this (I will use a square matrix for simplicity):
for (i = 0; i < n; ++i)
  for (j = 0; j < n; ++i)
    for (k = 0, m[i][j] = 0.; k < n; ++i)
      m[i][j] += a[i][k] * b[k][j];
The problem is the inner loop a[i][k] * b[k][j] where b[k][j] and b[k+1][j] are not adjacent in memory. This leads to frequent cache misses for large matrices. The better implementation is to transpose matrix b. The implementation will look like:
for (i = 0; i &lt; n; ++i) // transpose

  
## sum-decoy.js
var max_leftover = 0.1, max_diff = 0.05, min_mapq = 20;

var file = arguments.length == 0? new File() : new File(arguments[0]);
var buf = new Bytes();

var re = /(\d+)([MIDSH])/g;
var decoy = {};
while (file.readline(buf) >= 0) {
	var m, line = buf.toString();
	if ((m = /^@SQ.*\tSN:(\S+).*\tLN:(\d+)/.exec(line)) != null) {

## mastermind.c
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <stdio.h>
#include <time.h>

const char *mm_colors = "RGBWOPYSE";
const char *mm_numbers = "123456789";

long *mm_enumerate(int m, int n, int rep, long *_z) // m colors, n columns

## GeSQL.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                lh3
                / GeSQL.md
            
            
              Created
              June 28, 2015 05:39
            
          
    Querying Genotypes with Standard SQL

This document demonstrates that we can use three conceptual SQL tables for
variant annotations, sample phenotypes and genotypes, respectively, to achieve
flexible query of genotype data. The tables are conceptual in that they are
primarily used for constructing queries but not for storing real data as they
are not efficient enough for large-scale data.
We will use an example to show the structure of the three tables and how to
query data in standard SQL. You can find all the SQL statements in

  
## vqsr-b37.mak
# VQSR for single human sample mapped to b37. Usage:
#
#   make -f vqsr-b37.mak prefix=my-vcf-prefix
#
# For this to work, the input VCF must be named as "my-vcf-prefix.vcf" and
# there must be "GenomeAnalysisTK.jar" and "b37" directory from the GATK
# resource bundle in the working directory.
#
# References:
#

## 00_cmd.sh
#
# generate 01compositional.bed.gz
#

# low-complexity by mDUST
mdust hs37d5.fa -c -w7 -v28 \
  | hs37d5.mdust-w7-v28.txt \
  | cut -f1,3,4 \
  | gawk -vOFS="\t" '{--$2;print}' \
  | bgzip > 01compositional/hs37d5.mdust-w7-v28.bed.gz

## occ1.c
/* To compile this program, you need rld0.{h,c} from ropebwt2 or fermi2:
 *
 *   gcc -g -O2 -Wall -o occ1 rld0.c occ1.c
 */

#include <string.h>
#include <stdint.h>
#include <stdio.h>
#include "rld0.h"

## ksfile.c
#include <stdlib.h>
#include <unistd.h>
#include <stdio.h>
#include <stdint.h>
#include <pthread.h>

#define KO_BUFSIZE 0x40000

typedef struct {
	FILE *fp_in, *fp_out;

## test_bz.c
#include <bzlib.h>
#include <zlib.h>
#include <stdio.h>
#include <stdlib.h>

typedef struct zfile_s {
	int is_bz;
	gzFile fpz;
	FILE *fp; // for bzip2
	BZFILE *fpbz;
	===> Where do you most often perform bioinformatics analysis <===

	1 Personal computer (laptop/desktop)
	2 Lab server
	3 Departmental server
	4 University server/cluster
	5 Cloud
	6 Other

	===> Type of problems <===
	var max_leftover = 0.1, max_diff = 0.05, min_mapq = 20;

	var file = arguments.length == 0? new File() : new File(arguments[0]);
	var buf = new Bytes();

	var re = /(\d+)([MIDSH])/g;
	var decoy = {};
	while (file.readline(buf) >= 0) {
	var m, line = buf.toString();
	if ((m = /^@SQ.\tSN:(\S+).\tLN:(\d+)/.exec(line)) != null) {
	#include <stdlib.h>
	#include <unistd.h>
	#include <string.h>
	#include <stdio.h>
	#include <time.h>

	const char *mm_colors = "RGBWOPYSE";
	const char *mm_numbers = "123456789";

	long mm_enumerate(int m, int n, int rep, long _z) // m colors, n columns
	# VQSR for single human sample mapped to b37. Usage:
	#
	# make -f vqsr-b37.mak prefix=my-vcf-prefix
	#
	# For this to work, the input VCF must be named as "my-vcf-prefix.vcf" and
	# there must be "GenomeAnalysisTK.jar" and "b37" directory from the GATK
	# resource bundle in the working directory.
	#
	# References:
	#
	#
	# generate 01compositional.bed.gz
	#

	# low-complexity by mDUST
	mdust hs37d5.fa -c -w7 -v28 \
	\| hs37d5.mdust-w7-v28.txt \
	\| cut -f1,3,4 \
	\| gawk -vOFS="\t" '{--$2;print}' \
	\| bgzip > 01compositional/hs37d5.mdust-w7-v28.bed.gz
	/* To compile this program, you need rld0.{h,c} from ropebwt2 or fermi2:
	*
	* gcc -g -O2 -Wall -o occ1 rld0.c occ1.c
	*/

	#include <string.h>
	#include <stdint.h>
	#include <stdio.h>
	#include "rld0.h"
	#include <bzlib.h>
	#include <zlib.h>
	#include <stdio.h>
	#include <stdlib.h>

	typedef struct zfile_s {
	int is_bz;
	gzFile fpz;
	FILE *fp; // for bzip2
	BZFILE *fpbz;