lh3/duprate.tex

## duprate.tex
\documentclass[pdftex,10pt]{article}

\usepackage{amsmath}
\usepackage{amssymb}

\title{PCR and mapping duplicate rate in NGS}
\author{Heng Li}

\begin{document}

\maketitle

\section{Amplicon duplicates}

Let $N$ be the number of distinct segments (or seeds) before the
amplification and $M$ be the total number of amplicons in the
library. For seed $i$ ($i=1,\ldots,N$), let $k_i$ be the number of
amplicons in the library and $k_i$ is drawn from Poinsson distribution
${\rm Po}(\lambda)$. When $N$ is sufficiently large, we have:
\[
M=\sum_{i=1}^Nk_i=N\sum_{k=0}^{\infty}kp_k=N\lambda
\]
where $p_k=e^{-\lambda}\lambda^k/{k!}$.

At the sequencing step, we sample $m$ amplicons from the library. On the
condition that:
\begin{equation}
  m\ll M
\end{equation}
we can regard this procedure as sampling with replacement. For seed $i$,
let:
\begin{equation*}
  X_i=\left\{\begin{array}{ll}
      1 & \mbox{seed $i$ has been sampled at least once} \\
      0 & \mbox{otherwise}
    \end{array}
  \right.
\end{equation*}
and then:
\begin{equation*}
  {\rm E} X_i= \Pr\{X_i=1\}=1-\Big(1-\frac{k_i}{M}\Big)^m\simeq 1-e^{-k_i m/M}
\end{equation*}
Let:
\[Z=\sum_{i=1}^NX_i\]
be the number of seeds sampled from the
library. The fraction of duplicates $d$ is:
\begin{eqnarray*}
  d&=&1-\frac{{\rm E}(Z)}{m}\\
  &\simeq&1-\frac{N}{m}\sum_{k=0}^{\infty}\big(1-e^{-km/M}\big)p_k\\
  &=&1-\frac{N}{m}+\frac{N e^{-\lambda}}{m}\sum_k \frac{1}{k!}\big(\lambda e^{-m/M}\big)^k\\
  &\simeq& 1-\frac{N}{m}\Big[1-e^{-\lambda}\cdot e^{\lambda(1-m/M)}\Big]
\end{eqnarray*}
i.e.
\begin{equation}
  d \simeq 1 - \frac{N}{m}\Big(1-e^{-m/N}\Big)
\end{equation}
independent of of $\lambda$. This derivation also stands if $k_i$ is drawn from
a uniform or a delta distribution. In addition, when $m/N$ is sufficiently
small:
\begin{equation}\label{equ:d2}
  d\approx \frac{m}{2N}
\end{equation}

This deduction assumes that i) $k_i\ll M$ which should almost always
stand; ii) $m\ll M$ which should largely stand because otherwise the
fraction of duplicates will far more than half given $\lambda\sim 1000$
and iii) $k_i$ is drawn from a Poisson, a uniform or a delta distribution.

The basic message is that to reduce PCR duplicates, we should either
increase the original pool of distinct molecules before amplification or
reduce the number of reads sequenced from the library. Reducing PCR
cycles plays little role. Nonetheless, PCR frequently introduces compositional
biases and amplification errors. Reducing PCR cycles or not applying PCR at all
is still preferred.

\section{Mapping duplicates}

For simplicity, we assume a read is as short as a single base pair. For
$m$ read pairs, define an indicator function:
\begin{equation*}
  Y_{ij}=\left\{\begin{array}{ll}
      1 & \mbox{if at least one read pair is mapped to $(i,j)$} \\
      0 & \mbox{otherwise}
    \end{array}\right.
\end{equation*}
Let $\{p_k\}$ be the distribution of insert size. Then:
\begin{equation*}
  {\rm E}Y_{ij}=\Pr\{Y_{ij}=1\}=1-\Big[1-\frac{p_{j-i}}{L-(j-i)}\Big]^m\simeq 1-e^{-p_{j-i}\cdot m/[L-(j-i)]}
\end{equation*}
where $L$ is the length of the reference. The fraction of random
coincidence is:
\begin{eqnarray*}
  d'&=&1-\frac{1}{m}\sum_{i=1}^L\sum_{j=i}^L{\rm E}Y_{ij}\\
  &\simeq&1-\frac{1}{m}\sum_{i=1}^L\sum_{j=i}^L\big(1-e^{-p_{j-i}\cdot m/(L-(j-i))}\big)\\
  &=&1-\frac{1}{m}\sum_{k=0}^{L-1}(L-k)\big[1-e^{-p_k m/(L-k)}\big]
\end{eqnarray*}
On the condition that $L$ is sufficient large and:
\begin{equation}
  m\ll L
\end{equation}
\begin{equation}\label{equ:dd}
  d'\simeq\frac{m}{2}\sum_{k=0}^{L-1}\frac{p_k^2}{L-k}
\end{equation}

We can calculate/approximate Equation~\ref{equ:dd} for two types of
distributions. Firstly, if $p_k$ is evenly distributed between
$[k_0,k_0+k_1]$, $d'\simeq\frac{m}{2k_1L}$. Secondly, assume $k$ is
drawn from $N(\mu,\sigma)$ with $\sigma\gg 1$:
\begin{equation*}
  p_k=\frac{1}{\sqrt{2\pi}\sigma}\int_k^{k+1}e^{-\frac{(x-\mu)^2}{2\sigma^2}}\,dx
  \simeq \frac{1}{\sqrt{2\pi}\sigma}e^{-\frac{(k-\mu)^2}{2\sigma^2}}
\end{equation*}
If $p_0\ll 1$, $\mu\ll L$ and $L\gg 1$:
\begin{eqnarray*}
  d'&\simeq&\frac{m}{4\pi\sigma^2}\int_0^1\frac{1}{1-x}\cdot e^{-\frac{(Lx-\mu)^2}{\sigma^2}}\,dx\\
  &\simeq&\frac{m}{4\pi\sigma^2}\int_{-\infty}^{\infty}e^{-\frac{(x-\mu/L)^2}{(\sigma/L)^2}}\,dx\\
  &=&\frac{m}{4\pi\sigma^2}\cdot\frac{\sqrt{2\pi}\cdot \sqrt{2}\sigma}{L}\\
  &=&\frac{m}{2\sqrt{\pi}\sigma L}
\end{eqnarray*}

\end{document}
	\documentclass[pdftex,10pt]{article}

	\usepackage{amsmath}
	\usepackage{amssymb}

	\title{PCR and mapping duplicate rate in NGS}
	\author{Heng Li}

	\begin{document}

	\maketitle

	\section{Amplicon duplicates}

	Let $N$ be the number of distinct segments (or seeds) before the
	amplification and $M$ be the total number of amplicons in the
	library. For seed $i$ ($i=1,\ldots,N$), let $k_i$ be the number of
	amplicons in the library and $k_i$ is drawn from Poinsson distribution
	${\rm Po}(\lambda)$. When $N$ is sufficiently large, we have:
	\[
	M=\sum_{i=1}^Nk_i=N\sum_{k=0}^{\infty}kp_k=N\lambda
	\]
	where $p_k=e^{-\lambda}\lambda^k/{k!}$.

	At the sequencing step, we sample $m$ amplicons from the library. On the
	condition that:
	\begin{equation}
	m\ll M
	\end{equation}
	we can regard this procedure as sampling with replacement. For seed $i$,
	let:
	\begin{equation*}
	X_i=\left\{\begin{array}{ll}
	1 & \mbox{seed $i$ has been sampled at least once} \\
	0 & \mbox{otherwise}
	\end{array}
	\right.
	\end{equation*}
	and then:
	\begin{equation*}
	{\rm E} X_i= \Pr\{X_i=1\}=1-\Big(1-\frac{k_i}{M}\Big)^m\simeq 1-e^{-k_i m/M}
	\end{equation*}
	Let:
	\[Z=\sum_{i=1}^NX_i\]
	be the number of seeds sampled from the
	library. The fraction of duplicates $d$ is:
	\begin{eqnarray*}
	d&=&1-\frac{{\rm E}(Z)}{m}\\
	&\simeq&1-\frac{N}{m}\sum_{k=0}^{\infty}\big(1-e^{-km/M}\big)p_k\\
	&=&1-\frac{N}{m}+\frac{N e^{-\lambda}}{m}\sum_k \frac{1}{k!}\big(\lambda e^{-m/M}\big)^k\\
	&\simeq& 1-\frac{N}{m}\Big[1-e^{-\lambda}\cdot e^{\lambda(1-m/M)}\Big]
	\end{eqnarray*}
	i.e.
	\begin{equation}
	d \simeq 1 - \frac{N}{m}\Big(1-e^{-m/N}\Big)
	\end{equation}
	independent of of $\lambda$. This derivation also stands if $k_i$ is drawn from
	a uniform or a delta distribution. In addition, when $m/N$ is sufficiently
	small:
	\begin{equation}\label{equ:d2}
	d\approx \frac{m}{2N}
	\end{equation}

	This deduction assumes that i) $k_i\ll M$ which should almost always
	stand; ii) $m\ll M$ which should largely stand because otherwise the
	fraction of duplicates will far more than half given $\lambda\sim 1000$
	and iii) $k_i$ is drawn from a Poisson, a uniform or a delta distribution.

	The basic message is that to reduce PCR duplicates, we should either
	increase the original pool of distinct molecules before amplification or
	reduce the number of reads sequenced from the library. Reducing PCR
	cycles plays little role. Nonetheless, PCR frequently introduces compositional
	biases and amplification errors. Reducing PCR cycles or not applying PCR at all
	is still preferred.

	\section{Mapping duplicates}

	For simplicity, we assume a read is as short as a single base pair. For
	$m$ read pairs, define an indicator function:
	\begin{equation*}
	Y_{ij}=\left\{\begin{array}{ll}
	1 & \mbox{if at least one read pair is mapped to $(i,j)$} \\
	0 & \mbox{otherwise}
	\end{array}\right.
	\end{equation*}
	Let $\{p_k\}$ be the distribution of insert size. Then:
	\begin{equation*}
	{\rm E}Y_{ij}=\Pr\{Y_{ij}=1\}=1-\Big[1-\frac{p_{j-i}}{L-(j-i)}\Big]^m\simeq 1-e^{-p_{j-i}\cdot m/[L-(j-i)]}
	\end{equation*}
	where $L$ is the length of the reference. The fraction of random
	coincidence is:
	\begin{eqnarray*}
	d'&=&1-\frac{1}{m}\sum_{i=1}^L\sum_{j=i}^L{\rm E}Y_{ij}\\
	&\simeq&1-\frac{1}{m}\sum_{i=1}^L\sum_{j=i}^L\big(1-e^{-p_{j-i}\cdot m/(L-(j-i))}\big)\\
	&=&1-\frac{1}{m}\sum_{k=0}^{L-1}(L-k)\big[1-e^{-p_k m/(L-k)}\big]
	\end{eqnarray*}
	On the condition that $L$ is sufficient large and:
	\begin{equation}
	m\ll L
	\end{equation}
	\begin{equation}\label{equ:dd}
	d'\simeq\frac{m}{2}\sum_{k=0}^{L-1}\frac{p_k^2}{L-k}
	\end{equation}

	We can calculate/approximate Equation~\ref{equ:dd} for two types of
	distributions. Firstly, if $p_k$ is evenly distributed between
	$[k_0,k_0+k_1]$, $d'\simeq\frac{m}{2k_1L}$. Secondly, assume $k$ is
	drawn from $N(\mu,\sigma)$ with $\sigma\gg 1$:
	\begin{equation*}
	p_k=\frac{1}{\sqrt{2\pi}\sigma}\int_k^{k+1}e^{-\frac{(x-\mu)^2}{2\sigma^2}}\,dx
	\simeq \frac{1}{\sqrt{2\pi}\sigma}e^{-\frac{(k-\mu)^2}{2\sigma^2}}
	\end{equation*}
	If $p_0\ll 1$, $\mu\ll L$ and $L\gg 1$:
	\begin{eqnarray*}
	d'&\simeq&\frac{m}{4\pi\sigma^2}\int_0^1\frac{1}{1-x}\cdot e^{-\frac{(Lx-\mu)^2}{\sigma^2}}\,dx\\
	&\simeq&\frac{m}{4\pi\sigma^2}\int_{-\infty}^{\infty}e^{-\frac{(x-\mu/L)^2}{(\sigma/L)^2}}\,dx\\
	&=&\frac{m}{4\pi\sigma^2}\cdot\frac{\sqrt{2\pi}\cdot \sqrt{2}\sigma}{L}\\
	&=&\frac{m}{2\sqrt{\pi}\sigma L}
	\end{eqnarray*}

	\end{document}