goedel-gang/bytes_hs.hs

## bytes_hs.hs
import Data.List
import qualified Data.ByteString.Char8 as B

main :: IO ()
main = do
    stdin <- B.getContents
    B.putStrLn $ (B.intercalate (B.pack "\n")
                . map (B.intercalate (B.pack "\t"))
                . transpose . map (B.split '\t')
                . B.split '\n') stdin

## file-utils.sh
(cat list.txt | while read f; do # several
    cut -f $col “$f” | paste -s  # details
) | gzip > $tpfile               # ommitted; e.g. rowname equality check.
transpose -i $tpfile -o $dst     # Conflicted, but: bottleneck process.

## graph.png

      
    Raw
  

              graph.png
            
          
## naive_hs.hs
main :: IO ()
main = interact $ unlines . map unwords . transpose . map words . lines

transpose :: [[a]] -> [[a]]
transpose = foldr (zipWith (:)) (repeat [])

## numpy_transpose.py
data = numpy.loadtxt(gzip.open(sys.argv[1]), delimiter="\t", dtype='str')
numpy.savetxt(sys.stdout, data.transpose(), fmt='%s', delimiter="\t")

## pandas_transpose.py
df = pandas.read_table(gzip.open(sys.argv[1]), sep='\t', dtype='str')
df.transpose().to_csv(sys.stdout, sep="\t")

## transpose.pl
my @data = map { chomp; [ split "\t" ] } <>;
my @idx = 0..$#data;
for (my $i = 0; $i < @{$data[0]}; $i++) {
    print join "\t", map { $data[$_][$i] } @idx;
    print "\n";
}

## transpose.R
df <- read.table(gzfile(args[[1]]), colClasses=c('character'))
write.table(t(df), "", sep="\t", quote=FALSE, row.names=TRUE, col.names=FALSE)

## transpose.rb
puts readlines.map(&:split).transpose.map{|x|x*" "}

## transpose.sh
datamash transpose < <(zcat i.gz) > >(gzip > o.gz)

## transpose.tex
% vim: ts=2 sw=0 sts=-1 et ai cole=0 wrap
%^ vim modeline to set "tabs" to be two spaces and some other stuff

% Transposition presentation roughly translated to Beamer. Will probably compile
% if you have a distribution of TeX Live (I personally use latexmk for
% compilation).

% Come to think of it, to use minted probably you will also need pygments. If
% this isn't available through a proper system package manager (which it is, use
% homebrew) you could install with pip install --user Pygments, and make sure
% that pygmentize ends up in your PATH.

% It's not a perfect translation, nor does it probably fully utilise Beamer's
% numerous features, but I think it does a fair job.

% use 16:9 aspect ratio instead of 4:3
% \documentclass[aspectratio=169]{beamer}
\documentclass{beamer}
% this is just a theme I personally like the look of
\usetheme{metropolis}
\title{Transposing a big matrix/text file}
\subtitle{Merging many columns to create a big matrix in a text file}
\author{Stijn van Dongen}
\institute{WTSI}
\date{\today}

% automatically insert title slides for sections, subsections, and
% subsubsections
\AtBeginSection{\frame{\sectionpage}}
\AtBeginSubsection{\frame{\subsectionpage}}
\AtBeginSubsubsection{\frame{\subsubsectionpage}}

% nicer tables with \toprule, \midrule, \bottomrule
% \usepackage{booktabs}

% format SI units and other numbers, eg with \num, \si, \SI
\usepackage{siunitx}

\newcommand\smallish{\fontsize{7pt}{7pt}\selectfont}

% listings of code, set to use an appropriate fontsize because everything in
% Beamer is HUGE
% Also some options to determine how it wraps code if it has to.
\usepackage{minted}
\setminted{breaklines,
           breakbytokenanywhere,
           % linenos,
           % chosen so that the listings mostly don't need to wrap
           fontsize=\smallish
}
% make annoying red fboxes around $ in haskell code go away
% \usemintedstyle{friendly}
\usemintedstyle{xcode}
% line numbers size (although they're not currently on)
\renewcommand\theFancyVerbLine{\smallish\arabic{FancyVerbLine}}

% make things that are yet to be transitioned to transparent, rather than
% invisible
% \setbeamercovered{transparent}

\begin{document}
\begin{frame}
  \titlepage
\end{frame}

\begin{frame}
  \frametitle{Outline}
  \tableofcontents
\end{frame}

\begin{frame}
  \frametitle{Transposition}
  \begin{columns}
    \begin{column}{0.48\textwidth}
      % :r !python -c "print('\\\\\\\\\n'.join(' & '.join(map(str, range(i, i + 5))) for i in range(0, 15, 5)))"
      %^ vim command used to generate the table contents. you can recycle it by
      %navigating to the line in normal mode, and then issuing the key sequence
      % 0f:y$q:p<CR> where <CR> is a carriage return. Obviously needs Python.
      \centering
      \onslide<1->{
        \begin{tabular}{*{5}r}
          0 & 1 & 2 & 3 & 4\\
          5 & 6 & 7 & 8 & 9\\
          10 & 11 & 12 & 13 & 14
        \end{tabular}
      }
    \end{column}
    \begin{column}{0.48\textwidth}
      % :r !python -c "print('\\\\\\\\\n'.join(' & '.join(map(str, range(i, 11 + i, 5))) for i in range(5)))"
      \centering
      \onslide<2->{
        \begin{tabular}{rrr}
          0 & 5 & 10\\
          1 & 6 & 11\\
          2 & 7 & 12\\
          3 & 8 & 13\\
          4 & 9 & 14
        \end{tabular}
      }
    \end{column}
  \end{columns}
\end{frame}

\section{Transposing a big matrix in a text file}

\begin{frame}
  \frametitle{Transposing a big matrix in a text file}
  \begin{itemize}
    \item[{--}] 60k genes \(\times\) 20k samples
    \item[{--}] \(\num{1.2e9}\) fields
    \item[{--}] 3.5G compressed file
  \end{itemize}
  \vspace{2ex}
  \begin{itemize}
    \item many Smart-Seq2 samples (our use case 2018/19)
    \item \textbf{10x / hdf5} may make this data pattern obsolete (or just rare)
    \item sometimes required either by scientist or circumstance
    \item reuse old code \dots
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{2009}
  \begin{itemize}
    \item 2009, ArrayExpress miRNA project at EBI, dense tables
    \item custom solutions \texttt{transpose}
    \item \url{github.com/micans/reaper}
    \item not for the faint of heart (but
      \textcolor{red}{\texttt{valgrind}}-tested tec)
  \end{itemize}
  \vspace{2ex}
  \begin{tabular}{*{2}{p{0.4\textwidth}}}
    \bfseries Benefits & \bfseries Costs \\
    fast, low memory &
    bespoke C code \\
    shoveling bytes &
    should be optimal (is it?) \\
    & ownership \\
  \end{tabular}
\end{frame}

\begin{frame}
  \frametitle{2009}
  \begin{itemize}
    \item \textcolor{red}{read matrix as a single string}
    \item \textcolor{red}{write transpose while walking array of ptr from sep to
      sep}
  \end{itemize}
  \begin{itemize}
    \item r/w gzipped data transparently (zlib)
    \item recognises header line with off-by-one field+tab or just field
    \item whatever else is needed \dots
    \item \textcolor{red}{\texttt{transpose(transpose(X)) == X}} (validation)
  \end{itemize}
  \vspace{3ex}
  \begin{Large}
    2019: how does it compare? Investigate \(\rightarrow\)
  \end{Large}
\end{frame}

\subsection{Comparisons}

\begin{frame}
  \ttfamily
  \begin{itemize}
    \item[\textunderscore\textunderscore] bash
      \footnote{\smallish \url{https://stackoverflow.com/questions/1729824/an-efficient-way-to-transpose-a-file-in-bash}}
    \item[\textunderscore\textunderscore] Python pandas
      \inputminted{python}{pandas_transpose.py}
    \item[\textunderscore\textunderscore] Python numpy
      \inputminted{python}{numpy_transpose.py}
    \item[\textunderscore\textunderscore] Vanilla python
      \inputminted{python}{vanilla_transpose.py}
  \end{itemize}
\end{frame}

\begin{frame}
  \ttfamily
  \begin{itemize}
    \item[\textunderscore\textunderscore] Haskell 1 (stackoverflow)
      \inputminted{haskell}{naive_hs.hs}
    \item[\textunderscore\textunderscore] Bytes aware Haskell
      \inputminted{haskell}{bytes_hs.hs}
  \end{itemize}
\end{frame}

\begin{frame}
  \ttfamily
  \begin{itemize}
    \item[\textunderscore\textunderscore] perl
      \inputminted{perl}{transpose.pl}
    \item[\textunderscore\textunderscore] R
      \inputminted{R}{transpose.R}
    \item[\textunderscore\textunderscore] \textcolor{red}{datamash (GNU tool)}
      \inputminted{bash}{transpose.sh}
    \item[\textunderscore\textunderscore] ruby (stackoverflow, very much not
      optimised)
      \inputminted{ruby}{transpose.rb}
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Omitted solutions}
  \ttfamily
  \begin{itemize}
    \item[--] awk
    \item[--] jq
    \item[--] julia
    \item[--] bash
    \item[--] ruby (optimised)
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Transpose test case}
  \begin{itemize}
    \item \{ 10k, 20k, 30k, 40k, 50k, 60k \} \(\times\) 4671 matrix
    \item Largest test case:
      \begin{itemize}
        \item[\(\circ\)] 282M fields (84\% zeroes)
        \item[\(\circ\)] Compressed 125M
        \item[\(\circ\)] Uncompressed 666M
      \end{itemize}
  \end{itemize}
  \vspace{2ex}
  Note: read cells as strings, so that\\
  \texttt{transpose(transpose(X)) == X}\\
  (avoid rounding/truncating/NaN/NA/null/""/conversions)
\end{frame}

\begin{frame}
  % there's nothing stopping this from being a PDF/PS/<other vector format> file
  \centering
  \includegraphics[height=0.9\textheight]{graph.png}
\end{frame}

\section{Conclusions}

\begin{frame}
  \frametitle{Conclusions}
  \begin{itemize}
    \item[--] Pure Python, Haskell, datamash are effective, with different
      time/memory trade-offs.
    \item[--] Special purpose C code is highly effective, minimal memory
    \item[--] Python data frames, perl, ruby, awk, R best avoided
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Original problem: Creating a big matrix in a text file}
  Aggregation step after a parallelised pipeline:\\
  Combine 60k-element columns from thousands of result files.

  Python: slow churn reading files, high memory

  File-utils type approach:
  \inputminted{bash}{file-utils.sh}
\end{frame}

\begin{frame}
  \centering \Large
  \emph{Fin}
\end{frame}

\end{document}

## vanilla_transpose.py
for c in zip(*(l.split() for l in zin.readlines())):
    print("\t".join(c))
	import Data.List
	import qualified Data.ByteString.Char8 as B

	main :: IO ()
	main = do
	stdin <- B.getContents
	B.putStrLn $ (B.intercalate (B.pack "\n")
	. map (B.intercalate (B.pack "\t"))
	. transpose . map (B.split '\t')
	. B.split '\n') stdin
	(cat list.txt \| while read f; do # several
	cut -f $col “$f” \| paste -s # details
	) \| gzip > $tpfile # ommitted; e.g. rowname equality check.
	transpose -i $tpfile -o $dst # Conflicted, but: bottleneck process.
	main :: IO ()
	main = interact $ unlines . map unwords . transpose . map words . lines

	transpose :: [[a]] -> [[a]]
	transpose = foldr (zipWith (:)) (repeat [])
	data = numpy.loadtxt(gzip.open(sys.argv[1]), delimiter="\t", dtype='str')
	numpy.savetxt(sys.stdout, data.transpose(), fmt='%s', delimiter="\t")
	df = pandas.read_table(gzip.open(sys.argv[1]), sep='\t', dtype='str')
	df.transpose().to_csv(sys.stdout, sep="\t")
	my @data = map { chomp; [ split "\t" ] } <>;
	my @idx = 0..$#data;
	for (my $i = 0; $i < @{$data[0]}; $i++) {
	print join "\t", map { $data[$_][$i] } @idx;
	print "\n";
	}
	df <- read.table(gzfile(args[[1]]), colClasses=c('character'))
	write.table(t(df), "", sep="\t", quote=FALSE, row.names=TRUE, col.names=FALSE)
	% vim: ts=2 sw=0 sts=-1 et ai cole=0 wrap
	%^ vim modeline to set "tabs" to be two spaces and some other stuff

	% Transposition presentation roughly translated to Beamer. Will probably compile
	% if you have a distribution of TeX Live (I personally use latexmk for
	% compilation).

	% Come to think of it, to use minted probably you will also need pygments. If
	% this isn't available through a proper system package manager (which it is, use
	% homebrew) you could install with pip install --user Pygments, and make sure
	% that pygmentize ends up in your PATH.

	% It's not a perfect translation, nor does it probably fully utilise Beamer's
	% numerous features, but I think it does a fair job.

	% use 16:9 aspect ratio instead of 4:3
	% \documentclass[aspectratio=169]{beamer}
	\documentclass{beamer}
	% this is just a theme I personally like the look of
	\usetheme{metropolis}
	\title{Transposing a big matrix/text file}
	\subtitle{Merging many columns to create a big matrix in a text file}
	\author{Stijn van Dongen}
	\institute{WTSI}
	\date{\today}

	% automatically insert title slides for sections, subsections, and
	% subsubsections
	\AtBeginSection{\frame{\sectionpage}}
	\AtBeginSubsection{\frame{\subsectionpage}}
	\AtBeginSubsubsection{\frame{\subsubsectionpage}}

	% nicer tables with \toprule, \midrule, \bottomrule
	% \usepackage{booktabs}

	% format SI units and other numbers, eg with \num, \si, \SI
	\usepackage{siunitx}

	\newcommand\smallish{\fontsize{7pt}{7pt}\selectfont}

	% listings of code, set to use an appropriate fontsize because everything in
	% Beamer is HUGE
	% Also some options to determine how it wraps code if it has to.
	\usepackage{minted}
	\setminted{breaklines,
	breakbytokenanywhere,
	% linenos,
	% chosen so that the listings mostly don't need to wrap
	fontsize=\smallish
	}
	% make annoying red fboxes around $ in haskell code go away
	% \usemintedstyle{friendly}
	\usemintedstyle{xcode}
	% line numbers size (although they're not currently on)
	\renewcommand\theFancyVerbLine{\smallish\arabic{FancyVerbLine}}

	% make things that are yet to be transitioned to transparent, rather than
	% invisible
	% \setbeamercovered{transparent}

	\begin{document}
	\begin{frame}
	\titlepage
	\end{frame}

	\begin{frame}
	\frametitle{Outline}
	\tableofcontents
	\end{frame}

	\begin{frame}
	\frametitle{Transposition}
	\begin{columns}
	\begin{column}{0.48\textwidth}
	% :r !python -c "print('\\\\\\\\\n'.join(' & '.join(map(str, range(i, i + 5))) for i in range(0, 15, 5)))"
	%^ vim command used to generate the table contents. you can recycle it by
	%navigating to the line in normal mode, and then issuing the key sequence
	% 0f:y$q:p<CR> where <CR> is a carriage return. Obviously needs Python.
	\centering
	\onslide<1->{
	\begin{tabular}{*{5}r}
	0 & 1 & 2 & 3 & 4\\
	5 & 6 & 7 & 8 & 9\\
	10 & 11 & 12 & 13 & 14
	\end{tabular}
	}
	\end{column}
	\begin{column}{0.48\textwidth}
	% :r !python -c "print('\\\\\\\\\n'.join(' & '.join(map(str, range(i, 11 + i, 5))) for i in range(5)))"
	\centering
	\onslide<2->{
	\begin{tabular}{rrr}
	0 & 5 & 10\\
	1 & 6 & 11\\
	2 & 7 & 12\\
	3 & 8 & 13\\
	4 & 9 & 14
	\end{tabular}
	}
	\end{column}
	\end{columns}
	\end{frame}

	\section{Transposing a big matrix in a text file}

	\begin{frame}
	\frametitle{Transposing a big matrix in a text file}
	\begin{itemize}
	\item[{--}] 60k genes \(\times\) 20k samples
	\item[{--}] \(\num{1.2e9}\) fields
	\item[{--}] 3.5G compressed file
	\end{itemize}
	\vspace{2ex}
	\begin{itemize}
	\item many Smart-Seq2 samples (our use case 2018/19)
	\item \textbf{10x / hdf5} may make this data pattern obsolete (or just rare)
	\item sometimes required either by scientist or circumstance
	\item reuse old code \dots
	\end{itemize}
	\end{frame}

	\begin{frame}
	\frametitle{2009}
	\begin{itemize}
	\item 2009, ArrayExpress miRNA project at EBI, dense tables
	\item custom solutions \texttt{transpose}
	\item \url{github.com/micans/reaper}
	\item not for the faint of heart (but
	\textcolor{red}{\texttt{valgrind}}-tested tec)
	\end{itemize}
	\vspace{2ex}
	\begin{tabular}{*{2}{p{0.4\textwidth}}}
	\bfseries Benefits & \bfseries Costs \\
	fast, low memory &
	bespoke C code \\
	shoveling bytes &
	should be optimal (is it?) \\
	& ownership \\
	\end{tabular}
	\end{frame}

	\begin{frame}
	\frametitle{2009}
	\begin{itemize}
	\item \textcolor{red}{read matrix as a single string}
	\item \textcolor{red}{write transpose while walking array of ptr from sep to
	sep}
	\end{itemize}
	\begin{itemize}
	\item r/w gzipped data transparently (zlib)
	\item recognises header line with off-by-one field+tab or just field
	\item whatever else is needed \dots
	\item \textcolor{red}{\texttt{transpose(transpose(X)) == X}} (validation)
	\end{itemize}
	\vspace{3ex}
	\begin{Large}
	2019: how does it compare? Investigate \(\rightarrow\)
	\end{Large}
	\end{frame}

	\subsection{Comparisons}

	\begin{frame}
	\ttfamily
	\begin{itemize}
	\item[\textunderscore\textunderscore] bash
	\footnote{\smallish \url{https://stackoverflow.com/questions/1729824/an-efficient-way-to-transpose-a-file-in-bash}}
	\item[\textunderscore\textunderscore] Python pandas
	\inputminted{python}{pandas_transpose.py}
	\item[\textunderscore\textunderscore] Python numpy
	\inputminted{python}{numpy_transpose.py}
	\item[\textunderscore\textunderscore] Vanilla python
	\inputminted{python}{vanilla_transpose.py}
	\end{itemize}
	\end{frame}

	\begin{frame}
	\ttfamily
	\begin{itemize}
	\item[\textunderscore\textunderscore] Haskell 1 (stackoverflow)
	\inputminted{haskell}{naive_hs.hs}
	\item[\textunderscore\textunderscore] Bytes aware Haskell
	\inputminted{haskell}{bytes_hs.hs}
	\end{itemize}
	\end{frame}

	\begin{frame}
	\ttfamily
	\begin{itemize}
	\item[\textunderscore\textunderscore] perl
	\inputminted{perl}{transpose.pl}
	\item[\textunderscore\textunderscore] R
	\inputminted{R}{transpose.R}
	\item[\textunderscore\textunderscore] \textcolor{red}{datamash (GNU tool)}
	\inputminted{bash}{transpose.sh}
	\item[\textunderscore\textunderscore] ruby (stackoverflow, very much not
	optimised)
	\inputminted{ruby}{transpose.rb}
	\end{itemize}
	\end{frame}

	\begin{frame}
	\frametitle{Omitted solutions}
	\ttfamily
	\begin{itemize}
	\item[--] awk
	\item[--] jq
	\item[--] julia
	\item[--] bash
	\item[--] ruby (optimised)
	\end{itemize}
	\end{frame}

	\begin{frame}
	\frametitle{Transpose test case}
	\begin{itemize}
	\item \{ 10k, 20k, 30k, 40k, 50k, 60k \} \(\times\) 4671 matrix
	\item Largest test case:
	\begin{itemize}
	\item[\(\circ\)] 282M fields (84\% zeroes)
	\item[\(\circ\)] Compressed 125M
	\item[\(\circ\)] Uncompressed 666M
	\end{itemize}
	\end{itemize}
	\vspace{2ex}
	Note: read cells as strings, so that\\
	\texttt{transpose(transpose(X)) == X}\\
	(avoid rounding/truncating/NaN/NA/null/""/conversions)
	\end{frame}

	\begin{frame}
	% there's nothing stopping this from being a PDF/PS/<other vector format> file
	\centering
	\includegraphics[height=0.9\textheight]{graph.png}
	\end{frame}

	\section{Conclusions}

	\begin{frame}
	\frametitle{Conclusions}
	\begin{itemize}
	\item[--] Pure Python, Haskell, datamash are effective, with different
	time/memory trade-offs.
	\item[--] Special purpose C code is highly effective, minimal memory
	\item[--] Python data frames, perl, ruby, awk, R best avoided
	\end{itemize}
	\end{frame}

	\begin{frame}
	\frametitle{Original problem: Creating a big matrix in a text file}
	Aggregation step after a parallelised pipeline:\\
	Combine 60k-element columns from thousands of result files.

	Python: slow churn reading files, high memory

	File-utils type approach:
	\inputminted{bash}{file-utils.sh}
	\end{frame}

	\begin{frame}
	\centering \Large
	\emph{Fin}
	\end{frame}

	\end{document}
	for c in zip(*(l.split() for l in zin.readlines())):
	print("\t".join(c))