ToxicFrog/detailedtalk.latex Secret

## detailedtalk.latex
\documentclass[14pt]{beamer}

\usepackage[utf8x]{inputenc}
\usepackage{default}
\usepackage{fontenc}
\usepackage{graphicx}
\usepackage{listings}
\usepackage{hyperref}
\usepackage{alltt}
\usepackage{epstopdf}
\usepackage[style=numeric,defernumbers=true]{biblatex}
\useoutertheme{infolines}
%\usepackage{beamerthemebars}

\bibliography{imapcar2}

\newcommand{\ul}{\begin{itemize}}
\newcommand{\lu}{\end{itemize}}
\newcommand{\ft}[1]{\frametitle{#1}}

\newcommand{\slide}[2]{
  \begin{frame}
    \frametitle{#1}
    \ul
      #2
    \lu
  \end{frame}
}

\newcommand{\defbox}[1]{
 \vfill
 \begin{center}
  \fbox{
    \begin{minipage}{0.8\textwidth}
      \scriptsize
      \ul
        #1
      \lu
    \end{minipage}
  }
 \end{center}
}

\date{\today}

\title{Semi-Reconfigurable Processors for Fast Image Analysis}
\author{Ben Kelly}
\institute{University of Guelph}

\begin{document}

\begin{frame}
 \maketitle
\end{frame}

\begin{frame}
 \frametitle{Summary}

 \ul
  \item Realtime image analysis
  \item The IMAP architecture
  \item IMAP-CE and IMAPCAR
  \item IMAPCAR2
 \lu
\end{frame}

\section{Realtime Image Analysis}

\begin{frame}
 \frametitle{Realtime image analysis - what is it?}

 \ul
  \item Transformation and analysis of images in real time
  \ul
    \item Typically, this means 30fps or 60fps
    \item At 30fps you have about 33ms to process each frame
  \lu
  \item Subset of \emph{realtime image processing}
  \item Significantly more difficult than transformation only
 \lu
\end{frame}

\begin{frame}
 \frametitle{Realtime image analysis - what is it good for?}

 \ul
  \item Obstacle detection and avoidance
  \item Lane following
  \item Threat detection
  \item Object identification
  \item In short: machine vision
 \lu
\end{frame}

\begin{frame}
 \frametitle{Realtime image analysis - the problem}

 \ul
  \item 30fps image analysis is not cheap
  \item Embedded processors don't have the power
  \item GPPs are too expensive and too power-hungry
  \item ASICs are too inflexible
 \lu

  \defbox{
    \item[GPP] General Purpose Processor
    \item[ASIC] Application Specific Integrated Circuit
  }
\end{frame}

\section{IMAP}

\begin{frame}
 \frametitle{IMAP - the Integrated Memory Array Processor}

 \ul
  \item Described by Fukushima et al. in 1995
  \item Designed to quickly and cheaply perform image processing tasks
  \item 8-bit SIMD RISC architecture
  \item Intended to act as a coprocessor to a separate CPU
 \lu

 \defbox{
  \item[RISC] Reduced Instruction Set Computer
  \item[SIMD] Single Instruction Multiple Data
 }
\end{frame}

\begin{frame}
 \frametitle{IMAP - Internal Components}

 \ul
  \item 64 8-bit SIMD PEs
  \item 2KB of IMEM per PE
  \item Simple ring network connecting PEs
  \item Tree network connecting external CPU to PEs
 \lu

 \defbox{
  \item[PE] Processing Element - a SIMD miniprocessor
  \item[IMEM] Internal Memory
 }
\end{frame}

\begin{frame}
 \frametitle{IMAP - Internal Design}

 \ul
  \item Each PE can only directly access its own registers and IMEM
  \item Ring network lets PEs transfer data to registers of adjacent PEs
 \lu

 \begin{center}
    \includegraphics[width=0.6\textwidth]{imap.pdf}
  \end{center}
\end{frame}

\begin{frame}
 \frametitle{IMAP - Programming Model}

 \ul
  \item \emph{One-Dimensional C} (1DC)
  \item C programming language with data-parallel extensions
  \ul
    \item Description of data structures spread across IMEM
    \item SIMD processing of these structures
    \item Collection of results
  \lu
  \item Main code runs on the CPU; 1DC compiler automatically dispatches parallel operations to the PEs
 \lu
\end{frame}

\section{IMAP-CE and IMAPCAR}

\begin{frame}
 \frametitle{IMAP-CE}

 \ul
  \item Prototype IMAP implementation, developed by Kyo et al.
  \item CPU is now integrated onto the chip as the \emph{Central Processor} (CP)
  \item 128 PEs, 2KB of IMEM each
  \ul
    \item Designed to hold an entire 512x512 image in PE IMEM
  \lu
 \lu
\end{frame}

\begin{frame}
  \frametitle{IMAP-CE - Internal Design}
 \begin{center}
    \includegraphics[width=0.8\textwidth]{imap-ce.png}
  \end{center}
\end{frame}

\begin{frame}
 \frametitle{IMAP-CE - Usage}

 \ul
  \item Builtin support for four types of image access
  \ul
    \item[(a)] Row-wise
    \item[(b)] Row-systolic
    \item[(c)] Slant-systolic
    \item[(d)] Autonomous
  \lu
 \lu
 \begin{center}
    \includegraphics[width=0.8\textwidth]{pul.pdf}
  \end{center}
\end{frame}

\begin{frame}
 \frametitle{IMAPCAR}

 \ul
  \item Refinement of IMAP-CE, designed for use in automobiles
  \item ROI and DMA upgrades, including ROI scaling
  \item Video bus width tripled; IMAPCAR can handle two 768p or three 512p video streams simultaneously
  \item Program and data memory protected by ECC and parity checks respectively
 \lu

 \defbox{
  \item[ROI] Region of Interest
  \item[DMA] Direct Memory Access
  \item[ECC] Error Correction Code
 }
\end{frame}

\begin{frame}
 \frametitle{IMAPCAR In Practice}

 \ul
  \item Benchmarks:
  \ul
    \item 3x faster than IMAP-CE
    \item Comparable power requirements
  \lu
 \lu

 \begin{center}
  \includegraphics[height=0.5\textheight]{overtake.png}
 \end{center}
\end{frame}

\begin{frame}
 \frametitle{IMAPCAR - Weaknesses}

 \ul
  \item Initial stages of image analysis are all SIMD
  \item Once regions of interest are identified, they must be analyzed
  \item Analysis is intrinsically MIMD
  \item Problem: IMAPCAR has no MIMD support!
  \item ROI analysis ends up happening in serial on the CP, with the PEs idle
 \lu
\end{frame}

\begin{frame}
 \frametitle{IMAPCAR - Possible Solutions}

 \ul
  \item Use multiple IMAPCAR chips
  \ul
    \item Cost and power draw increase proportionally
    \item Extra IMAPCARs are idle when performing SIMD operations
    \item All PEs are idle when performing MIMD operations
  \lu
  \item Add more CPs to the IMAPCAR
  \ul
    \item Greatly increases complexity
    \item Still ``wastes'' the PEs
  \lu
 \lu
\end{frame}

\section{IMAPCAR2}

\begin{frame}
 \frametitle{IMAPCAR2}

 \ul
  \item Successor to IMAPCAR, intended to address MIMD issue
  \item Minor upgrades:
  \ul
    \item PEs and CP now use the same datapath and instruction set
    \item IMEM amount doubled
    \item Tiling capability
    \item 16-bit addressing and instruction width
  \lu
 \lu
\end{frame}

\begin{frame}
 \frametitle{IMAPCAR2 - MIMD Support}

 \ul
  \item PEs are grouped into sets of 4
  \item Each group is augmented with hardware that lets them combine to function as an additional CP
  \ul
    \item IMEM becomes data and instruction caches
    \item Extra ALUs become FPU components
    \item PE 0 handles registers and instruction dispatch
  \lu
  \item Total hardware overhead is around 20\%
 \lu

 \defbox{
  \item[ALU] Arithmetic/Logic Unit
  \item[FPU] Floating Point Unit
 }
\end{frame}

\begin{frame}
 \frametitle{IMAPCAR2 - Internal Design}

 \begin{center}
  \includegraphics[height=0.8\textheight]{pu.png}
 \end{center}
\end{frame}


\begin{frame}
 \frametitle{IMAPCAR2 - Programming Model}

 \ul
  \item Like earlier IMAPs, uses 1DC
  \item Additional extensions for controlling PUs
  \item C API for PU usage is pthreads-compatible, ie, shared-memory
  \ul
    \item Synchronization via shared structures - mutexes, semaphores, barriers
    \item Communication by reading and writing known areas of memory
  \lu
 \lu
\end{frame}

\begin{frame}
 \frametitle{IMAPCAR2 - Weaknesses}

 \ul
  \item Pthreads-alike programming model implies shared memory
  \item However, the IMAPCAR2 has no cache coherency
  \ul
    \item Cache$\leftrightarrow$RAM transfers must be explicitly invoked
  \lu
  \item Trying to use IMAPCAR2 as a shared-memory system will not work
  \item Unlike IMAPCAR's deficiencies, this can be fixed entirely in software by providing a message-passing API
 \lu
\end{frame}


\begin{frame}
 \frametitle{Conclusions}

 \ul
  \item IMAP is an old but still highly effective architecture for image processing
  \item IMAPCAR2 shows promise as a modern refinement of that design
  \item However, additional software support is needed to fully realize its potential
 \lu
  \nocite{*}
\end{frame}


\begin{frame}[allowframebreaks]{References}
% \frametitle{References}


% \nocite{Fujita:1997:GSP:522770.791783}
% \nocite{scalable-video-recognition-processor}
% \nocite{imap-architecture}
% \nocite{low-cost-mixed-mode-processor}
% \nocite{imap}
% \nocite{overtaking}
% \nocite{in-vehicle-vision-processors}
% \nocite{imapcar2-hotchips}
% \nocite{imapcar-in-vehicle-vision-processor}

%\bibliographystyle{acm}
{\tiny \printbibliography[omitnumbers=true]}

\end{frame}

\end{document}

## finalpaper.latex
\documentclass[a4paper,12pt]{article}
\usepackage[utf8x]{inputenc}

\usepackage{fullpage}

\parskip 1ex
\parindent 0ex

\usepackage{cite}
\usepackage{hyperref}
\usepackage{float}
\usepackage{wrapfig}
\usepackage{graphicx}

%opening
\title{IMAP-based SIMD and SIMD/MIMD hybrid architectures for image processing}
\author{Ben Kelly}

\begin{document}

\maketitle

\begin{abstract}
Realtime image processing is a rapidly growing application field. It is especially important to the automotive industry, where a fast image processor can provide collision detection, lane following, and overtake features, among others. Traditionally, special-purpose SIMD chips have been used for this purpose; however, modern image analysis techniques require true multiprocessing capability. To fill that need, hybrid SIMD/MIMD chips are starting to appear. This paper discusses the history of the IMAP architecture and the IMAP-CE, IMAPCAR, and IMAPCAR2 implementations of it, with a particular focus on the dynamic reconfiguration capabilities and programming model of the IMAPCAR2.
\end{abstract}

\section{Background}

Multimedia processors are, in this day and age, commonplace. Most development in this area focuses on audio/video transcoding hardware - devices to compress and decompress audio and video streams in real time. This can typically be implemented using comparatively cheap and efficient special-purpose hardware, with each chip implementing a single algorithm.

There is, however, an increased demand for real-time image processing and classification, especially in the domain of semi-autonomous vehicles. Hardware is needed that can not only transform captured images, but also analyze them - searching for obstacles, lane markers, and other features~\cite{in-vehicle-vision-processors}. Performing these operations in real time is beyond the capability of typical embedded processors, such as the ARM, especially when multiple video streams need to be processed. However, general purpose processors (GPPs) -- such as those found in modern desktops -- are far too large, expensive, and power-hungry; and application specific integrated circuits (ASICs) are too inflexible, requiring a new circuit for each image processing algorithm used -- and consequently a hardware redesign if algorithms need to be updated or added. It follows that to be useful for this purpose, a chip is required that is fully programmable, but without the drawbacks inherent in using an off-the-shelf GPP~\cite{scalable-video-recognition-processor}.

\section{IMAP}

\subsection{The Integrated Memory Array Processor}

A description of the IMAP architecture was first published by Fujita et al.~\cite{Fujita:1995:PIM:526253.791749} in 1995. It specifies a heavily SIMD-oriented design, consisting of a homogenous array of processor elements (\emph{PE}s), each one possessing local memory (\emph{internal memory} or IMEM) containing a slice of the image data to be operated on. The intent of the design is to exploit the high degree of data-level parallelism inherent in most image processing operations - the image can be divided up, row-wise or column-wise, among the PEs, which then process the image data and produce results to be collected by a central processor. Since the PE array is fully programmable, a degree of flexibility not possible with ASICs is obtained; at the same time, the massively SIMD architecture allows most image processing tasks to be performed very quickly, while the simplicity of the individual PEs keeps manufacturing costs and power requirements down.

For communication between PEs, they are linked in a simple ring network, which permits each PE to communicate with the PEs directly to the left and right of it by exchanging data in certain registers. This facilitates the implementation of algorithms that require data from groups of adjacent pixels - each PE can exchange information about the pixel it is currently processing with its neighbors, thus allowing such algorithms to be implemented without duplication of pixel data across IMEM blocks or expensive IMEM-to-IMEM transfers.

In addition to describing the general principles of the IMAP architecture, the paper demonstrates a sample implementation. This IMAP uses 64 eight-bit microprocessors as the PE array, with 4KB of IMEM attached to each PE. The system used for testing incorporated eight IMAPs (for a total of 512 PEs) controlled by a seperate 16-bit processor. Benchmarks show it performing most simple image processing tasks in less than a millisecond, which is ample for real-time processing - a typical real-time application needs to handle video at 30fps, giving it 33ms to process each frame.

\subsection{One-Dimensional C}

In a later paper, Fujita et al.~\cite{Fujita:1997:GSP:522770.791783} describe a language intended for use with IMAP, \emph{one-dimensional C} (1DC). This is based on ANSI C89, but with extensions to facilitate data-parallel operation and alleviate the difficulty inherent in manually managing PEs. (Existing data-parallel C dialects were considered, but deemed insuitable due to implicit assumptions of more powerful message-passing facilities in the underlying hardware than IMAP possesses.)

1DC adds a new declaration keyword, \emph{sep}, which is used to declare a variable that is spread across the PEs; storage for the variable is divided among PE IMEM, and operations on it are automatically dispatched to all of the PEs. It also adds new operators for inter-PE communication and result collection by the CPU, and a new flow control statement, \emph{mif}, used to perform operations only on PEs meeting some condition. In each case, the 1DC compiler emits machine code that handles the minutiae of reading and writing IMEM and PE registers and coordinating PE operations.

\section{IMAP-CE}

\subsection{Hardware Design}

IMAP-CE is an IMAP implementation developed by Kyo et al~\cite{imap-architecture}. Unlike the original IMAP design, which envisioned IMAP as a coprocessor attached to a seperate CPU, it integrates the control processor (\emph{CP}) onto the chip. A single IMAP-CE processor, then, consists of 128 PEs with 2KB of IMEM each (divided into sixteen identical tiles of eight PEs each), a single CP, an external memory interface (EXTIF) connected to an external RAM bank, and a bank of shift registers used to hold incoming video data (see Fig. \ref{fig:imap-ce}).

\begin{wrapfigure}{r}{0.5\textwidth}
 \begin{center}
 \includegraphics[width=0.45\textwidth]{imap-internals.pdf}
 \caption{Overview of IMAP-CE internal structure.}
 \label{fig:imap-ce}
 \end{center}
\end{wrapfigure}

The video shift registers are connected directly to IMEM; after each line of video data arrives, it is copied into IMEM for processing by the PEs. With 2KB of IMEM per PE, this allows it to process an entire 512x512px, 32-bit image without ever accessing EMEM, at least in principle. Additionally, the shift registers are connected to video \emph{output}, allowing the IMAP-CE to copy data from IMEM back into the VSRs and output it as a video stream - allowing the IMAP-CE to not just collect and report results in memory, but output an annotated or completely transformed version of the original video.

When EMEM access is necessary, it is performed via DMA hardware in the EXTIF. It takes only a single clock cycle to initiate a DMA transfer; however, it takes sixteen cycles to transfer a complete row of image data between IMEM and EMEM. For this reason, part of IMEM is set aside as buffers for image transfer, allowing multiple rows of DMA to be queued at once and then processed in the background.

As with the earlier IMAP and IMAP-VISION systems described by Fujita et al., 1DC is used as the programming language for IMAP-CE. The implementation is of course different from the one used by Fujita et al., but the language itself is unchanged.

\subsection{Parallelization Techniques}

For the purpose of parallelizing existing image processing tasks, they were categorized based on how their memory access patterns could be represented using a \emph{pixel update line}, or PUL. Notionally, the PUL is a line that sweeps across the image, with each pixel it crosses being operated on by a PE as the line crosses it -- each column of pixels being stored in the IMEM of a different PE.

The first access pattern, \emph{row-wise} (see (a) in Fig. \ref{fig:pul}), is also the simplest: an entire row is processed by having each PE operate on the top pixel in its column, then each PE ``moves'' down one pixel. Once the entire image is processed, each PE has processed an entire column of the image.

The second, \emph{row-systolic} (b), is used when each PE needs to process a row rather than a column. Unlike row-wise, the initial layout of the PUL is diagonal, not horizontal, with PE 0 operating on the top pixel in its column and PE $n$ operating on the bottom pixel. As before, each PE operates on its current pixel and then moves down; however, before moving, the PE passes its current state one PE to the left using the ring network (with PE 0 sending its results to PE $n$). Furthermore, when selecting the next pixel, PEs that ''fall`` off the bottom of the image wrap around to the top. In this way, each PE gradually accumulates the results from an entire row, rather than column, of the image.

The third access pattern, \emph{slant-systolic} (c), is used for operations with simple adjacent-pixel data dependencies. It starts with only a single pixel in one corner being processed; once that is handled, the two adjacent pixels (one below, and one beside) can be processed; and so forth until the entire image is handled, with the number of active PEs starting at 1, peaking halfway through the image, and then falling back off.

The final access pattern, \emph{autonomous} (d), is used when the extent of the region(s) to be processed must be determined as the algorithm runs. In this mode, each PE has a section of memory reserved as a stack of pixels of interest; each operation may push more pixels onto this stack, or the stacks of neighboring PEs. The CP manages this process until all PE stacks are empty. (Do not confuse this with instruction-parallel processing; the PEs are still performing the same operations on the pixels they examine. It is only the choice of which pixels to operate on that varies between PEs.)

\begin{figure}[h]
 \begin{center}
 \includegraphics[width=0.8\textwidth]{pul.pdf}
 \caption{Memory access patterns based on a pixel update line.}
 \label{fig:pul}
 \end{center}
\end{figure}

\subsection{Performance}

For performance evaluation, several image processing kernels were run in three different environments: once on the IMAP-CE at 100MHz; once on an Intel Pentium 4 at 2.4GHz, compiled with the Intel optimizing compiler; and once on the same P4, but compiled from the IMAP-CE 1DC source code using an optimizing 1DC compiler (which makes use of the MMX SIMD instructions supported by the P4).

The IMAP-CE proved to be significantly faster at these image processing tasks, demonstrating an average speedup of 8 relative to the C implementation and 3 relative to the 1DC implementation with MMX. Furthermore, it had a sustained power requirement while doing so of only 2W, compared to approximately 100W for the P4.

It was also compared to three other data-parallel media processor chips - the Imagine, MorphoSys2, and VIRAM. In these comparisons it did not far so well; although the most power- and space-efficient, it proved to be between 2 and 10 times slower. This is primarily attributable to its significantly slower clock rate, but also to the fact that the other three processors mentioned are all full 16-bit processors, whereas IMAP-CE still relies on 8-bit PEs. The authors suggest that expanding IMAP-CE to be fully 16-bit as well may be a worthwhile performance improvement.

\section{IMAPCAR}

\subsection{Design}

The IMAPCAR design, also described by Kyo et al.~\cite{imapcar-in-vehicle-vision-processor}, is an incremental improvement on IMAP-CE. It does not make the same sort of drastic changes to IMAP-CE that IMAP-CE made to IMAP; rather, it addresses the most easily correctable performance deficiencies in IMAP-CE, and adds the reliability features necessary for it to be safely used as an automotive vision processor.

To alleviate the slowness of EMEM access, as discussed above, the DMA capabilities of the EXTIF were upgraded. The DMA request queue was extended, making it possible to queue up twice as many DMA transfers in the background. Furthermore, scaling and translation support was added - when performing DMA operations on rectangular subsections of the image, the EXTIF is capable of performing simple up- or down-scaling operations and relocating the image region in memory as the DMA transfer occurs, freeing the PEs from the task.

In addition to the DMA upgrades, the number of video shift registers was tripled, allowing it to process three video streams of width 512px, or two video streams of width 640px or 768px, simultaneously. The interconnections among the SRs were also upgraded to support two different patterns for allocating video data among the PEs - one in which each PE gets a set of adjacent columns, and one in which each PE gets a set of columns evenly spaced within the frame. As with the IMAP-CE, the SRs operate on the video clock until a complete row has been buffered, then operate on the system clock during the horizontal-blank period to copy the row into or out of IMEM.

Various reliablity enhancements were also made. The temperature tolerances were improved to meet the -40\textdegree{C} to +85\textdegree{C} range required for automotive use; the vibration tolerances were similarly increased. The SDRAM (dynamic RAM) originally used for EMEM was replaced with SSRAM (static RAM), and the CP instruction cache C\$ and EMEM were augmented with four bits of ECC (error correction code) per 32-bit instruction word. Finally, IMEM and the CP data cache D\$ were given one parity bit per byte of data. Parity failures, and errors uncorrectable by ECC, will cause the IMAPCAR to stop execution and raise an exception (ideally, to be handled by the vehicle's master control system).

\subsection{Performance}

Like IMAP-CE, IMAPCAR proves to be sigificantly faster than general-purpose processors for performing data-parallel image processing tasks. Furthermore - once program code is updated to take advantage of the new region of interest (ROI) scaling features added to the EXTIF - it also proves to be up to three times faster than the IMAP-CE for the same operations. This is due primarily to the increased DMA queue depth and the time saved by performing simple scaling operations during DMA rather than using the PEs for them before or afterwards.

Despite these improvements, the IMAPCAR also consumes less power - 2W maximum, compared to 2-4W for the IMAP-CE - and operates at a lower core voltage (1.2V compared to 1.8V), although the IO and EMEM voltage remains the same at 3.3V.

In real-world use, such as the overtaking-vehicle detection system described by Kyo et al. in 2007~\cite{overtaking}, the performance improvements relative to GPPs are not quite as dramatic as the image processing benchmarks would suggest; this is primarily attributable to the fact that in such a system, the IMAPCAR must necessarily spend some time performing serial operations with the PEs idle, losing the huge advantage that is has when performing data-parallel operations - becoming, in effect, a 100MHz processor rather than a 12.8GHz one. Nonetheless, it proves to be nearly three times as fast as a 3GHz GPP, and can perform a complete overtake-detection cycle in approximately 31ms, giving it a comfortable 2ms to spare per frame.

\section{IMAPCAR2}

\subsection{Overview}

As noted above, the IMAPCAR's greatest weakness is that when \emph{not} performing data-parallel operations, it effectively operates as a rather slow single-core RISC processor. This is compounded by the fact that most image analysis tasks conclude with region of interest analysis - the inspection of several previously-identified regions of the image - and this operation is generally \emph{not} data-parallel; each region may be using a completely different algorithm, and even when not, the ROI analysis algorithm is generally too complex to be treated as a SIMD operation over multiple regions. As a result, once reaching this stage, the IMAPCAR ends up examining each ROI in serial, relying primarily on the CP alone.

The obvious solution to this is to add some sort of MIMD (multiple instruction, multiple data - ie, true multithreading) support to the IMAPCAR. While this could be done by adding more CPs, the increase in complexity, size, and cost is prohibitive. Instead, a dynamic reconfiguration approach was taken when designing its successor, the IMAPCAR2: the chip can operate in both SIMD and MIMD modes, re-using mostly the same hardware for both.~\cite{low-cost-mixed-mode-processor}

This was accomplished by dividing each tile of eight PEs into two groups of four, and then adding additional control circuity to each group (see Fig. \ref{fig:imapcar2-tile}). This permits each group of four PEs to operate either as four SIMD processing elements, as in the original design, or - using the new hardware - as a single \emph{processing unit} or PU, with capabilities equivalent to that of the CP itself. This new hardware increases the size of each tile by approximately 20\% (and the gate count by approximately 10\%).

\begin{figure}[h]
 \begin{center}
 \includegraphics[width=0.9\textwidth]{PU.png}
 \caption{Detail of one half of a reconfigurable PE tile (from \cite{low-cost-mixed-mode-processor}).}
 \label{fig:imapcar2-tile}
 \end{center}
\end{figure}

\subsection{Changes in CP and PE Design}

To support this SIMD/MIMD mode switching, some drastic changes - beyond the added control circuitry needed to support the mode switching in the first place - were made to the PE design. Not only was the amount of IMEM doubled (from 2KB to 4KB), but the entire datapath was replaced with a duplicate of the one present in the CP, and the instruction decoder revised to support a subset of the CP instruction set (only the CP has instructions for PE control, direct EMEM access, and dynamic reconfiguration).

As a result of this, each PE now has 23 16-bit general purpose registers and six special-purpose registers; one load/store unit, for accessing IMEM; and two arithmetic/logic units. Each ALU supports up to two operations per clock, provided they don't overlap in hardware usage; in ideal situations, the PE can now execute five instructions per clock - two arithmetic/logic operations and one load/store.

The instruction fetcher and decoder used by both the CP and the PEs has been upgraded to match; the shared instruction format is variable-width (16 or 32 bits per instruction), but has provisions for packing up to five 16-bit instructions that share common elements into a single 96-bit instruction that can be fetched and dispatched to the PE array in a single clock cycle.

Finally, the ring network used for communication among PEs has been renamed the \emph{N-ring}, and two additional ring networks have been added to the PE array, the \emph{M-ring} and \emph{C-ring}. The M-ring connects all of the PUs to a DMA controller attached to the CP, and is used for copying data between EMEM and the PU instruction and data caches; the C-ring connects all of the PUs to each other (and to the CP), and is used for message-passing between PUs. Unlike the N-ring, the C-ring contains additional selector hardware that lets it ``skip'' tiles, meaning that messages can be passed around the ring at a rate of one clock cycle per tile, rather than the one clock per PE that the N-ring is limited to; unlike the M-ring, the C-ring can also be used for message passes between PEs.

\subsection{Dynamic Reconfiguration} % THIS IS STILL THE COOLEST THING EVER

The chip supports three modes: \emph{SIMD mode}, in which all of the PEs are active; \emph{mixed mode}, in which at most half of the PUs are active (the ones in the ``lower half'' of each tile, specifically), while the remaining 64 PEs (the ``upper half'' ones) are also in use; and \emph{MIMD mode}, where more than half of the PEs are being used as PUs. The chip returns to SIMD mode when all PUs report that they have completed execution.

Reconfiguration for MIMD operation is initiated by a \emph{forkinit} instruction issued by the CP, which activates the connective circuitry in the selected PE groups. The CP then uses the \emph{forkp} and \emph{forkd} instructions to copy data into the PU instruction and data caches, respectively, and then finally the \emph{fork} instruction to start the PU running.

When operating in PU mode, nearly all of the resources of the four PEs are used to make up the PU. The datapath of PE0, being nearly identical to the CP datapath, is re-used entire as the PU datapath. The IMEM blocks are combined to make up the 8KB instruction and data caches, and the registers from PE2 and PE3 are used for cache tags. Finally, the registers from PE1 and the ALUs from PEs 1 through 3 are combined to form the FPU.

In this mode, the IMEM is gone, and the data and instruction caches behave much more similarly to the caches found on GPPs - when the code executing on the PU attempts to access a page of memory, it is transparently copied from EMEM into the appropriate cache if not already present. However, this is a relatively slow operation, and may copy surrounding memory that is not required by the PU. To alleviate this, instructions are provided to explicitly transfer regions of memory between EMEM and the instruction and data caches. These instructions are doubly important, as the PUs have no cache coherency hardware - changes to the cache will not automatically be written back to EMEM, and changes to EMEM will not automatically be copied into the cache. Thus, explicit transfers must be used to update external memory, and fetch updates from it.

\subsection{Programming Model}

The SIMD programming model is identical to that of the IMAPCAR, using 1DC as the programming language. This section will therefore only discuss the programming model for the new MIMD mode.

In MIMD mode, the 1DC data-parallel extensions are unavailable to the PUs; they are programmed in plain ANSI C. Communication with other PUs and with the CP is performed with \emph{send} and \emph{recv} primitives, which implement one-to-one, one-to-many, and one-to-any message passing using the C-ring; communication with main memory happens automatically on a cache miss, or can be triggered explicitly using \emph{roiread} and \emph{roiwrite}, and uses the M-ring.

At the application level, a pthreads-compatible API is exposed, with synchronization primitives such as semaphores implemented using message passing. However, this has one major drawback: pthreads is designed around the assumption of shared memory. While this is a safe assumption in single-core systems, and multi-core systems with cache coherency, the IMAPCAR2 does not have coherent caches. Thus, any communication more complex than simple synchronization (implemented internally with the message-passing operations) must be done using low-level message passing or ROI transfer operations; updating shared memory structures will not work, as once a page is copied into cache, it is no longer shared.

\subsection{Performance}

The IMAPCAR2 proves to be approximately twice as fast as the IMAPCAR, even when operating solely in SIMD mode. This is attributable primarily to the increased capabilities of the PEs (twice as much IMEM, 16-bit datapath, and more efficient instruction dispatch), all of which increase the amount of work that can be done per instruction and reduce the overall instruction count of each operation. An increase in the clock speed to 150MHz is also a major factor.~\cite{imapcar2-performance}

In MIMD mode, the speedup is greatly dependent on what operations are being performed. For heavily parallelizeable operations that previously had to be executed solely on the CP, speed improvements of up to 10x were seen when using all 32 PUs. In practice, performance tends to increase linearly up to 8 PUs; beyond that, M-ring and DMA contention becomes a serious bottleneck as all of the PUs attempt to access EMEM, and diminishing returns set in rapidly. It is likely that algorithms less dependent on EMEM access could show performance improvements approaching the theoretical maximum.

\subsection{Future Work}

The IMAPCAR2's biggest weakness is the mismatch between the environment for MIMD programming presented to the programmer, and the actual capabilities of the chip. The use of pthreads as an API encourages the programmer to use a shared-memory design, but in practice any use of shared memory must be done with great care and explicit synchronization between cache and EMEM.

IMAPCAR2 could greatly benefit from a message-passing multithreading built on top of, or replacing, the pthreads API. In practice, IMAPCAR2's cache-acoherent design and message-passing primitives more closely resemble the structure of a modern high-performance computing cluster than they do a shared-memory symmetric multiprocessing system, and for this reason Kyo et al. suggest MPI~\cite{mpi} or something like it as a suitable API for use of IMAPCAR2. Pilot~\cite{pilot}, a much simpler API traditionally implemented on top of MPI, would also be a suitable choice, and a Pilot implementation on IMAPCAR2 is likely to be the subject of my thesis.

\section{Conclusion}

The IMAP architecture has proven to be a highly efficient and effective architecture for image processing tasks. Despite its simplicity, all implementations of it have performance that completely outclasses GPPs in this problem domain, and because of it, low power requirements.

However, while ideal for data-parallel operations, IMAP's single-core design hurts it severely when performing instruction-parallel operations such as region-of-interest inspection. IMAPCAR2 addresses this by allowing SIMD processing elements to be reused as additional MIMD cores as needed.

IMAPCAR2's greatest weakness is its MIMD programming model, which uses a shared-memory API, despite the fact that the underlying hardware more closely resembles a local-memory system with message-passing. Development of a message-passing API that more closely corresponds to the IMAPCAR2's capabilities could be a fruitful area for further research.

\pagebreak

\bibliography{imapcar2}
\bibliographystyle{acm}

\end{document}
	\documentclass[14pt]{beamer}

	\usepackage[utf8x]{inputenc}
	\usepackage{default}
	\usepackage{fontenc}
	\usepackage{graphicx}
	\usepackage{listings}
	\usepackage{hyperref}
	\usepackage{alltt}
	\usepackage{epstopdf}
	\usepackage[style=numeric,defernumbers=true]{biblatex}
	\useoutertheme{infolines}
	%\usepackage{beamerthemebars}

	\bibliography{imapcar2}

	\newcommand{\ul}{\begin{itemize}}
	\newcommand{\lu}{\end{itemize}}
	\newcommand{\ft}[1]{\frametitle{#1}}

	\newcommand{\slide}[2]{
	\begin{frame}
	\frametitle{#1}
	\ul
	#2
	\lu
	\end{frame}
	}

	\newcommand{\defbox}[1]{
	\vfill
	\begin{center}
	\fbox{
	\begin{minipage}{0.8\textwidth}
	\scriptsize
	\ul
	#1
	\lu
	\end{minipage}
	}
	\end{center}
	}

	\date{\today}

	\title{Semi-Reconfigurable Processors for Fast Image Analysis}
	\author{Ben Kelly}
	\institute{University of Guelph}

	\begin{document}

	\begin{frame}
	\maketitle
	\end{frame}

	\begin{frame}
	\frametitle{Summary}

	\ul
	\item Realtime image analysis
	\item The IMAP architecture
	\item IMAP-CE and IMAPCAR
	\item IMAPCAR2
	\lu
	\end{frame}

	\section{Realtime Image Analysis}

	\begin{frame}
	\frametitle{Realtime image analysis - what is it?}

	\ul
	\item Transformation and analysis of images in real time
	\ul
	\item Typically, this means 30fps or 60fps
	\item At 30fps you have about 33ms to process each frame
	\lu
	\item Subset of \emph{realtime image processing}
	\item Significantly more difficult than transformation only
	\lu
	\end{frame}

	\begin{frame}
	\frametitle{Realtime image analysis - what is it good for?}

	\ul
	\item Obstacle detection and avoidance
	\item Lane following
	\item Threat detection
	\item Object identification
	\item In short: machine vision
	\lu
	\end{frame}

	\begin{frame}
	\frametitle{Realtime image analysis - the problem}

	\ul
	\item 30fps image analysis is not cheap
	\item Embedded processors don't have the power
	\item GPPs are too expensive and too power-hungry
	\item ASICs are too inflexible
	\lu

	\defbox{
	\item[GPP] General Purpose Processor
	\item[ASIC] Application Specific Integrated Circuit
	}
	\end{frame}

	\section{IMAP}

	\begin{frame}
	\frametitle{IMAP - the Integrated Memory Array Processor}

	\ul
	\item Described by Fukushima et al. in 1995
	\item Designed to quickly and cheaply perform image processing tasks
	\item 8-bit SIMD RISC architecture
	\item Intended to act as a coprocessor to a separate CPU
	\lu

	\defbox{
	\item[RISC] Reduced Instruction Set Computer
	\item[SIMD] Single Instruction Multiple Data
	}
	\end{frame}

	\begin{frame}
	\frametitle{IMAP - Internal Components}

	\ul
	\item 64 8-bit SIMD PEs
	\item 2KB of IMEM per PE
	\item Simple ring network connecting PEs
	\item Tree network connecting external CPU to PEs
	\lu

	\defbox{
	\item[PE] Processing Element - a SIMD miniprocessor
	\item[IMEM] Internal Memory
	}
	\end{frame}

	\begin{frame}
	\frametitle{IMAP - Internal Design}

	\ul
	\item Each PE can only directly access its own registers and IMEM
	\item Ring network lets PEs transfer data to registers of adjacent PEs
	\lu

	\begin{center}
	\includegraphics[width=0.6\textwidth]{imap.pdf}
	\end{center}
	\end{frame}

	\begin{frame}
	\frametitle{IMAP - Programming Model}

	\ul
	\item \emph{One-Dimensional C} (1DC)
	\item C programming language with data-parallel extensions
	\ul
	\item Description of data structures spread across IMEM
	\item SIMD processing of these structures
	\item Collection of results
	\lu
	\item Main code runs on the CPU; 1DC compiler automatically dispatches parallel operations to the PEs
	\lu
	\end{frame}

	\section{IMAP-CE and IMAPCAR}

	\begin{frame}
	\frametitle{IMAP-CE}

	\ul
	\item Prototype IMAP implementation, developed by Kyo et al.
	\item CPU is now integrated onto the chip as the \emph{Central Processor} (CP)
	\item 128 PEs, 2KB of IMEM each
	\ul
	\item Designed to hold an entire 512x512 image in PE IMEM
	\lu
	\lu
	\end{frame}

	\begin{frame}
	\frametitle{IMAP-CE - Internal Design}
	\begin{center}
	\includegraphics[width=0.8\textwidth]{imap-ce.png}
	\end{center}
	\end{frame}

	\begin{frame}
	\frametitle{IMAP-CE - Usage}

	\ul
	\item Builtin support for four types of image access
	\ul
	\item[(a)] Row-wise
	\item[(b)] Row-systolic
	\item[(c)] Slant-systolic
	\item[(d)] Autonomous
	\lu
	\lu
	\begin{center}
	\includegraphics[width=0.8\textwidth]{pul.pdf}
	\end{center}
	\end{frame}

	\begin{frame}
	\frametitle{IMAPCAR}

	\ul
	\item Refinement of IMAP-CE, designed for use in automobiles
	\item ROI and DMA upgrades, including ROI scaling
	\item Video bus width tripled; IMAPCAR can handle two 768p or three 512p video streams simultaneously
	\item Program and data memory protected by ECC and parity checks respectively
	\lu

	\defbox{
	\item[ROI] Region of Interest
	\item[DMA] Direct Memory Access
	\item[ECC] Error Correction Code
	}
	\end{frame}

	\begin{frame}
	\frametitle{IMAPCAR In Practice}

	\ul
	\item Benchmarks:
	\ul
	\item 3x faster than IMAP-CE
	\item Comparable power requirements
	\lu
	\lu

	\begin{center}
	\includegraphics[height=0.5\textheight]{overtake.png}
	\end{center}
	\end{frame}

	\begin{frame}
	\frametitle{IMAPCAR - Weaknesses}

	\ul
	\item Initial stages of image analysis are all SIMD
	\item Once regions of interest are identified, they must be analyzed
	\item Analysis is intrinsically MIMD
	\item Problem: IMAPCAR has no MIMD support!
	\item ROI analysis ends up happening in serial on the CP, with the PEs idle
	\lu
	\end{frame}

	\begin{frame}
	\frametitle{IMAPCAR - Possible Solutions}

	\ul
	\item Use multiple IMAPCAR chips
	\ul
	\item Cost and power draw increase proportionally
	\item Extra IMAPCARs are idle when performing SIMD operations
	\item All PEs are idle when performing MIMD operations
	\lu
	\item Add more CPs to the IMAPCAR
	\ul
	\item Greatly increases complexity
	\item Still ``wastes'' the PEs
	\lu
	\lu
	\end{frame}

	\section{IMAPCAR2}

	\begin{frame}
	\frametitle{IMAPCAR2}

	\ul
	\item Successor to IMAPCAR, intended to address MIMD issue
	\item Minor upgrades:
	\ul
	\item PEs and CP now use the same datapath and instruction set
	\item IMEM amount doubled
	\item Tiling capability
	\item 16-bit addressing and instruction width
	\lu
	\lu
	\end{frame}

	\begin{frame}
	\frametitle{IMAPCAR2 - MIMD Support}

	\ul
	\item PEs are grouped into sets of 4
	\item Each group is augmented with hardware that lets them combine to function as an additional CP
	\ul
	\item IMEM becomes data and instruction caches
	\item Extra ALUs become FPU components
	\item PE 0 handles registers and instruction dispatch
	\lu
	\item Total hardware overhead is around 20\%
	\lu

	\defbox{
	\item[ALU] Arithmetic/Logic Unit
	\item[FPU] Floating Point Unit
	}
	\end{frame}

	\begin{frame}
	\frametitle{IMAPCAR2 - Internal Design}

	\begin{center}
	\includegraphics[height=0.8\textheight]{pu.png}
	\end{center}
	\end{frame}


	\begin{frame}
	\frametitle{IMAPCAR2 - Programming Model}

	\ul
	\item Like earlier IMAPs, uses 1DC
	\item Additional extensions for controlling PUs
	\item C API for PU usage is pthreads-compatible, ie, shared-memory
	\ul
	\item Synchronization via shared structures - mutexes, semaphores, barriers
	\item Communication by reading and writing known areas of memory
	\lu
	\lu
	\end{frame}

	\begin{frame}
	\frametitle{IMAPCAR2 - Weaknesses}

	\ul
	\item Pthreads-alike programming model implies shared memory
	\item However, the IMAPCAR2 has no cache coherency
	\ul
	\item Cache$\leftrightarrow$RAM transfers must be explicitly invoked
	\lu
	\item Trying to use IMAPCAR2 as a shared-memory system will not work
	\item Unlike IMAPCAR's deficiencies, this can be fixed entirely in software by providing a message-passing API
	\lu
	\end{frame}


	\begin{frame}
	\frametitle{Conclusions}

	\ul
	\item IMAP is an old but still highly effective architecture for image processing
	\item IMAPCAR2 shows promise as a modern refinement of that design
	\item However, additional software support is needed to fully realize its potential
	\lu
	\nocite{*}
	\end{frame}



	\begin{frame}[allowframebreaks]{References}
	% \frametitle{References}


	% \nocite{Fujita:1997:GSP:522770.791783}
	% \nocite{scalable-video-recognition-processor}
	% \nocite{imap-architecture}
	% \nocite{low-cost-mixed-mode-processor}
	% \nocite{imap}
	% \nocite{overtaking}
	% \nocite{in-vehicle-vision-processors}
	% \nocite{imapcar2-hotchips}
	% \nocite{imapcar-in-vehicle-vision-processor}

	%\bibliographystyle{acm}
	{\tiny \printbibliography[omitnumbers=true]}

	\end{frame}

	\end{document}