davestevens/publications.bib

## publications.bib
%% This BibTeX bibliography file was created using BibDesk.
%% http://bibdesk.sourceforge.net/


%% Created for dave stevens at 2012-03-21 13:28:29 +0000


%% Saved with string encoding Unicode (UTF-8)


@inproceedings{le1_motion_estimation,
	Abstract = {This paper discusses the customization of the LE1 VLIW Chip Multiprocessor (CMP) for processing Motion Estima- tion (ME) algorithms in video coding applications. The LE1 is based on an 8-stage pipeline, configurable, extensible VLIW core implementing a partially-predicated Instruction Set Architecture (ISA) with support for pipelined, multi- input, multi-output (MIMO) custom instruction extensions and a shared-memory programmer's model. The CMP is parameterizable with respect to the number of processors, their architectural (issue) width, the setting of features at the micro-architecture level, such as the latency and the local memory system and the mix of the functional units. The re- sults of this work demonstrate the efficiency of the processor when executing video coding kernels such as Motion Es- timation; the CMP achieves near-linear speed-up in the computation of a number of such ME algorithms including Full Search, Three Step Search, Diamond Search, and PMVFAST. FPGA implementations are also presented.},
	Author = {Vassilios Chouliaras and George Lentaris and Dionysis Reisis and David Stevens},
	Booktitle = {Architecture of Computing Systems, 2011. ARCS 2011. 24th International Conference on},
	Date-Added = {2012-03-21 13:14:49 +0000},
	Date-Modified = {2012-03-21 13:25:47 +0000},
	Month = {February},
	Title = {Customizing a VLIW Chip Multiprocessor for Motion Estimation Algorithms},
	Year = {2011}}

@inproceedings{le1_fft,
	Abstract = {This paper presents the setup and the evaluation of the LE1 configurable, extensible, multi-cluster VLIW processor system in FFT execution. The input code is a C implementation of the FFT algorithm and we evaluate its performance on the LE1 simulator for multiple CPU configurations (issue width, execution resource mix, custom instruction) and compiler optimizations (inlining, loop unrolling) in an effort to optimize the cycle count. We identify the prevailing LE1 configurations, with respect to the FFT cycle performance, their silicon area and the power dissipation. Finally, we compare these results to a fully systolic single datapath delay feedback (SDF) VLSI FFT architecture derived from the same C code.},
	Author = {David Stevens and Nicky Glynn and Panagiotis Galiatsatos and Vassilios Chouliaras and Dionysis Reisis},
	Booktitle = {Electronics, Circuits, and Systems, 2009. ICECS 2009. 16th IEEE International Conference on},
	Date-Added = {2012-03-21 13:10:34 +0000},
	Date-Modified = {2012-03-21 13:26:23 +0000},
	Month = {June},
	Pages = {771-774},
	Title = {Evaluating the performance of a configurable, extensible VLIW processor in FFT execution},
	Year = {2009}}

@inproceedings{fastslam,
	Abstract = {This paper presents a fixed-point version of the FastSLAM 2.0 algorithm and describes its implementation on the LE1 configurable and extensible Very Long Instruction Word (VLIW) processor. In addition, modifications to the FastSLAM 2.0 algorithm have been made in order to enable fast execution on systems with limited resources. The resultant code has been executed on a cycle-accurate simulator of the processor to quantify the benefits of exploiting instruction level parallelism.},
	Author = {Scott Moyers and David Stevens and Vassilios A. Chouliaras and David J. Mulvaney},
	Booktitle = {Electronics, Circuits, and Systems, 2009. ICECS 2009. 16th IEEE International Conference on},
	Date-Added = {2012-03-21 13:06:14 +0000},
	Date-Modified = {2012-03-21 13:27:22 +0000},
	Month = {June},
	Title = {Implementation of a Fixed-Point FastSLAM2.0 Algorithm on a Configurable and Extensible VLIW Processor},
	Year = {2009}}

@article{le1_biothreads,
	Abstract = {We discuss BioThreads, a novel, configurable, extensible system-on-chip multiprocessor and its use in accelerating biomedical signal processing applications such as imaging photoplethysmography (IPPG). BioThreads is derived from the LE1 open-source VLIW chip multiprocessor and efficiently handles instruction, data and thread-level parallelism. In addition, it supports a novel mechanism for the dynamic creation, and allocation of software threads to uncommitted processor cores by implementing key POSIX Threads primitives directly in hardware, as custom instructions. In this study, the BioThreads core is used to accelerate the calculation of the oxygen saturation map of living tissue in an experimental setup consisting of a high speed image acquisition system, connected to an FPGA board and to a host system. Results demonstrate near-linear acceleration of the core kernels of the target blood perfusion assessment with increasing number of hardware threads. The BioThreads processor was implemented on both standard-cell and FPGA technologies; in the first case and for an issue width of two, full real-time performance is achieved with 4 cores whereas on a mid-range Xilinx Virtex6 device this is achieved with 10 dual-issue cores. An 8-core LE1 VLIW FPGA prototype of the system achieved 240 times faster execution time than the scalar Microblaze processor demonstrating the scalability of the proposed solution to a state-of-the-art FPGA vendor provided soft CPU core.},
	Author = {David Stevens and Vassilios Chouliaras and Vicente Azorin-Peris and Jia Zheng and Angelos Echiadis. and Sijung Hu},
	Date-Added = {2012-03-21 13:05:46 +0000},
	Date-Modified = {2012-03-21 13:27:52 +0000},
	Journal = {IEEE Transactions on Biomedical Circuits and Systems},
	Month = {November},
	Title = {BioThreads: A Novel VLIW-Based Chip Multiprocessor for Accelerating Biomedical Image Processing Applications},
	Year = {2011}}

@inproceedings{le1_pthread,
	Abstract = {We discuss LE1, a parameterized VLIW Chip Multiprocessor (CMP) adhering to the shared memory programmers model. LE1's novelty lies in its ability to perform dynamic thread-spawning through hardware support for PThread-like primitives in addition to its substantial architectural and microarchitectural parameterization. Dynamic (hardware) thread creation is very fast and removes the need for an executive/OS, presenting to the application programmer a 'bare-metal' multiprocessor, capable of exploiting all forms of parallelism. The core LE1 CPU is a configurable, 8-stage pipeline VLIW engine with a proprietary Instruction Set Architecture (ISA) supporting both partial and full predication and pipelined, multi-input, multi-output (MIMO) instruction extensions. The LE1 CMP is parameterizable as to the number of processors, their issue capability, internal microarchitectural features, functional unit mix and latency and the local memory system architecture. Preliminary results indicate near-linear performance improvement when executing a threaded version of the Mandelbrot calculation on 2-way and 4-way processor configurations with a 256 KB, 4-way banked tightly-coupled memory system. Similar trends are seen when executing a threaded matrix multiplication benchmark. We present these findings along with VLSI implementations of 4-way, dual-issue and 3-way, quad issue multiprocessor configurations.},
	Author = {David Stevens and Vassilios Chouliaras},
	Booktitle = {IEEE Computer Society Annual Symposium on VLSI},
	Date-Added = {2012-03-21 13:05:46 +0000},
	Date-Modified = {2012-03-21 13:28:15 +0000},
	Month = {July},
	Pages = {122-126},
	Title = {LE1: A Parameterizable VLIW Chip-Multiprocessor with Hardware PThreads Support},
	Year = {2010}}
	%% This BibTeX bibliography file was created using BibDesk.
	%% http://bibdesk.sourceforge.net/


	%% Created for dave stevens at 2012-03-21 13:28:29 +0000


	%% Saved with string encoding Unicode (UTF-8)



	@inproceedings{le1_motion_estimation,
	Abstract = {This paper discusses the customization of the LE1 VLIW Chip Multiprocessor (CMP) for processing Motion Estima- tion (ME) algorithms in video coding applications. The LE1 is based on an 8-stage pipeline, configurable, extensible VLIW core implementing a partially-predicated Instruction Set Architecture (ISA) with support for pipelined, multi- input, multi-output (MIMO) custom instruction extensions and a shared-memory programmer's model. The CMP is parameterizable with respect to the number of processors, their architectural (issue) width, the setting of features at the micro-architecture level, such as the latency and the local memory system and the mix of the functional units. The re- sults of this work demonstrate the efficiency of the processor when executing video coding kernels such as Motion Es- timation; the CMP achieves near-linear speed-up in the computation of a number of such ME algorithms including Full Search, Three Step Search, Diamond Search, and PMVFAST. FPGA implementations are also presented.},
	Author = {Vassilios Chouliaras and George Lentaris and Dionysis Reisis and David Stevens},
	Booktitle = {Architecture of Computing Systems, 2011. ARCS 2011. 24th International Conference on},
	Date-Added = {2012-03-21 13:14:49 +0000},
	Date-Modified = {2012-03-21 13:25:47 +0000},
	Month = {February},
	Title = {Customizing a VLIW Chip Multiprocessor for Motion Estimation Algorithms},
	Year = {2011}}

	@inproceedings{le1_fft,
	Abstract = {This paper presents the setup and the evaluation of the LE1 configurable, extensible, multi-cluster VLIW processor system in FFT execution. The input code is a C implementation of the FFT algorithm and we evaluate its performance on the LE1 simulator for multiple CPU configurations (issue width, execution resource mix, custom instruction) and compiler optimizations (inlining, loop unrolling) in an effort to optimize the cycle count. We identify the prevailing LE1 configurations, with respect to the FFT cycle performance, their silicon area and the power dissipation. Finally, we compare these results to a fully systolic single datapath delay feedback (SDF) VLSI FFT architecture derived from the same C code.},
	Author = {David Stevens and Nicky Glynn and Panagiotis Galiatsatos and Vassilios Chouliaras and Dionysis Reisis},
	Booktitle = {Electronics, Circuits, and Systems, 2009. ICECS 2009. 16th IEEE International Conference on},
	Date-Added = {2012-03-21 13:10:34 +0000},
	Date-Modified = {2012-03-21 13:26:23 +0000},
	Month = {June},
	Pages = {771-774},
	Title = {Evaluating the performance of a configurable, extensible VLIW processor in FFT execution},
	Year = {2009}}

	@inproceedings{fastslam,
	Abstract = {This paper presents a fixed-point version of the FastSLAM 2.0 algorithm and describes its implementation on the LE1 configurable and extensible Very Long Instruction Word (VLIW) processor. In addition, modifications to the FastSLAM 2.0 algorithm have been made in order to enable fast execution on systems with limited resources. The resultant code has been executed on a cycle-accurate simulator of the processor to quantify the benefits of exploiting instruction level parallelism.},
	Author = {Scott Moyers and David Stevens and Vassilios A. Chouliaras and David J. Mulvaney},
	Booktitle = {Electronics, Circuits, and Systems, 2009. ICECS 2009. 16th IEEE International Conference on},
	Date-Added = {2012-03-21 13:06:14 +0000},
	Date-Modified = {2012-03-21 13:27:22 +0000},
	Month = {June},
	Title = {Implementation of a Fixed-Point FastSLAM2.0 Algorithm on a Configurable and Extensible VLIW Processor},
	Year = {2009}}

	@article{le1_biothreads,
	Abstract = {We discuss BioThreads, a novel, configurable, extensible system-on-chip multiprocessor and its use in accelerating biomedical signal processing applications such as imaging photoplethysmography (IPPG). BioThreads is derived from the LE1 open-source VLIW chip multiprocessor and efficiently handles instruction, data and thread-level parallelism. In addition, it supports a novel mechanism for the dynamic creation, and allocation of software threads to uncommitted processor cores by implementing key POSIX Threads primitives directly in hardware, as custom instructions. In this study, the BioThreads core is used to accelerate the calculation of the oxygen saturation map of living tissue in an experimental setup consisting of a high speed image acquisition system, connected to an FPGA board and to a host system. Results demonstrate near-linear acceleration of the core kernels of the target blood perfusion assessment with increasing number of hardware threads. The BioThreads processor was implemented on both standard-cell and FPGA technologies; in the first case and for an issue width of two, full real-time performance is achieved with 4 cores whereas on a mid-range Xilinx Virtex6 device this is achieved with 10 dual-issue cores. An 8-core LE1 VLIW FPGA prototype of the system achieved 240 times faster execution time than the scalar Microblaze processor demonstrating the scalability of the proposed solution to a state-of-the-art FPGA vendor provided soft CPU core.},
	Author = {David Stevens and Vassilios Chouliaras and Vicente Azorin-Peris and Jia Zheng and Angelos Echiadis. and Sijung Hu},
	Date-Added = {2012-03-21 13:05:46 +0000},
	Date-Modified = {2012-03-21 13:27:52 +0000},
	Journal = {IEEE Transactions on Biomedical Circuits and Systems},
	Month = {November},
	Title = {BioThreads: A Novel VLIW-Based Chip Multiprocessor for Accelerating Biomedical Image Processing Applications},
	Year = {2011}}

	@inproceedings{le1_pthread,
	Abstract = {We discuss LE1, a parameterized VLIW Chip Multiprocessor (CMP) adhering to the shared memory programmers model. LE1's novelty lies in its ability to perform dynamic thread-spawning through hardware support for PThread-like primitives in addition to its substantial architectural and microarchitectural parameterization. Dynamic (hardware) thread creation is very fast and removes the need for an executive/OS, presenting to the application programmer a 'bare-metal' multiprocessor, capable of exploiting all forms of parallelism. The core LE1 CPU is a configurable, 8-stage pipeline VLIW engine with a proprietary Instruction Set Architecture (ISA) supporting both partial and full predication and pipelined, multi-input, multi-output (MIMO) instruction extensions. The LE1 CMP is parameterizable as to the number of processors, their issue capability, internal microarchitectural features, functional unit mix and latency and the local memory system architecture. Preliminary results indicate near-linear performance improvement when executing a threaded version of the Mandelbrot calculation on 2-way and 4-way processor configurations with a 256 KB, 4-way banked tightly-coupled memory system. Similar trends are seen when executing a threaded matrix multiplication benchmark. We present these findings along with VLSI implementations of 4-way, dual-issue and 3-way, quad issue multiprocessor configurations.},
	Author = {David Stevens and Vassilios Chouliaras},
	Booktitle = {IEEE Computer Society Annual Symposium on VLSI},
	Date-Added = {2012-03-21 13:05:46 +0000},
	Date-Modified = {2012-03-21 13:28:15 +0000},
	Month = {July},
	Pages = {122-126},
	Title = {LE1: A Parameterizable VLIW Chip-Multiprocessor with Hardware PThreads Support},
	Year = {2010}}