mbitsnbites

## vector-extensions-for-tiny-risc.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              1 star
            
          
                mbitsnbites
                / vector-extensions-for-tiny-risc.md
            
            
              Last active
              February 7, 2022 08:03
            
          
    Vector extensions for tiny RISC machines

This is an idea for adding vector processing support to tiny RISC style microcontroller CPU:s (e.g. similar to FEMTORV32).
It could be especially beneficial for CPU:s without a pipeline, where each instruction normally takes 2+ clock
cycles to complete.
The purpose of vector processing is to reduce the number of clock cycles required for each operation:

Loop logic overhead is reduced (increment, compare, branch) as several data elements are processed in each loop iteration.


## perf.hpp
#include <chrono>
#include <iostream>

/// @brief Scoped performance measurement class.
///
/// To use this class, instantiate a @c perf_t object inside a scope. Once the
/// object goes out of scope, the elapsed time (in nanoseconds) will be printed
/// to stdout.
///
/// @code{.cpp}

## fast_sin.c
float fast_sin(float x) {
  // 1) Reduce periods of sin(x) to the range -PI/2 to PI/2.
  const int period = (int)(x * (1.0f / 3.141592654f) + 0.5f);
  const int negate = period & 1;
  x -= 3.141592654f * ((float)period - 0.5f);

  // 2) Use a Tailor series approximation in the range -PI/2 to PI/2.
  // See: https://en.wikipedia.org/wiki/Taylor_series#Approximation_error_and_convergence
  // sin(x) ≃ x - x³/3! + x⁵/5! - x⁷/7!
  // Note: 3! = 6, 5! = 120, 7! = 5040

## cpp11_parallel_for.cpp
#include <atomic>
#include <thread>

template <typename COUNT_T, typename BODY_T>
void parallel_for(const COUNT_T count, BODY_T body) {
  std::vector<std::thread> workers;
  std::atomic<COUNT_T> next_job_id(0);
  for (unsigned int i = std::thread::hardware_concurrency(); i > 0; --i) {
    workers.emplace_back([&]() {
      while (true) {

## divu32.c
#include <stdint.h>
#include <stdio.h>

void divu32(uint32_t N, uint32_t D) {
  uint32_t Q = 0;
  uint32_t R = 0;

  for (int i = 31; i >= 0; --i) {
    Q = Q << 1;
    R = R << 1;

## StreamingCPU.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                mbitsnbites
                / StreamingCPU.md
            
            
              Last active
              September 14, 2018 14:36
            
          
    Idea

Minimize logic in each pipeline stage to minimize design complexity and maxmize
clock speed (roughly the same as for MIPS, but more extreme).

No speculative branches.

Use branch delay slots.


No operand forwarding.

All instructions have the same latency.
I.e. every instruction has trailing "delay slots".


## raw_cast.hpp
#include <cstring>

template <typename T2, typename T1>
inline T2 raw_cast(const T1& x) {
  static_assert(sizeof(T2) == sizeof(T1), "raw_cast type sizes must match");
  T2 result;
  std::memcpy(static_cast<void*>(&result), static_cast<const void*>(&x), sizeof(T2));
  return result;
}

## fft.js
// This is a tiny radix-2 FFT implementation in JavaScript.
// The function takes a complex valued input signal, and performs an in-place
// Fast Fourier Transform (i.e. the result is returned in x_re, x_im). The
// function arguments can be any Array type (including typed arrays).
// Code size: <300 bytes after Closure Compiler.
function FFT(x_re, x_im) {
  var m = x_re.length / 2, k, X_re = [], X_im = [], Y_re = [], Y_im = [],
      a, b, tw_re, tw_im;

  for (k = 0; k < m; ++k) {
	#include <chrono>
	#include <iostream>

	/// @brief Scoped performance measurement class.
	///
	/// To use this class, instantiate a @c perf_t object inside a scope. Once the
	/// object goes out of scope, the elapsed time (in nanoseconds) will be printed
	/// to stdout.
	///
	/// @code{.cpp}
	float fast_sin(float x) {
	// 1) Reduce periods of sin(x) to the range -PI/2 to PI/2.
	const int period = (int)(x * (1.0f / 3.141592654f) + 0.5f);
	const int negate = period & 1;
	x -= 3.141592654f * ((float)period - 0.5f);

	// 2) Use a Tailor series approximation in the range -PI/2 to PI/2.
	// See: https://en.wikipedia.org/wiki/Taylor_series#Approximation_error_and_convergence
	// sin(x) ≃ x - x³/3! + x⁵/5! - x⁷/7!
	// Note: 3! = 6, 5! = 120, 7! = 5040
	#include <atomic>
	#include <thread>

	template <typename COUNT_T, typename BODY_T>
	void parallel_for(const COUNT_T count, BODY_T body) {
	std::vector<std::thread> workers;
	std::atomic<COUNT_T> next_job_id(0);
	for (unsigned int i = std::thread::hardware_concurrency(); i > 0; --i) {
	workers.emplace_back([&]() {
	while (true) {
	#include <stdint.h>
	#include <stdio.h>

	void divu32(uint32_t N, uint32_t D) {
	uint32_t Q = 0;
	uint32_t R = 0;

	for (int i = 31; i >= 0; --i) {
	Q = Q << 1;
	R = R << 1;
	#include <cstring>

	template <typename T2, typename T1>
	inline T2 raw_cast(const T1& x) {
	static_assert(sizeof(T2) == sizeof(T1), "raw_cast type sizes must match");
	T2 result;
	std::memcpy(static_cast<void>(&result), static_cast<const void>(&x), sizeof(T2));
	return result;
	}
	// This is a tiny radix-2 FFT implementation in JavaScript.
	// The function takes a complex valued input signal, and performs an in-place
	// Fast Fourier Transform (i.e. the result is returned in x_re, x_im). The
	// function arguments can be any Array type (including typed arrays).
	// Code size: <300 bytes after Closure Compiler.
	function FFT(x_re, x_im) {
	var m = x_re.length / 2, k, X_re = [], X_im = [], Y_re = [], Y_im = [],
	a, b, tw_re, tw_im;

	for (k = 0; k < m; ++k) {