pervognsen/cache.v

## cache.v
// This was mostly done as a fun experiment to see how much I could minimize LUT count and combinational delays.
// Vivado synthesizes this to 17-18 LUTs for the default parameters (32-bit addresses, 8 KB cache, 32-byte cachelines).
// About half those LUTs are for the tag comparator. Note that the LUT count is carefully designed to be independent
// of the cacheline width by avoiding any output muxes (e.g. rather than forwarding the fill response, which would
// create an output mux between the fill response and the cache array, it just repeats the cache lookup). The downside
// of this general kind of design is that it takes 3 cycles to report a hit. You could easily do a design with combinational
// outputs that produces a result in 1 cycle, albeit with very restrictive setup times and combinational output delays.
//
// See if you can reduce the LUT count further! (Yes, it's silly, since the cacheline-to-word shifter is going to dominate.)

`define ASSERT(x)

module cache#(
    parameter                       ADDR_WIDTH = 32 - 5,
    parameter                       DATA_WIDTH = 32*8,
    parameter                       CACHE_SIZE = 256,
    parameter                       INDEX_WIDTH = $clog2(CACHE_SIZE),
    parameter                       TAG_WIDTH = ADDR_WIDTH - INDEX_WIDTH
)(
    input                           clk,
    input                           cache_req_enable,
    input       [ADDR_WIDTH-1:0]    cache_req_addr,
    output reg                      cache_req_ready,
    output reg                      cache_resp_enable,
    output      [DATA_WIDTH-1:0]    cache_resp_data,
    output reg                      mem_req_enable,
    output      [ADDR_WIDTH-1:0]    mem_req_addr,
    input                           mem_req_ready,
    input                           mem_resp_enable,
    input       [DATA_WIDTH-1:0]    mem_resp_data,
    output                          mem_resp_ready
);

    localparam                      IDLE = 0, START_READ = 1, READ = 2, HIT = 3, START_FILL = 4, FILL = 5;

    reg     [DATA_WIDTH-1:0]        data_array[0:CACHE_SIZE-1];
    reg     [TAG_WIDTH-1:0]         tag_array[0:CACHE_SIZE-1];
    reg     [2:0]                   state;
    reg     [ADDR_WIDTH-1:0]        req_addr;
    reg     [DATA_WIDTH-1:0]        read_data;
    reg     [TAG_WIDTH-1:0]         read_tag;

    wire    [TAG_WIDTH-1:0]         req_tag = req_addr[ADDR_WIDTH-1:INDEX_WIDTH];
    wire    [INDEX_WIDTH-1:0]       req_index = req_addr[INDEX_WIDTH-1:0];
    wire    [INDEX_WIDTH-1:0]       cache_req_index = cache_req_addr[INDEX_WIDTH-1:0];

    assign                          cache_resp_data = read_data;
    assign                          mem_req_addr = req_addr;
    assign                          mem_resp_ready = 1;

    initial begin
        cache_resp_enable = 0;
        mem_req_enable = 0;
        cache_req_ready = 1;
        state = IDLE;
    end

    always @(posedge clk) begin
        case (state)
            IDLE: begin
                `ASSERT(cache_req_ready)
                if (cache_req_enable) begin
                    req_addr <= cache_req_addr;
                    cache_req_ready <= 0;
                    state <= START_READ;
                end
            end

            START_READ: begin
                read_data <= data_array[req_index];
                read_tag <= tag_array[req_index];
                state <= READ;
            end

            READ: begin
                if (read_tag == req_tag) begin
                    cache_resp_enable <= 1;
                    state <= HIT;
                end else begin
                    mem_req_enable <= 1;
                    state <= START_FILL;
                end
            end

            HIT: begin
                cache_resp_enable <= 0;
                cache_req_ready <= 1;
                state <= IDLE;
            end

            START_FILL: begin
                `ASSERT(mem_req_enable)
                if (mem_req_ready) begin
                    mem_req_enable <= 0;
                    state <= FILL;
                end
            end

            FILL: begin
                `ASSERT(mem_resp_ready)
                if (mem_resp_enable) begin
                    data_array[req_index] <= mem_resp_data;
                    tag_array[req_index] <= req_tag;
                    state <= START_READ;
                end
            end
        endcase
    end

endmodule
	// This was mostly done as a fun experiment to see how much I could minimize LUT count and combinational delays.
	// Vivado synthesizes this to 17-18 LUTs for the default parameters (32-bit addresses, 8 KB cache, 32-byte cachelines).
	// About half those LUTs are for the tag comparator. Note that the LUT count is carefully designed to be independent
	// of the cacheline width by avoiding any output muxes (e.g. rather than forwarding the fill response, which would
	// create an output mux between the fill response and the cache array, it just repeats the cache lookup). The downside
	// of this general kind of design is that it takes 3 cycles to report a hit. You could easily do a design with combinational
	// outputs that produces a result in 1 cycle, albeit with very restrictive setup times and combinational output delays.
	//
	// See if you can reduce the LUT count further! (Yes, it's silly, since the cacheline-to-word shifter is going to dominate.)

	`define ASSERT(x)

	module cache#(
	parameter ADDR_WIDTH = 32 - 5,
	parameter DATA_WIDTH = 32*8,
	parameter CACHE_SIZE = 256,
	parameter INDEX_WIDTH = $clog2(CACHE_SIZE),
	parameter TAG_WIDTH = ADDR_WIDTH - INDEX_WIDTH
	)(
	input clk,
	input cache_req_enable,
	input [ADDR_WIDTH-1:0] cache_req_addr,
	output reg cache_req_ready,
	output reg cache_resp_enable,
	output [DATA_WIDTH-1:0] cache_resp_data,
	output reg mem_req_enable,
	output [ADDR_WIDTH-1:0] mem_req_addr,
	input mem_req_ready,
	input mem_resp_enable,
	input [DATA_WIDTH-1:0] mem_resp_data,
	output mem_resp_ready
	);

	localparam IDLE = 0, START_READ = 1, READ = 2, HIT = 3, START_FILL = 4, FILL = 5;

	reg [DATA_WIDTH-1:0] data_array[0:CACHE_SIZE-1];
	reg [TAG_WIDTH-1:0] tag_array[0:CACHE_SIZE-1];
	reg [2:0] state;
	reg [ADDR_WIDTH-1:0] req_addr;
	reg [DATA_WIDTH-1:0] read_data;
	reg [TAG_WIDTH-1:0] read_tag;

	wire [TAG_WIDTH-1:0] req_tag = req_addr[ADDR_WIDTH-1:INDEX_WIDTH];
	wire [INDEX_WIDTH-1:0] req_index = req_addr[INDEX_WIDTH-1:0];
	wire [INDEX_WIDTH-1:0] cache_req_index = cache_req_addr[INDEX_WIDTH-1:0];

	assign cache_resp_data = read_data;
	assign mem_req_addr = req_addr;
	assign mem_resp_ready = 1;

	initial begin
	cache_resp_enable = 0;
	mem_req_enable = 0;
	cache_req_ready = 1;
	state = IDLE;
	end

	always @(posedge clk) begin
	case (state)
	IDLE: begin
	`ASSERT(cache_req_ready)
	if (cache_req_enable) begin
	req_addr <= cache_req_addr;
	cache_req_ready <= 0;
	state <= START_READ;
	end
	end

	START_READ: begin
	read_data <= data_array[req_index];
	read_tag <= tag_array[req_index];
	state <= READ;
	end

	READ: begin
	if (read_tag == req_tag) begin
	cache_resp_enable <= 1;
	state <= HIT;
	end else begin
	mem_req_enable <= 1;
	state <= START_FILL;
	end
	end

	HIT: begin
	cache_resp_enable <= 0;
	cache_req_ready <= 1;
	state <= IDLE;
	end

	START_FILL: begin
	`ASSERT(mem_req_enable)
	if (mem_req_ready) begin
	mem_req_enable <= 0;
	state <= FILL;
	end
	end

	FILL: begin
	`ASSERT(mem_resp_ready)
	if (mem_resp_enable) begin
	data_array[req_index] <= mem_resp_data;
	tag_array[req_index] <= req_tag;
	state <= START_READ;
	end
	end
	endcase
	end

	endmodule