Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@buttercutter
Last active February 12, 2022 03:15
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save buttercutter/2c477b1dbc39f04a1271fb229e6d678c to your computer and use it in GitHub Desktop.
Save buttercutter/2c477b1dbc39f04a1271fb229e6d678c to your computer and use it in GitHub Desktop.
A simple DDR3 memory controller
[tasks]
proof
cover
[options]
proof: mode prove
proof: depth 10
cover: mode cover
cover: depth 40
cover: append 6
[engines]
smtbmc yices
# smtbmc boolector
# abc pdr
# aiger avy
# aiger suprove
[script]
read_verilog -formal -sv cells_sim.v
read_verilog -formal -sv ddr3_memory_controller.v
prep -top ddr3_memory_controller
# tribuf -logic
# synth -top ddr3_memory_controller
# iopadmap -bits -inpad IBUF O:PAD -outpad OBUF I:PAD -tinoutpad IOBUF ENA:O:I:PAD ddr3_memory_controller
[files]
ddr3_memory_controller.v
cells_sim.v
cells_ff.vh
cells_io.vh
// Credit : https://github.com/MartinGeisse/esdk2/blob/master/simsyn/orange-crab/src/mahdl/name/martingeisse/esdk/riscv/orange_crab/ddr3/RamController.mahdl
// Will simulate loopback transaction (write some data into RAM, then read those data back from RAM)
// with the verilog simulation model provided by Micron
// https://www.micron.com/products/dram/ddr3-sdram/part-catalog/mt41j128m16jt-125
// Later, formal verification will proceed with using Micron simulation model
`define HIGH_SPEED 1 // Minimum DDR3-1600 operating frequency >= 303MHz
`define MICRON_SIM 1 // micron simulation model
`define TESTBENCH 1 // for both micron simulation model and Xilinx ISIM simulator
`define VIVADO 1 // for 7-series and above
`define USE_x16 1
`define USE_SERDES 1
// `define TDQS 1
//`define RAM_SIZE_1GB
`define RAM_SIZE_2GB
//`define RAM_SIZE_4GB
`ifndef FORMAL
`ifdef HIGH_SPEED
// for internal logic analyzer
//`define USE_ILA 1
// for lattice ECP5 FPGA
//`define LATTICE 1
// for Xilinx Spartan-6 FPGA
`define XILINX 1
// for Altera MAX-10 FPGA
//`define ALTERA 1
`endif
`endif
//`ifndef XILINX
/* verilator lint_off VARHIDDEN */
localparam NUM_OF_DDR_STATES = 23;
// TIME_TZQINIT = 512
// See also 'COUNTER_INCREMENT_VALUE' on why some of the large timing variables are not used in this case
localparam MAX_WAIT_COUNT = 512;
/* verilator lint_on VARHIDDEN */
//`endif
// write data to RAM and then read them back from RAM
`define LOOPBACK 1
// https://www.systemverilog.io/ddr4-basics
module ddr3_memory_controller
#(
parameter NUM_OF_WRITE_DATA = 32, // 32 pieces of data are to be written to DRAM
parameter NUM_OF_READ_DATA = 32, // 32 pieces of data are to be read from DRAM
parameter DATA_BURST_LENGTH = 8, // eight data transfers per burst activity, please modify MR0 setting if none other than BL8
`ifdef USE_SERDES
// why 8 ? because of FPGA development board is using external 50 MHz crystal
// and the minimum operating frequency for Micron DDR3 memory is 303MHz
parameter SERDES_RATIO = 8,
`endif
parameter PICO_TO_NANO_CONVERSION_FACTOR = 1000, // 1ns = 1000ps
`ifndef HIGH_SPEED
parameter PERIOD_MARGIN = 10, // 10ps margin
parameter MAXIMUM_CK_PERIOD = 3300-PERIOD_MARGIN, // 3300ps which is defined by Micron simulation model
parameter DIVIDE_RATIO = 4, // master 'clk' signal is divided by 4 for DDR outgoing 'ck' signal, it is for 90 degree phase shift purpose.
// host clock period in ns
// clock period of 'clk' = 0.8225ns , clock period of 'ck' = 3.3ns
parameter CLK_PERIOD = $itor(MAXIMUM_CK_PERIOD/DIVIDE_RATIO)/$itor(PICO_TO_NANO_CONVERSION_FACTOR),
`else
parameter CLK_PERIOD = 20, // 20ns, 50MHz
parameter CLK_SERDES_PERIOD = 12, // 12ns, 83.333MHz
`endif
`ifdef TESTBENCH
`ifndef MICRON_SIM
parameter PERIOD_MARGIN = 10, // 10ps margin
parameter MAXIMUM_CK_PERIOD = 3300-PERIOD_MARGIN, // 3300ps which is defined by Micron simulation model
parameter DIVIDE_RATIO = 4, // master 'clk' signal is divided by 4 for DDR outgoing 'ck' signal, it is for 90 degree phase shift purpose.
`endif
`endif
`ifdef HIGH_SPEED
parameter CK_PERIOD = 3, // 333.333MHz from PLL, 1/333.333MHz = 3ns
`else
parameter CK_PERIOD = (CLK_PERIOD*DIVIDE_RATIO),
`endif
// for STATE_IDLE transition into STATE_REFRESH
// tREFI = 65*tRFC calculated using info from Micron dataheet, so tREFI > 8 * tRFC
// So it is entirely possible to do all 8 refresh commands inside one tREFI cycle
// since each refresh command will take tRFC cycle to finish
// See also https://www.systemverilog.io/understanding-ddr4-timing-parameters#refresh
/* verilator lint_off VARHIDDEN */
parameter MAX_NUM_OF_REFRESH_COMMANDS_POSTPONED = 8, // 9 commands. one executed immediately, 8 more enqueued.
/* verilator lint_on VARHIDDEN */
`ifdef USE_x16
parameter DQS_BITWIDTH = 2,
`ifdef RAM_SIZE_1GB
parameter ADDRESS_BITWIDTH = 13,
`elsif RAM_SIZE_2GB
parameter ADDRESS_BITWIDTH = 14,
`elsif RAM_SIZE_4GB
parameter ADDRESS_BITWIDTH = 15,
`endif
`else
parameter DQS_BITWIDTH = 1,
`ifdef RAM_SIZE_1GB
parameter ADDRESS_BITWIDTH = 14,
`elsif RAM_SIZE_2GB
parameter ADDRESS_BITWIDTH = 15,
`elsif RAM_SIZE_4GB
parameter ADDRESS_BITWIDTH = 16,
`endif
`endif
parameter BANK_ADDRESS_BITWIDTH = 3, // 8 banks, and $clog2(8) = 3
`ifdef USE_x16
parameter DQ_BITWIDTH = 16 // bitwidth for each piece of data
`else
parameter DQ_BITWIDTH = 8 // bitwidth for each piece of data
`endif
)
(
// these are FPGA internal signals
input clk,
input reset,
input write_enable, // write to DDR memory
input read_enable, // read from DDR memory
input [BANK_ADDRESS_BITWIDTH+ADDRESS_BITWIDTH-1:0] i_user_data_address, // the DDR memory address for which the user wants to write/read the data
`ifdef USE_SERDES
input [DQ_BITWIDTH*SERDES_RATIO-1:0] data_to_ram, // data for which the user wants to write to DDR
output [DQ_BITWIDTH*SERDES_RATIO-1:0] data_from_ram, // the requested data from DDR RAM after read operation
`else
// TWO pieces of data bundled together due to double-data-rate requirement of DQ signal
input [(DQ_BITWIDTH << 1)-1:0] data_to_ram, // data to be written to DDR RAM
output [(DQ_BITWIDTH << 1)-1:0] data_from_ram, // the requested data being read from DDR RAM read operation
`endif
input [$clog2(MAX_NUM_OF_REFRESH_COMMANDS_POSTPONED):0] user_desired_extra_read_or_write_cycles, // for the purpose of postponing refresh commands
`ifndef HIGH_SPEED
output clk_slow_posedge, // for dq phase shifting purpose
output clk180_slow_posedge, // for dq phase shifting purpose
`endif
// these are to be fed into external DDR3 memory
output [ADDRESS_BITWIDTH-1:0] address,
output [BANK_ADDRESS_BITWIDTH-1:0] bank_address,
`ifdef HIGH_SPEED
output ck_obuf, // CK
output ck_n_obuf, // CK#
`else
output ck, // CK
output ck_n, // CK#
`endif
`ifdef TESTBENCH
output ck_90,
output ck_270,
output [DQ_BITWIDTH-1:0] dq_iobuf_enable,
output ldqs_iobuf_enable,
output udqs_iobuf_enable,
output reg data_read_is_ongoing,
`endif
`ifdef HIGH_SPEED
output clk_serdes_data, // 83.333MHz with 0 phase shift
output clk_serdes, // 83.333MHz with 225 phase shift
output ck_180, // 333.333MHz with 180 phase shift
output reg locked_previous,
output need_to_assert_reset,
`endif
output ck_en, // CKE
output cs_n, // chip select signal
output odt, // on-die termination
output ras_n, // RAS#
output cas_n, // CAS#
output we_n, // WE#
output reset_n,
inout [DQ_BITWIDTH-1:0] dq, // Data input/output
// for coordinating with the user application on when to start DRAM write and read operation
output reg [$clog2(NUM_OF_DDR_STATES)-1:0] main_state,
output reg [$clog2(MAX_WAIT_COUNT):0] wait_count,
// Xilinx ILA could not probe port IO of IOBUF primitive, but could probe rest of the ports (ports I, O, and T)
`ifdef USE_ILA
output [DQ_BITWIDTH-1:0] dq_w, // port I
output [DQ_BITWIDTH-1:0] dq_r, // port O
output low_Priority_Refresh_Request,
output high_Priority_Refresh_Request,
// to propagate 'write_enable' and 'read_enable' signals during STATE_IDLE to STATE_WRITE and STATE_READ
output reg write_is_enabled,
output reg read_is_enabled,
output reg [$clog2(MAX_NUM_OF_REFRESH_COMMANDS_POSTPONED):0] refresh_Queue,
output reg [($clog2(DIVIDE_RATIO_HALVED)-1):0] dqs_counter,
output dqs_rising_edge,
output dqs_falling_edge,
`endif
`ifdef USE_x16
output ldm, // lower-byte data mask
output udm, // upper-byte data mask
inout ldqs, // lower byte data strobe
inout ldqs_n,
inout udqs, // upper byte data strobe
inout udqs_n
`else
inout [DQS_BITWIDTH-1:0] dqs, // Data strobe
inout [DQS_BITWIDTH-1:0] dqs_n,
// driven to high-Z if TDQS termination function is disabled
// according to TN-41-06: DDR3 Termination Data Strobe (TDQS)
// Please as well look at TN-41-04: DDR3 Dynamic On-Die Termination Operation
`ifdef TDQS
inout [DQS_BITWIDTH-1:0] tdqs, // Termination data strobe, but can act as data-mask (DM) when TDQS function is disabled
`else
output [DQS_BITWIDTH-1:0] tdqs,
`endif
inout [DQS_BITWIDTH-1:0] tdqs_n
`endif
);
// When writes are done on bus with a data-width > 8, you are doing a single write for multiple bytes and
// then need to be able to indicate which bytes are valid and need to be updated in memory,
// which bytes should be ignored. That's the purpose of DM.
// It is allowed to have DM always pulled low (some boards are wired like this) but will make you loose
// the byte granularity on writes, your granularity is then on DRAM's burst words.
// DM is just here to have byte granularity on the write accesses
// (ie you only want to update some bytes of the DRAM word)
`ifndef USE_x16
`ifndef TDQS
assign tdqs = 0; // acts as DM
`endif
`endif
/*
reg previous_clk_en;
always @(posedge clk)
begin
if(reset) previous_clk_en <= 0;
previous_clk_en <= clk_en;
end
*/
// Commands truth table extracted from Micron specification document
/*
localparam MRS = (previous_clk_en) & (ck_en) & (~cs_n) & (~ras_n) & (~cas_n) & (~we_n);
localparam REF = (previous_clk_en) & (ck_en) & (~cs_n) & (~ras_n) & (~cas_n) & (we_n);
localparam PRE = (previous_clk_en) & (ck_en) & (~cs_n) & (~ras_n) & (cas_n) & (~we_n) & (~A10);
localparam PREA = (previous_clk_en) & (ck_en) & (~cs_n) & (~ras_n) & (cas_n) & (~we_n) & (A10);
localparam ACT = (previous_clk_en) & (ck_en) & (~cs_n) & (~ras_n) & (cas_n) & (we_n);
localparam WR = (previous_clk_en) & (ck_en) & (~cs_n) & (ras_n) & (~cas_n) & (~we_n) & (~A10);
localparam WRS4 = (previous_clk_en) & (ck_en) & (~cs_n) & (ras_n) & (~cas_n) & (~we_n) & (~A12) & (~A10);
localparam WRS8 = (previous_clk_en) & (ck_en) & (~cs_n) & (ras_n) & (~cas_n) & (~we_n) & (A12) & (~A10);
localparam WRAP = (previous_clk_en) & (ck_en) & (~cs_n) & (ras_n) & (~cas_n) & (~we_n) & (A10);
localparam WRAPS4 = (previous_clk_en) & (ck_en) & (~cs_n) & (ras_n) & (~cas_n) & (~we_n) & (~A12) & (A10);
localparam WRAPS8 = (previous_clk_en) & (ck_en) & (~cs_n) & (ras_n) & (~cas_n) & (~we_n) & (A12) & (A10);
localparam RD = (previous_clk_en) & (ck_en) & (~cs_n) & (ras_n) & (~cas_n) & (we_n) & (~A10);
localparam RDS4 = (previous_clk_en) & (ck_en) & (~cs_n) & (ras_n) & (~cas_n) & (we_n) & (~A12) & (~A10);
localparam RDS8 = (previous_clk_en) & (ck_en) & (~cs_n) & (ras_n) & (~cas_n) & (we_n) & (A12) & (~A10);
localparam RDAP = (previous_clk_en) & (ck_en) & (~cs_n) & (ras_n) & (~cas_n) & (we_n) & (A10);
localparam RDAPS4 = (previous_clk_en) & (ck_en) & (~cs_n) & (ras_n) & (~cas_n) & (we_n) & (~A12) & (A10);
localparam RDAPS8 = (previous_clk_en) & (ck_en) & (~cs_n) & (ras_n) & (~cas_n) & (we_n) & (A12) & (A10);
localparam NOP = (previous_clk_en) & (ck_en) & (~cs_n) & (ras_n) & (cas_n) & (we_n);
localparam DES = (previous_clk_en) & (ck_en) & (cs_n);
localparam PDE = (previous_clk_en) & (~ck_en) & (~cs_n) & (ras_n) & (cas_n) & (we_n);
localparam PDX = (~previous_clk_en) & (ck_en) & (~cs_n) & (ras_n) & (cas_n) & (we_n);
localparam ZQCL = (previous_clk_en) & (ck_en) & (~cs_n) & (ras_n) & (cas_n) & (~we_n) & (A10);
localparam ZQCS = (previous_clk_en) & (ck_en) & (~cs_n) & (ras_n) & (cas_n) & (~we_n) & (~A10);
*/
// for the purpose of calculating DDR timing parameters such as tXPR, tRFC, ...
//reg [$clog2(MAX_WAIT_COUNT):0] wait_count;
// to synchronize signal in clk_serdes domain to ck_180 domain
wire [$clog2(MAX_WAIT_COUNT):0] wait_count_ck_180;
wire [$clog2(NUM_OF_DDR_STATES)-1:0] main_state_ck_180;
//reg [$clog2(NUM_OF_DDR_STATES)-1:0] main_state;
reg [$clog2(NUM_OF_DDR_STATES)-1:0] previous_main_state;
reg [$clog2(NUM_OF_DDR_STATES)-1:0] previous_main_state_ck_180;
// for PLL lock issue
reg [$clog2(NUM_OF_DDR_STATES)-1:0] state_to_be_restored;
localparam STATE_RESET = 0;
localparam STATE_RESET_FINISH = 1;
localparam STATE_ZQ_CALIBRATION = 23;
localparam STATE_IDLE = 24;
localparam STATE_ACTIVATE = 5;
localparam STATE_WRITE = 6;
localparam STATE_WRITE_AP = 7;
localparam STATE_WRITE_DATA = 8;
localparam STATE_READ = 9;
localparam STATE_READ_AP = 10;
localparam STATE_READ_DATA = 3; // smaller value to solve setup timing issue due to lesser comparison hardware
localparam STATE_PRECHARGE = 12;
localparam STATE_REFRESH = 13;
localparam STATE_WRITE_LEVELLING = 14;
localparam STATE_INIT_CLOCK_ENABLE = 15;
localparam STATE_INIT_MRS_2 = 16;
localparam STATE_INIT_MRS_3 = 17;
localparam STATE_INIT_MRS_1 = 18;
localparam STATE_INIT_MRS_0 = 19;
localparam STATE_WAIT_AFTER_MPR = 20;
localparam STATE_MRS3_TO_MRS1 = 21;
localparam STATE_PLL_LOCK_ISSUE = 22;
localparam STATE_READ_ACTUAL = 2;
localparam STATE_READ_AP_ACTUAL = 4;
// https://www.systemverilog.io/understanding-ddr4-timing-parameters
// TIME_INITIAL_CK_INACTIVE
localparam MAX_TIMING = (500000/CLK_SERDES_PERIOD); // just for initial development stage, will refine the value later
// just to avoid https://github.com/YosysHQ/yosys/issues/2718
`ifndef XILINX
localparam FIXED_POINT_BITWIDTH = $clog2(MAX_TIMING);
`else
localparam FIXED_POINT_BITWIDTH = 18;
`endif
`ifdef FORMAL
// just to make the cover() spends lesser time to complete
localparam TIME_INITIAL_RESET_ACTIVE = 2;
localparam TIME_INITIAL_CK_INACTIVE = 2;
localparam TIME_TZQINIT = 2;
localparam TIME_RL = 2;
localparam TIME_WL = 2;
localparam TIME_TBURST = 2;
localparam TIME_TXPR = 2;
localparam TIME_TMRD = 2;
localparam TIME_TMOD = 2;
localparam TIME_TRFC = 2;
localparam TIME_TREFI = 2;
localparam TIME_TDLLK = 2;
`else
localparam [FIXED_POINT_BITWIDTH-1:0] TIME_INITIAL_RESET_ACTIVE = (200000/CLK_SERDES_PERIOD); // 200us = 200000ns, After the power is stable, RESET# must be LOW for at least 200µs to begin the initialization process.
localparam [FIXED_POINT_BITWIDTH-1:0] TIME_INITIAL_CK_INACTIVE = (500000/CLK_SERDES_PERIOD); // 500us = 500000ns, After RESET# transitions HIGH, wait 500µs (minus one clock) with CKE LOW.
`ifdef RAM_SIZE_1GB
localparam [FIXED_POINT_BITWIDTH-1:0] TIME_TRFC = (110/CLK_SERDES_PERIOD); // minimum 110ns, Delay between the REFRESH command and the next valid command, except DES
localparam [FIXED_POINT_BITWIDTH-1:0] TIME_TXPR = ((10+110)/CLK_SERDES_PERIOD); // https://i.imgur.com/SAqPZzT.png, min. (greater of(10ns+tRFC = 120ns, 5 clocks))
`elsif RAM_SIZE_2GB
localparam [FIXED_POINT_BITWIDTH-1:0] TIME_TRFC = (160/CLK_SERDES_PERIOD);
localparam [FIXED_POINT_BITWIDTH-1:0] TIME_TXPR = ((10+160)/CLK_SERDES_PERIOD); // https://i.imgur.com/SAqPZzT.png, min. (greater of(10ns+tRFC = 170ns, 5 clocks))
`elsif RAM_SIZE_4GB
localparam [FIXED_POINT_BITWIDTH-1:0] TIME_TRFC = (260/CLK_SERDES_PERIOD);
localparam [FIXED_POINT_BITWIDTH-1:0] TIME_TXPR = ((10+260)/CLK_SERDES_PERIOD); // https://i.imgur.com/SAqPZzT.png, min. (greater of(10ns+tRFC = 270ns, 5 clocks))
`endif
localparam [FIXED_POINT_BITWIDTH-1:0] TIME_TREFI = (7800/CLK_SERDES_PERIOD); // 7.8?s = 7800ns, Maximum average periodic refresh
localparam [FIXED_POINT_BITWIDTH-1:0] TIME_TRAS = (35/CLK_SERDES_PERIOD); // minimum 35ns, ACTIVATE-to-PRECHARGE command period
localparam [FIXED_POINT_BITWIDTH-1:0] TIME_TRP = (13.91/CLK_SERDES_PERIOD); // minimum 13.91ns, Precharge time. The banks have to be precharged and idle for tRP before a REFRESH command can be applied
localparam [FIXED_POINT_BITWIDTH-1:0] TIME_TRCD = (13.91/CLK_SERDES_PERIOD); // minimum 13.91ns, Time RAS-to-CAS delay, ACT to RD/WR
localparam [FIXED_POINT_BITWIDTH-1:0] TIME_TWR = (15/CLK_SERDES_PERIOD); // Minimum 15ns, Write recovery time is the time interval between the end of a write data burst and the start of a precharge command. It allows sense amplifiers to restore data to cells.
localparam [FIXED_POINT_BITWIDTH-1:0] TIME_TFAW = (50/CLK_SERDES_PERIOD); // Minimum 50ns, Why Four Activate Window, not Five or Eight Activate Window ? For limiting high current drain over the period of tFAW time interval
localparam [FIXED_POINT_BITWIDTH-1:0] TIME_TIS = (0.195/CLK_SERDES_PERIOD); // Minimum 195ps, setup time
localparam [FIXED_POINT_BITWIDTH-1:0] TIME_TDLLK = (512*CK_PERIOD/CLK_SERDES_PERIOD); // tDLLK = 512 clock cycles, DLL locking time
localparam [FIXED_POINT_BITWIDTH-1:0] TIME_TZQINIT = (512*CK_PERIOD/CLK_SERDES_PERIOD); // tZQINIT = 512 clock cycles, ZQCL command calibration time for POWER-UP and RESET operation
localparam [FIXED_POINT_BITWIDTH-1:0] TIME_RL = (5*CK_PERIOD/CLK_SERDES_PERIOD); // if DLL is disable, only CL=6 is supported. Since AL=0 for simplicity and RL=AL+CL , RL=5
localparam [FIXED_POINT_BITWIDTH-1:0] TIME_WL = (5*CK_PERIOD/CLK_SERDES_PERIOD); // if DLL is disable, only CWL=6 is supported. Since AL=0 for simplicity and WL=AL+CWL , WL=5
localparam [FIXED_POINT_BITWIDTH-1:0] TIME_TBURST = ((DATA_BURST_LENGTH >> 1)*CK_PERIOD/CLK_SERDES_PERIOD); // each read or write commands will work on 8 different pieces of consecutive data. In other words, burst length is 8, and tburst = burst_length/2 with double data rate mechanism
localparam [FIXED_POINT_BITWIDTH-1:0] TIME_TMRD = (4*CK_PERIOD/CLK_SERDES_PERIOD); // tMRD = 4 clock cycles, Time MRS to MRS command Delay
localparam [FIXED_POINT_BITWIDTH-1:0] TIME_TMOD = (12*CK_PERIOD/CLK_SERDES_PERIOD); // tMOD = 12 clock cycles, Time MRS to non-MRS command Delay
`endif
localparam TIME_TWTR = 4; // Delay from start of internal WRITE transaction to internal READ command, MIN = greater of 4CK or 7.5ns;
localparam TIME_TDAL = TIME_TWR + TIME_TRP; // Auto precharge write recovery + precharge time
localparam TIME_TRPRE = 1; // this is for read pre-amble. It is the time between when the data strobe goes from non-valid (HIGH) to valid (LOW, initial drive level).
localparam TIME_TRPST = 1; // this is for read post-amble. It is the time from when the last valid data strobe to when the strobe goes to HIGH, non-drive level.
localparam TIME_TWPRE = 1; // this is for write pre-amble. It is the time between when the data strobe goes from non-valid (HIGH) to valid (LOW, initial drive level).
localparam TIME_TWPST = 1; // this is for write post-amble. It is the time from when the last valid data strobe to when the strobe goes to HIGH, non-drive level.
localparam TIME_TMPRR = 1; // this is for MPR System Read Calibration. It is the time between MULTIPURPOSE REGISTER READ burst end until mode register set for multipurpose register exit
localparam TIME_WRITE_COMMAND_TO_DQS_VALID = TIME_WL-TIME_TWPRE; // time between write command and valid DQS
localparam TIME_TCCD = (4*CK_PERIOD/CLK_SERDES_PERIOD); // CAS#-to-CAS# command delay, applicable for consecutive DRAM write or read operations
localparam ADDRESS_FOR_MODE_REGISTER_0 = 0;
localparam ADDRESS_FOR_MODE_REGISTER_1 = 1;
localparam ADDRESS_FOR_MODE_REGISTER_2 = 2;
localparam ADDRESS_FOR_MODE_REGISTER_3 = 3;
// Mode register 0 (MR0) settings
localparam MR0 = 2'b00; // Mode register set 0
localparam PRECHARGE_PD = 1'b1; // DLL on
localparam WRITE_RECOVERY = 3'b010; // WR = 6 , WR (cycles) = roundup (tWR [ns]/tCK [ns])
localparam DLL_RESET = 1'b1;
localparam CAS_LATENCY_46 = 3'b001;
localparam CAS_LATENCY_2 = 1'b0;
localparam CAS_LATENCY = {CAS_LATENCY_46, CAS_LATENCY_2}; // CL = 5
localparam READ_BURST_TYPE = 1'b0; // sequential burst
localparam BURST_LENGTH = 2'b0; // Fixed BL8
// Mode register 1 (MR1) settings
localparam MR1 = 2'b01; // Mode register set 1
localparam Q_OFF = 1'b0; // Output enabled
localparam TDQS = 1'b0; // TDQS disabled (x8 configuration only)
localparam RTT_9 = 1'b0;
localparam RTT_6 = 1'b0;
localparam RTT_2 = 1'b0;
localparam RTT = {RTT_9, RTT_6, RTT_2}; // on-die termination resistance value
localparam WL = 1'b0; // Write levelling disabled
localparam ODS_5 = 1'b0;
localparam ODS_2 = 1'b1;
localparam ODS = {ODS_5, ODS_2}; // Output drive strength set at 34 ohm
localparam AL = 2'b0; // Additive latency disabled
localparam DLL_EN = 1'b0; // DLL is enabled
// Mode register 3 (MR3) settings
localparam MPR_EN = 1'b1; // enables or disables Dataflow from MPR, in most cases it is a must to enable
localparam MPR_READ_FUNCTION = 2'b0; // Predefined data pattern for READ synchronization
localparam MPR_BITWIDTH_COMBINED = 3; // the three least-significant-bits of MR3
localparam A10 = 10; // address bit for auto-precharge option
localparam A12 = 12; // address bit for burst-chop option
localparam HIGH_REFRESH_QUEUE_THRESHOLD = 4;
reg MPR_ENABLE, MPR_Read_had_finished; // for use within MR3 finite state machine
`ifndef USE_ILA
wire [DQ_BITWIDTH-1:0] dq_w; // the output data stream is NOT serialized
`endif
`ifndef USE_ILA
wire [DQ_BITWIDTH-1:0] dq_r; // the input data stream is NOT serialized
`endif
// incoming signals from RAM
`ifdef USE_x16
wire ldqs_r;
wire ldqs_n_r;
wire udqs_r;
wire udqs_n_r;
`else
wire dqs_r;
wire dqs_n_r;
`endif
// outgoing signals to RAM
`ifdef USE_x16
wire ldqs_w;
wire ldqs_n_w;
wire udqs_w;
wire udqs_n_w;
`else
wire dqs_w;
wire dqs_n_w;
`endif
`ifndef HIGH_SPEED
// Purposes of Clock divider:
// 1. for developing correct logic first before making the DDR memory controller works in higher frequency,
// 2. to perform 90 degree phase shift on DQ signal with relative to DQS signal during data writing stage
// 3. to perform 180 degree phase shift (DDR mechanism of both DQS and DQ signals need to work on
// both posedge and negedge clk) for the next consecutive data
// See https://i.imgur.com/dnDwZul.png or
// https://www.markimicrowave.com/blog/top-7-ways-to-create-a-quadrature-90-phase-shift/
// See https://i.imgur.com/ZnBuofE.png or
// https://patentimages.storage.googleapis.com/0e/94/46/6fdcafc946e940/US5297181.pdf#page=3
// Will use digital PLL or https://stackoverflow.com/a/50172237/8776167 in later stage of the project
// See https://www.edaplayground.com/x/gXC for waveform simulation of the clock divider
reg clk_slow;
localparam DIVIDE_RATIO_HALVED = (DIVIDE_RATIO >> 1);
reg [($clog2(DIVIDE_RATIO_HALVED)-1):0] counter;
reg counter_reset;
always @(posedge clk)
begin
if(reset) counter_reset <= 1;
`ifndef XILINX
else counter_reset <= (counter == DIVIDE_RATIO_HALVED[0 +: $clog2(DIVIDE_RATIO_HALVED)] - 1'b1);
`else
else counter_reset <= (counter == DIVIDE_RATIO_HALVED[0 +: 1] - 1'b1);
`endif
end
always @(posedge clk)
begin
if(reset) counter <= 0;
else if(counter_reset) counter <= 1;
else counter <= counter + 1;
end
always @(posedge clk)
begin
if(reset) clk_slow <= 1;
else if(counter_reset)
clk_slow <= ~clk_slow;
end
assign ck = clk_slow;
assign ck_n = ~clk_slow;
wire clk90_slow_is_at_high = (~clk_slow && counter_reset) || (clk_slow && ~counter_reset);
wire clk90_slow_is_at_low = (clk_slow && counter_reset) || (~clk_slow && ~counter_reset);
wire clk90_slow_posedge = (clk_slow && counter_reset);
assign clk_slow_posedge = (clk_slow && ~counter_reset);
wire clk_slow_negedge = (~clk_slow && ~counter_reset);
wire clk180_slow = ~clk_slow; // simply inversion of the clk_slow signal will give 180 degree phase shift
assign clk180_slow_posedge = clk_slow_negedge;
`ifdef USE_x16
assign ldqs_w = clk_slow;
assign ldqs_n_w = ~clk_slow;
assign udqs_w = clk_slow;
assign udqs_n_w = ~clk_slow;
`else
assign dqs_w = clk_slow;
assign dqs_n_w = ~clk_slow;
`endif
`else
// wire clk_serdes_data;
// wire clk_serdes;
wire ck, ck_out;
`ifndef TESTBENCH
wire ck_90;
wire ck_270;
`endif
wire ck_180_out;
wire locked;
// for dynamic phase shift
reg psen;
wire psdone;
wire ck_dynamic_90, ck_dynamic_270;
wire locked_dynamic;
`ifdef XILINX
pll_ddr pll_static_clocks
( // Clock in ports
.clk(clk), // IN 50MHz
// Clock out ports
// SERDES_RATIO = 8, but 2 separate serdes are used due to double-data-rate restriction
// So, 333.333MHz divided by (SERDES_RATIO >> 1) equals 83.333MHz
.clk_serdes_data(clk_serdes_data), // OUT 83.333MHz, 0 phase shift, for DRAM data
.clk_serdes(clk_serdes), // OUT 83.333MHz, 225 phase shift, for DRAM command
.ck(ck), // OUT 333.333MHz, 0 phase shift
.ck_90(ck_90), // OUT 333.333MHz, 90 phase shift, for dq phase shifting purpose
.ck_180(ck_180), // OUT 333.333MHz, 180 phase shift
.ck_270(ck_270), // OUT 333.333MHz, 270 phase shift, for dq phase shifting purpose
// Status and control signals
.reset(reset), // IN
.locked(locked) // OUT
);
localparam NUM_OF_FF_SYNCHRONIZERS_FOR_CK_180_DOMAIN_TO_CK_DOMAIN = 4;
// to synchronize signal in ck_180 domain to ck domain
reg [NUM_OF_FF_SYNCHRONIZERS_FOR_CK_180_DOMAIN_TO_CK_DOMAIN-1:0] data_read_is_ongoing_ck;
reg data_read_is_ongoing_previous;
always @(posedge ck)
data_read_is_ongoing_previous <= data_read_is_ongoing_ck;
reg psdone_previous;
always @(posedge ck) psdone_previous <= psdone;
always @(posedge ck)
begin
// triggers the first phase shift enable request only during the start of read operation
if(~data_read_is_ongoing_previous && data_read_is_ongoing_ck) psen <= 1;
// Phase shifting is like changing PLL settings, so need to wait for new PLL lock in order to avoid
// Warning : Please wait for PSDONE signal before adjusting the Phase Shift
// asserts psen signal only when psdone is asserted low after asserted high previously
else if(psdone_previous && ~psdone) psen <= psdone;
// assert PSEN for one PSCLK cycle only and then wait for PSDONE to assert before performing
// another phase shift operation. Asserting PSEN for more than one PSCLK cycle can cause the DCM
// to phase shift in an unpredictable manner.
else psen <= 0;
end
localparam PLL_STATUS_BITWIDTH = 3;
`ifndef VIVADO
wire [PLL_STATUS_BITWIDTH-1:0] pll_read_status;
wire input_clk_stopped;
wire clk_valid;
`endif
// dynamic phase shift for incoming DQ bits
pll_tuneable pll_read
( // Clock in ports
.clk(clk), // IN 50MHz
// Clock out ports
.ck_dynamic_90(ck_dynamic_90), // OUT 333.333MHz, 90 phase shift, incoming DQ bit is not phase shifted
.ck_dynamic_270(ck_dynamic_270), // OUT 333.333MHz, 270 phase shift
// Dynamic phase shift ports
.psclk(udqs_r), // IN
.psen(psen), // IN
.psincdec(1'b1), // IN
.psdone(psdone), // OUT
// Status and control signals
.reset(reset), // IN
`ifdef VIVADO
.locked_dynamic(locked_dynamic) // OUT
`else
.locked_dynamic(locked_dynamic), // OUT
.status(pll_read_status), // OUT
.input_clk_stopped(input_clk_stopped), // OUT
.clk_valid(clk_valid) // OUT
`endif
);
// There is need for OBUF because if otherwise, the output of ODDR2_ck_out would be connected to
// FPGA fabric which is not allowed
OBUF #(
.DRIVE(12), // Specify the output drive strength
.IOSTANDARD("LVCMOS25"), // Specify the output I/O standard
.SLEW("SLOW") // Specify the output slew rate
)
OBUF_ck (
.O(ck_obuf), // Buffer output (connect directly to FPGA I/O pad)
.I(ck_out) // Buffer input
);
OBUF #(
.DRIVE(12), // Specify the output drive strength
.IOSTANDARD("LVCMOS25"), // Specify the output I/O standard
.SLEW("SLOW") // Specify the output slew rate
)
OBUF_ck_n (
.O(ck_n_obuf), // Buffer output (connect directly to FPGA I/O pad)
.I(ck_180_out) // Buffer input
);
// ODDR2: Input Double Data Rate Output Register with Set, Reset and Clock Enable.
// Spartan-6
// Xilinx HDL Libraries Guide, version 14.7
// As for why 'ck' and 'ck_180' signals are implemented using ODDR2 primitive,
// see https://forums.xilinx.com/t5/Other-FPGA-Architecture/Place-1198-Error-Route-cause-and-possible-solution/m-p/408489/highlight/true#M34528
ODDR2 #(
.DDR_ALIGNMENT("C0"), // Sets output alignment to "NONE", "C0" or "C1"
.INIT(1'b0), // Sets initial state of the Q output to 1'b0 or 1'b1
.SRTYPE("ASYNC") // Specifies "SYNC" or "ASYNC" set/reset
)
ODDR2_ck_out(
.Q(ck_out), // 1-bit DDR output data
.C0(ck), // 1-bit clock input
.C1(ck), // 1-bit clock input
.CE(1'b1), // 1-bit clock enable input
.D0(1'b1), // 1-bit DDR data input (associated with C0)
.D1(1'b0), // 1-bit DDR data input (associated with C1)
.R(1'b0), // 1-bit reset input
.S(1'b0) // 1-bit set input
);
ODDR2 #(
.DDR_ALIGNMENT("C0"), // Sets output alignment to "NONE", "C0" or "C1"
.INIT(1'b0), // Sets initial state of the Q output to 1'b0 or 1'b1
.SRTYPE("ASYNC") // Specifies "SYNC" or "ASYNC" set/reset
)
ODDR2_ck_180_out(
.Q(ck_180_out), // 1-bit DDR output data
.C0(ck_180), // 1-bit clock input
.C1(ck_180), // 1-bit clock input
.CE(1'b1), // 1-bit clock enable input
.D0(1'b1), // 1-bit DDR data input (associated with C0)
.D1(1'b0), // 1-bit DDR data input (associated with C1)
.R(1'b0), // 1-bit reset input
.S(1'b0) // 1-bit set input
);
// DQS signals are of double-data-rate signals
`ifdef USE_x16
ODDR2 #(
.DDR_ALIGNMENT("C0"), // Sets output alignment to "NONE", "C0" or "C1"
.INIT(1'b0), // Sets initial state of the Q output to 1'b0 or 1'b1
.SRTYPE("ASYNC") // Specifies "SYNC" or "ASYNC" set/reset
)
ODDR2_ldqs_w(
.Q(ldqs_w), // 1-bit DDR output data
.C0(ck), // 1-bit clock input
.C1(ck_180), // 1-bit clock input
.CE(1'b1), // 1-bit clock enable input
.D0(1'b1), // 1-bit DDR data input (associated with C0)
.D1(1'b0), // 1-bit DDR data input (associated with C1)
.R(1'b0), // 1-bit reset input
.S(1'b0) // 1-bit set input
);
ODDR2 #(
.DDR_ALIGNMENT("C0"), // Sets output alignment to "NONE", "C0" or "C1"
.INIT(1'b0), // Sets initial state of the Q output to 1'b0 or 1'b1
.SRTYPE("ASYNC") // Specifies "SYNC" or "ASYNC" set/reset
)
ODDR2_udqs_w(
.Q(udqs_w), // 1-bit DDR output data
.C0(ck), // 1-bit clock input
.C1(ck_180), // 1-bit clock input
.CE(1'b1), // 1-bit clock enable input
.D0(1'b1), // 1-bit DDR data input (associated with C0)
.D1(1'b0), // 1-bit DDR data input (associated with C1)
.R(1'b0), // 1-bit reset input
.S(1'b0) // 1-bit set input
);
ODDR2 #(
.DDR_ALIGNMENT("C0"), // Sets output alignment to "NONE", "C0" or "C1"
.INIT(1'b0), // Sets initial state of the Q output to 1'b0 or 1'b1
.SRTYPE("ASYNC") // Specifies "SYNC" or "ASYNC" set/reset
)
ODDR2_ldqs_n_w(
.Q(ldqs_n_w), // 1-bit DDR output data
.C0(ck), // 1-bit clock input
.C1(ck_180), // 1-bit clock input
.CE(1'b1), // 1-bit clock enable input
.D0(1'b0), // 1-bit DDR data input (associated with C0)
.D1(1'b1), // 1-bit DDR data input (associated with C1)
.R(1'b0), // 1-bit reset input
.S(1'b0) // 1-bit set input
);
ODDR2 #(
.DDR_ALIGNMENT("C0"), // Sets output alignment to "NONE", "C0" or "C1"
.INIT(1'b0), // Sets initial state of the Q output to 1'b0 or 1'b1
.SRTYPE("ASYNC") // Specifies "SYNC" or "ASYNC" set/reset
)
ODDR2_udqs_n_w(
.Q(udqs_n_w), // 1-bit DDR output data
.C0(ck), // 1-bit clock input
.C1(ck_180), // 1-bit clock input
.CE(1'b1), // 1-bit clock enable input
.D0(1'b0), // 1-bit DDR data input (associated with C0)
.D1(1'b1), // 1-bit DDR data input (associated with C1)
.R(1'b0), // 1-bit reset input
.S(1'b0) // 1-bit set input
);
`else
ODDR2 #(
.DDR_ALIGNMENT("C0"), // Sets output alignment to "NONE", "C0" or "C1"
.INIT(1'b0), // Sets initial state of the Q output to 1'b0 or 1'b1
.SRTYPE("SYNC") // Specifies "SYNC" or "ASYNC" set/reset
)
ODDR2_dqs_w(
.Q(dqs_w), // 1-bit DDR output data
.C0(ck), // 1-bit clock input
.C1(ck_180), // 1-bit clock input
.CE(1'b1), // 1-bit clock enable input
.D0(1'b1), // 1-bit DDR data input (associated with C0)
.D1(1'b0), // 1-bit DDR data input (associated with C1)
.R(1'b0), // 1-bit reset input
.S(1'b0) // 1-bit set input
);
ODDR2 #(
.DDR_ALIGNMENT("C0"), // Sets output alignment to "NONE", "C0" or "C1"
.INIT(1'b0), // Sets initial state of the Q output to 1'b0 or 1'b1
.SRTYPE("SYNC") // Specifies "SYNC" or "ASYNC" set/reset
)
ODDR2_dqs_n_w(
.Q(dqs_n_w), // 1-bit DDR output data
.C0(ck), // 1-bit clock input
.C1(ck_180), // 1-bit clock input
.CE(1'b1), // 1-bit clock enable input
.D0(1'b1), // 1-bit DDR data input (associated with C0)
.D1(1'b0), // 1-bit DDR data input (associated with C1)
.R(1'b0), // 1-bit reset input
.S(1'b0) // 1-bit set input
);
`endif
`elsif ALTERA
pll_ddr pll_static_clocks
( // Clock in ports
.inclk0(clk), // IN 50MHz
// Clock out ports
//.clk_pll(clk_pll), // OUT 83.333MHz, 45 phase shift, for solving STA issues
// SERDES_RATIO = 8, but 2 separate serdes are used due to double-data-rate restriction
// So, 333.333MHz divided by (SERDES_RATIO >> 1) equals 83.333MHz
.c4(clk_serdes), // OUT 83.333MHz, 225 phase shift, for SERDES use
.c0(ck), // OUT 333.333MHz, 0 phase shift
.c1(ck_90), // OUT 333.333MHz, 90 phase shift, for dq phase shifting purpose
.c2(ck_180), // OUT 333.333MHz, 180 phase shift
.c3(ck_270), // OUT 333.333MHz, 270 phase shift, for dq phase shifting purpose
// Status and control signals
.areset(reset), // IN
.locked(locked) // OUT
);
// dynamic phase shift for incoming DQ bits
pll_tuneable
(
.areset(reset), // IN
.inclk0(clk), // IN 50 MHz
.pfdena(1'b1), // IN
.phasecounterselect(udqs_r), // IN
.phasestep(psen), // IN
.phaseupdown(1'b1), // IN
.scanclk(clk), // IN
.c0(ck_dynamic_90), // OUT 333.333MHz, 90 phase shift, incoming DQ bit is not phase shifted
.c1(ck_dynamic_270), // OUT 333.333MHz, 270 phase shift
.locked(locked_dynamic), // OUT
.phasedone(psdone) // OUT
);
`endif
`endif
// See https://www.micron.com/-/media/client/global/documents/products/technical-note/dram/tn4605.pdf#page=7
// for an overview on DQS Preamble and Postamble bits
`ifndef HIGH_SPEED
wire [(DQ_BITWIDTH >> 1)-1:0] ldq_w;
wire [(DQ_BITWIDTH >> 1)-1:0] udq_w;
reg dqs_is_at_high_previously;
reg dqs_is_at_low_previously;
`ifndef USE_ILA
`ifdef USE_x16
wire dqs_is_at_high = (ldqs_r & ~ldqs_n_r) || (udqs_r & ~udqs_n_r);
wire dqs_is_at_low = (~ldqs_r & ldqs_n_r) || (~udqs_r & udqs_n_r);
`else
wire dqs_is_at_high = (dqs & ~dqs_n);
wire dqs_is_at_low = (~dqs & dqs_n);
`endif
wire dqs_rising_edge = (dqs_is_at_low_previously && dqs_is_at_high);
wire dqs_falling_edge = (dqs_is_at_high_previously && dqs_is_at_low);
`else
`ifdef USE_x16
assign dqs_is_at_high = (ldqs_r & ~ldqs_n_r) || (udqs_r & ~udqs_n_r);
assign dqs_is_at_low = (~ldqs_r & ldqs_n_r) || (~udqs_r & udqs_n_r);
`else
assign dqs_is_at_high = (dqs & ~dqs_n);
assign dqs_is_at_low = (~dqs & dqs_n);
`endif
assign dqs_rising_edge = (dqs_is_at_low_previously && dqs_is_at_high);
assign dqs_falling_edge = (dqs_is_at_high_previously && dqs_is_at_low);
`endif
always @(posedge clk) dqs_is_at_high_previously <= dqs_is_at_high;
always @(posedge clk) dqs_is_at_low_previously <= dqs_is_at_low;
// For WRITE, we have to phase-shift DQS by 90 degrees and output the phase-shifted DQS to RAM
// phase-shifts the incoming dqs and dqs_n signals by 90 degrees
// with reference to outgoing 'ck' DDR signal
// the reason is to sample at the middle of incoming `dq` signal
`ifndef USE_ILA
reg [($clog2(DIVIDE_RATIO_HALVED)-1):0] dqs_counter;
`endif
always @(posedge clk)
begin
if(reset) dqs_counter <= 0;
else begin
// Due to PCB trace layout and high-speed DDR signal transmission,
// there is no alignment to any generic clock signal that we can depend upon,
// especially when data is coming back from the SDRAM chip.
// Thus, we could only depend upon incoming `DQS` signal to sample 'DQ' signal
if(dqs_rising_edge | dqs_falling_edge) dqs_counter <= 1;
else if(dqs_counter > 0)
dqs_counter <= dqs_counter + 1;
end
end
`ifndef XILINX
wire dqs_phase_shifted = (dqs_counter == DIVIDE_RATIO_HALVED[0 +: $clog2(DIVIDE_RATIO_HALVED)]);
`else
wire dqs_phase_shifted = (dqs_counter == DIVIDE_RATIO_HALVED[0 +: 2]);
`endif
wire dqs_n_phase_shifted = ~dqs_phase_shifted;
always @(posedge clk)
begin
if(reset) data_from_ram <= 0;
// 'dq_r' is sampled at its middle (thanks to 90 degree phase shift on dqs)
else if(dqs_phase_shifted & ~dqs_n_phase_shifted)
begin
`ifdef XILINX
data_from_ram <= dq_r;
`elsif LATTICE
data_from_ram <= dq_r;
`else // Micron DDR3 simulation model
data_from_ram <= dq;
`endif
end
end
`ifdef USE_x16
wire [(DQ_BITWIDTH >> 1)-1:0] ldq;
wire [(DQ_BITWIDTH >> 1)-1:0] udq;
assign ldq_w = data_to_ram[0 +: (DQ_BITWIDTH >> 1)];
assign udq_w = data_to_ram[(DQ_BITWIDTH >> 1) +: (DQ_BITWIDTH >> 1)];
assign dq_w = {udq_w, ldq_w};
`else
assign dq_w = data_to_ram; // input data stream of 'data_to_ram' is NOT serialized
`endif
`else
// bitslip and IODELAY phase shift delay calibration
// https://www.xilinx.com/support/documentation/application_notes/xapp1208-bitslip-logic.pdf#page=4
// https://www.xilinx.com/support/documentation/sw_manuals/xilinx14_7/spartan6_hdl.pdf#page=130
// https://www.xilinx.com/support/documentation/white_papers/wp249.pdf#page=5
// https://www.xilinx.com/support/documentation/ip_documentation/ultrascale_memory_ip/v1_4/pg150-ultrascale-memory-ip.pdf#page=361
// https://blog.elphel.com/2014/06/ddr3-memory-interface-on-xilinx-zynq-soc-free-software-compatible/
// Will use Micron built-in features (Write leveling, MPR_Read_function) to facilitate skew calibration
// See https://www.edaboard.com/threads/phase-detection-mechanism.398492/ for an
// understanding on how the dynamic(real-time) phase calibration mechanism works
/*
phase_detector #(.D(DQ_BITWIDTH)) // Set the number of inputs
pd_state_machine (
.use_phase_detector (use_phase_detector),
.busy (busy_data),
.valid (valid_data),
.inc_dec (incdec_data),
.reset (reset),
.gclk (gclk),
.debug_in (debug_in),
.cal_master (cal_data_master),
.cal_slave (cal_data_slave),
.rst_out (rst_data),
.ce (ce_data),
.inc (inc_data),
.debug (debug)
);
*/
/* PLL dynamic phase shift is used in lieu of IODELAY2 primitive
// for phase shift alignment between READ DQS strobe and 'ck' signal
wire delayed_dqs_r;
// See https://www.xilinx.com/support/documentation/user_guides/ug381.pdf#page=73
// Once BUSY is Low, the new delay value is operational.
wire idelay_is_busy;
reg idelay_is_busy_previously;
always @(posedge clk_serdes) idelay_is_busy_previously <= idelay_is_busy;
reg idelay_inc_dqs_r;
reg idelay_counter_enable;
// IODELAY2 primitive requires some initial hardware startup or warmup time
localparam IODELAY_STARTUP_BITWIDTH = 12;
reg [IODELAY_STARTUP_BITWIDTH-1:0] iodelay_startup_counter;
always @(posedge clk_serdes)
begin
if(reset) iodelay_startup_counter <= 0;
else iodelay_startup_counter <= iodelay_startup_counter + 1;
end
// xilinx demo example only needs iodelay_startup_counter[IODELAY_STARTUP_BITWIDTH-1]
// See https://github.com/promach/DDR/blob/main/phase_detector.v#L135-L137
// It is only static calibration as of now,
// will implement dynamic (real-time) phase calibration as project progresses
wire idelay_cal_dqs_r = &iodelay_startup_counter; // Wait for IODELAY to be available
IODELAY2 #(
.DATA_RATE ("DDR"), // <SDR>, DDR
.IDELAY_VALUE (0), // {0 ... 255}
.IDELAY2_VALUE (0), // {0 ... 255}
.IDELAY_MODE ("NORMAL" ), // NORMAL, PCI
.ODELAY_VALUE (0), // {0 ... 255}
.IDELAY_TYPE ("VARIABLE_FROM_ZERO"),// "DEFAULT", "DIFF_PHASE_DETECTOR", "FIXED", "VARIABLE_FROM_HALF_MAX", "VARIABLE_FROM_ZERO"
.COUNTER_WRAPAROUND ("WRAPAROUND" ), // <STAY_AT_LIMIT>, WRAPAROUND
.DELAY_SRC ("IDATAIN" ), // "IO", "IDATAIN", "ODATAIN"
.SERDES_MODE ("NONE") // <NONE>, MASTER, SLAVE
)
iodelay_dqs_r (
.IDATAIN (dqs_r), // data from primary IOB
.TOUT (), // tri-state signal to IOB
.DOUT (), // output data to IOB
.T (1'b1), // tri-state control from OLOGIC/OSERDES2
.ODATAIN (1'b0), // data from OLOGIC/OSERDES2
.DATAOUT (), // Delayed Data output, can only route to a register in ILOGIC
.DATAOUT2 (delayed_dqs_r), // Delayed Data output, can route to fabric
.IOCLK0 (ck_90), // High speed clock for calibration
.IOCLK1 (ck_270), // High speed clock for calibration
.CLK (clk), // Fabric clock (GCLK) for control signals
.CAL (idelay_cal_dqs_r), // Calibrate control signal
.INC (idelay_inc_dqs_r), // Increment counter
.CE (idelay_counter_enable), // Enable counter increment/decrement
.RST (idelay_is_busy_previously & (~idelay_is_busy)), // Reset delay line
.BUSY (idelay_is_busy) // output signal indicating sync circuit has finished / calibration has finished
);
*/
// RAM -> IOBUF (for inout) -> IDELAY (DQS Centering) -> IDDR2 (input DDR buffer) -> ISERDES
// OSERDES -> ODDR2 (output DDR buffer) -> ODELAY (DQS Centering) -> IOBUF (for inout) -> RAM
//assign dqs_r = (udqs_r | ldqs_r);
assign dqs_r = udqs_r; // iodelay input must come directly from IO pad, no FPGA fabric in between
// See https://www.eevblog.com/forum/fpga/ddr3-initialization-sequence-issue/msg3678799/#msg3678799
localparam NUM_OF_FF_SYNCHRONIZERS_FOR_CLK_DOMAIN_TO_CK_DYNAMIC_90_DOMAIN= 3;
localparam NUM_OF_FF_SYNCHRONIZERS_FOR_CLK_DOMAIN_TO_CK_DOMAIN = 3;
localparam NUM_OF_FF_SYNCHRONIZERS_FOR_CLK_DOMAIN_TO_CK_180_DOMAIN = 3;
localparam NUM_OF_FF_SYNCHRONIZERS_FOR_CLK_DOMAIN_TO_CK_270_DOMAIN = 3;
reg [NUM_OF_FF_SYNCHRONIZERS_FOR_CLK_DOMAIN_TO_CK_DOMAIN-1:0] need_to_assert_reset_ck;
reg [NUM_OF_FF_SYNCHRONIZERS_FOR_CLK_DOMAIN_TO_CK_270_DOMAIN-1:0] need_to_assert_reset_ck_270;
reg [NUM_OF_FF_SYNCHRONIZERS_FOR_CLK_DOMAIN_TO_CK_DYNAMIC_90_DOMAIN-1:0] need_to_assert_reset_ck_dynamic_90;
// combines the interleaving 'dq_r_q0', 'dq_r_q1' DDR signals into a single SDR signal
reg [DQ_BITWIDTH-1:0] dq_r_q0;
reg [DQ_BITWIDTH-1:0] dq_r_q1;
wire [DQ_BITWIDTH-1:0] dq_r_q0_reg; // for STA
reg [DQ_BITWIDTH-1:0] dq_r_q0_reg_reg, dq_r_q0_reg_reg_reg; // for STA
always @(posedge ck) dq_r_q0 <= dq_r_q0_reg_reg_reg;
always @(posedge ck) dq_r_q0_reg_reg_reg <= dq_r_q0_reg_reg;
always @(posedge ck) dq_r_q0_reg_reg <= dq_r_q0_reg;
wire [DQ_BITWIDTH-1:0] dq_r_q1_reg; // for STA
reg [DQ_BITWIDTH-1:0] dq_r_q1_reg_reg, dq_r_q1_reg_reg_reg; // for STA
always @(posedge ck) dq_r_q1 <= dq_r_q1_reg_reg_reg;
always @(posedge ck) dq_r_q1_reg_reg_reg <= dq_r_q1_reg_reg;
always @(posedge ck) dq_r_q1_reg_reg <= dq_r_q1_reg;
// for synchronizing multi-bits signals from ck_dynamic_90 domain to ck domain
wire afifo_dq_r_q0_is_empty;
wire afifo_dq_r_q0_is_full;
reg [DQ_BITWIDTH-1:0] dq_r_q0_ck_dynamic_90;
async_fifo
#(
.WIDTH(DQ_BITWIDTH),
.NUM_ENTRIES(),
.TO_SIMPLIFY_FULL_LOGIC(1),
.TO_SIMPLIFY_EMPTY_LOGIC(1)
)
afifo_dq_r_q0
(
.write_reset(reset),
.read_reset(reset),
// Read.
.read_clk(ck),
.read_en(1'b1),
.read_data(dq_r_q0_reg),
.empty(afifo_dq_r_q0_is_empty),
// Write
.write_clk(ck_dynamic_90),
.write_en(1'b1),
.full(afifo_dq_r_q0_is_full),
.write_data(dq_r_q0_ck_dynamic_90)
);
// for synchronizing multi-bits signals from ck_dynamic_90 domain to ck domain
wire afifo_dq_r_q1_is_empty;
wire afifo_dq_r_q1_is_full;
reg [DQ_BITWIDTH-1:0] dq_r_q1_ck_dynamic_90;
async_fifo
#(
.WIDTH(DQ_BITWIDTH),
.NUM_ENTRIES(),
.TO_SIMPLIFY_FULL_LOGIC(1),
.TO_SIMPLIFY_EMPTY_LOGIC(1)
)
afifo_dq_r_q1
(
.write_reset(reset),
.read_reset(reset),
// Read.
.read_clk(ck),
.read_en(1'b1),
.read_data(dq_r_q1_reg),
.empty(afifo_dq_r_q1_is_empty),
// Write
.write_clk(ck_dynamic_270),
.write_en(1'b1),
.full(afifo_dq_r_q1_is_full),
.write_data(dq_r_q1_ck_dynamic_90)
);
`ifdef USE_SERDES
// splits 'dq_w_oserdes' SDR signal into two ('dq_w_d0', 'dq_w_d1') SDR signals for ODDR2
// Check the explanation below for the need of two separate OSERDES
reg [DQ_BITWIDTH-1:0] dq_w_d0;
reg [DQ_BITWIDTH-1:0] dq_w_d1;
reg [DQ_BITWIDTH-1:0] dq_w_d0_reg;
reg [DQ_BITWIDTH-1:0] dq_w_d1_reg;
reg [DQ_BITWIDTH-1:0] dq_w_d0_reg_reg;
reg [DQ_BITWIDTH-1:0] dq_w_d1_reg_reg;
wire [DQ_BITWIDTH-1:0] dq_w_oserdes_0; // associated with dqs_w
wire [DQ_BITWIDTH-1:0] dq_w_oserdes_1; // associated with dq_n_w
always @(posedge ck) dq_w_d0_reg <= dq_w_oserdes_0; // for C0, D0 of ODDR2 primitive
always @(posedge ck) dq_w_d1_reg <= dq_w_oserdes_1; // for C1, D1 of ODDR2 primitive
// for DQ signal starting position on AL alignment for DRAM write operation
// See https://www.edaboard.com/threads/additive-latency-for-dram-read-and-write-commands.400678/
always @(posedge ck) dq_w_d0_reg_reg <= dq_w_d0_reg;
always @(posedge ck) dq_w_d1_reg_reg <= dq_w_d1_reg;
always @(posedge ck) dq_w_d0 <= dq_w_d0_reg_reg;
always @(posedge ck) dq_w_d1 <= dq_w_d1_reg_reg;
// always @(*) dq_w_d0 <= dq_w_oserdes_0;
// always @(*) dq_w_d1 <= dq_w_oserdes_1;
// why need IOSERDES primitives ?
// because you want a memory transaction rate much higher than the main clock frequency
// but you don't want to require a very high main clock frequency
// send a write of 8w bits to the memory controller,
// which is similar to bundling multiple transactions into one wider one,
// and the memory controller issues 8 writes of w bits to the memory,
// where w is the data width of your memory interface. (w == DQ_BITWIDTH)
// This literally means SERDES_RATIO=8
// localparam SERDES_RATIO = 8;
localparam EVEN_RATIO = 2;
//reg [DQ_BITWIDTH-1:0] dq_r_iserdes;
// The following way of combining dq_r_q0 and dq_r_q1 back into a single signal will not work
// for high DDR3 RAM frequency. Besides, never use clock-related signal for combinational logic
// See the rationale for having two separate deserializer module to handle this instead
//always @(dq_r_q0, dq_r_q1, delayed_dqs_r)
// dq_r_iserdes <= (delayed_dqs_r) ? dq_r_q0: dq_r_q1;
// if you want to build your own serdeses feeding from IDDR, you cannot clump dq_r_q0 and dq_r_q1 back
// into a single signal and feed this signal to your single serdes.
// You will need to build two separate serdeses - one for dq_r_q0, and another one for dq_r_q1.
wire [(DQ_BITWIDTH*(SERDES_RATIO >> 1))-1:0] data_out_iserdes_0;
wire [(DQ_BITWIDTH*(SERDES_RATIO >> 1))-1:0] data_out_iserdes_1;
reg [DQ_BITWIDTH*SERDES_RATIO-1:0] data_from_ram_clk_serdes_data;
genvar data_index_iserdes;
generate
for(data_index_iserdes = 0; data_index_iserdes < (DQ_BITWIDTH*SERDES_RATIO);
data_index_iserdes = data_index_iserdes + DQ_BITWIDTH)
begin: data_from_ram_combine_loop
// the use of $rtoi and $floor functions are to limit the bit range of 'data_index_iserdes'
// since 'data_out_iserdes_0' and 'data_out_iserdes_1' are half the size of
// 'data_from_ram_clk_serdes_data'
always @(*)
begin
if(((data_index_iserdes/DQ_BITWIDTH) % EVEN_RATIO) == 0)
begin
data_from_ram_clk_serdes_data[data_index_iserdes +: DQ_BITWIDTH] <=
data_out_iserdes_0[DQ_BITWIDTH *
$rtoi($floor(data_index_iserdes/(DQ_BITWIDTH << 1)))
+: DQ_BITWIDTH];
end
else begin
data_from_ram_clk_serdes_data[data_index_iserdes +: DQ_BITWIDTH] <=
data_out_iserdes_1[DQ_BITWIDTH *
$rtoi($floor(data_index_iserdes/(DQ_BITWIDTH << 1)))
+: DQ_BITWIDTH];
end
end
end
endgenerate
deserializer #(.D(DQ_BITWIDTH), .S(SERDES_RATIO >> 1), .INITIAL_S((SERDES_RATIO >> 1) - 1))
dq_iserdes_0
(
.reset(need_to_assert_reset_ck[NUM_OF_FF_SYNCHRONIZERS_FOR_CLK_DOMAIN_TO_CK_DOMAIN-1]),
// fast clock domain
.high_speed_clock(ck),
.data_in(dq_r_q0),
// slow clock domain
.data_out(data_out_iserdes_0)
);
deserializer #(.D(DQ_BITWIDTH), .S(SERDES_RATIO >> 1), .INITIAL_S((SERDES_RATIO >> 1) - 1))
dq_iserdes_1
(
.reset(need_to_assert_reset_ck[NUM_OF_FF_SYNCHRONIZERS_FOR_CLK_DOMAIN_TO_CK_DOMAIN-1]),
// fast clock domain
.high_speed_clock(ck),
.data_in(dq_r_q1),
// slow clock domain
.data_out(data_out_iserdes_1)
);
// There is need to use two separate OSERDES because ODDR2 expects its D0 and D1 inputs to be
// presented to it at a DDR clock rate of 303MHz (D0 at posedge of 303MHz, D1 at negedge of 303MHz),
// where 303MHz is the minimum DDR3 RAM working frequency.
// However, one single SDR OSERDES alone could not fulfill this data rate requirement of ODDR2
// For example, a 8:1 DDR OSERDES which takes 8 inputs D0,D1,D2,D3,D4,D5,D6,D7 and output them serially
// The values supplied by D0,D2,D4,D6 are clocked out on the rising edge
// The values supplied by D1,D3,D5,D7 are clocked out on the falling edge
// You can then create two 4:1 SDR OSERDES modules.
// One of the 2 modules will take D0,D2,D4,D6 inputs and output them serially.
// You route its output to the D0 pin of the ODDR.
// The other will output D1,D3,D5,D7 serially. You route its output to the D1 pin of the ODDR.
// But this is only if you write your own OSERDES.
// The vendor-specific hardware OSERDES will have built-in DDR mode.
// Even if you put it in SDR mode, it cannot be routed to ODDR because ODDR and OSERDES are two
// incarnations of the same OLOGIC block.
reg [(DQ_BITWIDTH*(SERDES_RATIO >> 1))-1:0] data_in_oserdes_0;
reg [(DQ_BITWIDTH*(SERDES_RATIO >> 1))-1:0] data_in_oserdes_1;
genvar data_index_oserdes;
generate
for(data_index_oserdes = 0; data_index_oserdes < (DQ_BITWIDTH*SERDES_RATIO);
data_index_oserdes = data_index_oserdes + DQ_BITWIDTH)
begin: data_to_ram_split_loop
// the use of $rtoi and $floor functions are to limit the bit range of 'data_index_oserdes'
// since 'data_in_oserdes_0' and 'data_in_oserdes_1' are half the size of 'data_to_ram'
always @(*)
begin
if(((data_index_oserdes/DQ_BITWIDTH) % EVEN_RATIO) == 0)
begin
data_in_oserdes_0[DQ_BITWIDTH *
$rtoi($floor(data_index_oserdes/(DQ_BITWIDTH << 1)))
+: DQ_BITWIDTH] <=
data_to_ram[data_index_oserdes +: DQ_BITWIDTH];
end
else begin
data_in_oserdes_1[DQ_BITWIDTH *
$rtoi($floor(data_index_oserdes/(DQ_BITWIDTH << 1)))
+: DQ_BITWIDTH] <=
data_to_ram[data_index_oserdes +: DQ_BITWIDTH];
end
end
end
endgenerate
serializer #(.D(DQ_BITWIDTH), .S(SERDES_RATIO >> 1), .INITIAL_S(0))
dq_oserdes_0
(
.reset(need_to_assert_reset_ck[NUM_OF_FF_SYNCHRONIZERS_FOR_CLK_DOMAIN_TO_CK_DOMAIN-1]),
// slow clock domain
.data_in(data_in_oserdes_0),
// fast clock domain
.high_speed_clock(ck),
.data_out(dq_w_oserdes_0)
);
serializer #(.D(DQ_BITWIDTH), .S(SERDES_RATIO >> 1), .INITIAL_S(0))
dq_oserdes_1
(
.reset(need_to_assert_reset_ck[NUM_OF_FF_SYNCHRONIZERS_FOR_CLK_DOMAIN_TO_CK_DOMAIN-1]),
// slow clock domain
.data_in(data_in_oserdes_1),
// fast clock domain
.high_speed_clock(ck),
.data_out(dq_w_oserdes_1)
);
// The following Xilinx-specific IOSERDES primitives are not used due to placement blockage restrictions
// See https://forums.xilinx.com/t5/Implementation/Xilinx-ISE-implementation-stage-issues/m-p/1255587/highlight/true#M30717
// or https://www.eevblog.com/forum/fpga/ddr3-initialization-sequence-issue/msg3592301/#msg3592301
// DDR Data Reception Using Two BUFIO2s
// See Figure 6 of https://www.xilinx.com/support/documentation/application_notes/xapp1064.pdf#page=5
/*
wire rxioclkp;
wire rxioclkn;
wire rx_serdesstrobe;
wire gclk_iserdes;
wire clkin_p_iserdes = (udqs_r | ldqs_r);
wire clkin_n_iserdes = (udqs_n_r | ldqs_n_r);
serdes_1_to_n_clk_ddr_s8_diff #(.S(SERDES_RATIO))
dqs_iserdes
(
.clkin_p(clkin_p_iserdes),
.clkin_n(clkin_n_iserdes),
.rxioclkp(rxioclkp),
.rxioclkn(rxioclkn),
.rx_serdesstrobe(rx_serdesstrobe),
.rx_bufg_x1(gclk_iserdes)
);
serdes_1_to_n_data_ddr_s8_diff #(.D(DQ_BITWIDTH), .S(SERDES_RATIO))
dq_iserdes
(
.use_phase_detector(1'b1),
.datain_p(dq_r),
.datain_n(),
.rxioclkp(rxioclkp),
.rxioclkn(rxioclkn),
.rxserdesstrobe(rx_serdesstrobe),
.reset(reset),
.gclk(gclk_iserdes),
.bitslip(1'b1),
.debug_in(2'b00),
.data_out(data_from_ram),
.debug(debug_dq_serdes)
);
// DDR Data Transmission Using Two BUFIO2s
// See Figure 18 of https://www.xilinx.com/support/documentation/application_notes/xapp1064.pdf#page=17
wire txioclkp;
wire txioclkn;
wire txserdesstrobe;
wire gclk_oserdes;
clock_generator_ddr_s8_diff #(.S(SERDES_RATIO))
dqs_oserdes
(
.clkin_p(clk),
.clkin_n(),
.ioclkap(txioclkp),
.ioclkan(txioclkn),
.serdesstrobea(txserdesstrobe),
.ioclkbp(),
.ioclkbn(),
.serdesstrobeb(),
.gclk(gclk_oserdes)
);
serdes_n_to_1_ddr_s8_diff #(.D(DQ_BITWIDTH), .S(SERDES_RATIO))
dq_oserdes
(
.txioclkp(txioclkp),
.txioclkn(txioclkn),
.txserdesstrobe(txserdesstrobe),
.reset(reset),
.gclk(gclk_oserdes),
.datain(data_to_ram),
.dataout_p(dq_w),
.dataout_n()
);
*/
// to synchronize signal in clk_serdes_data domain to ck domain
reg [DQ_BITWIDTH*SERDES_RATIO-1:0] data_from_ram_ck;
`else
wire [DQ_BITWIDTH-1:0] dq_w_d0 = data_to_ram[0 +: DQ_BITWIDTH];
wire [DQ_BITWIDTH-1:0] dq_w_d1 = data_to_ram[DQ_BITWIDTH +: DQ_BITWIDTH];
// to synchronize signal in clk_serdes_data domain to ck domain
reg [(DQ_BITWIDTH << 1)-1:0] data_from_ram_ck;
wire [(DQ_BITWIDTH << 1)-1:0] data_from_ram_clk_serdes_data =
{dq_r_q1, dq_r_q0};
`endif
assign data_from_ram = data_from_ram_clk_serdes_data;
// wire data_read_is_ongoing = ((wait_count > TIME_RL-TIME_TRPRE) &&
// ((main_state == STATE_READ) || (main_state == STATE_READ_AP))) ||
// (main_state == STATE_READ_DATA);
// for pipelining in order to feed valid non-X incoming DQ bits into deserializer module
localparam NUM_OF_READ_PIPELINE_REGISTER_ADDED = 15; // for 'dq_iobuf_en' and 'dqs_iobuf_en'
`ifndef TESTBENCH
reg data_read_is_ongoing;
`endif
wire data_write_is_ongoing = ((wait_count > TIME_WRITE_COMMAND_TO_DQS_VALID) &&
((main_state == STATE_WRITE) || (main_state == STATE_WRITE_AP))) ||
(main_state == STATE_WRITE_DATA);
`endif
`ifdef LATTICE
// look for BB primitive in this lattice document :
// http://www.latticesemi.com/-/media/LatticeSemi/Documents/UserManuals/EI/FPGALibrariesReferenceGuide33.ashx?document_id=50790
// we cannot have tristate signal inside the logic of an ECP5. tristates only work at the I/O boundary.
// So, need to split up the read/write signals and have logic to handle these as two separate paths
// that meet at the I/O boundary at the BB primitive.
`ifndef USE_x16
TRELLIS_IO BB_dqs (
.B(dqs),
.I(dqs_w),
.T(data_read_is_ongoing),
.O(dqs_r)
);
TRELLIS_IO BB_dqs_n (
.B(dqs_n),
.I(dqs_n_w),
.T(data_read_is_ongoing),
.O(dqs_n_r)
);
`else // DQS strobes, the following IOBUF instantiations just use all available x16 bandwidth
TRELLIS_IO BB_ldqs (
.B(ldqs),
.I(ldqs_w),
.T(data_read_is_ongoing),
.O(ldqs_r)
);
TRELLIS_IO BB_ldqs_n (
.B(ldqs_n),
.I(ldqs_n_w),
.T(data_read_is_ongoing),
.O(ldqs_n_r)
);
TRELLIS_IO BB_udqs (
.B(udqs),
.I(udqs_w),
.T(data_read_is_ongoing),
.O(udqs_r)
);
TRELLIS_IO BB_udqs_n (
.B(udqs_n),
.I(udqs_n_w),
.T(data_read_is_ongoing),
.O(udqs_n_r)
);
`endif
generate
genvar dq_index; // to indicate the bit position of DQ signal
for(dq_index = 0; dq_index < DQ_BITWIDTH; dq_index = dq_index + 1)
begin : dq_tristate_io
TRELLIS_IO BB_dq (
.B(dq[dq_index]),
.I(dq_w[dq_index]),
.T(data_read_is_ongoing),
.O(dq_r[dq_index])
);
end
endgenerate
`endif
`ifdef ALTERA
// https://www.intel.com/content/dam/www/programmable/us/en/pdfs/literature/hb/max-10/archives/ug-m10-gpio-15.1.pdf#page=47
// we cannot have tristate signal inside the logic of FPGA. tristates only work at the I/O boundary.
// So, need to split up the read/write signals and have logic to handle these as two separate paths
// that meet at the I/O boundary at the GPIO primitive.
`ifndef USE_x16
IOBUF BB_dqs (
.inclock(ck_dynamic_90),
.outclock(ck_90),
.pad_io(dqs),
.pad_io_b(dqs_n),
.oe(data_read_is_ongoing),
.dout(2'b10), // {dqs_w, dqs_n_w}
.din({dqs_r_1, dqs_r_0})
);
`else // DQS strobes, the following IOBUF instantiations just use all available x16 bandwidth
IOBUF BB_ldqs (
.inclock(ck_dynamic_90),
.outclock(ck_90),
.pad_io(ldqs),
.pad_io_b(ldqs_n),
.oe(data_read_is_ongoing),
.dout(2'b10), // {ldqs_w, ldqs_n_w}
.din({ldqs_r_1, ldqs_r_0})
);
IOBUF BB_udqs (
.inclock(ck_dynamic_90),
.outclock(ck_90),
.pad_io(udqs),
.pad_io_b(udqs_n),
.oe(data_read_is_ongoing),
.dout(2'b10), // {udqs_w, udqs_n_w}
.din({udqs_r_1, udqs_r_0})
);
`endif
generate
genvar dq_index; // to indicate the bit position of DQ signal
for(dq_index = 0; dq_index < (DQ_BITWIDTH >> 1); dq_index = dq_index + 1)
begin : dq_tristate_io
IOBUF_DQ BB_dq (
.inclock(ck_dynamic_90),
.outclock(ck),
.pad_io(dq[dq_index]),
//.pad_io_b(), // DQ signal is not differential type
.oe(data_read_is_ongoing),
.dout({dq_w_1[dq_index], dq_w_0[dq_index]}),
.din({dq_r_1[dq_index], dq_r_0[dq_index]})
);
end
endgenerate
`endif
// https://www.eevblog.com/forum/fpga/ddr3-initialization-sequence-issue/msg3668329/#msg3668329
localparam NUM_OF_FF_SYNCHRONIZERS_FOR_CK_180_DOMAIN_TO_CK_90_DOMAIN = 3;
// to synchronize signal in ck_180 domain to ck_90 domain
reg [NUM_OF_FF_SYNCHRONIZERS_FOR_CK_180_DOMAIN_TO_CK_90_DOMAIN-1:0] data_read_is_ongoing_90;
genvar ff_ck_180_ck_90;
generate
for(ff_ck_180_ck_90 = 0;
ff_ck_180_ck_90 < NUM_OF_FF_SYNCHRONIZERS_FOR_CK_180_DOMAIN_TO_CK_90_DOMAIN;
ff_ck_180_ck_90 = ff_ck_180_ck_90 + 1)
begin: ck_180_to_ck_90
always @(posedge ck_90)
begin
if(reset) data_read_is_ongoing_90[ff_ck_180_ck_90] <= 0;
else begin
if(ff_ck_180_ck_90 == 0) data_read_is_ongoing_90[ff_ck_180_ck_90] <= data_read_is_ongoing;
else data_read_is_ongoing_90[ff_ck_180_ck_90] <= data_read_is_ongoing_90[ff_ck_180_ck_90-1];
end
end
end
endgenerate
`ifdef XILINX
`ifndef TESTBENCH
wire ldqs_iobuf_enable, udqs_iobuf_enable;
`endif
wire ldqs_n_iobuf_enable, udqs_n_iobuf_enable;
`ifndef TESTBENCH
wire [DQ_BITWIDTH-1:0] dq_iobuf_enable;
`endif
//wire [DQ_BITWIDTH-1:0] delayed_dq_r;
//wire [DQ_BITWIDTH-1:0] delayed_dq_w;
// https://www.xilinx.com/support/documentation/sw_manuals/xilinx14_7/spartan6_hdl.pdf#page=126
`ifndef USE_x16
IOBUF IO_dqs (
.IO(dqs),
.I(dqs_w),
.T(data_read_is_ongoing),
.O(dqs_r)
);
IOBUF IO_dqs_n (
.IO(dqs_n),
.I(dqs_n_w),
.T(data_read_is_ongoing),
.O(dqs_n_r)
);
`else // DQS strobes, the following IOBUF instantiations just use all available x16 bandwidth
IOBUF IO_ldqs (
.IO(ldqs),
.I(ldqs_w),
.T(ldqs_iobuf_enable),
.O(ldqs_r)
);
IOBUF IO_ldqs_n (
.IO(ldqs_n),
.I(ldqs_n_w),
.T(ldqs_n_iobuf_enable),
.O(ldqs_n_r)
);
IOBUF IO_udqs (
.IO(udqs),
.I(udqs_w),
.T(udqs_iobuf_enable),
.O(udqs_r)
);
IOBUF IO_udqs_n (
.IO(udqs_n),
.I(udqs_n_w),
.T(udqs_n_iobuf_enable),
.O(udqs_n_r)
);
// localparam NUM_OF_FF_SYNCHRONIZERS_FOR_CK_180_DOMAIN_TO_CK_DOMAIN = 4;
// to synchronize signal in ck_180 domain to ck domain
// reg [NUM_OF_FF_SYNCHRONIZERS_FOR_CK_180_DOMAIN_TO_CK_DOMAIN-1:0] data_read_is_ongoing_ck;
genvar ff_ck_180_ck;
generate
for(ff_ck_180_ck = 0; ff_ck_180_ck < NUM_OF_FF_SYNCHRONIZERS_FOR_CK_180_DOMAIN_TO_CK_DOMAIN;
ff_ck_180_ck = ff_ck_180_ck + 1)
begin: ck_180_to_ck
always @(posedge ck)
begin
if(reset) data_read_is_ongoing_ck[ff_ck_180_ck] <= 0;
else begin
if(ff_ck_180_ck == 0)
// for tRPRE , needed for the incoming read preamble bits
// dqs tri-state buffer enable signal is connected to 'data_read_is_ongoing_ck'
data_read_is_ongoing_ck[ff_ck_180_ck] <= data_read_is_ongoing;
else data_read_is_ongoing_ck[ff_ck_180_ck] <= data_read_is_ongoing_ck[ff_ck_180_ck-1];
end
end
end
endgenerate
reg [NUM_OF_READ_PIPELINE_REGISTER_ADDED-1:0] dqs_iobuf_en;
genvar ff_dqs_iobuf_en;
generate
for(ff_dqs_iobuf_en = 0;
ff_dqs_iobuf_en < NUM_OF_READ_PIPELINE_REGISTER_ADDED;
ff_dqs_iobuf_en = ff_dqs_iobuf_en + 1)
begin: dqs_iobuf_en_pipeline
always @(posedge ck)
begin
if(reset) dqs_iobuf_en[ff_dqs_iobuf_en] <= 0;
else begin
if(ff_dqs_iobuf_en == 0)
dqs_iobuf_en[ff_dqs_iobuf_en] <=
data_read_is_ongoing_ck[NUM_OF_FF_SYNCHRONIZERS_FOR_CK_180_DOMAIN_TO_CK_DOMAIN-1];
else dqs_iobuf_en[ff_dqs_iobuf_en] <= dqs_iobuf_en[ff_dqs_iobuf_en-1];
end
end
end
endgenerate
localparam NUM_OF_FF_SYNCHRONIZERS_FOR_CK_180_DOMAIN_TO_CK_270_DOMAIN = 3;
// to synchronize signal in ck_180 domain to ck_270 domain
reg [NUM_OF_FF_SYNCHRONIZERS_FOR_CK_180_DOMAIN_TO_CK_270_DOMAIN-1:0] data_read_is_ongoing_270;
genvar ff_ck_180_ck_270;
generate
for(ff_ck_180_ck_270 = 0;
ff_ck_180_ck_270 < NUM_OF_FF_SYNCHRONIZERS_FOR_CK_180_DOMAIN_TO_CK_270_DOMAIN;
ff_ck_180_ck_270 = ff_ck_180_ck_270 + 1)
begin: ck_to_ck_270
always @(posedge ck_270)
begin
if(reset) data_read_is_ongoing_270[ff_ck_180_ck_270] <= 0;
else begin
if(ff_ck_180_ck_270 == 0) data_read_is_ongoing_270[ff_ck_180_ck_270] <= data_read_is_ongoing;
else data_read_is_ongoing_270[ff_ck_180_ck_270] <= data_read_is_ongoing_270[ff_ck_180_ck_270-1];
end
end
end
endgenerate
reg [NUM_OF_READ_PIPELINE_REGISTER_ADDED:0] dq_iobuf_en;
genvar ff_dq_iobuf_en;
generate
for(ff_dq_iobuf_en = 0;
ff_dq_iobuf_en <= NUM_OF_READ_PIPELINE_REGISTER_ADDED;
ff_dq_iobuf_en = ff_dq_iobuf_en + 1)
begin: dq_iobuf_en_pipeline
always @(posedge ck_270)
begin
if(reset) dq_iobuf_en[ff_dq_iobuf_en] <= 0;
else begin
if(ff_dq_iobuf_en == 0)
dq_iobuf_en[ff_dq_iobuf_en] <=
data_read_is_ongoing_270[NUM_OF_FF_SYNCHRONIZERS_FOR_CK_180_DOMAIN_TO_CK_270_DOMAIN-1];
else dq_iobuf_en[ff_dq_iobuf_en] <= dq_iobuf_en[ff_dq_iobuf_en-1];
end
end
end
endgenerate
// see https://www.xilinx.com/support/documentation/user_guides/ug381.pdf#page=61
// 'data_read_is_ongoing' signal is not of double-data-rate signals,
// but it is connected to T port of IOBUF where its I port is fed in with double-data-rate DQS signals,
// thus the purpose of having the following ODDR2 primitives
ODDR2 #(
.DDR_ALIGNMENT("C0"), // Sets output alignment to "NONE", "C0" or "C1"
.INIT(1'b0), // Sets initial state of the Q output to 1'b0 or 1'b1
.SRTYPE("ASYNC") // Specifies "SYNC" or "ASYNC" set/reset
)
ODDR2_ldqs_iobuf_en(
.Q(ldqs_iobuf_enable), // 1-bit DDR output data
.C0(ck), // 1-bit clock input
.C1(ck_180), // 1-bit clock input
.CE(1'b1), // 1-bit clock enable input
.D0(dqs_iobuf_en[NUM_OF_READ_PIPELINE_REGISTER_ADDED-1]), // 1-bit DDR data input (associated with C0)
.D1(dqs_iobuf_en[NUM_OF_READ_PIPELINE_REGISTER_ADDED-1]), // 1-bit DDR data input (associated with C1)
.R(1'b0), // 1-bit reset input
.S(1'b0) // 1-bit set input
);
ODDR2 #(
.DDR_ALIGNMENT("C0"), // Sets output alignment to "NONE", "C0" or "C1"
.INIT(1'b0), // Sets initial state of the Q output to 1'b0 or 1'b1
.SRTYPE("ASYNC") // Specifies "SYNC" or "ASYNC" set/reset
)
ODDR2_ldqs_n_iobuf_en(
.Q(ldqs_n_iobuf_enable), // 1-bit DDR output data
.C0(ck), // 1-bit clock input
.C1(ck_180), // 1-bit clock input
.CE(1'b1), // 1-bit clock enable input
.D0(dqs_iobuf_en[NUM_OF_READ_PIPELINE_REGISTER_ADDED-1]), // 1-bit DDR data input (associated with C0)
.D1(dqs_iobuf_en[NUM_OF_READ_PIPELINE_REGISTER_ADDED-1]), // 1-bit DDR data input (associated with C1)
.R(1'b0), // 1-bit reset input
.S(1'b0) // 1-bit set input
);
ODDR2 #(
.DDR_ALIGNMENT("C0"), // Sets output alignment to "NONE", "C0" or "C1"
.INIT(1'b0), // Sets initial state of the Q output to 1'b0 or 1'b1
.SRTYPE("ASYNC") // Specifies "SYNC" or "ASYNC" set/reset
)
ODDR2_udqs_iobuf_en(
.Q(udqs_iobuf_enable), // 1-bit DDR output data
.C0(ck), // 1-bit clock input
.C1(ck_180), // 1-bit clock input
.CE(1'b1), // 1-bit clock enable input
.D0(dqs_iobuf_en[NUM_OF_READ_PIPELINE_REGISTER_ADDED-1]), // 1-bit DDR data input (associated with C0)
.D1(dqs_iobuf_en[NUM_OF_READ_PIPELINE_REGISTER_ADDED-1]), // 1-bit DDR data input (associated with C1)
.R(1'b0), // 1-bit reset input
.S(1'b0) // 1-bit set input
);
ODDR2 #(
.DDR_ALIGNMENT("C0"), // Sets output alignment to "NONE", "C0" or "C1"
.INIT(1'b0), // Sets initial state of the Q output to 1'b0 or 1'b1
.SRTYPE("ASYNC") // Specifies "SYNC" or "ASYNC" set/reset
)
ODDR2_udqs_n_iobuf_en(
.Q(udqs_n_iobuf_enable), // 1-bit DDR output data
.C0(ck), // 1-bit clock input
.C1(ck_180), // 1-bit clock input
.CE(1'b1), // 1-bit clock enable input
.D0(dqs_iobuf_en[NUM_OF_READ_PIPELINE_REGISTER_ADDED-1]), // 1-bit DDR data input (associated with C0)
.D1(dqs_iobuf_en[NUM_OF_READ_PIPELINE_REGISTER_ADDED-1]), // 1-bit DDR data input (associated with C1)
.R(1'b0), // 1-bit reset input
.S(1'b0) // 1-bit set input
);
`endif
generate
genvar dq_index; // to indicate the bit position of DQ signal
for(dq_index = 0; dq_index < DQ_BITWIDTH; dq_index = dq_index + 1)
begin : dq_io
// RAM -> IOBUF (for inout) -> IDELAY (DQS Centering) -> IDDR2 (input DDR buffer) -> ISERDES
// OSERDES -> ODDR2 (output DDR buffer) -> ODELAY (DQS Centering) -> IOBUF (for inout) -> RAM
IOBUF IO_dq (
.IO(dq[dq_index]),
.I(dq_w[dq_index]), // already phase-shifted by 90 degrees
.T(dq_iobuf_enable[dq_index]),
.O(dq_r[dq_index]) // not phase-shifted by 90 degrees yet
);
// As for why 'dq_iobuf_enable' signal is implemented using ODDR2 primitive,
// see https://www.xilinx.com/support/documentation/user_guides/ug381.pdf#page=61
// ODDR2: Input Double Data Rate Output Register with Set, Reset and Clock Enable.
// Spartan-6
// Xilinx HDL Libraries Guide, version 14.7
ODDR2 #(
.DDR_ALIGNMENT("C1"), // Sets output alignment to "NONE", "C0" or "C1"
.INIT(1'b0), // Sets initial state of the Q output to 1'b0 or 1'b1
.SRTYPE("ASYNC") // Specifies "SYNC" or "ASYNC" set/reset
)
ODDR2_dq_iobuf_en(
.Q(dq_iobuf_enable[dq_index]), // 1-bit DDR output data
.C0(ck_90), // 1-bit clock input
.C1(ck_270), // 1-bit clock input
.CE(1'b1), // 1-bit clock enable input
.D0(dq_iobuf_en[NUM_OF_READ_PIPELINE_REGISTER_ADDED]), // 1-bit DDR data input (associated with C0)
.D1(dq_iobuf_en[NUM_OF_READ_PIPELINE_REGISTER_ADDED]), // 1-bit DDR data input (associated with C1)
.R(reset), // 1-bit reset input
.S(1'b0) // 1-bit set input
);
// End of ODDR2_inst instantiation
// IODDR2 primitives are needed because the 'dq' signals are of double-data-rate
// https://www.xilinx.com/support/documentation/sw_manuals/xilinx14_7/spartan6_hdl.pdf#page=123
// IDDR2: Input Double Data Rate Input Register with Set, Reset and Clock Enable.
// Spartan-6
// Xilinx HDL Libraries Guide, version 14.7
/*
IDDR2 #(
.DDR_ALIGNMENT("NONE"), // Sets output alignment to "NONE", "C0" or "C1"
.INIT_Q0(1'b0), // Sets initial state of the Q0 output to 1'b0 or 1'b1
.INIT_Q1(1'b0), // Sets initial state of the Q1 output to 1'b0 or 1'b1
.SRTYPE("SYNC") // Specifies "SYNC" or "ASYNC" set/reset
)
IDDR2_dq_r(
.Q0(dq_r_q0[dq_index]), // 1-bit output captured with C0 clock
.Q1(dq_r_q1[dq_index]), // 1-bit output captured with C1 clock
.C0(ck_dynamic_90), // 1-bit clock input
.C1(ck_dynamic_270), // 1-bit clock input
.CE(1'b1), // 1-bit clock enable input
.D(dq_r[dq_index]), // 1-bit DDR data input
.R(reset), // 1-bit reset input
.S(1'b0) // 1-bit set input
);
// End of IDDR2_inst instantiation
*/
// https://www.xilinx.com/support/documentation/user_guides/ug381.pdf#page=51
// IDDR2 is re-coded in verilog fabric due to same clock restriction of IODDR which leads to routing issue
// the use of ck_dynamic_90 instead of ck_dynamic is due to the reason:
// for DQ centering, incoming DQ bits have 0 phase shift with respect to incoming DQS strobe
always @(posedge ck_dynamic_90)
begin
if(reset) dq_r_q0_ck_dynamic_90[dq_index] <= 0;
else dq_r_q0_ck_dynamic_90[dq_index] <= dq_r[dq_index];
end
always @(negedge ck_dynamic_90) // always @(posedge ck_dynamic_270)
begin
if(reset) dq_r_q1_ck_dynamic_90[dq_index] <= 0;
else dq_r_q1_ck_dynamic_90[dq_index] <= dq_r[dq_index];
end
// ODDR2: Input Double Data Rate Output Register with Set, Reset and Clock Enable.
// Spartan-6
// Xilinx HDL Libraries Guide, version 14.7
ODDR2 #(
.DDR_ALIGNMENT("C1"), // Sets output alignment to "NONE", "C0" or "C1"
.INIT(1'b0), // Sets initial state of the Q output to 1'b0 or 1'b1
.SRTYPE("ASYNC") // Specifies "SYNC" or "ASYNC" set/reset
)
ODDR2_dq_w(
.Q(dq_w[dq_index]), // 1-bit DDR output data
.C0(ck_90), // 1-bit clock input
.C1(ck_270), // 1-bit clock input
.CE(1'b1), // 1-bit clock enable input
.D0(dq_w_d0[dq_index]), // 1-bit DDR data input (associated with C0)
.D1(dq_w_d1[dq_index]), // 1-bit DDR data input (associated with C1)
.R(reset), // 1-bit reset input
.S(1'b0) // 1-bit set input
);
// End of ODDR2_inst instantiation
// IODELAY2 primitive is not used due to some internal hardware issues as described in
// https://www.xilinx.com/support/answers/38408.html
/*
// See https://www.xilinx.com/support/documentation/user_guides/ug381.pdf#page=51
// xilinx specs says "Only possible when the two BUFGs are common for both input and output" or similar
// This means that the input and output clocks must be the same
// Or, the read sampling clock and the write transmitter clock must be the same (denoted as ck)
// In other words, while you read the MPR_Read_function test calibration pattern from DDR3 chip,
// you shift DQS to be in phase with the clock and maintain it that way.
// You know how big is the shift, so you know how much you need to shift DQ to move it to the point
// where the sampling clock will be centred in the DQ bit.
// Note: "VARIABLE_FROM_HALF_MAX" is used to emulate 90 degrees phase shift.
IODELAY2 #(
.DATA_RATE ("DDR"), // <SDR>, DDR
.IDELAY_VALUE (0), // {0 ... 255}
.IDELAY2_VALUE (0), // {0 ... 255}
.IDELAY_MODE ("NORMAL" ), // NORMAL, PCI
.ODELAY_VALUE (0), // {0 ... 255}
.IDELAY_TYPE ("VARIABLE_FROM_HALF_MAX"),// "DEFAULT", "DIFF_PHASE_DETECTOR", "FIXED", "VARIABLE_FROM_HALF_MAX", "VARIABLE_FROM_ZERO"
.COUNTER_WRAPAROUND ("WRAPAROUND" ), // <STAY_AT_LIMIT>, WRAPAROUND
.DELAY_SRC ("IDATAIN" ), // "IO", "IDATAIN", "ODATAIN"
.SERDES_MODE ("NONE") // <NONE>, MASTER, SLAVE
)
iodelay_dq_r (
.IDATAIN (dq_r[dq_index]), // data from primary IOB
.TOUT (), // tri-state signal to IOB
.DOUT (), // output data to IOB
.T (1'b1), // tri-state control from OLOGIC/OSERDES2
.ODATAIN (1'b0), // data from OLOGIC/OSERDES2
.DATAOUT (delayed_dq_r[dq_index]), // Output data 1 to ILOGIC/ISERDES2
.DATAOUT2 (), // Output data 2 to ILOGIC/ISERDES2
.IOCLK0 (ck_90), // High speed clock for calibration
.IOCLK1 (ck_270), // High speed clock for calibration
.CLK (clk), // Fabric clock (GCLK) for control signals
// Note that my read clock is parallel for all DQ bits as well as the DQS.
// I do not have any individual tuning skew adjustments on any of the DQ pins.
// Everything is sampled and transmitted in parallel.
// In other words, the parallel DQ bits group is assumed to be length-matched
// So, all DQ bits will experience the exact same delay value (similar CAL, INC signals)
// See https://www.eevblog.com/forum/fpga/ddr3-initialization-sequence-issue/msg3601621/#msg3601621
// Might need to change this calibration decision in later part of project for further improvement
.CAL (idelay_cal_dqs_r), // Calibrate control signal
.INC (idelay_inc_dqs_r), // Increment counter
.CE (idelay_counter_enable), // Enable counter increment/decrement
.RST (idelay_is_busy_previously & (~idelay_is_busy)), // Reset delay line
.BUSY () // output signal indicating sync circuit has finished / calibration has finished
);
*/
/*
The following ODELAY for dq_w is not used.
Reason: The output of ODDR2 primitive needs to connect to the I port of IOBUF primitive
by bypassing ODELAY in order to avoid ERROR:PACK 2530 error from ISE tool
// Initially the RAM controller uses ck_90 to drive DQ bits directly to IOBUF without using ODELAY.
// However, there is some underlying xilinx spartan-6 hardware limitations where this is not possible.
// The output from ODDR2 primitive can only be routed to ILOGIC, IODELAY, and IOB
// the IODELAY2 primitives for DQ bits could not be shared between read and write operations
// because if they are to be shared, they would be some combinational logic to select between
// read and write operations which is not helpful at all for read operations.
// Note that for read pipeline, IDELAY is used before ISERDES, which means any extra logic for input of
// IDELAY will slow things down significantly until the read operations might fail to calibrate delay
IODELAY2 #(
.DATA_RATE ("DDR"), // <SDR>, DDR
.IDELAY_VALUE (0), // {0 ... 255}
.IDELAY2_VALUE (0), // {0 ... 255}
.IDELAY_MODE ("NORMAL" ), // NORMAL, PCI
.ODELAY_VALUE (0), // {0 ... 255}
.IDELAY_TYPE ("VARIABLE_FROM_HALF_MAX"),// "DEFAULT", "DIFF_PHASE_DETECTOR", "FIXED", "VARIABLE_FROM_HALF_MAX", "VARIABLE_FROM_ZERO"
.COUNTER_WRAPAROUND ("WRAPAROUND" ), // <STAY_AT_LIMIT>, WRAPAROUND
.DELAY_SRC ("IDATAIN" ), // "IO", "IDATAIN", "ODATAIN"
.SERDES_MODE ("NONE") // <NONE>, MASTER, SLAVE
)
iodelay_dq_w (
.IDATAIN (dq_w[dq_index]), // data from primary IOB
.TOUT (), // tri-state signal to IOB
.DOUT (), // output data to IOB
.T (1'b1), // tri-state control from OLOGIC/OSERDES2
.ODATAIN (1'b0), // data from OLOGIC/OSERDES2
.DATAOUT (delayed_dq_w[dq_index]), // Output data 1 to ILOGIC/ISERDES2
.DATAOUT2 (), // Output data 2 to ILOGIC/ISERDES2
.IOCLK0 (ck), // High speed clock for calibration
.IOCLK1 (ck_180), // High speed clock for calibration
.CLK (clk), // Fabric clock (GCLK) for control signals
// Note that my read clock is parallel for all DQ bits as well as the DQS.
// I do not have any individual tuning skew adjustments on any of the DQ pins.
// Everything is sampled and transmitted in parallel.
// In other words, the parallel DQ bits group is assumed to be length-matched
// So, all DQ bits will experience the exact same delay value (similar CAL, INC signals)
// See https://www.eevblog.com/forum/fpga/ddr3-initialization-sequence-issue/msg3601621/#msg3601621
// Might need to change this calibration decision in later part of project for further improvement
.CAL (idelay_cal_dqs_r), // Calibrate control signal
.INC (idelay_inc_dqs_r), // Increment counter
.CE (idelay_counter_enable), // Enable counter increment/decrement
.RST (idelay_is_busy_previously & (~idelay_is_busy)), // Reset delay line
.BUSY () // output signal indicating sync circuit has finished / calibration has finished
);
*/
end
endgenerate
`endif
`ifndef HIGH_SPEED
`ifndef USE_x16
assign dqs = ((main_state == STATE_WRITE) || (main_state == STATE_WRITE_AP) ||
(main_state == STATE_WRITE_DATA)) ?
dqs_w : {DQS_BITWIDTH{1'bz}}; // dqs value of 1'bz is for input
// assign dqs_r = dqs; // only for formal modelling of tri-state logic
assign dqs_n = ((main_state == STATE_WRITE) || (main_state == STATE_WRITE_AP) ||
(main_state == STATE_WRITE_DATA)) ?
dqs_n_w : {DQS_BITWIDTH{1'bz}}; // dqs value of 1'bz is for input
// assign dqs_n_r = dqs_n; // only for formal modelling of tri-state logic
assign dq = ((main_state == STATE_WRITE) || (main_state == STATE_WRITE_AP) ||
(main_state == STATE_WRITE_DATA)) ?
dq_w : {DQ_BITWIDTH{1'bz}}; // dq value of 1'bz is for input
// assign dq_r = dq; // only for formal modelling of tri-state logic
`else
assign ldqs = ((main_state == STATE_WRITE) || (main_state == STATE_WRITE_AP) ||
(main_state == STATE_WRITE_DATA)) ?
ldqs_w : {(DQS_BITWIDTH >> 1){1'bz}}; // dqs value of 1'bz is for input
// assign ldqs_r = ldqs; // only for formal modelling of tri-state logic
assign ldqs_n = ((main_state == STATE_WRITE) || (main_state == STATE_WRITE_AP) ||
(main_state == STATE_WRITE_DATA)) ?
ldqs_n_w : {(DQS_BITWIDTH >> 1){1'bz}}; // dqs value of 1'bz is for input
// assign ldqs_n_r = ldqs_n; // only for formal modelling of tri-state logic
assign ldq = ((main_state == STATE_WRITE) || (main_state == STATE_WRITE_AP) ||
(main_state == STATE_WRITE_DATA)) ?
ldq_w : {(DQ_BITWIDTH >> 1){1'bz}}; // dq value of 1'bz is for input
// assign ldq_r = ldq; // only for formal modelling of tri-state logic
assign udqs = ((main_state == STATE_WRITE) || (main_state == STATE_WRITE_AP) ||
(main_state == STATE_WRITE_DATA)) ?
udqs_w : {(DQS_BITWIDTH >> 1){1'bz}}; // dqs value of 1'bz is for input
// assign udqs_r = udqs; // only for formal modelling of tri-state logic
assign udqs_n = ((main_state == STATE_WRITE) || (main_state == STATE_WRITE_AP) ||
(main_state == STATE_WRITE_DATA)) ?
udqs_n_w : {(DQS_BITWIDTH >> 1){1'bz}}; // dqs value of 1'bz is for input
// assign udqs_n_r = udqs_n; // only for formal modelling of tri-state logic
assign udq = ((main_state == STATE_WRITE) || (main_state == STATE_WRITE_AP) ||
(main_state == STATE_WRITE_DATA)) ?
udq_w : {(DQ_BITWIDTH >> 1){1'bz}}; // dq value of 1'bz is for input
// assign udq_r = udq; // only for formal modelling of tri-state logic
assign dq = {udq, ldq};
assign dqs = {udqs, ldqs};
assign dqs_n = {udqs_n, ldqs_n};
`endif
`endif
`ifdef FORMAL
initial assume(reset);
/*
reg reset_extended;
always @(posedge clk_serdes)
begin
if(reset) reset_extended <= 1;
else reset_extended <= reset;
end
always @(posedge clk_serdes) // reset extender
begin
if(($past(reset) == 1) && (reset_extended) && (!$past(reset_extended))) assume(reset);
end
*/
assign dqs = ((main_state == STATE_WRITE) || (main_state == STATE_WRITE_AP) ||
(main_state == STATE_WRITE_DATA)) ?
dqs_w : {DQS_BITWIDTH{1'bz}}; // dqs value of 1'bz is for input
assign dqs_r = dqs; // only for formal modelling of tri-state logic
assign dqs_n = ((main_state == STATE_WRITE) || (main_state == STATE_WRITE_AP) ||
(main_state == STATE_WRITE_DATA)) ?
dqs_n_w : {DQS_BITWIDTH{1'bz}}; // dqs value of 1'bz is for input
assign dqs_n_r = dqs_n; // only for formal modelling of tri-state logic
assign dq = ((main_state == STATE_WRITE) || (main_state == STATE_WRITE_AP) ||
(main_state == STATE_WRITE_DATA)) ?
dq_w : {DQ_BITWIDTH{1'bz}}; // dq value of 1'bz is for input
assign dq_r = dq; // only for formal modelling of tri-state logic
reg first_clock_had_passed;
initial first_clock_had_passed = 0;
always @(posedge clk_serdes)
begin
if(reset) first_clock_had_passed <= 0;
else first_clock_had_passed <= 1;
end
always @(posedge clk_serdes)
begin
if(first_clock_had_passed)
begin
// cover(main_state == STATE_RESET_FINISH);
// cover(main_state == STATE_INIT_CLOCK_ENABLE);
// cover(main_state == STATE_INIT_MRS_2);
// cover(main_state == STATE_INIT_MRS_3);
// cover(main_state == STATE_ZQ_CALIBRATION);
cover(main_state == STATE_READ_DATA); // to obtain a RAM read transaction waveform
cover(main_state == STATE_WRITE_DATA); // to obtain a RAM write transaction waveform
end
end
always @(posedge clk_serdes)
begin
if(data_write_is_ongoing)
begin
assert(dqs == dqs_w);
end
else assert(dqs == dqs_r);
end
always @(posedge clk_serdes)
begin
if(data_write_is_ongoing)
begin
assert(dqs_n == dqs_n_w);
end
else assert(dqs_n == dqs_n_r);
end
always @(posedge clk_serdes)
begin
if(data_write_is_ongoing)
begin
assert(dq == dq_w);
end
else assert(dq == dq_r);
end
`endif
`ifndef USE_ILA
reg [$clog2(MAX_NUM_OF_REFRESH_COMMANDS_POSTPONED):0] refresh_Queue;
`endif
// It is not a must that all 8 postponed REF-commands have to be executed inside a single tREFI
`ifdef USE_ILA
assign low_Priority_Refresh_Request = (refresh_Queue != MAX_NUM_OF_REFRESH_COMMANDS_POSTPONED);
assign high_Priority_Refresh_Request = (refresh_Queue >= HIGH_REFRESH_QUEUE_THRESHOLD);
`else
wire low_Priority_Refresh_Request = (refresh_Queue != MAX_NUM_OF_REFRESH_COMMANDS_POSTPONED);
wire high_Priority_Refresh_Request = (refresh_Queue >= HIGH_REFRESH_QUEUE_THRESHOLD);
`endif
`ifndef USE_ILA
// to propagate 'write_enable' and 'read_enable' signals during STATE_IDLE to STATE_WRITE and STATE_READ
reg write_is_enabled;
reg read_is_enabled;
`endif
`ifdef USE_x16
// no data masking
assign ldm = 0;
assign udm = 0;
`endif
reg [$clog2(MAX_NUM_OF_REFRESH_COMMANDS_POSTPONED*TIME_TREFI)-1:0] postponed_refresh_timing_count;
reg [$clog2(TIME_TREFI)-1:0] refresh_timing_count;
wire extra_read_or_write_cycles_had_passed // to allow burst read or write operations to proceed first
= (postponed_refresh_timing_count ==
`ifndef XILINX
user_desired_extra_read_or_write_cycles*TIME_TREFI[0 +: $clog2(TIME_TREFI)]); // for verilator warning
`else
user_desired_extra_read_or_write_cycles*TIME_TREFI[0 +: 9]);
`endif
wire it_is_time_to_do_refresh_now // tREFI is the "average" interval between REFRESH commands
`ifndef XILINX
= (refresh_timing_count == TIME_TREFI[0 +: $clog2(TIME_TREFI)]); // for verilator warning
`else
= (refresh_timing_count == TIME_TREFI[0 +: 9]);
`endif
/* PLL dynamic phase shift is used in lieu of IODELAY2 primitive
`ifdef HIGH_SPEED
// for phase-shifting incoming read DQS strobe with respect to 'ck' signal
localparam JITTER_MARGIN_FOR_DQS_SAMPLING = 2;
reg dqs_delay_sampling_margin;
reg previous_delayed_dqs_r;
`endif
*/
`ifdef HIGH_SPEED
reg need_to_assert_reset_clk;
always @(posedge clk) // 'clk_serdes' or 'ck' is only turned on after clk is turned on
begin
if(reset) need_to_assert_reset_clk <= 1;
else if(locked_previous) need_to_assert_reset_clk <= 0;
end
`ifdef USE_SERDES
always @(posedge clk_serdes) locked_previous <= locked;
`else
always @(posedge ck) locked_previous <= locked;
`endif
// localparam NUM_OF_FF_SYNCHRONIZERS_FOR_CLK_DOMAIN_TO_CK_DOMAIN = 3;
// to synchronize signal in clk domain to ck domain
// reg [NUM_OF_FF_SYNCHRONIZERS_FOR_CLK_DOMAIN_TO_CK_DOMAIN-1:0] need_to_assert_reset_ck;
genvar ff_clk_ck;
generate
for(ff_clk_ck = 0; ff_clk_ck < NUM_OF_FF_SYNCHRONIZERS_FOR_CLK_DOMAIN_TO_CK_DOMAIN;
ff_clk_ck = ff_clk_ck + 1)
begin: clk_to_ck
always @(posedge ck)
begin
if(reset) need_to_assert_reset_ck[ff_clk_ck] <= 0;
else begin
if(ff_clk_ck == 0)
need_to_assert_reset_ck[ff_clk_ck] <= need_to_assert_reset_clk;
else need_to_assert_reset_ck[ff_clk_ck] <=
need_to_assert_reset_ck[ff_clk_ck-1];
end
end
end
endgenerate
// localparam NUM_OF_FF_SYNCHRONIZERS_FOR_CLK_DOMAIN_TO_CK_270_DOMAIN = 3;
// to synchronize signal in clk domain to ck_270 domain
// reg [NUM_OF_FF_SYNCHRONIZERS_FOR_CLK_DOMAIN_TO_CK_270_DOMAIN-1:0] need_to_assert_reset_ck_270;
genvar ff_clk_ck_270;
generate
for(ff_clk_ck_270 = 0;
ff_clk_ck_270 < NUM_OF_FF_SYNCHRONIZERS_FOR_CLK_DOMAIN_TO_CK_270_DOMAIN;
ff_clk_ck_270 = ff_clk_ck_270 + 1)
begin: clk_to_ck_270
always @(posedge ck_270)
begin
if(reset) need_to_assert_reset_ck_270[ff_clk_ck_270] <= 0;
else begin
if(ff_clk_ck_270 == 0)
need_to_assert_reset_ck_270[ff_clk_ck_270] <= need_to_assert_reset_clk;
else need_to_assert_reset_ck_270[ff_clk_ck_270] <=
need_to_assert_reset_ck_270[ff_clk_ck_270-1];
end
end
end
endgenerate
// localparam NUM_OF_FF_SYNCHRONIZERS_FOR_CLK_DOMAIN_TO_CK_DYNAMIC_90_DOMAIN = 3;
// to synchronize signal in clk domain to ck_dynamic_90 domain
// reg [NUM_OF_FF_SYNCHRONIZERS_FOR_CLK_DOMAIN_TO_CK_DYNAMIC_90_DOMAIN-1:0] need_to_assert_reset_ck_dynamic_90;
genvar ff_clk_ck_dynamic_90;
generate
for(ff_clk_ck_dynamic_90 = 0; ff_clk_ck_dynamic_90 < NUM_OF_FF_SYNCHRONIZERS_FOR_CLK_DOMAIN_TO_CK_DYNAMIC_90_DOMAIN;
ff_clk_ck_dynamic_90 = ff_clk_ck_dynamic_90 + 1)
begin: clk_to_ck_dynamic_90
always @(posedge ck_dynamic_90)
begin
if(reset) need_to_assert_reset_ck_dynamic_90[ff_clk_ck_dynamic_90] <= 0;
else begin
if(ff_clk_ck_dynamic_90 == 0)
need_to_assert_reset_ck_dynamic_90[ff_clk_ck_dynamic_90] <= need_to_assert_reset_clk;
else need_to_assert_reset_ck_dynamic_90[ff_clk_ck_dynamic_90] <=
need_to_assert_reset_ck_dynamic_90[ff_clk_ck_dynamic_90-1];
end
end
end
endgenerate
assign need_to_assert_reset =
need_to_assert_reset_ck_dynamic_90[NUM_OF_FF_SYNCHRONIZERS_FOR_CLK_DOMAIN_TO_CK_DYNAMIC_90_DOMAIN-1];
// to solve STA setup timing violation due to 'wait_count'
localparam [FIXED_POINT_BITWIDTH-1:0] COUNTER_INCREMENT_VALUE = 512;
reg [$clog2(COUNTER_INCREMENT_VALUE):0] counter_state;
reg [$clog2(MAX_TIMING/COUNTER_INCREMENT_VALUE):0] num_of_increment_done;
// See https://www.eevblog.com/forum/fpga/brianhg_ddr3_controller-open-source-ddr3-controller/msg3805064/#msg3805064
// for an explanation of using half-rate on the commands generation,
// but still achieving full-rate DRAM commands transaction with the usage of
// either (an OSERDES with a serialization factor of 2) or (2 words ck/2 in, ck out FIFO),
// and command enqueue/dequeue signal which take into account of the number of ck cycles had passed.
// This is to get around the STA setup timing violation issues related to commands generation block.
// to generate a signal that only enqueues the 333.333MHz FIFO with 83.333MHz input ONCE
// 333.333MHz (ck_180) and 83.333MHz (clk_serdes) have the same 180 phase shift and are generated from same PLL
// hence eliminates the need for asynchronous FIFO and its complicated CDC issue
reg enqueue_dram_command_bits;
reg previous_enqueue_dram_command_bits;
always @(posedge ck_180)
previous_enqueue_dram_command_bits <= enqueue_dram_command_bits;
wire fifo_command_is_empty;
// https://www.eevblog.com/forum/fpga/ddr3-initialization-sequence-issue/msg3668329/#msg3668329
localparam NUM_OF_FF_SYNCHRONIZERS_FOR_CLK_SERDES_DOMAIN_TO_CK_180_DOMAIN = 3;
// to synchronize signal in clk_serdes domain to ck_180 domain
reg [NUM_OF_FF_SYNCHRONIZERS_FOR_CLK_SERDES_DOMAIN_TO_CK_180_DOMAIN-1:0] enqueue_dram_command_bits_ck_180;
genvar ff_clk_serdes_ck_180;
generate
for(ff_clk_serdes_ck_180 = 0;
ff_clk_serdes_ck_180 < NUM_OF_FF_SYNCHRONIZERS_FOR_CLK_SERDES_DOMAIN_TO_CK_180_DOMAIN;
ff_clk_serdes_ck_180 = ff_clk_serdes_ck_180 + 1)
begin: clk_serdes_to_ck_180
always @(posedge ck_180)
begin
if(reset)
begin
enqueue_dram_command_bits_ck_180[ff_clk_serdes_ck_180] <= 0;
end
else begin
if(ff_clk_serdes_ck_180 == 0)
begin
enqueue_dram_command_bits_ck_180[ff_clk_serdes_ck_180] <= enqueue_dram_command_bits;
end
else begin
enqueue_dram_command_bits_ck_180[ff_clk_serdes_ck_180] <=
enqueue_dram_command_bits_ck_180[ff_clk_serdes_ck_180-1];
end
end
end
end
endgenerate
parameter NUM_OF_DRAM_COMMAND_BITS = 7;
// prepends the DDR command signals with "r_" so as to
// differentiate between the stored FIFO signals and actual signals sent to DRAM
reg r_ck_en, r_cs_n, r_ras_n, r_cas_n, r_we_n, r_reset_n, r_odt;
reg [ADDRESS_BITWIDTH-1:0] r_address;
reg [BANK_ADDRESS_BITWIDTH-1:0] r_bank_address;
// {ck_en, cs_n, ras_n, cas_n, we_n, reset_n, odt, address, bank_address}
reg [NUM_OF_DRAM_COMMAND_BITS-1:0] dram_command_bits_to_be_sent_to_dram;
reg [NUM_OF_DRAM_COMMAND_BITS-1:0] dram_command_bits_sent_to_dram; // data alignment due to write AL latency
reg [ADDRESS_BITWIDTH-1:0] dram_address_bits_to_be_sent_to_dram;
reg [ADDRESS_BITWIDTH-1:0] dram_address_bits_sent_to_dram; // data alignment due to write AL latency
reg [BANK_ADDRESS_BITWIDTH-1:0] dram_bank_address_bits_to_be_sent_to_dram;
reg [BANK_ADDRESS_BITWIDTH-1:0] dram_bank_address_bits_sent_to_dram; // data alignment due to write AL latency
wire [NUM_OF_DRAM_COMMAND_BITS-1:0] fifo_command_dequeue_value;
wire [NUM_OF_DRAM_COMMAND_BITS-1:0] NOP_DRAM_COMMAND_BITS =
// keeps the values of 'r_ck_en' and 'r_reset_n' since they are at logic '0' during DRAM initialization
{r_ck_en, 1'b0, 1'b1, 1'b1, 1'b1, r_reset_n, 1'b0};
wire [NUM_OF_DRAM_COMMAND_BITS-1:0] dram_command_bits_clk_serdes =
{r_ck_en, r_cs_n, r_ras_n, r_cas_n, r_we_n, r_reset_n, r_odt};
wire [NUM_OF_DRAM_COMMAND_BITS-1:0] dram_command_bits_ck_180;
wire [ADDRESS_BITWIDTH-1:0] dram_address_bits_ck_180;
wire [BANK_ADDRESS_BITWIDTH-1:0] dram_bank_address_bits_ck_180;
// for synchronizing multi-bits signals from clk_serdes domain to ck_180 domain
wire afifo_main_state_is_empty;
wire afifo_main_state_is_full;
wire afifo_dram_command_bits_is_empty;
wire afifo_dram_command_bits_is_full;
wire afifo_dram_address_bits_is_empty;
wire afifo_dram_address_bits_is_full;
wire afifo_dram_bank_address_bits_is_empty;
wire afifo_dram_bank_address_bits_is_full;
parameter CLOCK_FACTOR_BETWEEN_CLK_SERDES_AND_CK = CLK_SERDES_PERIOD/CK_PERIOD;
//parameter num_of_afifo_main_state_entries = 1 << $clog2(CLOCK_FACTOR_BETWEEN_CLK_SERDES_AND_CK);
async_fifo
#(
.WIDTH($clog2(NUM_OF_DDR_STATES)),
.NUM_ENTRIES(),
.TO_SIMPLIFY_FULL_LOGIC(1),
.TO_SIMPLIFY_EMPTY_LOGIC(0)
)
afifo_main_state
(
.write_reset(reset),
.read_reset(reset),
// Read.
.read_clk(ck_180),
.read_en(1'b1),
.read_data(main_state_ck_180),
.empty(afifo_main_state_is_empty),
// Write
.write_clk(clk_serdes),
.write_en(1'b1),
.full(afifo_main_state_is_full),
.write_data(main_state)
);
async_fifo
#(
.WIDTH(NUM_OF_DRAM_COMMAND_BITS),
.NUM_ENTRIES(),
.TO_SIMPLIFY_FULL_LOGIC(1),
.TO_SIMPLIFY_EMPTY_LOGIC(0)
)
afifo_dram_command_bits
(
.write_reset(reset),
.read_reset(reset),
// Read.
.read_clk(ck_180),
.read_en(1'b1),
.read_data(dram_command_bits_ck_180),
.empty(afifo_dram_command_bits_is_empty),
// Write
.write_clk(clk_serdes),
.write_en(1'b1),
.full(afifo_dram_command_bits_is_full),
.write_data(dram_command_bits_clk_serdes)
);
async_fifo
#(
.WIDTH(ADDRESS_BITWIDTH),
.NUM_ENTRIES(),
.TO_SIMPLIFY_FULL_LOGIC(1),
.TO_SIMPLIFY_EMPTY_LOGIC(0)
)
afifo_dram_address_bits
(
.write_reset(reset),
.read_reset(reset),
// Read.
.read_clk(ck_180),
.read_en(1'b1),
.read_data(dram_address_bits_ck_180),
.empty(afifo_dram_address_bits_is_empty),
// Write
.write_clk(clk_serdes),
.write_en(1'b1),
.full(afifo_dram_address_bits_is_full),
.write_data(r_address)
);
async_fifo
#(
.WIDTH(BANK_ADDRESS_BITWIDTH),
.NUM_ENTRIES(),
.TO_SIMPLIFY_FULL_LOGIC(1),
.TO_SIMPLIFY_EMPTY_LOGIC(0)
)
afifo_dram_bank_address_bits
(
.write_reset(reset),
.read_reset(reset),
// Read.
.read_clk(ck_180),
.read_en(1'b1),
.read_data(dram_bank_address_bits_ck_180),
.empty(afifo_dram_bank_address_bits_is_empty),
// Write
.write_clk(clk_serdes),
.write_en(1'b1),
.full(afifo_dram_bank_address_bits_is_full),
.write_data(r_bank_address)
);
async_fifo
#(
.WIDTH($clog2(MAX_WAIT_COUNT)+1),
.NUM_ENTRIES(),
.TO_SIMPLIFY_FULL_LOGIC(1),
.TO_SIMPLIFY_EMPTY_LOGIC(0)
)
afifo_wait_count
(
.write_reset(reset),
.read_reset(reset),
// Read.
.read_clk(ck_180),
.read_en(1'b1),
.read_data(wait_count_ck_180),
.empty(afifo_wait_count_is_empty),
// Write
.write_clk(clk_serdes),
.write_en(1'b1),
.full(afifo_wait_count_is_full),
.write_data(wait_count)
);
reg is_STATE_READ_AP;
always @(posedge ck_180) is_STATE_READ_AP <= (main_state == STATE_READ_AP);
reg about_to_issue_rdap_command;
always @(posedge ck_180)
begin
about_to_issue_rdap_command <=
(wait_count_ck_180 == (NUM_OF_READ_PIPELINE_REGISTER_ADDED +
NUM_OF_FF_SYNCHRONIZERS_FOR_CK_180_DOMAIN_TO_CK_90_DOMAIN-1));
end
reg issue_actual_rdap_command_now, previous_issue_actual_rdap_command_now;
always @(posedge ck_180)
issue_actual_rdap_command_now <= (is_STATE_READ_AP && about_to_issue_rdap_command);
always @(posedge ck_180)
previous_issue_actual_rdap_command_now <= issue_actual_rdap_command_now;
reg after_new_command_is_issued;
reg main_state_remains_the_same;
reg no_need_to_issue_rdap_command;
reg previous_enqueue_dram_command_bits_ck_180;
always @(posedge ck_180)
previous_enqueue_dram_command_bits_ck_180 <=
enqueue_dram_command_bits_ck_180[NUM_OF_FF_SYNCHRONIZERS_FOR_CLK_SERDES_DOMAIN_TO_CK_180_DOMAIN-1];
always @(posedge ck_180)
main_state_remains_the_same <=
//(enqueue_dram_command_bits_ck_180[NUM_OF_FF_SYNCHRONIZERS_FOR_CLK_SERDES_DOMAIN_TO_CK_180_DOMAIN-1] &&
// ~previous_enqueue_dram_command_bits_ck_180); // not used due to post P&R STA setup timing violation
(previous_main_state_ck_180 == main_state_ck_180);
always @(posedge ck_180) // rdap command needs to be issued only ONCE
no_need_to_issue_rdap_command <= (issue_actual_rdap_command_now == previous_issue_actual_rdap_command_now);
always @(posedge ck_180)
after_new_command_is_issued <= (main_state_remains_the_same);// && (no_need_to_issue_rdap_command);
always @(posedge ck_180)
begin
if(after_new_command_is_issued)
begin
dram_command_bits_to_be_sent_to_dram <= NOP_DRAM_COMMAND_BITS; // sends NOP command to DRAM
dram_address_bits_to_be_sent_to_dram <= 0; // don't care in NOP
dram_bank_address_bits_to_be_sent_to_dram <= 0; // don't care in NOP
//else dram_command_bits_to_be_sent_to_dram <= fifo_command_dequeue_value; // keep the DRAM command unchanged
end
else begin
dram_command_bits_to_be_sent_to_dram <= dram_command_bits_ck_180; // new DRAM command
dram_address_bits_to_be_sent_to_dram <= dram_address_bits_ck_180; // new DRAM address
dram_bank_address_bits_to_be_sent_to_dram <= dram_bank_address_bits_ck_180; // new DRAM bank address
end
end
// data alignment due to write AL latency
always @(posedge ck_180) dram_command_bits_sent_to_dram <= dram_command_bits_to_be_sent_to_dram;
always @(posedge ck_180) dram_address_bits_sent_to_dram <= dram_address_bits_to_be_sent_to_dram;
always @(posedge ck_180) dram_bank_address_bits_sent_to_dram <= dram_bank_address_bits_to_be_sent_to_dram;
assign {ck_en, cs_n, ras_n, cas_n, we_n, reset_n, odt, address, bank_address} =
{dram_command_bits_sent_to_dram, dram_address_bits_sent_to_dram, dram_bank_address_bits_sent_to_dram};
// the purpose of using FIFO instead of just a register is
// to allow stuffing multiple user request commands where permitted in between command execution inside DRAM
// One example would be where other banks may be activated while a write command was just sent
// and a write burst is taking place.
sync_fifo
#(
.WIDTH(NUM_OF_DRAM_COMMAND_BITS),
.SIZE(4),
.ALMOST_FULL_THRESHOLD(1)
//.ALMOST_EMPTY_THRESHOLD(1)
)
fifo_command
(
.clk(ck_180), // 333.333MHz
.reset(reset),
.full(),
.almost_full(),
// such that 83.333MHz signal is only sampled once, assuming no immediate consecutive DRAM commands
.enqueue_en(~previous_enqueue_dram_command_bits & enqueue_dram_command_bits),
.enqueue_value(dram_command_bits),
.empty(fifo_command_is_empty),
//.almost_empty(),
// it is always dequeued to satisfy DRAM manufacturer timing,
// but need to change for tRRD (ACTIVATE command when write burst is still ongoing) later
.dequeue_en(1'b1),
.dequeue_value(fifo_command_dequeue_value)
);
always @(posedge ck_180)
begin
if(reset) previous_main_state_ck_180 <= STATE_RESET;
else previous_main_state_ck_180 <= main_state_ck_180;
end
`endif
reg [$clog2(NUM_OF_WRITE_DATA/DATA_BURST_LENGTH):0] num_of_data_write_burst_had_finished;
reg [$clog2(NUM_OF_READ_DATA/DATA_BURST_LENGTH):0] num_of_data_read_burst_had_finished;
`ifdef HIGH_SPEED
always @(posedge clk_serdes) // 83.333MHz
`else
always @(posedge clk)
`endif
begin
if(reset)
begin
main_state <= STATE_RESET;
previous_main_state <= STATE_RESET;
enqueue_dram_command_bits <= 1;
r_ck_en <= 0;
// low-level signals (except reset_n) are asserted high initially
r_cs_n <= 1;
r_ras_n <= 1;
r_cas_n <= 1;
r_we_n <= 1;
// 200 us is required before RST_N goes inactive.
// CKE must be maintained inactive for 10 ns before RST_N goes inactive.
r_reset_n <= 0;
r_odt <= 0;
r_address <= 0;
r_bank_address <= 0;
wait_count <= 0;
counter_state <= 0;
num_of_increment_done <= 0;
refresh_Queue <= 0;
postponed_refresh_timing_count <= 0;
refresh_timing_count <= 0;
MPR_ENABLE <= 0;
MPR_Read_had_finished <= 0;
write_is_enabled <= 0;
read_is_enabled <= 0;
num_of_data_write_burst_had_finished <= 0;
num_of_data_read_burst_had_finished <= 0;
data_read_is_ongoing <= 0;
/* PLL dynamic phase shift is used in lieu of IODELAY2 primitive
`ifdef HIGH_SPEED
// such that the first phase delay calibration iteration does not abort
dqs_delay_sampling_margin <= JITTER_MARGIN_FOR_DQS_SAMPLING;
idelay_inc_dqs_r <= 0;
idelay_counter_enable <= 0;
`endif
*/
end
`ifdef HIGH_SPEED
else
`else
// DDR signals are 90 degrees phase-shifted in advance
// with reference to outgoing 'clk' (clk_slow) signal to DDR RAM
// such that all outgoing DDR signals are sampled in the middle of during posedge(ck)
// For more info, see the initialization sequence : https://i.imgur.com/JClPQ6G.png
// since clocked always block only updates the new data at the next clock cycle,
// clk90_slow_posedge is used instead of clk180_slow_posedge to produce a new data
// that is 180 degree phase-shifted, for which the data will be sampled in the middle by 'clk_slow' ('clk')
// Since DIVIDE_RATIO=4, so in half clock period for fast 'ck' signal, there are 2 slow 'clk' cycles
// Therefore, clk90_slow_posedge is 1 'clk' cycle in advance/early with comparison to clk180_slow_posedge
// The purpose of doing so is to have larger setup and hold timing margin for positive edge of clk_slow,
// while still obeying DDR3 datasheet specifications
else if(clk90_slow_posedge) // generates new data at 180 degrees before positive edge of clk_slow
`endif
begin
if(write_enable) write_is_enabled <= 1;
if(read_enable) read_is_enabled <= 1;
data_read_is_ongoing <= 0;
wait_count <= wait_count + 1;
previous_main_state <= main_state;
/* PLL dynamic phase shift is used in lieu of IODELAY2 primitive
`ifdef HIGH_SPEED
previous_delayed_dqs_r <= delayed_dqs_r;
`endif
*/
if(extra_read_or_write_cycles_had_passed) postponed_refresh_timing_count <= 0;
else postponed_refresh_timing_count <= postponed_refresh_timing_count + 1;
if(it_is_time_to_do_refresh_now) refresh_timing_count <= 0;
else refresh_timing_count <= refresh_timing_count + 1;
if(~locked) // PLL outputs are not locked to desired frequencies
begin
main_state <= STATE_PLL_LOCK_ISSUE; // PLL debug state
end
// defaults the command signals high & only pulse low for the 1 clock when need to issue a command.
r_cs_n <= 1;
r_ras_n <= 1;
r_cas_n <= 1;
r_we_n <= 1;
enqueue_dram_command_bits <= 0;
// https://i.imgur.com/VUdYasX.png
// See https://www.systemverilog.io/ddr4-initialization-and-calibration
case(main_state)
// reset active, wait for 200us, reset inactive, wait for 500us, CKE=1,
// then, wait for tXPR = 10ns + tRFC = 10ns + 110ns (tRFC of 1GB memory = 110ns),
// Then the MRS commands begin.
STATE_RESET : // https://i.imgur.com/ePuqhsY.png
begin
r_ck_en <= 0;
//if(wait_count[$clog2(TIME_INITIAL_RESET_ACTIVE):0] > TIME_INITIAL_RESET_ACTIVE-1)
if(num_of_increment_done[$clog2(TIME_INITIAL_RESET_ACTIVE/COUNTER_INCREMENT_VALUE):0] >
(TIME_INITIAL_RESET_ACTIVE/COUNTER_INCREMENT_VALUE))
begin
r_reset_n <= 1; // reset inactive
main_state <= STATE_RESET_FINISH;
wait_count <= 0;
counter_state <= 0;
num_of_increment_done <= 0;
enqueue_dram_command_bits <= 1;
end
else begin
r_reset_n <= 0; // reset active
main_state <= STATE_RESET;
enqueue_dram_command_bits <= 0;
end
// The following code is trying to solve the setup timing violation brought by
// large comparison hardware for signal with long bitwidth such as 'wait_count'
// In other words, the following code is doing increment for the 'wait_count' signal
// in multiple consecutive stages
if(counter_state == COUNTER_INCREMENT_VALUE)
begin
counter_state <= 1;
num_of_increment_done <= num_of_increment_done + 1;
end
else begin
counter_state <= counter_state + 1;
end
end
STATE_RESET_FINISH :
begin
// ODT must be driven LOW at least tIS prior to CKE being registered HIGH.
// For tIS, see https://i.imgur.com/kiJI0pY.png or
// the section "Command and Address Setup, Hold, and Derating" inside
// https://media-www.micron.com/-/media/client/global/documents/products/data-sheet/dram/ddr3/2gb_ddr3_sdram.pdf#page=99
// as well as the JESD79-3F DDR3 SDRAM Standard which adds further derating which means
// another 25 ps to account for the earlier reference point
r_odt <= 0; // tIs = 195ps (170ps+25ps) , this does not affect anything at low speed testing mode
//if(wait_count > TIME_INITIAL_CK_INACTIVE-1)
if(num_of_increment_done[$clog2(TIME_INITIAL_CK_INACTIVE/COUNTER_INCREMENT_VALUE):0] >
(TIME_INITIAL_CK_INACTIVE/COUNTER_INCREMENT_VALUE))
begin
r_ck_en <= 1; // CK active
main_state <= STATE_INIT_CLOCK_ENABLE;
wait_count <= 0;
counter_state <= 0;
num_of_increment_done <= 0;
// localparam NOP = (previous_clk_en) & (ck_en) & (~cs_n) & (ras_n) & (cas_n) & (we_n);
r_cs_n <= 0;
r_ras_n <= 1;
r_cas_n <= 1;
r_we_n <= 1;
enqueue_dram_command_bits <= 1;
end
else begin
main_state <= STATE_RESET_FINISH;
enqueue_dram_command_bits <= 0;
end
// The following code is trying to solve the setup timing violation brought by
// large comparison hardware for signal with long bitwidth such as 'wait_count'
// In other words, the following code is doing increment for the 'wait_count' signal
// in multiple consecutive stages
if(counter_state == COUNTER_INCREMENT_VALUE)
begin
counter_state <= 1;
num_of_increment_done <= num_of_increment_done + 1;
end
else begin
counter_state <= counter_state + 1;
end
end
STATE_INIT_CLOCK_ENABLE :
begin
r_ck_en <= 1; // CK active
// The clock must be present and valid for at least 10ns (and a minimum of five clocks)
if(wait_count > TIME_TXPR-1)
begin
// prepare necessary parameters for next state
main_state <= STATE_INIT_MRS_2;
r_bank_address <= ADDRESS_FOR_MODE_REGISTER_2;
r_address <= 0; // CWL=5; ASR disabled; SRT=normal; dynamic ODT disabled
wait_count <= 0;
// no more NOP command in next 'ck' cycle, transition to MR2 command
r_cs_n <= 0;
r_ras_n <= 0;
r_cas_n <= 0;
r_we_n <= 0;
enqueue_dram_command_bits <= 1;
end
else begin
main_state <= STATE_INIT_CLOCK_ENABLE;
enqueue_dram_command_bits <= 0;
end
end
STATE_INIT_MRS_2 :
begin
r_ck_en <= 1;
// localparam NOP = (previous_clk_en) & (ck_en) & (~cs_n) & (ras_n) & (cas_n) & (we_n);
// only a single, non-repeating MRS command is executed, and followed by NOP commands
r_cs_n <= 0;
r_ras_n <= 1;
r_cas_n <= 1;
r_we_n <= 1;
// CWL=5; ASR disabled; SRT=normal; dynamic ODT disabled
r_address <= 0;
if(wait_count > TIME_TMRD-1)
begin
// prepare necessary parameters for MR3 state
main_state <= STATE_INIT_MRS_3;
r_bank_address <= ADDRESS_FOR_MODE_REGISTER_3;
// MPR Read function enabled
r_address <= {{(ADDRESS_BITWIDTH-MPR_BITWIDTH_COMBINED){1'b0}},
MPR_ENABLE, MPR_READ_FUNCTION};
wait_count <= 0;
// no more NOP command in next 'ck' cycle, transition to MR3 command
r_cs_n <= 0;
r_ras_n <= 0;
r_cas_n <= 0;
r_we_n <= 0;
enqueue_dram_command_bits <= 1;
end
else begin
main_state <= STATE_INIT_MRS_2;
r_bank_address <= ADDRESS_FOR_MODE_REGISTER_2;
enqueue_dram_command_bits <= 0;
end
end
STATE_MRS3_TO_MRS1 :
begin
if(wait_count > TIME_TMRD-1) begin
// prepare necessary parameters for next MRS
main_state <= STATE_INIT_MRS_1;
r_bank_address <= ADDRESS_FOR_MODE_REGISTER_1;
wait_count <= 0;
// no more NOP command in next 'ck' cycle, transition to MR1 command
r_cs_n <= 0;
r_ras_n <= 0;
r_cas_n <= 0;
r_we_n <= 0;
enqueue_dram_command_bits <= 1;
`ifdef USE_x16
`ifdef RAM_SIZE_1GB
r_address <= {Q_OFF, TDQS, 1'b0, RTT_9, 1'b0, WL, RTT_6, ODS_5, AL, RTT_2, ODS_2, DLL_EN};
`elsif RAM_SIZE_2GB
r_address <= {1'b0, Q_OFF, TDQS, 1'b0, RTT_9, 1'b0, WL, RTT_6, ODS_5, AL, RTT_2, ODS_2, DLL_EN};
`elsif RAM_SIZE_4GB
r_address <= {2'b0, Q_OFF, TDQS, 1'b0, RTT_9, 1'b0, WL, RTT_6, ODS_5, AL, RTT_2, ODS_2, DLL_EN};
`endif
`else
`ifdef RAM_SIZE_1GB
r_address <= {1'b0, Q_OFF, TDQS, 1'b0, RTT_9, 1'b0, WL, RTT_6, ODS_5, AL, RTT_2, ODS_2, DLL_EN};
`elsif RAM_SIZE_2GB
r_address <= {2'b0, Q_OFF, TDQS, 1'b0, RTT_9, 1'b0, WL, RTT_6, ODS_5, AL, RTT_2, ODS_2, DLL_EN};
`elsif RAM_SIZE_4GB
r_address <= {MR1[0], 2'b0, Q_OFF, TDQS, 1'b0, RTT_9, 1'b0, WL, RTT_6, ODS_5, AL, RTT_2, ODS_2, DLL_EN};
`endif
`endif
end
else begin
main_state <= STATE_MRS3_TO_MRS1;
enqueue_dram_command_bits <= 0;
end
end
STATE_WAIT_AFTER_MPR :
begin
// NOP command in next 'ck' cycle, transition to IDLE command
r_cs_n <= 0;
r_ras_n <= 1;
r_cas_n <= 1;
r_we_n <= 1;
if(wait_count[$clog2(TIME_TMOD):0] > TIME_TMOD-1) begin
main_state <= STATE_IDLE;
wait_count <= 0;
MPR_Read_had_finished <= 1;
enqueue_dram_command_bits <= 1;
end
else begin
main_state <= STATE_WAIT_AFTER_MPR;
enqueue_dram_command_bits <= 0;
end
end
STATE_INIT_MRS_3 :
begin
r_ck_en <= 1;
// localparam NOP = (previous_clk_en) & (ck_en) & (~cs_n) & (ras_n) & (cas_n) & (we_n);
// only a single, non-repeating MRS command is executed, and followed by NOP commands
r_cs_n <= 0;
r_ras_n <= 1;
r_cas_n <= 1;
r_we_n <= 1;
if(MPR_ENABLE == 0)
begin
// finished MPR System Read Calibration, just returned from STATE_READ_DATA
if((previous_main_state == STATE_READ_DATA) || MPR_Read_had_finished)
begin
MPR_Read_had_finished <= 1;
main_state <= STATE_WAIT_AFTER_MPR;
end
// must fully initialize the DDR3 chip, right past the ZQCL before we can read the MPR.
// See Figure 48 on the DDR RAM initialization sequence
// See https://www.eevblog.com/forum/fpga/ddr3-initialization-sequence-issue/msg3599352/#msg3599352
else begin
main_state <= STATE_MRS3_TO_MRS1;
end
enqueue_dram_command_bits <= 0;
end
// Issues READ command at tMOD after MRS command is issued
// See Figure 59 or https://i.imgur.com/K1qrMME.png
else if(wait_count > TIME_TMOD-1) begin
// MPR System READ calibration is a must for all Micron DDR RAM,
// still issue NOP command in next 'ck' cycle due to some FF synchronizer chain delay
// but transition to RDAP state first
r_ck_en <= 1;
r_cs_n <= 0;
r_ras_n <= 1;
r_cas_n <= 1;
r_we_n <= 1;
enqueue_dram_command_bits <= 1;
main_state <= STATE_READ_AP;
r_address <= 0; // required by spec, see Figure 59 or https://i.imgur.com/K1qrMME.png
/*
• A[1:0] must be set to 00 as the burst order is fixed per nibble.
• A2 selects the burst order: BL8, A2 is set to 0, and the burst order is fixed to 0, 1, 2, 3, 4, 5, 6, 7.
• A[9:3] are "Don't Care."
• A10 is "Don't Care."
• A11 is "Don't Care."
• A12: Selects burst chop mode on-the-fly, if enabled within MR0.
• A13 is a "Don't Care"
• BA[2:0] are "Don't Care."
*/
wait_count <= 0;
end
else begin
main_state <= STATE_INIT_MRS_3;
enqueue_dram_command_bits <= 0;
end
end
STATE_INIT_MRS_1 :
begin
r_ck_en <= 1;
// localparam NOP = (previous_clk_en) & (ck_en) & (~cs_n) & (ras_n) & (cas_n) & (we_n);
// only a single, non-repeating MRS command is executed, and followed by NOP commands
r_cs_n <= 0;
r_ras_n <= 1;
r_cas_n <= 1;
r_we_n <= 1;
// enable DLL; 34ohm output driver; no additive latency (AL); write leveling disabled;
// termination resistors disabled; TDQS disabled; output enabled
// Note: Write leveling : See https://i.imgur.com/mKY1Sra.png
// Note: AL can be used somehow to save a few cycles when you ACTIVATE multiple banks
// interleaved, but since this is really high-end optimisation,
// it is set to value of 0 for now.
// See https://blog.csdn.net/xingqingly/article/details/48997879 and
// https://application-notes.digchip.com/024/24-19971.pdf for more context on AL
//r_address <= {1'b0, MR1, 2'b0, Q_OFF, TDQS, 1'b0, RTT_9, 1'b0, WL, RTT_6, ODS_5, AL, RTT_2, ODS_2, DLL_EN};
if(wait_count > TIME_TMRD-1)
begin
// prepare necessary parameters for next state
main_state <= STATE_INIT_MRS_0;
r_bank_address <= ADDRESS_FOR_MODE_REGISTER_0;
`ifdef USE_x16
`ifdef RAM_SIZE_1GB
r_address <= {PRECHARGE_PD, WRITE_RECOVERY, DLL_RESET, 1'b0, CAS_LATENCY_46,
READ_BURST_TYPE, CAS_LATENCY_2, BURST_LENGTH};
`elsif RAM_SIZE_2GB
r_address <= {1'b0, PRECHARGE_PD, WRITE_RECOVERY, DLL_RESET, 1'b0, CAS_LATENCY_46,
READ_BURST_TYPE, CAS_LATENCY_2, BURST_LENGTH};
`elsif RAM_SIZE_4GB
r_address <= {2'b0, PRECHARGE_PD, WRITE_RECOVERY, DLL_RESET, 1'b0, CAS_LATENCY_46,
READ_BURST_TYPE, CAS_LATENCY_2, BURST_LENGTH};
`endif
`else
`ifdef RAM_SIZE_1GB
r_address <= {1'b0, PRECHARGE_PD, WRITE_RECOVERY, DLL_RESET, 1'b0, CAS_LATENCY_46,
READ_BURST_TYPE, CAS_LATENCY_2, BURST_LENGTH};
`elsif RAM_SIZE_2GB
r_address <= {2'b0, PRECHARGE_PD, WRITE_RECOVERY, DLL_RESET, 1'b0, CAS_LATENCY_46,
READ_BURST_TYPE, CAS_LATENCY_2, BURST_LENGTH};
`elsif RAM_SIZE_4GB
r_address <= {MR0[0], 2'b0, PRECHARGE_PD, WRITE_RECOVERY, DLL_RESET, 1'b0, CAS_LATENCY_46,
READ_BURST_TYPE, CAS_LATENCY_2, BURST_LENGTH};
`endif
`endif
wait_count <= 0;
// no more NOP command in next 'ck' cycle, transition to MR0 command
r_cs_n <= 0;
r_ras_n <= 0;
r_cas_n <= 0;
r_we_n <= 0;
enqueue_dram_command_bits <= 1;
end
else begin
main_state <= STATE_INIT_MRS_1;
r_bank_address <= ADDRESS_FOR_MODE_REGISTER_1;
enqueue_dram_command_bits <= 0;
end
end
STATE_INIT_MRS_0 :
begin
r_ck_en <= 1;
// localparam NOP = (previous_clk_en) & (ck_en) & (~cs_n) & (ras_n) & (cas_n) & (we_n);
// only a single, non-repeating MRS command is executed, and followed by NOP commands
r_cs_n <= 0;
r_ras_n <= 1;
r_cas_n <= 1;
r_we_n <= 1;
// fixed burst length 8; sequential burst; CL=5; DLL reset yes
// write recovery=5; precharge PD: DLL on
// write recovery: WR(cycles) = roundup ( tWR (ns)/ tCK (ns) )
// tWR sets the number of clock cycles between the completion of a valid write operation and
// before an active bank can be precharged
// DLL reset: see https://www.issi.com/WW/pdf/EN-I002-Clock%20Consideration_QUAD&DDR2.pdf
// when initialising the RAM for the first time, the memory controller's clock outputs are
// usually disabled, so the RAM is "running" at 0 Hz (it's not running)
// after enabling the clock outputs, the DLL in the RAM needs to "lock" to the clock signal.
// A DLL reset "unlocks" the DLL, so that it can lock again to the current clock speed.
// If you enable "DLL reset" in MR0, then you must wait for tDLLK before using any functions
// that require the DLL (read commands or ODT synchronous operations)
// The DLL is used to generate DQS. For read commands, the DRAM drives DQ and DQS pins, and
// uses the DLL to maintain a 90 degrees phase shift between DQ and DQS
// tDLLK (512) cycles of clock input are required to lock the DLL.
// CL=5 is not supported with the DLL disabled according to the Micron spec.
// The Micron spec says something about DQSCK "starting earlier" with the DLL off and
// this seems to mean that we actually have CL=4 when CL=5 is configured.
// See https://i.imgur.com/iuS45ld.png where tDQSCK starts AL + CL - 1 cycles
// after the READ command.
//r_address <= {1'b0, MR0, 2'b0, PRECHARGE_PD, WRITE_RECOVERY, DLL_RESET, 1'b0, CAS_LATENCY_46,
// READ_BURST_TYPE, CAS_LATENCY_2, BURST_LENGTH};
if(wait_count > TIME_TMOD-1)
begin
main_state <= STATE_ZQ_CALIBRATION;
wait_count <= 0;
// no more NOP command in next 'ck' cycle, transition to ZQCL command
r_cs_n <= 0;
r_ras_n <= 1;
r_cas_n <= 1;
r_we_n <= 0;
r_address[A10] <= 1;
enqueue_dram_command_bits <= 1;
end
else begin
main_state <= STATE_INIT_MRS_0;
r_bank_address <= ADDRESS_FOR_MODE_REGISTER_0;
enqueue_dram_command_bits <= 0;
end
end
STATE_ZQ_CALIBRATION : // https://i.imgur.com/n4VU0MF.png
begin
r_ck_en <= 1;
// localparam NOP = (previous_clk_en) & (ck_en) & (~cs_n) & (ras_n) & (cas_n) & (we_n);
// only a single, non-repeating ZQCL command is executed, and followed by NOP commands
r_cs_n <= 0;
r_ras_n <= 1;
r_cas_n <= 1;
r_we_n <= 1;
if(wait_count > TIME_TZQINIT-1)
begin
MPR_ENABLE <= MPR_EN; // turns on MPR System Read Calibration
if(MPR_EN) main_state <= STATE_PRECHARGE;
else main_state <= STATE_IDLE;
wait_count <= 0;
enqueue_dram_command_bits <= 1;
end
else begin
main_state <= STATE_ZQ_CALIBRATION;
enqueue_dram_command_bits <= 0;
end
end
STATE_IDLE :
begin
// for simplicity, idle state coding will only transit to STATE_ACTIVATE and STATE_REFRESH
// will implement state transition to STATE_WRITE_LEVELLING and STATE_SELF_REFRESH later
// Rationale behind the priority encoder logic coding below:
// We can queue (or postpone) up to maximum 8 REFRESH commands inside the RAM.
// If 8 are queued, there's a high priority request.
// If 4-7 are queued, there's a low-priority request.
// If 0-3 are queued, no more are needed (both request signals are false).
// So READ/WRITE normally go first and refreshes are done while no READ/WRITE are pending,
// unless there is a danger that the queue underflows,
// in which case it becomes a high-priority request and READ/WRITE have to wait.
// So, in summary, it is to overcome the performance penalty due to refresh lockout at the
// higher densities
if((refresh_Queue == 0) &&
(user_desired_extra_read_or_write_cycles <= MAX_NUM_OF_REFRESH_COMMANDS_POSTPONED))
begin
refresh_Queue <= user_desired_extra_read_or_write_cycles;
end
if (high_Priority_Refresh_Request)
begin
// need to do PRECHARGE before REFRESH, see tRP
r_ck_en <= 1;
r_cs_n <= 0;
r_ras_n <= 0;
r_cas_n <= 1;
r_we_n <= 0;
r_address[A10] <= 0;
main_state <= STATE_PRECHARGE;
enqueue_dram_command_bits <= 1;
wait_count <= 0;
end
else if (write_is_enabled | read_is_enabled)
begin
r_ck_en <= 1;
r_cs_n <= 0;
r_ras_n <= 0;
r_cas_n <= 1;
r_we_n <= 1;
r_bank_address <= i_user_data_address[ADDRESS_BITWIDTH +: BANK_ADDRESS_BITWIDTH];
main_state <= STATE_ACTIVATE;
enqueue_dram_command_bits <= 1;
wait_count <= 0;
end
else if (low_Priority_Refresh_Request)
begin
// need to do PRECHARGE before REFRESH, see tRP
r_ck_en <= 1;
r_cs_n <= 0;
r_ras_n <= 0;
r_cas_n <= 1;
r_we_n <= 0;
r_address[A10] <= 0;
main_state <= STATE_PRECHARGE;
enqueue_dram_command_bits <= 1;
wait_count <= 0;
end
else begin
main_state <= STATE_IDLE;
enqueue_dram_command_bits <= 0;
end
end
STATE_ACTIVATE :
begin
r_ck_en <= 1;
// localparam NOP = (previous_clk_en) & (ck_en) & (~cs_n) & (ras_n) & (cas_n) & (we_n);
// only a single, non-repeating ACT command is executed, and followed by NOP commands
r_cs_n <= 0;
r_ras_n <= 1;
r_cas_n <= 1;
r_we_n <= 1;
// need to make sure that 'i_user_data_address' remains unchanged for at least tRRD
// because according to the definition of tRAS and tRC, it is legal within the same bank,
// to issue either ACTIVATE or REFRESH when bank is idle, and PRECHARGE when a row is open
// So, we have to keep track of what state each bank is in and which row is currently active
// will implement multiple consecutive ACT commands (TIME_RRD) in later stage of project
// However, tRRD mentioned "Time ACT to ACT, different banks, no PRE between" ?
r_bank_address <= i_user_data_address[ADDRESS_BITWIDTH +: BANK_ADDRESS_BITWIDTH];
r_address <= // column address
{
i_user_data_address[(A12+1) +: (ADDRESS_BITWIDTH-A12-1)],
1'b1, // A12 : no burst-chop
i_user_data_address[A10+1],
1'b1, // use auto-precharge, but it is don't care in this state
i_user_data_address[A10-1:0]
};
// auto-precharge (AP) is easier for now. In the end it will be manually precharging
// (since many read/write commands may use the same row) but for now, simple is better
if(wait_count > TIME_TRCD-1)
begin
if(write_is_enabled) // write operation has higher priority during loopback test
begin
// no more NOP command in next 'ck' cycle, transition to WRAP command
r_ck_en <= 1;
r_cs_n <= 0;
r_ras_n <= 1;
r_cas_n <= 0;
r_we_n <= 0;
`ifdef LOOPBACK
// for data loopback, auto-precharge will close the bank,
// which means read operation could not proceeed without reopening the bank
main_state <= STATE_WRITE;
r_address <= // column address
{
i_user_data_address[(A12+1) +: (ADDRESS_BITWIDTH-A12-1)],
1'b1, // A12 : no burst-chop
i_user_data_address[A10+1],
1'b0, // A10 : no auto-precharge
i_user_data_address[A10-1:0]
};
`else
main_state <= STATE_WRITE_AP;
r_address <= // column address
{
i_user_data_address[(A12+1) +: (ADDRESS_BITWIDTH-A12-1)],
1'b1, // A12 : no burst-chop
i_user_data_address[A10+1],
1'b1, // A10 : use auto-precharge
i_user_data_address[A10-1:0]
};
`endif
wait_count <= 0;
enqueue_dram_command_bits <= 1;
end
else if(read_is_enabled)
begin
// still issue NOP command in next 'ck' cycle due to some FF synchronizer chain delay
// but transition to RDAP state first
r_ck_en <= 1;
r_cs_n <= 0;
r_ras_n <= 1;
r_cas_n <= 1;
r_we_n <= 1;
main_state <= STATE_READ_AP;
wait_count <= 0;
enqueue_dram_command_bits <= 1;
end
else begin
main_state <= STATE_ACTIVATE;
enqueue_dram_command_bits <= 0;
end
end
else begin
main_state <= STATE_ACTIVATE;
enqueue_dram_command_bits <= 0;
end
end
STATE_WRITE :
begin
r_ck_en <= 1;
// localparam NOP = (previous_clk_en) & (ck_en) & (~cs_n) & (ras_n) & (cas_n) & (we_n);
// only a single, non-repeating ACT command is executed, and followed by NOP commands
r_cs_n <= 0;
r_ras_n <= 1;
r_cas_n <= 1;
r_we_n <= 1;
r_address <= // column address
{
i_user_data_address[(A12+1) +: (ADDRESS_BITWIDTH-A12-1)],
1'b1, // A12 : no burst-chop
i_user_data_address[A10+1],
1'b0, // A10 : no auto-precharge
i_user_data_address[A10-1:0]
};
if(wait_count >= TIME_TCCD-1)
begin
main_state <= STATE_WRITE_DATA;
wait_count <= 0;
// minus 1 to avoid one extra data write burst operation
if(num_of_data_write_burst_had_finished == (NUM_OF_WRITE_DATA/DATA_BURST_LENGTH)-1)
begin
// finished all intended data write bursts
write_is_enabled <= 0;
// do not reset the following value here to zero to avoid restarting data write bursts
//num_of_data_write_burst_had_finished <= 0;
end
else begin
// continues data write bursts
//write_is_enabled <= 1;
num_of_data_write_burst_had_finished <= num_of_data_write_burst_had_finished + 1;
// issues WR command again
r_ck_en <= 1;
r_cs_n <= 0;
r_ras_n <= 1;
r_cas_n <= 0;
r_we_n <= 0;
end
end
else begin
main_state <= STATE_WRITE;
end
end
STATE_WRITE_AP :
begin
// https://www.systemverilog.io/understanding-ddr4-timing-parameters#write
r_ck_en <= 1;
// localparam NOP = (previous_clk_en) & (ck_en) & (~cs_n) & (ras_n) & (cas_n) & (we_n);
// only a single, non-repeating ACT command is executed, and followed by NOP commands
r_cs_n <= 0;
r_ras_n <= 1;
r_cas_n <= 1;
r_we_n <= 1;
r_address <= // column address
{
i_user_data_address[(A12+1) +: (ADDRESS_BITWIDTH-A12-1)],
1'b1, // A12 : no burst-chop
i_user_data_address[A10+1],
1'b1, // A10 : use auto-precharge
i_user_data_address[A10-1:0]
};
if(wait_count >= TIME_TCCD-1)
begin
main_state <= STATE_WRITE_DATA;
wait_count <= 0;
// minus 1 to avoid one extra data write burst operation
if(num_of_data_write_burst_had_finished == (NUM_OF_WRITE_DATA/DATA_BURST_LENGTH)-1)
begin
// finished all intended data write bursts
write_is_enabled <= 0;
// do not reset the following value here to zero to avoid restarting data write bursts
//num_of_data_write_burst_had_finished <= 0;
end
else begin
// continues data write bursts
//write_is_enabled <= 1;
num_of_data_write_burst_had_finished <= num_of_data_write_burst_had_finished + 1;
// issues WR command again
r_ck_en <= 1;
r_cs_n <= 0;
r_ras_n <= 1;
r_cas_n <= 0;
r_we_n <= 0;
end
end
else begin
main_state <= STATE_WRITE_AP;
end
end
STATE_WRITE_DATA :
begin
r_ck_en <= 1;
// localparam NOP = (previous_clk_en) & (ck_en) & (~cs_n) & (ras_n) & (cas_n) & (we_n);
// only a single, non-repeating ACT command is executed, and followed by NOP commands
r_cs_n <= 0;
r_ras_n <= 1;
r_cas_n <= 1;
r_we_n <= 1;
r_address <= // column address
{
i_user_data_address[(A12+1) +: (ADDRESS_BITWIDTH-A12-1)],
1'b1, // A12 : no burst-chop
i_user_data_address[A10+1],
`ifdef LOOPBACK
1'b0, // A10 : no auto-precharge
`else
1'b1, // A10 : use auto-precharge
`endif
i_user_data_address[A10-1:0]
};
enqueue_dram_command_bits <= 0;
if(wait_count > (TIME_TBURST+TIME_TDAL)-1)
begin
`ifdef LOOPBACK
// still issue NOP command in next 'ck' cycle due to some FF synchronizer chain delay
// but transition to RD state first
r_ck_en <= 1;
r_cs_n <= 0;
r_ras_n <= 1;
r_cas_n <= 1;
r_we_n <= 1;
main_state <= STATE_READ;
wait_count <= 0;
`else
main_state <= STATE_IDLE;
wait_count <= 0;
`endif
write_is_enabled <= 0;
num_of_data_write_burst_had_finished <= 0;
end
else if(wait_count >= TIME_TBURST-1) // just finished a single data write burst
begin
// minus 1 to avoid one extra data write burst operation
if(num_of_data_write_burst_had_finished == (NUM_OF_WRITE_DATA/DATA_BURST_LENGTH)-1)
begin
// finished all intended data write bursts
main_state <= STATE_WRITE_DATA;
write_is_enabled <= 0;
// do not reset the following value here to zero to avoid restarting data write bursts
//num_of_data_write_burst_had_finished <= 0;
end
else begin
// continues data write bursts
//write_is_enabled <= 1;
wait_count <= 0;
num_of_data_write_burst_had_finished <= num_of_data_write_burst_had_finished + 1;
`ifdef LOOPBACK
main_state <= STATE_WRITE;
`else
main_state <= STATE_WRITE_AP;
`endif
// issues WR command again
r_ck_en <= 1;
r_cs_n <= 0;
r_ras_n <= 1;
r_cas_n <= 0;
r_we_n <= 0;
end
end
end
STATE_READ :
begin
r_ck_en <= 1;
// localparam NOP = (previous_clk_en) & (ck_en) & (~cs_n) & (ras_n) & (cas_n) & (we_n);
// only a single, non-repeating ACT command is executed, and followed by NOP commands
r_cs_n <= 0;
r_ras_n <= 1;
r_cas_n <= 1;
r_we_n <= 1;
r_address <= // column address
{
i_user_data_address[(A12+1) +: (ADDRESS_BITWIDTH-A12-1)],
1'b1, // A12 : no burst-chop
i_user_data_address[A10+1],
1'b0, // A10 : no auto-precharge
i_user_data_address[A10-1:0]
};
write_is_enabled <= 0;
if(wait_count ==
(NUM_OF_READ_PIPELINE_REGISTER_ADDED+
NUM_OF_FF_SYNCHRONIZERS_FOR_CK_180_DOMAIN_TO_CK_90_DOMAIN)-1)
begin
main_state <= STATE_READ_ACTUAL;
// for tRPRE , needed for the incoming read preamble bits
data_read_is_ongoing <= 1;
end
else begin
main_state <= STATE_READ;
enqueue_dram_command_bits <= 0;
end
end
STATE_READ_ACTUAL :
begin
r_ck_en <= 1;
// localparam NOP = (previous_clk_en) & (ck_en) & (~cs_n) & (ras_n) & (cas_n) & (we_n);
// only a single, non-repeating ACT command is executed, and followed by NOP commands
r_cs_n <= 0;
r_ras_n <= 1;
r_cas_n <= 1;
r_we_n <= 1;
r_address <= // column address
{
i_user_data_address[(A12+1) +: (ADDRESS_BITWIDTH-A12-1)],
1'b1, // A12 : no burst-chop
i_user_data_address[A10+1],
1'b0, // A10 : no auto-precharge
i_user_data_address[A10-1:0]
};
write_is_enabled <= 0;
if(wait_count >= TIME_TCCD-1)
begin
main_state <= STATE_READ_DATA;
wait_count <= 0;
if(num_of_data_read_burst_had_finished ==
(NUM_OF_READ_DATA/DATA_BURST_LENGTH))
begin
// finished all intended data read bursts
read_is_enabled <= 0;
// do not reset the following value here to zero to avoid restarting data read bursts
//num_of_data_read_burst_had_finished <= 0;
end
else begin
// continues data read bursts
//read_is_enabled <= 1;
num_of_data_read_burst_had_finished <= num_of_data_read_burst_had_finished + 1;
data_read_is_ongoing <= 1;
// issues RD command again
r_ck_en <= 1;
r_cs_n <= 0;
r_ras_n <= 1;
r_cas_n <= 0;
r_we_n <= 1;
end
end
else begin
main_state <= STATE_READ_ACTUAL;
end
end
STATE_READ_AP :
begin
r_ck_en <= 1;
// localparam NOP = (previous_clk_en) & (ck_en) & (~cs_n) & (ras_n) & (cas_n) & (we_n);
// only a single, non-repeating ACT command is executed, and followed by NOP commands
r_cs_n <= 0;
r_ras_n <= 1;
r_cas_n <= 1;
r_we_n <= 1;
r_address <= // column address
{
i_user_data_address[(A12+1) +: (ADDRESS_BITWIDTH-A12-1)],
1'b1, // A12 : no burst-chop
i_user_data_address[A10+1],
1'b1, // A10 : use auto-precharge
i_user_data_address[A10-1:0]
};
write_is_enabled <= 0;
if(wait_count ==
(NUM_OF_READ_PIPELINE_REGISTER_ADDED+
NUM_OF_FF_SYNCHRONIZERS_FOR_CK_180_DOMAIN_TO_CK_90_DOMAIN)-1)
begin
main_state <= STATE_READ_AP_ACTUAL;
// for tRPRE , needed for the incoming read preamble bits
data_read_is_ongoing <= 1;
end
else begin
main_state <= STATE_READ_AP;
enqueue_dram_command_bits <= 0;
end
end
STATE_READ_AP_ACTUAL :
begin
r_ck_en <= 1;
// localparam NOP = (previous_clk_en) & (ck_en) & (~cs_n) & (ras_n) & (cas_n) & (we_n);
// only a single, non-repeating ACT command is executed, and followed by NOP commands
r_cs_n <= 0;
r_ras_n <= 1;
r_cas_n <= 1;
r_we_n <= 1;
r_address <= // column address
{
i_user_data_address[(A12+1) +: (ADDRESS_BITWIDTH-A12-1)],
1'b1, // A12 : no burst-chop
i_user_data_address[A10+1],
1'b1, // A10 : use auto-precharge
i_user_data_address[A10-1:0]
};
write_is_enabled <= 0;
if(wait_count >= (TIME_RL-TIME_TRPRE))
begin
// issues RD command again
r_ck_en <= 1;
r_cs_n <= 0;
r_ras_n <= 1;
r_cas_n <= 0;
r_we_n <= 1;
main_state <= STATE_READ_DATA;
wait_count <= 0;
data_read_is_ongoing <= 1;
enqueue_dram_command_bits <= 1;
end
else begin
main_state <= STATE_READ_AP_ACTUAL;
enqueue_dram_command_bits <= 0;
end
end
STATE_READ_DATA :
begin
// See https://patents.google.com/patent/US7911857B1/en for pre-amble detection circuit
// For read, we get the unshifted DQS from the RAM and have to phase-shift it ourselves before
// using it as a clock strobe signal to sample (or capture) DQ signal
enqueue_dram_command_bits <= 0;
write_is_enabled <= 0;
r_address <= // column address
{
i_user_data_address[(A12+1) +: (ADDRESS_BITWIDTH-A12-1)],
1'b1, // A12 : no burst-chop
i_user_data_address[A10+1],
`ifdef LOOPBACK
1'b0, // A10 : no auto-precharge
`else
1'b1, // A10 : use auto-precharge
`endif
i_user_data_address[A10-1:0]
};
if(wait_count > (TIME_TBURST + TIME_TRPST + TIME_TMPRR)-1)
begin
if(~MPR_Read_had_finished) // MPR System Read Calibration is not done previously
begin
main_state <= STATE_INIT_MRS_3;
// MPR_ENABLE is already set to ZERO in the next-IF block
// MPR Read function disabled
r_address <= {{(ADDRESS_BITWIDTH-MPR_BITWIDTH_COMBINED){1'b0}},
MPR_ENABLE, MPR_READ_FUNCTION};
// no more NOP command in next 'ck' cycle, transition to MR3 command
r_cs_n <= 0;
r_ras_n <= 0;
r_cas_n <= 0;
r_we_n <= 0;
wait_count <= 0;
enqueue_dram_command_bits <= 1;
end
end
else if(wait_count >= TIME_TBURST-1) // just finished a single data read burst
begin
if(num_of_data_read_burst_had_finished ==
(NUM_OF_READ_DATA/DATA_BURST_LENGTH))
begin
// finished all intended data write bursts
main_state <= STATE_READ_DATA;
read_is_enabled <= 0;
// do not reset the following value here to zero to avoid restarting data read bursts
//num_of_data_read_burst_had_finished <= 0;
end
else begin
MPR_ENABLE <= 1'b0; // prepares to turn off MPR System Read Calibration mode after READ_DATA command finished
if(MPR_Read_had_finished) // MPR System Read Calibration is already done previously
begin
// continues data read bursts
//read_is_enabled <= 1;
wait_count <= 0;
num_of_data_read_burst_had_finished <= num_of_data_read_burst_had_finished + 1;
`ifdef LOOPBACK
main_state <= STATE_READ_ACTUAL;
`else
main_state <= STATE_READ_AP_ACTUAL;
`endif
data_read_is_ongoing <= 1;
// issues RD command again
r_ck_en <= 1;
r_cs_n <= 0;
r_ras_n <= 1;
r_cas_n <= 0;
r_we_n <= 1;
end
else begin
main_state <= STATE_READ_DATA;
// NOP command in next 'ck' cycle
r_cs_n <= 0;
r_ras_n <= 1;
r_cas_n <= 1;
r_we_n <= 1;
end
end
end
`ifdef HIGH_SPEED
else begin
main_state <= STATE_READ_DATA;
// no change in DRAM command
r_ck_en <= r_ck_en;
r_cs_n <= r_cs_n;
r_ras_n <= r_ras_n;
r_cas_n <= r_cas_n;
r_we_n <= r_we_n;
/*
Your DQS IO logic is clocked by a clock. You need to align DQS to this clock.
If you sample DQS with the rising edge of the clock, you can get different responses:
1. If you get always '0' which means that the clock rising edge already happened,
but DQS rising edge didn't. DQS needs to be moved earlier by decreasing DQS delay.
2. If you get always '1' which means that the clock rising edge happens after DQS edge.
Therefore, DQS's delay must be increased.
3. If you're somewhere in the middle (in the jitter zone) then DQS and clock are aligned.
Of course, you don't need DQS data, you only need DQ data. Therefore you adjust DQ delays
the same as DQS - every time you increase DQS delay, you also increase DQ delay as well.
Every time you decrease DQS delay you decrease DQ delay. This way, if DQS shifts, you shift
the DQ sampling point to follow DQS.
*/
/* PLL dynamic phase shift is used in lieu of IODELAY2 primitive
if(MPR_ENABLE)
begin
// samples the delayed version of dqs_r for continous feedback to IDELAY2 primitive
if(~delayed_dqs_r & ~previous_delayed_dqs_r)
begin
idelay_inc_dqs_r <= 0; // 1st case : decrements delay value
dqs_delay_sampling_margin <= dqs_delay_sampling_margin - 1;
end
else if(delayed_dqs_r & previous_delayed_dqs_r)
begin
idelay_inc_dqs_r <= 1; // 2nd case : increments delay value
dqs_delay_sampling_margin <= dqs_delay_sampling_margin + 1;
end
// see 3rd case
if(dqs_delay_sampling_margin < JITTER_MARGIN_FOR_DQS_SAMPLING)
idelay_counter_enable <= 0; // disables delay feedback process, calibration is done
else idelay_counter_enable <= 1; // enables delay feedback process
end*/
end
`endif
end
STATE_PRECHARGE :
begin
// need to do PRECHARGE before REFRESH, see tRP
r_ck_en <= 1;
r_cs_n <= 0;
r_ras_n <= 0;
r_cas_n <= 1;
r_we_n <= 0;
r_address[A10] <= 1; // precharge ALL banks
if(wait_count > TIME_TRP-1)
begin
if(MPR_ENABLE) // MPR System Read Calibration has higher priority
begin
// prepare necessary parameters for next state
main_state <= STATE_INIT_MRS_3;
r_bank_address <= ADDRESS_FOR_MODE_REGISTER_3;
// MPR Read function enabled
r_address <= {{(ADDRESS_BITWIDTH-MPR_BITWIDTH_COMBINED){1'b0}},
MPR_ENABLE, MPR_READ_FUNCTION};
wait_count <= 0;
// no more NOP command in next 'ck' cycle, transition to MR3 command
r_cs_n <= 0;
r_ras_n <= 0;
r_cas_n <= 0;
r_we_n <= 0;
enqueue_dram_command_bits <= 1;
end
else begin
main_state <= STATE_REFRESH;
wait_count <= 0;
// no more NOP command in next 'ck' cycle, transition to REF command
r_ck_en <= 1;
r_cs_n <= 0;
r_ras_n <= 0;
r_cas_n <= 0;
r_we_n <= 1;
enqueue_dram_command_bits <= 1;
end
end
else begin
main_state <= STATE_PRECHARGE;
enqueue_dram_command_bits <= 0;
end
end
STATE_REFRESH :
begin
// https://www.systemverilog.io/understanding-ddr4-timing-parameters#refresh
// As for why the maximum absolute interval between any REFRESH command and the next REFRESH
// command is nine times the maximum average interval refresh rate (9x tREFI), we are allowed
// to deviate from sending refresh to a DRAM chip by up to 9x the nominal period in a chain of
// up to 8 refresh commands that are queued to the chip to ensure the data held doesn't decay.
// So we can send a spree of refresh commands, then wait some time (9x the nominal period)
// then send another spree because that works out to about the nominal period and the refresh
// scheduler in the DRAM will do the rest
// the max active -> precharge delay (tRAS) is also 9*tREFI, as we need to be precharged to
// issue a refresh, so if we leave the precharge command any later, the max refresh constraints
// would not be obeyed anymore
r_ck_en <= 1;
// localparam NOP = (previous_clk_en) & (ck_en) & (~cs_n) & (ras_n) & (cas_n) & (we_n);
// only a single, non-repeating ACT command is executed, and followed by NOP commands
r_cs_n <= 0;
r_ras_n <= 1;
r_cas_n <= 1;
r_we_n <= 1;
enqueue_dram_command_bits <= 0;
if(refresh_Queue > 0)
refresh_Queue <= refresh_Queue - 1; // a countdown trigger for precharge/refresh operation
if(wait_count > TIME_TRFC-1)
begin
main_state <= STATE_IDLE;
wait_count <= 0;
end
else begin
main_state <= STATE_REFRESH;
end
end
STATE_WRITE_LEVELLING :
begin
end
STATE_PLL_LOCK_ISSUE :
begin
if(previous_main_state != STATE_PLL_LOCK_ISSUE) // just encountered PLL issue
state_to_be_restored <= previous_main_state; // for restoring state before entering PLL debug state
else if(locked) // PLL outputs are now properly locked to their desired frequencies
main_state <= state_to_be_restored; // continues at where the FSM is previously paused
end
default : main_state <= STATE_IDLE;
endcase
end
end
endmodule
`define HIGH_SPEED 1 // Minimum DDR3-1600 operating frequency >= 303MHz
`define MICRON_SIM 1 // micron simulation model
`define TESTBENCH 1 // for both micron simulation model and Xilinx ISIM simulator
`define VIVADO 1 // for 7-series and above
`define USE_x16 1
`define USE_SERDES 1
// `define TDQS 1
//`define RAM_SIZE_1GB
`define RAM_SIZE_2GB
//`define RAM_SIZE_4GB
`ifndef FORMAL
`ifdef HIGH_SPEED
// for internal logic analyzer
//`define USE_ILA 1
// for lattice ECP5 FPGA
//`define LATTICE 1
// for Xilinx Spartan-6 FPGA
`define XILINX 1
// for Altera MAX-10 FPGA
//`define ALTERA 1
`endif
`endif
`ifdef MICRON_SIM
// follows Micron simulation model
`timescale 1ps / 1ps // time-unit = 1 ps, precision = 1 ps
`endif
// write data to RAM and then read them back from RAM
`define LOOPBACK 1
`ifdef LOOPBACK
`ifndef FORMAL
`ifndef MICRON_SIM
// data loopback requires ILA capability to check data integrity
//`define USE_ILA 1
`endif
`endif
`endif
module test_ddr3_memory_controller
#(
parameter NUM_OF_WRITE_DATA = 32, // 32 pieces of data are to be written to DRAM
parameter NUM_OF_READ_DATA = 32, // 32 pieces of data are to be read from DRAM
parameter DATA_BURST_LENGTH = 8, // eight data transfers per burst activity, please modify MR0 setting if none other than BL8
`ifdef USE_SERDES
// why 8 ? because of FPGA development board is using external 50 MHz crystal
// and the minimum operating frequency for Micron DDR3 memory is 303MHz
parameter SERDES_RATIO = 8,
`endif
parameter PICO_TO_NANO_CONVERSION_FACTOR = 1000, // 1ns = 1000ps
`ifndef HIGH_SPEED
parameter PERIOD_MARGIN = 10, // 10ps margin
parameter MAXIMUM_CK_PERIOD = 3300-PERIOD_MARGIN, // 3300ps which is defined by Micron simulation model
parameter DIVIDE_RATIO = 4, // master 'clk' signal is divided by 4 for DDR outgoing 'ck' signal, it is for 90 degree phase shift purpose.
// host clock period in ns
// clock period of 'clk' = 0.8225ns , clock period of 'ck' = 3.3ns
parameter CLK_PERIOD = $itor(MAXIMUM_CK_PERIOD/DIVIDE_RATIO)/$itor(PICO_TO_NANO_CONVERSION_FACTOR),
`else
parameter CLK_PERIOD = 20, // 20ns, 50MHz
parameter CLK_SERDES_PERIOD = 12, // 12ns, 83.333MHz
`endif
`ifdef TESTBENCH
`ifndef MICRON_SIM
parameter PERIOD_MARGIN = 10, // 10ps margin
parameter MAXIMUM_CK_PERIOD = 3300-PERIOD_MARGIN, // 3300ps which is defined by Micron simulation model
parameter DIVIDE_RATIO = 4, // master 'clk' signal is divided by 4 for DDR outgoing 'ck' signal, it is for 90 degree phase shift purpose.
`endif
`endif
`ifdef HIGH_SPEED
parameter CK_PERIOD = 3, // 333.333MHz from PLL, 1/333.333MHz = 3ns
`else
parameter CK_PERIOD = (CLK_PERIOD*DIVIDE_RATIO),
`endif
`ifdef USE_x16
parameter DM_BITWIDTH = 2,
parameter DQS_BITWIDTH = 2,
`ifdef RAM_SIZE_1GB
parameter ADDRESS_BITWIDTH = 13,
`elsif RAM_SIZE_2GB
parameter ADDRESS_BITWIDTH = 14,
`elsif RAM_SIZE_4GB
parameter ADDRESS_BITWIDTH = 15,
`endif
`else
parameter DM_BITWIDTH = 1,
parameter DQS_BITWIDTH = 1,
`ifdef RAM_SIZE_1GB
parameter ADDRESS_BITWIDTH = 14,
`elsif RAM_SIZE_2GB
parameter ADDRESS_BITWIDTH = 15,
`elsif RAM_SIZE_4GB
parameter ADDRESS_BITWIDTH = 16,
`endif
`endif
parameter BANK_ADDRESS_BITWIDTH = 3, // 8 banks, and $clog2(8) = 3
`ifdef USE_x16
parameter DQ_BITWIDTH = 16 // bitwidth for each piece of data
`else
parameter DQ_BITWIDTH = 8 // bitwidth for each piece of data
`endif
)
(
`ifndef MICRON_SIM
// these are FPGA internal signals
input clk,
input resetn, // negation polarity due to pull-down tact switch
output done, // finished DDR write and read operations in loopback mechaism
output led_test, // just to test whether bitstream works or not
// these are to be fed into external DDR3 memory
output [ADDRESS_BITWIDTH-1:0] address,
output [BANK_ADDRESS_BITWIDTH-1:0] bank_address,
output ck, // CK
output ck_n, // CK#
output ck_en, // CKE
output cs_n, // chip select signal
output odt, // on-die termination
output ras_n, // RAS#
output cas_n, // CAS#
output we_n, // WE#
output reset_n,
inout [DQ_BITWIDTH-1:0] dq, // Data input/output
`ifdef USE_x16
output ldm, // lower-byte data mask, to be asserted HIGH during data write activities into RAM
output udm, // upper-byte data mask, to be asserted HIGH during data write activities into RAM
inout ldqs, // lower byte data strobe
inout ldqs_n,
inout udqs, // upper byte data strobe
inout udqs_n
`else
inout [DQS_BITWIDTH-1:0] dqs, // Data strobe
inout [DQS_BITWIDTH-1:0] dqs_n,
// driven to high-Z if TDQS termination function is disabled
// according to TN-41-06: DDR3 Termination Data Strobe (TDQS)
// Please as well look at TN-41-04: DDR3 Dynamic On-Die Termination Operation
`ifdef TDQS
inout [DQS_BITWIDTH-1:0] tdqs, // Termination data strobe, but can act as data-mask (DM) when TDQS function is disabled
`else
output [DQS_BITWIDTH-1:0] tdqs,
`endif
inout [DQS_BITWIDTH-1:0] tdqs_n
`endif
`endif
);
`ifdef HIGH_SPEED
// for clk_serdes clock domain
wire clk_serdes_data; // 83.333MHz with 0 phase shift
wire clk_serdes; // 83.333MHz with 225 phase shift
wire ck_180; // 333.333MHz with 180 phase shift
wire locked_previous;
wire need_to_assert_reset;
`endif
/* verilator lint_off VARHIDDEN */
localparam NUM_OF_DDR_STATES = 23;
// TIME_TZQINIT = 512
// See also 'COUNTER_INCREMENT_VALUE' on why some of the large timing variables are not used in this case
localparam MAX_WAIT_COUNT = 512;
/* verilator lint_on VARHIDDEN */
// https://www.systemverilog.io/understanding-ddr4-timing-parameters
// TIME_INITIAL_CK_INACTIVE
localparam MAX_TIMING = (500000/CLK_SERDES_PERIOD); // just for initial development stage, will refine the value later
localparam STATE_WAIT_AFTER_MPR = 20;
localparam STATE_IDLE = 24;
localparam STATE_ACTIVATE = 5;
localparam STATE_WRITE = 6;
localparam STATE_WRITE_AP = 7;
localparam STATE_WRITE_DATA = 8;
localparam STATE_READ = 9;
localparam STATE_READ_AP = 10;
localparam STATE_READ_ACTUAL = 2;
localparam STATE_READ_AP_ACTUAL = 4;
localparam STATE_READ_DATA = 3; // smaller value to solve setup timing issue due to lesser comparison hardware
wire [$clog2(NUM_OF_DDR_STATES)-1:0] main_state;
reg [$clog2(NUM_OF_DDR_STATES)-1:0] previous_main_state;
reg [$clog2(NUM_OF_DDR_STATES)-1:0] previous_previous_main_state;
always @(posedge clk_serdes) previous_main_state <= main_state;
always @(posedge clk_serdes) previous_previous_main_state <= previous_main_state;
wire [$clog2(MAX_WAIT_COUNT):0] wait_count;
// for STATE_IDLE transition into STATE_REFRESH
parameter MAX_NUM_OF_REFRESH_COMMANDS_POSTPONED = 8; // 9 commands. one executed immediately, 8 more enqueued.
`ifndef MICRON_SIM
assign led_test = resetn; // because of light LED polarity, '1' will turn off LED, '0' will turn on LED
`else
wire done; // finished DDR write and read operations in loopback mechaism
// these are to be fed into external DDR3 memory
wire [ADDRESS_BITWIDTH-1:0] address;
wire [BANK_ADDRESS_BITWIDTH-1:0] bank_address;
wire ck; // CK
wire ck_n; // CK#
wire ck_en; // CKE
wire cs_n; // chip select signal
wire odt; // on-die termination
wire ras_n; // RAS#
wire cas_n; // CAS#
wire we_n; // WE#
wire reset_n;
wire [DQ_BITWIDTH-1:0] dq; // Data input/output
`ifdef USE_x16
wire ldm; // lower-byte data mask, to be asserted HIGH during data write activities into RAM
wire udm; // upper-byte data mask, to be asserted HIGH during data write activities into RAM
wire [(DQS_BITWIDTH >> 1)-1:0] ldqs; // lower byte data strobe
wire [(DQS_BITWIDTH >> 1)-1:0] ldqs_n;
wire [(DQS_BITWIDTH >> 1)-1:0] udqs; // upper byte data strobe
wire [(DQS_BITWIDTH >> 1)-1:0] udqs_n;
wire [DM_BITWIDTH-1:0] dm = {udm, ldm};
// wire [DQS_BITWIDTH-1:0] dqs = {udqs, ldqs};
// wire [DQS_BITWIDTH-1:0] dqs_n = {udqs_n, ldqs_n};
`else
wire [DQS_BITWIDTH-1:0] dqs; // Data strobe
wire [DQS_BITWIDTH-1:0] dqs_n;
// driven to high-Z if TDQS termination function is disabled
// according to TN-41-06: DDR3 Termination Data Strobe (TDQS)
// Please as well look at TN-41-04: DDR3 Dynamic On-Die Termination Operation
`ifdef TDQS
wire [DQS_BITWIDTH-1:0] tdqs; // Termination data strobe, but can act as data-mask (DM) when TDQS function is disabled
`else
wire [DQS_BITWIDTH-1:0] tdqs;
`endif
wire [DQS_BITWIDTH-1:0] tdqs_n;
`endif
`endif
`ifdef TESTBENCH
// Micron simulation model is using `timescale 1ps / 1ps
// duration for each bit = 1 * timescale = 1 * 1ps = 1ps
// but the following coding is also for Xilinx ISIM simulator
// reset for at least 3 CLKIN clock cycles (requirement by Xilinx PLL IP core)
localparam RESET_TIMING = 3 * CLK_PERIOD;
localparam STOP_TIMING = 900_000; // 900us = 900_000ns
// clock and reset signals generation for simulation testbench
reg clk_sim;
reg resetn_sim;
initial begin
$dumpfile("ddr3.vcd");
$dumpvars(0, test_ddr3_memory_controller);
clk_sim <= 1'b0;
resetn_sim <= 1'b1;
@(posedge clk_sim);
resetn_sim <= 1'b0; // asserts master reset signal
repeat((RESET_TIMING/CLK_PERIOD)+1) @(posedge clk_sim); // +1 because of division usually uses floor()
resetn_sim <= 1'b1; // releases master reset signal
repeat(STOP_TIMING/CLK_PERIOD) @(posedge clk_sim); // minimum runtime
$stop;
end
// note that sensitive list is omitted in always block
// therefore always-block run forever
// clock period = 20ns , frequency = 50MHz
always #((CLK_PERIOD*PICO_TO_NANO_CONVERSION_FACTOR)/2) clk_sim = ~clk_sim; // clock edge transition every half clock cycle period
wire reset = ~resetn_sim; // just for convenience of verilog syntax
`else
wire reset = ~resetn; // just for convenience of verilog syntax
`endif
wire [$clog2(MAX_NUM_OF_REFRESH_COMMANDS_POSTPONED):0] user_desired_extra_read_or_write_cycles; // for the purpose of postponing refresh commands
assign user_desired_extra_read_or_write_cycles = MAX_NUM_OF_REFRESH_COMMANDS_POSTPONED;
// phase-shift dq_w, dq_n_w signals by 90 degree with reference to clk_slow ('ck') before sending to RAM
// such that dq signals are sampled right at its middle by dqs signals
// the purpose is for dq signal integrity at high speed PCB trace
`ifndef HIGH_SPEED
wire clk_slow_posedge; // for dq phase shifting purpose
wire clk180_slow_posedge; // for dq phase shifting purpose
`endif
`ifdef TESTBENCH
wire ck_90; // for dq phase shifting purpose
wire ck_270;
wire [DQ_BITWIDTH-1:0] dq_iobuf_enable;
wire udqs_iobuf_enable;
wire ldqs_iobuf_enable;
wire data_read_is_ongoing;
`endif
reg write_enable, read_enable;
reg previous_write_enable, previous_previous_write_enable;
always @(posedge clk_serdes)
begin
previous_write_enable <= write_enable;
previous_previous_write_enable <= previous_write_enable;
end
reg done_writing, done_reading;
reg time_to_send_out_data_to_dram;
reg time_to_send_out_address_to_dram_during_write;
reg [BANK_ADDRESS_BITWIDTH+ADDRESS_BITWIDTH-1:0] i_user_data_address; // the DDR memory address for which the user wants to write/read the data
`ifdef USE_SERDES
wire [DQ_BITWIDTH*SERDES_RATIO-1:0] data_to_ram; // data for which the user wants to write/read to/from DDR
wire [DQ_BITWIDTH*SERDES_RATIO-1:0] data_from_ram; // the requested data from DDR RAM after read operation
reg [DQ_BITWIDTH*SERDES_RATIO-1:0] data_to_ram_clk_serdes;
wire [DQ_BITWIDTH*SERDES_RATIO-1:0] data_from_ram_clk_serdes;
`else
// TWO pieces of data bundled together due to double-data-rate requirement of DQ signal
reg [(DQ_BITWIDTH << 1)-1:0] data_to_ram; // data to be written to DDR RAM
wire [(DQ_BITWIDTH << 1)-1:0] data_from_ram; // the requested data being read from DDR RAM
`endif
`ifdef USE_SERDES
// for synchronizing multi-bits signals from clk_serdes domain to clk_serdes_data domain
wire afifo_data_to_ram_clk_serdes_is_empty;
wire afifo_data_to_ram_clk_serdes_is_full;
async_fifo
#(
.WIDTH(DQ_BITWIDTH*SERDES_RATIO),
.NUM_ENTRIES(),
.TO_SIMPLIFY_FULL_LOGIC(1),
.TO_SIMPLIFY_EMPTY_LOGIC(1)
)
afifo_data_to_ram_serdes
(
.write_reset(reset),
.read_reset(reset),
// Read.
.read_clk(clk_serdes_data),
.read_en(1'b1),
.read_data(data_to_ram),
.empty(afifo_data_to_ram_clk_serdes_is_empty),
// Write
.write_clk(clk_serdes),
.write_en(1'b1),
.full(afifo_data_to_ram_clk_serdes_is_full),
.write_data(data_to_ram_clk_serdes)
);
// for synchronizing multi-bits signals from clk_serdes_data domain to clk_serdes domain
wire afifo_data_from_ram_clk_serdes_is_empty;
wire afifo_data_from_ram_clk_serdes_is_full;
async_fifo
#(
.WIDTH(DQ_BITWIDTH*SERDES_RATIO),
.NUM_ENTRIES(),
.TO_SIMPLIFY_FULL_LOGIC(1),
.TO_SIMPLIFY_EMPTY_LOGIC(1)
)
afifo_data_from_ram_serdes
(
.write_reset(reset),
.read_reset(reset),
// Read.
.read_clk(clk_serdes),
.read_en(1'b1),
.read_data(data_from_ram_clk_serdes),
.empty(afifo_data_from_ram_clk_serdes_is_empty),
// Write
.write_clk(clk_serdes_data),
.write_en(1'b1),
.full(afifo_data_from_ram_clk_serdes_is_full),
.write_data(data_from_ram) // data_from_ram_clk_serdes_data
);
`endif
always @(posedge clk_serdes)
begin
if(reset) time_to_send_out_data_to_dram <= 0;
else begin
time_to_send_out_data_to_dram <=
((previous_main_state == STATE_ACTIVATE) && (previous_write_enable)) ||
`ifdef LOOPBACK
(previous_main_state == STATE_WRITE) ||
`else
(previous_main_state == STATE_WRITE_AP) ||
`endif
(previous_main_state == STATE_WRITE_DATA);
end
end
always @(posedge clk_serdes)
begin
if(reset) time_to_send_out_address_to_dram_during_write <= 0;
else begin
time_to_send_out_address_to_dram_during_write <=
((main_state == STATE_ACTIVATE) && (write_enable)) ||
`ifdef LOOPBACK
(main_state == STATE_WRITE) ||
`else
(main_state == STATE_WRITE_AP) ||
`endif
(main_state == STATE_WRITE_DATA);
end
end
// for pipelining in order to feed valid non-X incoming DQ bits into deserializer module
localparam NUM_OF_READ_PIPELINE_REGISTER_ADDED = 15; // for 'dq_iobuf_en' and 'dqs_iobuf_en'
// https://www.eevblog.com/forum/fpga/ddr3-initialization-sequence-issue/msg3668329/#msg3668329
localparam NUM_OF_FF_SYNCHRONIZERS_FOR_CK_180_DOMAIN_TO_CK_90_DOMAIN = 3;
reg time_to_send_out_address_to_dram_during_read;
always @(posedge clk_serdes)
begin
if(reset) time_to_send_out_address_to_dram_during_read <= 0;
else begin
time_to_send_out_address_to_dram_during_read <=
`ifdef LOOPBACK
(main_state == STATE_READ_ACTUAL) ||
((main_state == STATE_READ) &&
(wait_count >= (NUM_OF_READ_PIPELINE_REGISTER_ADDED+
NUM_OF_FF_SYNCHRONIZERS_FOR_CK_180_DOMAIN_TO_CK_90_DOMAIN)-1)) ||
`else
(main_state == STATE_READ_AP_ACTUAL) ||
((main_state == STATE_READ_AP) &&
(wait_count >= (NUM_OF_READ_PIPELINE_REGISTER_ADDED+
NUM_OF_FF_SYNCHRONIZERS_FOR_CK_180_DOMAIN_TO_CK_90_DOMAIN)-1)) ||
`endif
(main_state == STATE_READ_DATA);
end
end
`ifdef LOOPBACK
reg [DQ_BITWIDTH-1:0] test_data;
assign done = (done_writing & done_reading); // finish a data loopback transaction
localparam STARTING_VALUE_OF_TEST_DATA = 5; // starts from 5
`ifdef USE_SERDES
genvar data_write_index;
generate
for(data_write_index = 0; data_write_index < SERDES_RATIO;
data_write_index = data_write_index + 1)
begin: data_write_loop
`endif
`ifdef HIGH_SPEED
`ifdef USE_SERDES
always @(posedge clk_serdes)
`else
always @(posedge ck_180) // positive edge of ck_180 is right after WRITE operation starts
`endif
`else
`ifdef TESTBENCH
always @(posedge clk_sim)
`else
always @(posedge clk)
`endif
`endif
begin
if(reset)
begin
`ifdef USE_SERDES
data_to_ram_clk_serdes[DQ_BITWIDTH*data_write_index +: DQ_BITWIDTH] <= 0;
`else
data_to_ram <= 0;
`endif
test_data <= STARTING_VALUE_OF_TEST_DATA;
end
else if(
`ifndef HIGH_SPEED
// Since this is always block which updates new data in next clock cycle,
// and DIVIDE_RATIO=4 which means there are 2 'clk' cycles in each half period of a 'clk_slow' cycle,
// the following immediate single line of code will update new data
// both at 90 degrees before and after positive edge of 'ck'
(clk180_slow_posedge | clk_slow_posedge) &&
`endif
(~done_writing) &&
// write operation has higher priority in loopback mechanism
(time_to_send_out_data_to_dram)) // starts preparing for DRAM write operation
begin
`ifdef USE_SERDES
`ifdef USE_x16
data_to_ram_clk_serdes[DQ_BITWIDTH*data_write_index +: DQ_BITWIDTH] <=
{test_data + data_write_index + 1, test_data + data_write_index};
`else
data_to_ram_clk_serdes[DQ_BITWIDTH*data_write_index +: DQ_BITWIDTH] <=
test_data + data_write_index;
`endif
`else
`ifdef USE_x16
data_to_ram <= {test_data+1, test_data};
`else
data_to_ram <= test_data;
`endif
`endif
`ifdef USE_SERDES
test_data <= test_data + SERDES_RATIO;
`else
test_data <= test_data + UNIQUE_DQ_OUTPUT;
`endif
end
else if((done_writing) && (main_state == STATE_READ_DATA)) begin // read operation
end
end
`ifdef USE_SERDES
end
endgenerate
`endif
// such that DQ output signal will have unique value every DQS cycle
// this is due to double-data-rate nature of DQ
localparam UNIQUE_DQ_OUTPUT = 2;
`ifdef HIGH_SPEED
`ifdef USE_SERDES
always @(posedge clk_serdes)
`else
always @(posedge ck_180) // positive edge of ck_180 is right after WRITE operation starts
`endif
`else
`ifdef TESTBENCH
always @(posedge clk_sim)
`else
always @(posedge clk)
`endif
`endif
begin
if(reset)
begin
i_user_data_address <= 0;
write_enable <= 1; // writes data first
read_enable <= 0;
done_writing <= 0;
done_reading <= 0;
end
else if(
`ifndef HIGH_SPEED
// Since this is always block which updates new data in next clock cycle,
// and DIVIDE_RATIO=4 which means there are 2 'clk' cycles in each half period of a 'clk_slow' cycle,
// the following immediate single line of code will update new data
// both at 90 degrees before and after positive edge of 'ck'
(clk180_slow_posedge | clk_slow_posedge) &&
`endif
(~done_writing) &&
// write operation has higher priority in loopback mechanism
(time_to_send_out_address_to_dram_during_write)) // starts preparing for DRAM write operation
begin
// According to https://media-www.micron.com/-/media/client/global/documents/products/data-sheet/dram/ddr3/2gb_ddr3_sdram.pdf?rev=4bc67ac3a6f34250a2b73cb9db8c5502#page=139
// address[2:0] are designated as starting column address,
// So, need to increment DRAM address by an amount of BURST_LENGTH instead of just 1
i_user_data_address <= i_user_data_address + DATA_BURST_LENGTH;
write_enable <= (test_data < (STARTING_VALUE_OF_TEST_DATA+NUM_OF_WRITE_DATA-DATA_BURST_LENGTH)); // writes up to 'NUM_OF_WRITE_DATA' pieces of data
read_enable <= (test_data >= (STARTING_VALUE_OF_TEST_DATA+NUM_OF_WRITE_DATA-DATA_BURST_LENGTH)); // starts the readback operation
done_writing <= (test_data >= (STARTING_VALUE_OF_TEST_DATA+NUM_OF_WRITE_DATA-DATA_BURST_LENGTH)); // stops writing since readback operation starts
done_reading <= 0;
if(test_data >= (STARTING_VALUE_OF_TEST_DATA+NUM_OF_WRITE_DATA-DATA_BURST_LENGTH)) // finished writing data
begin
i_user_data_address <= 0; // read from the first piece of data written
read_enable <= 1; // prepare to read data
end
end
else if((done_writing) && // (~done_reading) && // no need extra logic
time_to_send_out_address_to_dram_during_read) // starts preparing for DRAM read operation
begin
i_user_data_address <= i_user_data_address + DATA_BURST_LENGTH;
//test_data <= 0; // not related to DDR read operation, only for DDR write operation
write_enable <= 0;
if(done) read_enable <= 0; // already finished reading all data
else read_enable <= 1;
done_writing <= done_writing;
/* the following logic is wrong given that data_from_ram is derived from the
external DQ bits PCBA trace which is susceptible to PVT deviation
if(data_from_ram[0 +: DQ_BITWIDTH] >=
(STARTING_VALUE_OF_TEST_DATA+NUM_OF_READ_DATA-DATA_BURST_LENGTH))
begin
done_reading <= 1;
end
*/
end
end
`endif
`ifdef USE_ILA
wire [DQ_BITWIDTH-1:0] dq_r; // port O of IOBUF primitive
wire [DQ_BITWIDTH-1:0] dq_w; // port I of IOBUF primitive
wire low_Priority_Refresh_Request;
wire high_Priority_Refresh_Request;
// to propagate 'write_enable' and 'read_enable' signals during STATE_IDLE to STATE_WRITE and STATE_READ
wire write_is_enabled;
wire read_is_enabled;
wire dqs_rising_edge;
wire dqs_falling_edge;
wire [$clog2(MAX_NUM_OF_REFRESH_COMMANDS_POSTPONED):0] refresh_Queue;
wire [($clog2(DIVIDE_RATIO_HALVED)-1):0] dqs_counter;
`ifdef XILINX
// Added to solve https://forums.xilinx.com/t5/Vivado-Debug-and-Power/Chipscope-ILA-Please-ensure-that-all-the-pins-used-in-the/m-p/1237451
wire [35:0] CONTROL0;
wire [35:0] CONTROL1;
wire [35:0] CONTROL2;
wire [35:0] CONTROL3;
wire [35:0] CONTROL4;
wire [35:0] CONTROL5;
icon icon_inst (
.CONTROL0(CONTROL0), // INOUT BUS [35:0]
.CONTROL1(CONTROL1), // INOUT BUS [35:0]
.CONTROL2(CONTROL2), // INOUT BUS [35:0]
.CONTROL3(CONTROL3), // INOUT BUS [35:0]
.CONTROL4(CONTROL4), // INOUT BUS [35:0]
.CONTROL5(CONTROL5) // INOUT BUS [35:0]
);
ila_1_bit ila_write_enable (
.CONTROL(CONTROL0), // INOUT BUS [35:0]
.CLK(clk), // IN
.TRIG0(write_enable) // IN BUS [0:0]
);
ila_1_bit ila_done (
.CONTROL(CONTROL1), // INOUT BUS [35:0]
.CLK(clk), // IN
.TRIG0(done) // IN BUS [0:0]
);
ila_1_bit ila_ck_n (
.CONTROL(CONTROL2), // INOUT BUS [35:0]
.CLK(clk), // IN
.TRIG0(ck_n) // IN BUS [0:0]
);
ila_16_bits ila_dq_w (
.CONTROL(CONTROL3), // INOUT BUS [35:0]
.CLK(clk), // IN
.TRIG0(dq_w) // IN BUS [15:0]
);
ila_16_bits ila_states_and_commands (
.CONTROL(CONTROL4), // INOUT BUS [35:0]
.CLK(clk), // IN
.TRIG0({low_Priority_Refresh_Request, high_Priority_Refresh_Request,
write_is_enabled, read_is_enabled, write_enable, read_enable,
main_state, ck_en, cs_n, ras_n, cas_n, we_n}) // IN BUS [15:0]
);
ila_64_bits ila_states_and_wait_count (
.CONTROL(CONTROL5), // INOUT BUS [35:0]
.CLK(clk), // IN
.TRIG0({data_to_ram, data_from_ram, low_Priority_Refresh_Request, high_Priority_Refresh_Request,
write_enable, read_enable, dqs_counter, dqs_rising_edge, dqs_falling_edge,
main_state, wait_count, refresh_Queue}) // IN BUS [63:0]
);
`else
// https://github.com/promach/internal_logic_analyzer
localparam DIVIDE_RATIO_HALVED = (DIVIDE_RATIO >> 1);
`endif
`endif
ddr3_memory_controller
#(
.NUM_OF_WRITE_DATA(NUM_OF_WRITE_DATA),
.NUM_OF_READ_DATA(NUM_OF_READ_DATA),
.DATA_BURST_LENGTH(DATA_BURST_LENGTH),
.MAX_NUM_OF_REFRESH_COMMANDS_POSTPONED(MAX_NUM_OF_REFRESH_COMMANDS_POSTPONED)
`ifdef USE_SERDES
, .SERDES_RATIO(SERDES_RATIO)
`endif
)
ddr3_control
(
// these are FPGA internal signals
`ifdef TESTBENCH
.clk(clk_sim),
`else
.clk(clk),
`endif
.reset(reset), // reset for entire system
.write_enable(write_enable), // write to DDR memory
.read_enable(read_enable), // read from DDR memory
.i_user_data_address(i_user_data_address), // the DDR memory address for which the user wants to write/read the data
.data_to_ram(data_to_ram), // data for which the user wants to write to DDR RAM
.data_from_ram(data_from_ram), // the requested data from DDR RAM after read operation
.user_desired_extra_read_or_write_cycles(user_desired_extra_read_or_write_cycles), // for the purpose of postponing refresh commands
`ifndef HIGH_SPEED
.clk_slow_posedge(clk_slow_posedge), // for dq phase shifting purpose
.clk180_slow_posedge(clk180_slow_posedge), // for dq phase shifting purpose
`endif
// these are to be fed into external DDR3 memory
.address(address),
.bank_address(bank_address),
`ifdef HIGH_SPEED
.ck_obuf(ck), // CK
.ck_n_obuf(ck_n), // CK#
`else
.ck(ck), // CK
.ck_n(ck_n), // CK#
`endif
`ifdef TESTBENCH
.ck_90(ck_90),
.ck_270(ck_270),
.dq_iobuf_enable(dq_iobuf_enable),
.udqs_iobuf_enable(udqs_iobuf_enable),
.ldqs_iobuf_enable(ldqs_iobuf_enable),
.data_read_is_ongoing(data_read_is_ongoing),
`endif
`ifdef HIGH_SPEED
.clk_serdes_data(clk_serdes_data), // 83.333MHz with 0 phase shift
.clk_serdes(clk_serdes), // 83.333MHz with 225 phase shift
.ck_180(ck_180), // 333.333MHz with 180 phase shift
.locked_previous(locked_previous),
.need_to_assert_reset(need_to_assert_reset),
`endif
.ck_en(ck_en), // CKE
.cs_n(cs_n), // chip select signal
.odt(odt), // on-die termination
.ras_n(ras_n), // RAS#
.cas_n(cas_n), // CAS#
.we_n(we_n), // WE#
.reset_n(reset_n), // reset only for RAM
.dq(dq), // Data input/output
.main_state(main_state),
.wait_count(wait_count),
`ifdef USE_ILA
.dq_w(dq_w),
.dq_r(dq_r),
.low_Priority_Refresh_Request(low_Priority_Refresh_Request),
.high_Priority_Refresh_Request(high_Priority_Refresh_Request),
.write_is_enabled(write_is_enabled),
.read_is_enabled(read_is_enabled),
.refresh_Queue(refresh_Queue),
.dqs_counter(dqs_counter),
.dqs_rising_edge(dqs_rising_edge),
.dqs_falling_edge(dqs_falling_edge),
`endif
`ifdef USE_x16
.ldm(ldm), // lower-byte data mask, to be asserted HIGH during data write activities into RAM
.udm(udm), // upper-byte data mask, to be asserted HIGH during data write activities into RAM
.ldqs(ldqs), // lower byte data strobe
.ldqs_n(ldqs_n),
.udqs(udqs), // upper byte data strobe
.udqs_n(udqs_n)
`else
.dqs(dqs), // Data strobe
.dqs_n(dqs_n),
// driven to high-Z if TDQS termination function is disabled
// according to TN-41-06: DDR3 Termination Data Strobe (TDQS)
// Please as well look at TN-41-04: DDR3 Dynamic On-Die Termination Operation
.tdqs(tdqs), // Termination data strobe, but can act as data-mask (DM) when TDQS function is disabled
.tdqs_n(tdqs_n)
`endif
);
`ifdef MICRON_SIM
// Micron simulation model
ddr3 mem(
.rst_n(reset_n),
.ck(ck),
.ck_n(ck_n),
.cke(ck_en),
.cs_n(cs_n),
.ras_n(ras_n),
.cas_n(cas_n),
.we_n(we_n),
.dm_tdqs(dm),
.ba(bank_address),
.addr(address),
.dq(dq),
.dqs({udqs, ldqs}),
.dqs_n({udqs_n, ldqs_n}),
.tdqs_n(tdqs_n),
.odt(odt)
);
`elsif TESTBENCH
// to emulate DQS and DQ signals coming out from DDR3 RAM
wire [DQ_BITWIDTH-1:0] test_dq_w;
reg [DQ_BITWIDTH-1:0] test_dq_w_d0;
reg [DQ_BITWIDTH-1:0] test_dq_w_d1;
wire [DQS_BITWIDTH-1:0] test_dqs_w;
always @(posedge ck_90)
begin
if(~reset_n | ~(|dq_iobuf_enable)) test_dq_w_d1 <= 1;
else if(data_read_is_ongoing) test_dq_w_d1 <= test_dq_w_d0 + 1;
end
always @(posedge ck_270)
begin
if(~reset_n | ~(|dq_iobuf_enable)) test_dq_w_d0 <= 0;
else if(data_read_is_ongoing) test_dq_w_d0 <= test_dq_w_d1 + 1;
end
// DQS and DQ signals are of double-data-rate signals
`ifdef XILINX
genvar test_dq_index;
generate
for(test_dq_index = 0; test_dq_index < DQ_BITWIDTH; test_dq_index = test_dq_index + 1)
begin: test_dq_io
IOBUF IO_test_dq (
.IO(dq[test_dq_index]),
.I(test_dq_w[test_dq_index]),
.T(~dq_iobuf_enable[test_dq_index]),
.O() // no need to connect since the code is only emulating DDR3 RAM emitting out DQ bit
);
ODDR2 #(
.DDR_ALIGNMENT("NONE"), // Sets output alignment to "NONE", "C0" or "C1"
.INIT(1'b0), // Sets initial state of the Q output to 1'b0 or 1'b1
.SRTYPE("SYNC") // Specifies "SYNC" or "ASYNC" set/reset
)
ODDR2_test_dq_w(
.Q(test_dq_w[test_dq_index]), // 1-bit DDR output data
.C0(ck_90), // 1-bit clock input
.C1(ck_270), // 1-bit clock input
.CE(1'b1), // 1-bit clock enable input
.D0(test_dq_w_d1[test_dq_index]), // 1-bit DDR data input (associated with C0)
.D1(test_dq_w_d0[test_dq_index]), // 1-bit DDR data input (associated with C1)
.R(reset), // 1-bit reset input
.S(1'b0) // 1-bit set input
);
end
endgenerate
`ifdef USE_x16
IOBUF IO_test_udqs (
.IO(udqs),
.I(test_dqs_w[1]),
.T(~udqs_iobuf_enable),
.O() // no need to connect since the code is only emulating DDR3 RAM emitting out DQS strobe
);
IOBUF IO_test_ldqs (
.IO(ldqs),
.I(test_dqs_w[0]),
.T(~ldqs_iobuf_enable),
.O() // no need to connect since the code is only emulating DDR3 RAM emitting out DQS strobe
);
`else
IOBUF IO_test_dqs (
.IO(dqs),
.I(test_dqs_w),
.T(~dqs_iobuf_enable),
.O() // no need to connect since the code is only emulating DDR3 RAM emitting out DQS strobe
);
`endif
genvar test_dqs_index;
generate
for(test_dqs_index = 0; test_dqs_index < DQS_BITWIDTH; test_dqs_index = test_dqs_index + 1)
begin: test_dqs_io
ODDR2 #(
.DDR_ALIGNMENT("NONE"), // Sets output alignment to "NONE", "C0" or "C1"
.INIT(1'b0), // Sets initial state of the Q output to 1'b0 or 1'b1
.SRTYPE("SYNC") // Specifies "SYNC" or "ASYNC" set/reset
)
ODDR2_test_dqs_w(
.Q(test_dqs_w[test_dqs_index]), // 1-bit DDR output data
.C0(ck_90), // 1-bit clock input
.C1(ck_270), // 1-bit clock input
.CE(1'b1), // 1-bit clock enable input
.D0(1'b1), // 1-bit DDR data input (associated with C0)
.D1(1'b0), // 1-bit DDR data input (associated with C1)
.R(1'b0), // 1-bit reset input
.S(1'b0) // 1-bit set input
);
end
endgenerate
`endif
`endif
endmodule
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment