diff --git a/CHANGELOG.md b/CHANGELOG.md index 529361b68..40ca1055c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -62,6 +62,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - `VLXE` and `VSXE` need to wait that the SlideAddrGenA opreq is free before being issued by the lane sequencer to the operand requester stage - Do not trap instructions with no operands in the main sequencer - Commit a reduction only after a grant from the VRF + - Decouple `cmdBuffer` and `dataBuffer` depth parameters in the operand queues ### Added @@ -104,6 +105,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Add support for vector mask population count and find first set bit instructions: `vcpop.m`, `vfirst.m` - Add Spyglass linting script - Add parametrized support for Fixed-Point math + - Add support for Barber's Pole VRF Layout ### Changed @@ -134,6 +136,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Adapt `fdotproduct` to `dotproduct` structure - Pre-calculate next-cycle `aligned_start_address` in `addrgen` for timing reasons - Add `is_reduct` signal to the operand queues, to gate the neutral value filling + - Handle WAW and WAR `vload` hazards in the `VLDU` without stalling the main sequencer + - Reductions are no more treated as widening instructions for what concerns WAW hazards in the operand requesters + - `slide1x` instructions are now not stalled in the main sequencer, but the hazard is handled downstream ## 2.2.0 - 2021-11-02 diff --git a/hardware/include/ara_pkg.sv b/hardware/include/ara_pkg.sv index 593967a7e..6c45d77ff 100644 --- a/hardware/include/ara_pkg.sv +++ b/hardware/include/ara_pkg.sv @@ -86,6 +86,7 @@ package ara_pkg; localparam int unsigned ValuInsnQueueDepth = 4; localparam int unsigned VlduInsnQueueDepth = 4; localparam int unsigned VstuInsnQueueDepth = 4; + localparam int unsigned VaddrgenInsnQueueDepth = 4; localparam int unsigned SlduInsnQueueDepth = 2; localparam int unsigned NoneInsnQueueDepth = 1; // Ara supports MaskuInsnQueueDepth = 1 only. @@ -299,6 +300,8 @@ package ara_pkg; logic wide_fp_imm; // Resizing of FP conversions resize_e cvt_resize; + // Widening and vslide1x instructions have different hazard stall policies + logic special_hazard; // Vector machine metadata vlen_t vl; @@ -396,6 +399,8 @@ package ara_pkg; logic wide_fp_imm; // Resizing of FP conversions resize_e cvt_resize; + // Widening and vslide1x instructions have different hazard stall policies + logic special_hazard; // Vector machine metadata vlen_t vl; @@ -877,11 +882,6 @@ package ara_pkg; // Each lane has eight VRF banks localparam int unsigned NrVRFBanksPerLane = 8; - // Find the starting address of a vector register vid - function automatic logic [63:0] vaddr(logic [4:0] vid, int NrLanes); - vaddr = vid * (VLENB / NrLanes / 8); - endfunction: vaddr - // Differenciate between SLDU and ADDRGEN operands from opqueue typedef enum logic { ALU_SLDU = 1'b0, @@ -898,6 +898,7 @@ package ara_pkg; logic scale_vl; // Rescale vl taking into account the new and old EEW resize_e cvt_resize; // Resizing of FP conversions + logic special_hazard; // Widening and vslide1x instructions have different hazard stall policies logic is_reduct; // Is this a reduction? diff --git a/hardware/include/ara_vaddr.svh b/hardware/include/ara_vaddr.svh new file mode 100644 index 000000000..3cd9f9ce8 --- /dev/null +++ b/hardware/include/ara_vaddr.svh @@ -0,0 +1,80 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 +// +// Author: Matteo Perotti +// Description: +// Ara's functions to calculate VRF addresses. Not in the package +// since the functions depend on `NrLanes` + +// All the functions to support a Barber-Pole VRF layout + +// Find the starting VRF address of a vector register vid +function automatic vaddr_t vaddr(logic [4:0] vid, int NrLanes); + // This is not an adder, it's only wires. + // (this holds if VLENB / NrLanes >= NrVRFBanksPerLane^2) + vaddr = vid * (VLENB / NrLanes / NrVRFBanksPerLane) + vid[VaddrBankWidth-1:0]; +endfunction: vaddr + +// Return the physical address of the next element of a certain vector +function automatic vaddr_t next_vaddr(vaddr_t vaddr, logic [4:0] vid); + // vaddr msbs -> byte index in a bank + logic [VaddrIdxWidth-1:VaddrBankWidth] index, old_index; + // vaddr lsbs -> bank index + logic [VaddrBankWidth-1:0] bank; + + index = vaddr[VaddrIdxWidth-1:VaddrBankWidth]; + bank = vaddr[VaddrBankWidth-1:0]; + + old_index = index; + + // Increment bank counter + bank += 1; + if (bank == vid[VaddrBankWidth-1:0]) + // Wrap around + index += 1; + + // If we change vreg, the start element position is +1 (LMUL > 1) + // This is important for B layout consistency among different LMUL + // or when inactive element policy is "undistrubed" + if (index[VaddrVregWidth] != old_index[VaddrVregWidth]) + bank += 1; + + return {index, bank}; +endfunction + +// Initialize with an offset (necessary with vslideup) +function automatic vaddr_t vaddr_offset(vaddr_t vaddr, vaddr_t off, logic [4:0] vid); + // vaddr msbs -> byte index in a bank + logic [VaddrIdxWidth-1:VaddrBankWidth] index, old_index; + // vaddr lsbs -> bank index + logic [VaddrBankWidth-1:0] bank, old_bank; + + index = vaddr[VaddrIdxWidth-1:VaddrBankWidth]; + bank = vaddr[VaddrBankWidth-1:0]; + + old_index = index; + old_bank = bank; + + // Increment bank counter + index += off[VaddrIdxWidth-1:VaddrBankWidth]; + bank += off[VaddrBankWidth-1:0]; + // Support vstart != 0: don't hypothesize that old_bank == vid[VaddrBankWidth-1:0] + // Wrap around if we meet vid[VaddrBankWidth-1:0] during the addition + if (old_bank > vid[VaddrBankWidth-1:0]) begin + if (bank >= vid[VaddrBankWidth-1:0] && bank < old_bank) + // Wrap around + index += 1; + end else if (old_bank < vid[VaddrBankWidth-1:0]) begin + if (bank >= vid[VaddrBankWidth-1:0] || bank < old_bank) + // Wrap around + index += 1; + end + + // If we change vreg, the start element position is +1 + // for every reg passed (LMUL > 1). The max reg id delta is 7 + // with LMUL == 8. + bank += index[VaddrVregWidth +: 3] - old_index[VaddrVregWidth +: 3]; + + return {index, bank}; +endfunction diff --git a/hardware/src/ara.sv b/hardware/src/ara.sv index 7668fef06..4bc110a5d 100644 --- a/hardware/src/ara.sv +++ b/hardware/src/ara.sv @@ -137,6 +137,10 @@ module ara import ara_pkg::*; #( logic [NrVInsn-1:0][NrVInsn-1:0] global_hazard_table; // Ready for lane 0 (scalar operand fwd) logic pe_scalar_resp_ready; + // VLDU Hazard checking + vid_t vldu_commit_id; + logic vldu_commit_id_valid; + logic vldu_hazard; // Mask unit operands elen_t [NrLanes-1:0][NrMaskFUnits+2-1:0] masku_operand; @@ -178,7 +182,11 @@ module ara import ara_pkg::*; #( // Interface with the address generator .addrgen_ack_i (addrgen_ack ), .addrgen_error_i (addrgen_error ), - .addrgen_error_vl_i (addrgen_error_vl ) + .addrgen_error_vl_i (addrgen_error_vl ), + // Interface with the VLDU for hazard handling + .vldu_commit_id_i (vldu_commit_id ), + .vldu_commit_id_valid_i(vldu_commit_id_valid ), + .vldu_hazard_o (vldu_hazard ) ); // Scalar move support @@ -234,9 +242,9 @@ module ara import ara_pkg::*; #( for (genvar lane = 0; lane < NrLanes; lane++) begin: gen_lanes lane #( - .NrLanes (NrLanes ), - .FPUSupport (FPUSupport ), - .FixPtSupport(FixPtSupport) + .NrLanes (NrLanes ), + .FPUSupport (FPUSupport ), + .FixPtSupport (FixPtSupport ) ) i_lane ( .clk_i (clk_i ), .rst_ni (rst_ni ), @@ -344,6 +352,9 @@ module ara import ara_pkg::*; #( .addrgen_ack_o (addrgen_ack ), .addrgen_error_o (addrgen_error ), .addrgen_error_vl_o (addrgen_error_vl ), + .commit_id_o (vldu_commit_id ), + .commit_id_valid_o (vldu_commit_id_valid ), + .hazard_i (vldu_hazard ), // Interface with the Mask unit .mask_i (mask ), .mask_valid_i (mask_valid ), @@ -377,8 +388,7 @@ module ara import ara_pkg::*; #( logic sldu_mask_ready; sldu #( - .NrLanes(NrLanes), - .vaddr_t(vaddr_t) + .NrLanes(NrLanes) ) i_sldu ( .clk_i (clk_i ), .rst_ni (rst_ni ), @@ -413,8 +423,7 @@ module ara import ara_pkg::*; #( ///////////////// masku #( - .NrLanes(NrLanes), - .vaddr_t(vaddr_t) + .NrLanes(NrLanes) ) i_masku ( .clk_i (clk_i ), .rst_ni (rst_ni ), diff --git a/hardware/src/ara_dispatcher.sv b/hardware/src/ara_dispatcher.sv index 6fe3783dc..22c1cd5b1 100644 --- a/hardware/src/ara_dispatcher.sv +++ b/hardware/src/ara_dispatcher.sv @@ -681,6 +681,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueReductionZExt; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110001: begin ara_req_d.op = ara_pkg::VWREDSUM; @@ -690,6 +691,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueReductionZExt; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end default: illegal_insn = 1'b1; endcase @@ -1300,6 +1302,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionZExt8; ara_req_d.eew_vs2 = eew_q[insn.varith_type.rs2]; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; // Invalid conversion if (int'(vtype_q.vsew) < int'(EW64) || @@ -1310,6 +1313,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionSExt8; ara_req_d.eew_vs2 = eew_q[insn.varith_type.rs2]; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; // Invalid conversion if (int'(vtype_q.vsew) < int'(EW64) || @@ -1320,6 +1324,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionZExt4; ara_req_d.eew_vs2 = prev_prev_ew(vtype_q.vsew); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; // Invalid conversion if (int'(vtype_q.vsew) < int'(EW32) || @@ -1329,6 +1334,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionSExt4; ara_req_d.eew_vs2 = prev_prev_ew(vtype_q.vsew); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; // Invalid conversion if (int'(vtype_q.vsew) < int'(EW32) || @@ -1338,6 +1344,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.eew_vs2 = vtype_q.vsew.prev(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; // Invalid conversion if (int'(vtype_q.vsew) < int'(EW16) || int'(vtype_q.vlmul) inside {LMUL_1_8}) @@ -1347,6 +1354,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.eew_vs2 = vtype_q.vsew.prev(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; // Invalid conversion if (int'(vtype_q.vsew) < int'(EW16) || int'(vtype_q.vlmul) inside {LMUL_1_8}) @@ -1394,6 +1402,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110001: begin // VWADD ara_req_d.op = ara_pkg::VADD; @@ -1402,6 +1411,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110010: begin // VWSUBU ara_req_d.op = ara_pkg::VSUB; @@ -1410,6 +1420,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110011: begin // VWSUB ara_req_d.op = ara_pkg::VSUB; @@ -1418,6 +1429,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110100: begin // VWADDU.W ara_req_d.op = ara_pkg::VADD; @@ -1427,6 +1439,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110101: begin // VWADD.W ara_req_d.op = ara_pkg::VADD; @@ -1436,6 +1449,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110110: begin // VWSUBU.W ara_req_d.op = ara_pkg::VSUB; @@ -1445,6 +1459,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110111: begin // VWSUB.W ara_req_d.op = ara_pkg::VSUB; @@ -1454,6 +1469,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111000: begin // VWMULU ara_req_d.op = ara_pkg::VMUL; @@ -1462,6 +1478,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111010: begin // VWMULSU ara_req_d.op = ara_pkg::VMUL; @@ -1470,6 +1487,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111011: begin // VWMUL ara_req_d.op = ara_pkg::VMUL; @@ -1478,6 +1496,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111100: begin // VWMACCU ara_req_d.op = ara_pkg::VMACC; @@ -1508,6 +1527,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end default: illegal_insn = 1'b1; endcase @@ -1567,6 +1587,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.eew_vs2 = vtype_q.vsew; // Request will need reshuffling ara_req_d.scale_vl = 1'b1; + // Special hazard handling for this instruction + ara_req_d.special_hazard = 1'b1; // If stride > vl, the vslideup has no effects if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(vl_q)] || (vlen_t'(ara_req_d.stride) >= vl_q)) null_vslideup = 1'b1; @@ -1577,6 +1599,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.eew_vs2 = vtype_q.vsew; // Request will need reshuffling ara_req_d.scale_vl = 1'b1; + // Special hazard handling for this instruction + ara_req_d.special_hazard = 1'b1; end 6'b010000: begin // VRXUNARY0 // vmv.s.x @@ -1625,6 +1649,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110001: begin // VWADD ara_req_d.op = ara_pkg::VADD; @@ -1633,6 +1658,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110010: begin // VWSUBU ara_req_d.op = ara_pkg::VSUB; @@ -1641,6 +1667,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110011: begin // VWSUB ara_req_d.op = ara_pkg::VSUB; @@ -1649,6 +1676,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110100: begin // VWADDU.W ara_req_d.op = ara_pkg::VADD; @@ -1658,6 +1686,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110101: begin // VWADD.W ara_req_d.op = ara_pkg::VADD; @@ -1667,6 +1696,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110110: begin // VWSUBU.W ara_req_d.op = ara_pkg::VSUB; @@ -1676,6 +1706,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110111: begin // VWSUB.W ara_req_d.op = ara_pkg::VSUB; @@ -1685,6 +1716,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111000: begin // VWMULU ara_req_d.op = ara_pkg::VMUL; @@ -1693,6 +1725,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111010: begin // VWMULSU ara_req_d.op = ara_pkg::VMUL; @@ -1701,6 +1734,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111011: begin // VWMUL ara_req_d.op = ara_pkg::VMUL; @@ -1709,6 +1743,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111100: begin // VWMACCU ara_req_d.op = ara_pkg::VMACC; @@ -1719,6 +1754,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111101: begin // VWMACC ara_req_d.op = ara_pkg::VMACC; @@ -1729,6 +1765,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111110: begin // VWMACCUS ara_req_d.op = ara_pkg::VMACC; @@ -1739,6 +1776,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111111: begin // VWMACCSU ara_req_d.op = ara_pkg::VMACC; @@ -1749,6 +1787,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end default: illegal_insn = 1'b1; endcase @@ -1883,6 +1922,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01000: begin // Widening VFCVTXUF ara_req_d.op = VFCVTXUF; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; ara_req_d.emul = next_lmul(vtype_q.vlmul); ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; @@ -1890,6 +1930,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01001: begin // Widening VFCVTXF ara_req_d.op = VFCVTXF; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; ara_req_d.emul = next_lmul(vtype_q.vlmul); ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; @@ -1897,6 +1938,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01010: begin // Widening VFCVTFXU ara_req_d.op = VFCVTFXU; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; ara_req_d.emul = next_lmul(vtype_q.vlmul); ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; @@ -1904,6 +1946,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01011: begin // Widening VFCVTFX ara_req_d.op = VFCVTFX; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; ara_req_d.emul = next_lmul(vtype_q.vlmul); ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; @@ -1911,6 +1954,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01100: begin // Widening VFCVTFF ara_req_d.op = VFCVTFF; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; ara_req_d.emul = next_lmul(vtype_q.vlmul); ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; @@ -1918,6 +1962,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01110: begin // Widening VFCVTRTZXUF ara_req_d.op = VFCVTRTZXUF; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; ara_req_d.emul = next_lmul(vtype_q.vlmul); ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; @@ -1925,6 +1970,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01111: begin // Widening VFCVTRTZXF ara_req_d.op = VFCVTRTZXF; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; ara_req_d.emul = next_lmul(vtype_q.vlmul); ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; @@ -2032,6 +2078,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110001: begin // VFWREDUSUM ara_req_d.op = ara_pkg::VFWREDUSUM; @@ -2041,7 +2089,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueReductionZExt; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.eew_vs1 = vtype_q.vsew.next(); - ara_req_d.cvt_resize = resize_e'(2'b00); + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110010: begin // VFWSUB ara_req_d.op = ara_pkg::VFSUB; @@ -2050,6 +2099,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110011: begin // VFWREDOSUM ara_req_d.op = ara_pkg::VFWREDOSUM; @@ -2059,7 +2110,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueReductionZExt; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.eew_vs1 = vtype_q.vsew.next(); - ara_req_d.cvt_resize = resize_e'(2'b00); + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110100: begin // VFWADD.W ara_req_d.op = ara_pkg::VFADD; @@ -2069,6 +2121,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110110: begin // VFWSUB.W ara_req_d.op = ara_pkg::VFSUB; @@ -2078,6 +2132,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111000: begin // VFWMUL ara_req_d.op = ara_pkg::VFMUL; @@ -2085,6 +2141,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111100: begin // VFWMACC ara_req_d.op = ara_pkg::VFMACC; @@ -2094,6 +2152,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111101: begin // VFWNMACC ara_req_d.op = ara_pkg::VFNMACC; @@ -2103,6 +2163,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111110: begin // VFWMSAC ara_req_d.op = ara_pkg::VFMSAC; @@ -2112,6 +2174,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111111: begin // VFWNMSAC ara_req_d.op = ara_pkg::VFNMSAC; @@ -2121,6 +2185,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end default: illegal_insn = 1'b1; endcase @@ -2217,6 +2283,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.eew_vs2 = vtype_q.vsew; // Request will need reshuffling ara_req_d.scale_vl = 1'b1; + // Special hazard handling for this instruction + ara_req_d.special_hazard = 1'b1; // If stride > vl, the vslideup has no effects if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(vl_q)] || (vlen_t'(ara_req_d.stride) >= vl_q)) null_vslideup = 1'b1; @@ -2224,9 +2292,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b001111: begin // vfslide1down ara_req_d.op = ara_pkg::VSLIDEDOWN; ara_req_d.stride = 1; - ara_req_d.eew_vs2 = vtype_q.vsew; - // Request will need reshuffling - ara_req_d.scale_vl = 1'b1; + ara_req_d.eew_vs2 = vtype_q.vsew; + // Request will need reshuffling + ara_req_d.scale_vl = 1'b1; + // Special hazard handling for this instruction + ara_req_d.special_hazard = 1'b1; end 6'b010000: begin // VRFUNARY0 // vmv.s.f diff --git a/hardware/src/ara_sequencer.sv b/hardware/src/ara_sequencer.sv index 348c01107..539e3d2b3 100644 --- a/hardware/src/ara_sequencer.sv +++ b/hardware/src/ara_sequencer.sv @@ -41,7 +41,11 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i // Interface with the Address Generation input logic addrgen_ack_i, input logic addrgen_error_i, - input vlen_t addrgen_error_vl_i + input vlen_t addrgen_error_vl_i, + // Interface with the VLDU to handle load WAW and WAR hazards + input vid_t vldu_commit_id_i, + input logic vldu_commit_id_valid_i, + output logic vldu_hazard_o ); /////////////////////////////////// @@ -261,6 +265,9 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i write_list_d = write_list_q; global_hazard_table_d = global_hazard_table_o; + // No hazard check requested + vldu_hazard_o = 1'b0; + // Maintain request pe_req_d = '0; pe_req_valid_d = 1'b0; @@ -354,6 +361,7 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i fp_rm : ara_req_i.fp_rm, wide_fp_imm : ara_req_i.wide_fp_imm, cvt_resize : ara_req_i.cvt_resize, + special_hazard: ara_req_i.special_hazard, scale_vl : ara_req_i.scale_vl, vl : ara_req_i.vl, vstart : ara_req_i.vstart, @@ -370,12 +378,17 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i pe_req_d.hazard_vs1 | pe_req_d.hazard_vs2; // We only issue instructions that take no operands if they have no hazards. + // Exception to this rule: loads, as they are super common. WAW and WAR hazards + // on load instructions are handled in the VLDU. // Moreover, SLIDE instructions cannot be always chained // ToDo: optimize the case for vslide1down, vslide1up (wait 2 cycles, then chain) - if (!(|{ara_req_i.use_vs1, ara_req_i.use_vs2, ara_req_i.use_vd_op, !ara_req_i.vm}) && - |{pe_req_d.hazard_vs1, pe_req_d.hazard_vs2, pe_req_d.hazard_vm, pe_req_d.hazard_vd} || - (pe_req_d.op == VSLIDEUP && |{pe_req_d.hazard_vd, pe_req_d.hazard_vs1, pe_req_d.hazard_vs2}) || - (pe_req_d.op == VSLIDEDOWN && |{pe_req_d.hazard_vs1, pe_req_d.hazard_vs2})) + if ((!(|{ara_req_i.use_vs1, ara_req_i.use_vs2, ara_req_i.use_vd_op, !ara_req_i.vm}) && + |{pe_req_d.hazard_vs1, pe_req_d.hazard_vs2, pe_req_d.hazard_vm, pe_req_d.hazard_vd} && + !(is_load(pe_req_d.op))) || + (pe_req_d.op == VSLIDEUP && !pe_req_d.use_scalar_op && + |{pe_req_d.hazard_vd, pe_req_d.hazard_vs1, pe_req_d.hazard_vs2}) || + (pe_req_d.op == VSLIDEDOWN && !pe_req_d.use_scalar_op && + |{pe_req_d.hazard_vs1, pe_req_d.hazard_vs2})) begin ara_req_ready_o = 1'b0; pe_req_valid_d = 1'b0; @@ -453,6 +466,18 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i end endcase + // Load-related hazards handling + // Loads are masters on the x-bar to write the in-lane VRF. Nevertheless, + // they can have WAR or WAW dependencies. When there is a load in the load + // unit, its hazard bit is always checked and cleared here as soon as the + // dependency does not exist anymore. Whenever the hazard bit is set, + // the load cannot issue requests. + // It's safe to pipeline vldu_hazard_o if the timing is tight. + // (if so, add a sync signal) + if (vldu_commit_id_valid_i) begin + vldu_hazard_o = |global_hazard_table_o[vldu_commit_id_i]; + end + // Update the global hazard table for (int id = 0; id < NrVInsn; id++) global_hazard_table_d[id] &= vinsn_running_d; end : p_sequencer diff --git a/hardware/src/lane/lane.sv b/hardware/src/lane/lane.sv index d12c71345..a786cabfe 100644 --- a/hardware/src/lane/lane.sv +++ b/hardware/src/lane/lane.sv @@ -191,8 +191,7 @@ module lane import ara_pkg::*; import rvv_pkg::*; #( operand_requester #( .NrBanks(NrVRFBanksPerLane), - .NrLanes(NrLanes ), - .vaddr_t(vaddr_t ) + .NrLanes(NrLanes ) ) i_operand_requester ( .clk_i (clk_i ), .rst_ni (rst_ni ), diff --git a/hardware/src/lane/lane_sequencer.sv b/hardware/src/lane/lane_sequencer.sv index 722bab7a5..cee688f18 100644 --- a/hardware/src/lane/lane_sequencer.sv +++ b/hardware/src/lane/lane_sequencer.sv @@ -240,42 +240,44 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: unique case (pe_req.vfu) VFU_Alu: begin operand_request_i[AluA] = '{ - id : pe_req.id, - vs : pe_req.vs1, - eew : pe_req.eew_vs1, + id : pe_req.id, + vs : pe_req.vs1, + eew : pe_req.eew_vs1, // If reductions and vl == 0, we must replace with neutral values - conv : (vfu_operation_d.vl == '0) ? OpQueueReductionZExt : pe_req.conversion_vs1, - scale_vl : pe_req.scale_vl, - cvt_resize : pe_req.cvt_resize, - vtype : pe_req.vtype, + conv : (vfu_operation_d.vl == '0) ? OpQueueReductionZExt : pe_req.conversion_vs1, + scale_vl : pe_req.scale_vl, + cvt_resize : pe_req.cvt_resize, + special_hazard : pe_req.special_hazard, + vtype : pe_req.vtype, // In case of reduction, AluA opqueue will keep the scalar element - vl : (pe_req.op inside {[VREDSUM:VWREDSUM]}) ? 1 : vfu_operation_d.vl, - vstart : vfu_operation_d.vstart, - hazard : pe_req.hazard_vs1 | pe_req.hazard_vd, - is_reduct : pe_req.op inside {[VREDSUM:VWREDSUM]} ? 1'b1 : 0, - target_fu : ALU_SLDU, - default : '0 + vl : (pe_req.op inside {[VREDSUM:VWREDSUM]}) ? 1 : vfu_operation_d.vl, + vstart : vfu_operation_d.vstart, + hazard : pe_req.hazard_vs1 | pe_req.hazard_vd, + is_reduct : pe_req.op inside {[VREDSUM:VWREDSUM]} ? 1'b1 : 0, + target_fu : ALU_SLDU, + default : '0 }; operand_request_push[AluA] = pe_req.use_vs1; operand_request_i[AluB] = '{ - id : pe_req.id, - vs : pe_req.vs2, - eew : pe_req.eew_vs2, + id : pe_req.id, + vs : pe_req.vs2, + eew : pe_req.eew_vs2, // If reductions and vl == 0, we must replace with neutral values - conv : (vfu_operation_d.vl == '0) ? OpQueueReductionZExt : pe_req.conversion_vs2, - scale_vl : pe_req.scale_vl, - cvt_resize : pe_req.cvt_resize, - vtype : pe_req.vtype, + conv : (vfu_operation_d.vl == '0) ? OpQueueReductionZExt : pe_req.conversion_vs2, + scale_vl : pe_req.scale_vl, + cvt_resize : pe_req.cvt_resize, + special_hazard : pe_req.special_hazard, + vtype : pe_req.vtype, // If reductions and vl == 0, we must replace the operands with neutral // values in the opqueues. So, vl must be 1 at least - vl : (pe_req.op inside {[VREDSUM:VWREDSUM]} && vfu_operation_d.vl == '0) - ? 1 : vfu_operation_d.vl, - vstart : vfu_operation_d.vstart, - hazard : pe_req.hazard_vs2 | pe_req.hazard_vd, - is_reduct : pe_req.op inside {[VREDSUM:VWREDSUM]} ? 1'b1 : 0, - target_fu : ALU_SLDU, - default : '0 + vl : (pe_req.op inside {[VREDSUM:VWREDSUM]} && vfu_operation_d.vl == '0) + ? 1 : vfu_operation_d.vl, + vstart : vfu_operation_d.vstart, + hazard : pe_req.hazard_vs2 | pe_req.hazard_vd, + is_reduct : pe_req.op inside {[VREDSUM:VWREDSUM]} ? 1'b1 : 0, + target_fu : ALU_SLDU, + default : '0 }; operand_request_push[AluB] = pe_req.use_vs2; @@ -298,66 +300,69 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: end VFU_MFpu: begin operand_request_i[MulFPUA] = '{ - id : pe_req.id, - vs : pe_req.vs1, - eew : pe_req.eew_vs1, + id : pe_req.id, + vs : pe_req.vs1, + eew : pe_req.eew_vs1, // If reductions and vl == 0, we must replace with neutral values - conv : pe_req.conversion_vs1, - scale_vl : pe_req.scale_vl, - cvt_resize : pe_req.cvt_resize, - vtype : pe_req.vtype, + conv : pe_req.conversion_vs1, + scale_vl : pe_req.scale_vl, + cvt_resize : pe_req.cvt_resize, + special_hazard : pe_req.special_hazard, + vtype : pe_req.vtype, // If reductions and vl == 0, we must replace the operands with neutral // values in the opqueues. So, vl must be 1 at least - vl : (pe_req.op inside {[VFREDUSUM:VFWREDOSUM]}) ? 1 : vfu_operation_d.vl, - vstart : vfu_operation_d.vstart, - hazard : pe_req.hazard_vs1 | pe_req.hazard_vd, - is_reduct : pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} ? 1'b1 : 0, - target_fu : MFPU_ADDRGEN, - default : '0 + vl : (pe_req.op inside {[VFREDUSUM:VFWREDOSUM]}) ? 1 : vfu_operation_d.vl, + vstart : vfu_operation_d.vstart, + hazard : pe_req.hazard_vs1 | pe_req.hazard_vd, + is_reduct : pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} ? 1'b1 : 0, + target_fu : MFPU_ADDRGEN, + default : '0 }; operand_request_push[MulFPUA] = pe_req.use_vs1; operand_request_i[MulFPUB] = '{ - id : pe_req.id, - vs : pe_req.swap_vs2_vd_op ? pe_req.vd : pe_req.vs2, - eew : pe_req.swap_vs2_vd_op ? pe_req.eew_vd_op : pe_req.eew_vs2, + id : pe_req.id, + vs : pe_req.swap_vs2_vd_op ? pe_req.vd : pe_req.vs2, + eew : pe_req.swap_vs2_vd_op ? pe_req.eew_vd_op : pe_req.eew_vs2, // If reductions and vl == 0, we must replace with neutral values - conv : pe_req.conversion_vs2, - scale_vl : pe_req.scale_vl, - cvt_resize : pe_req.cvt_resize, - vtype : pe_req.vtype, + conv : pe_req.conversion_vs2, + scale_vl : pe_req.scale_vl, + cvt_resize : pe_req.cvt_resize, + special_hazard : pe_req.special_hazard, + vtype : pe_req.vtype, // If reductions and vl == 0, we must replace the operands with neutral // values in the opqueues. So, vl must be 1 at least - vl : (pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} && vfu_operation_d.vl == '0) - ? 1 : vfu_operation_d.vl, - vstart : vfu_operation_d.vstart, - hazard : (pe_req.swap_vs2_vd_op ? + vl : (pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} && vfu_operation_d.vl == '0) + ? 1 : vfu_operation_d.vl, + vstart : vfu_operation_d.vstart, + hazard : (pe_req.swap_vs2_vd_op ? pe_req.hazard_vd : (pe_req.hazard_vs2 | pe_req.hazard_vd)), - is_reduct : pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} ? 1'b1 : 0, - target_fu : MFPU_ADDRGEN, - default: '0 + is_reduct : pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} ? 1'b1 : 0, + target_fu : MFPU_ADDRGEN, + default : '0 }; operand_request_push[MulFPUB] = pe_req.swap_vs2_vd_op ? pe_req.use_vd_op : pe_req.use_vs2; operand_request_i[MulFPUC] = '{ - id : pe_req.id, - vs : pe_req.swap_vs2_vd_op ? pe_req.vs2 : pe_req.vd, - eew : pe_req.swap_vs2_vd_op ? pe_req.eew_vs2 : pe_req.eew_vd_op, - conv : pe_req.swap_vs2_vd_op ? pe_req.conversion_vs2 : OpQueueConversionNone, - scale_vl : pe_req.scale_vl, - cvt_resize : pe_req.cvt_resize, + id : pe_req.id, + vs : pe_req.swap_vs2_vd_op ? pe_req.vs2 : pe_req.vd, + eew : pe_req.swap_vs2_vd_op ? pe_req.eew_vs2 : pe_req.eew_vd_op, + conv : pe_req.swap_vs2_vd_op ? pe_req.conversion_vs2 : OpQueueConversionNone, + scale_vl : pe_req.scale_vl, + cvt_resize : pe_req.cvt_resize, + special_hazard : pe_req.special_hazard, // If reductions and vl == 0, we must replace the operands with neutral // values in the opqueues. So, vl must be 1 at least - vl : (pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} && vfu_operation_d.vl == '0) - ? 1 : vfu_operation_d.vl, - vstart : vfu_operation_d.vstart, - vtype : pe_req.vtype, - hazard : pe_req.swap_vs2_vd_op ? + vl : (pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} && vfu_operation_d.vl == '0) + ? 1 : vfu_operation_d.vl, + vstart : vfu_operation_d.vstart, + vtype : pe_req.vtype, + hazard : pe_req.swap_vs2_vd_op ? (pe_req.hazard_vs2 | pe_req.hazard_vd) : pe_req.hazard_vd, - is_reduct : pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} ? 1'b1 : 0, - target_fu : MFPU_ADDRGEN, - default : '0 + is_reduct : pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} ? 1'b1 : 0, + target_fu : MFPU_ADDRGEN, + default : '0 }; operand_request_push[MulFPUC] = pe_req.swap_vs2_vd_op ? pe_req.use_vs2 : pe_req.use_vd_op; @@ -399,17 +404,18 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // Load indexed operand_request_i[SlideAddrGenA] = '{ - id : pe_req_i.id, - vs : pe_req_i.vs2, - eew : pe_req_i.eew_vs2, - conv : pe_req_i.conversion_vs2, - target_fu: MFPU_ADDRGEN, - vl : pe_req_i.vl / NrLanes, - scale_vl : pe_req_i.scale_vl, - vstart : vfu_operation_d.vstart, - vtype : pe_req_i.vtype, - hazard : pe_req_i.hazard_vs2 | pe_req_i.hazard_vd, - default : '0 + id : pe_req_i.id, + vs : pe_req_i.vs2, + eew : pe_req_i.eew_vs2, + conv : pe_req_i.conversion_vs2, + target_fu : MFPU_ADDRGEN, + special_hazard : pe_req.special_hazard, + vl : pe_req_i.vl / NrLanes, + scale_vl : pe_req_i.scale_vl, + vstart : vfu_operation_d.vstart, + vtype : pe_req_i.vtype, + hazard : pe_req_i.hazard_vs2 | pe_req_i.hazard_vd, + default : '0 }; // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. @@ -455,17 +461,18 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // Store indexed operand_request_i[SlideAddrGenA] = '{ - id : pe_req_i.id, - vs : pe_req_i.vs2, - eew : pe_req_i.eew_vs2, - conv : pe_req_i.conversion_vs2, - target_fu: MFPU_ADDRGEN, - vl : pe_req_i.vl / NrLanes, - scale_vl : pe_req_i.scale_vl, - vstart : vfu_operation_d.vstart, - vtype : pe_req_i.vtype, - hazard : pe_req_i.hazard_vs2 | pe_req_i.hazard_vd, - default : '0 + id : pe_req_i.id, + vs : pe_req_i.vs2, + eew : pe_req_i.eew_vs2, + conv : pe_req_i.conversion_vs2, + target_fu : MFPU_ADDRGEN, + special_hazard : pe_req.special_hazard, + vl : pe_req_i.vl / NrLanes, + scale_vl : pe_req_i.scale_vl, + vstart : vfu_operation_d.vstart, + vtype : pe_req_i.vtype, + hazard : pe_req_i.hazard_vs2 | pe_req_i.hazard_vd, + default : '0 }; // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. @@ -476,16 +483,17 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: VFU_SlideUnit: begin operand_request_i[SlideAddrGenA] = '{ - id : pe_req.id, - vs : pe_req.vs2, - eew : pe_req.eew_vs2, - conv : pe_req.conversion_vs2, - target_fu: ALU_SLDU, - scale_vl : pe_req.scale_vl, - vtype : pe_req.vtype, - vstart : vfu_operation_d.vstart, - hazard : pe_req.hazard_vs2 | pe_req.hazard_vd, - default : '0 + id : pe_req.id, + vs : pe_req.vs2, + eew : pe_req.eew_vs2, + conv : pe_req.conversion_vs2, + target_fu : ALU_SLDU, + special_hazard : pe_req.special_hazard, + scale_vl : pe_req.scale_vl, + vtype : pe_req.vtype, + vstart : vfu_operation_d.vstart, + hazard : pe_req.hazard_vs2 | pe_req.hazard_vd, + default : '0 }; operand_request_push[SlideAddrGenA] = pe_req.use_vs2; diff --git a/hardware/src/lane/operand_queue.sv b/hardware/src/lane/operand_queue.sv index fe40a291b..72c8202e1 100644 --- a/hardware/src/lane/operand_queue.sv +++ b/hardware/src/lane/operand_queue.sv @@ -9,7 +9,8 @@ // need it. module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; #( - parameter int unsigned BufferDepth = 2, + parameter int unsigned CmdBufDepth = 2, + parameter int unsigned DataBufDepth = 2, parameter int unsigned NrSlaves = 1, parameter int unsigned NrLanes = 0, // Support for floating-point data types @@ -52,7 +53,7 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i logic cmd_pop; fifo_v3 #( - .DEPTH(BufferDepth ), + .DEPTH(CmdBufDepth ), .dtype(operand_queue_cmd_t) ) i_cmd_buffer ( .clk_i (clk_i ), @@ -79,8 +80,8 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i logic ibuf_pop; fifo_v3 #( - .DEPTH (BufferDepth), - .DATA_WIDTH(DataWidth ) + .DEPTH (DataBufDepth), + .DATA_WIDTH(DataWidth ) ) i_input_buffer ( .clk_i (clk_i ), .rst_ni (rst_ni ), @@ -98,7 +99,7 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i // We used a credit based system, to ensure that the FIFO is always // able to accept a request. - logic [idx_width(BufferDepth):0] ibuf_usage_d, ibuf_usage_q; + logic [idx_width(DataBufDepth):0] ibuf_usage_d, ibuf_usage_q; always_comb begin: p_ibuf_usage // Maintain state @@ -110,7 +111,7 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i if (ibuf_pop) ibuf_usage_d -= 1; // Are we ready? - operand_queue_ready_o = (ibuf_usage_q != BufferDepth); + operand_queue_ready_o = (ibuf_usage_q != DataBufDepth); end always_ff @(posedge clk_i or negedge rst_ni) begin: p_ibuf_usage_ff diff --git a/hardware/src/lane/operand_queues_stage.sv b/hardware/src/lane/operand_queues_stage.sv index dab636d07..5ed714522 100644 --- a/hardware/src/lane/operand_queues_stage.sv +++ b/hardware/src/lane/operand_queues_stage.sv @@ -52,14 +52,15 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math /////////// operand_queue #( - .BufferDepth (5 ), - .FPUSupport (FPUSupport), - .NrLanes (NrLanes ), - .SupportIntExt2(1'b1 ), - .SupportIntExt4(1'b1 ), - .SupportIntExt8(1'b1 ), - .SupportReduct (1'b1 ), - .SupportNtrVal (1'b0 ) + .CmdBufDepth (ValuInsnQueueDepth), + .DataBufDepth (5 ), + .FPUSupport (FPUSupport ), + .NrLanes (NrLanes ), + .SupportIntExt2(1'b1 ), + .SupportIntExt4(1'b1 ), + .SupportIntExt8(1'b1 ), + .SupportReduct (1'b1 ), + .SupportNtrVal (1'b0 ) ) i_operand_queue_alu_a ( .clk_i (clk_i ), .rst_ni (rst_ni ), @@ -77,14 +78,15 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math ); operand_queue #( - .BufferDepth (5 ), - .FPUSupport (FPUSupport), - .NrLanes (NrLanes ), - .SupportIntExt2(1'b1 ), - .SupportIntExt4(1'b1 ), - .SupportIntExt8(1'b1 ), - .SupportReduct (1'b1 ), - .SupportNtrVal (1'b1 ) + .CmdBufDepth (ValuInsnQueueDepth), + .DataBufDepth (5 ), + .FPUSupport (FPUSupport ), + .NrLanes (NrLanes ), + .SupportIntExt2(1'b1 ), + .SupportIntExt4(1'b1 ), + .SupportIntExt8(1'b1 ), + .SupportReduct (1'b1 ), + .SupportNtrVal (1'b1 ) ) i_operand_queue_alu_b ( .clk_i (clk_i ), .rst_ni (rst_ni ), @@ -106,12 +108,13 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math ////////////////////// operand_queue #( - .BufferDepth (5 ), - .FPUSupport (FPUSupport), - .NrLanes (NrLanes ), - .SupportIntExt2(1'b1 ), - .SupportReduct (1'b1 ), - .SupportNtrVal (1'b0 ) + .CmdBufDepth (MfpuInsnQueueDepth ), + .DataBufDepth (5 ), + .FPUSupport (FPUSupport ), + .NrLanes (NrLanes ), + .SupportIntExt2(1'b1 ), + .SupportReduct (1'b1 ), + .SupportNtrVal (1'b0 ) ) i_operand_queue_mfpu_a ( .clk_i (clk_i ), .rst_ni (rst_ni ), @@ -129,12 +132,13 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math ); operand_queue #( - .BufferDepth (5 ), - .FPUSupport (FPUSupport), - .NrLanes (NrLanes ), - .SupportIntExt2(1'b1 ), - .SupportReduct (1'b1 ), - .SupportNtrVal (1'b1 ) + .CmdBufDepth (MfpuInsnQueueDepth ), + .DataBufDepth (5 ), + .FPUSupport (FPUSupport ), + .NrLanes (NrLanes ), + .SupportIntExt2(1'b1 ), + .SupportReduct (1'b1 ), + .SupportNtrVal (1'b1 ) ) i_operand_queue_mfpu_b ( .clk_i (clk_i ), .rst_ni (rst_ni ), @@ -152,12 +156,13 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math ); operand_queue #( - .BufferDepth (5 ), - .FPUSupport (FPUSupport), - .NrLanes (NrLanes ), - .SupportIntExt2(1'b1 ), - .SupportReduct (1'b1 ), - .SupportNtrVal (1'b1 ) + .CmdBufDepth (MfpuInsnQueueDepth ), + .DataBufDepth (5 ), + .FPUSupport (FPUSupport ), + .NrLanes (NrLanes ), + .SupportIntExt2(1'b1 ), + .SupportReduct (1'b1 ), + .SupportNtrVal (1'b1 ) ) i_operand_queue_mfpu_c ( .clk_i (clk_i ), .rst_ni (rst_ni ), @@ -179,9 +184,10 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math /////////////////////// operand_queue #( - .BufferDepth(2 ), - .FPUSupport (FPUSupport), - .NrLanes (NrLanes ) + .CmdBufDepth (VstuInsnQueueDepth + MaskuInsnQueueDepth), + .DataBufDepth (2 ), + .FPUSupport (FPUSupport ), + .NrLanes (NrLanes ) ) i_operand_queue_st_mask_a ( .clk_i (clk_i ), .rst_ni (rst_ni ), @@ -203,9 +209,10 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math ****************/ operand_queue #( - .BufferDepth(2 ), - .FPUSupport (FPUSupport), - .NrLanes (NrLanes ) + .CmdBufDepth (VlduInsnQueueDepth), + .DataBufDepth (2 ), + .FPUSupport (FPUSupport ), + .NrLanes (NrLanes ) ) i_operand_queue_slide_addrgen_a ( .clk_i (clk_i ), .rst_ni (rst_ni ), @@ -227,11 +234,12 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math ///////////////// operand_queue #( - .BufferDepth(1 ), - .FPUSupport (FPUSupport), - .SupportIntExt2(1'b1), - .SupportIntExt4(1'b1), - .SupportIntExt8(1'b1), + .CmdBufDepth (MaskuInsnQueueDepth), + .DataBufDepth (1 ), + .FPUSupport (FPUSupport ), + .SupportIntExt2(1'b1 ), + .SupportIntExt4(1'b1 ), + .SupportIntExt8(1'b1 ), .NrLanes (NrLanes ) ) i_operand_queue_mask_b ( .clk_i (clk_i ), @@ -250,8 +258,9 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math ); operand_queue #( - .BufferDepth(1 ), - .NrLanes (NrLanes ) + .CmdBufDepth (MaskuInsnQueueDepth), + .DataBufDepth (1 ), + .NrLanes (NrLanes ) ) i_operand_queue_mask_m ( .clk_i (clk_i ), .rst_ni (rst_ni ), diff --git a/hardware/src/lane/operand_requester.sv b/hardware/src/lane/operand_requester.sv index 54590fbc3..3e85a58b3 100644 --- a/hardware/src/lane/operand_requester.sv +++ b/hardware/src/lane/operand_requester.sv @@ -9,11 +9,17 @@ // queues. This stage also includes the VRF arbiter. module operand_requester import ara_pkg::*; import rvv_pkg::*; #( - parameter int unsigned NrLanes = 0, - parameter int unsigned NrBanks = 0, // Number of banks in the vector register file - parameter type vaddr_t = logic, // Type used to address vector register file elements + parameter int unsigned NrLanes = 0, + parameter int unsigned NrBanks = 0, // Number of banks in the vector register file + // Type used to address vector register file elements + localparam int unsigned MaxVLenBPerLane = VLENB / NrLanes, // In bytes + localparam int unsigned VRFBSizePerLane = MaxVLenBPerLane * 32, // In bytes + localparam int unsigned VaddrIdxWidth = $clog2(VRFBSizePerLane), + localparam int unsigned VaddrBankWidth = $clog2(NrVRFBanksPerLane), + localparam int unsigned VaddrVregWidth = $clog2(MaxVLenBPerLane), + localparam type vaddr_t = logic [VaddrIdxWidth-1:0], // Dependant parameters. DO NOT CHANGE! - localparam type strb_t = logic[$bits(elen_t)/8-1:0] + localparam type strb_t = logic[$bits(elen_t)/8-1:0] ) ( input logic clk_i, input logic rst_ni, @@ -76,6 +82,8 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( output logic ldu_result_final_gnt_o ); + `include "../include/ara_vaddr.svh" + import cf_math_pkg::idx_width; //////////////////////// @@ -233,6 +241,8 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( vid_t id; // Address of the next element to be read vaddr_t addr; + // Source reg LSbs (useful for barber's pole) + logic [idx_width(NrBanks)-1:0] vs; // How many elements remain to be read vlen_t len; // Element width @@ -245,15 +255,27 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( // In case of a WAW with a previous instruction, // read once every two writes of the previous instruction logic is_widening; + // Does this instruction have a special hazard protocol? + logic special_hazard; // One-bit counters logic [NrVInsn-1:0] waw_hazard_counter; } requester_d, requester_q; + // Asserted if the SLDU requester is registering a new instruction + logic new_sldu_insn; + logic has_stalled_d, has_stalled_q; // Is there a hazard during this cycle? + // WAW with widening instructions are special: wait for 2 writes instead of 1 + // Slide1Up/Down with hazards should wait one cycle before being handled normally logic stall; - assign stall = |(requester_q.hazard & ~(vinsn_result_written_q & - (~{NrVInsn{requester_q.is_widening}} | requester_q.waw_hazard_counter))); + assign stall = |(requester_q.hazard & ~(vinsn_result_written_q & ((~{NrVInsn{requester_q.is_widening}} & + requester_q.special_hazard) | requester_q.waw_hazard_counter))) | + (~has_stalled_q & requester_q.special_hazard & |requester_q.hazard); + + // For every instruction, it signals if the requester has already stalled once + // This is needed for vslide1x stall handling + assign has_stalled_d = new_sldu_insn ? 1'b0 : (stall ? 1'b1 : has_stalled_q); // Did we get a grant? logic [NrBanks-1:0] operand_requester_gnt; @@ -269,6 +291,8 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( state_d = state_q; requester_d = requester_q; + new_sldu_insn = 1'b0; + // Make no requests to the VRF operand_payload[requester] = '0; for (int bank = 0; bank < NrBanks; bank++) operand_req[bank][requester] = 1'b0; @@ -288,6 +312,10 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( // Acknowledge the request operand_request_ready_o[requester] = 1'b1; + // New slide unit instruction incoming + if (requester == (NrOperandQueues + VFU_SlideUnit)) + new_sldu_insn = 1'b1; + // Send a command to the operand queue operand_queue_cmd_o[requester] = '{ eew : operand_request_i[requester].eew, @@ -312,22 +340,25 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( // Store the request requester_d = '{ - id : operand_request_i[requester].id, - addr : vaddr(operand_request_i[requester].vs, NrLanes) + - (operand_request_i[requester].vstart >> - (int'(EW64) - int'(operand_request_i[requester].eew))), + id : operand_request_i[requester].id, + addr : vaddr_offset(vaddr(operand_request_i[requester].vs, NrLanes), + vaddr_t'(operand_request_i[requester].vstart >> + (int'(EW64) - int'(operand_request_i[requester].eew))), operand_request_i[requester].vs), + vs : operand_request_i[requester].vs[idx_width(NrBanks)-1:0], // For memory operations, the number of elements initially refers to the new EEW (vsew here), // but the requester must refer to the old EEW (eew here) // This reasoning cannot be applied also to widening instructions, which modify vsew // treating it as the EEW of vd - len : (operand_request_i[requester].scale_vl) ? - ((operand_request_i[requester].vl << - operand_request_i[requester].vtype.vsew) >> - operand_request_i[requester].eew) : - operand_request_i[requester].vl, - vew : operand_request_i[requester].eew, - hazard : operand_request_i[requester].hazard, - is_widening : operand_request_i[requester].cvt_resize == CVT_WIDE, + len : (operand_request_i[requester].scale_vl) ? + ((operand_request_i[requester].vl << + operand_request_i[requester].vtype.vsew) >> + operand_request_i[requester].eew) : + operand_request_i[requester].vl, + vew : operand_request_i[requester].eew, + hazard : operand_request_i[requester].hazard, + is_widening : operand_request_i[requester].cvt_resize == CVT_WIDE && + operand_request_i[requester].special_hazard, + special_hazard : operand_request_i[requester].special_hazard, default: '0 }; // The length should be at least one after the rescaling @@ -363,7 +394,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( // Received a grant. if (|operand_requester_gnt) begin // Bump the address pointer - requester_d.addr = requester_q.addr + 1'b1; + requester_d.addr = next_vaddr(requester_q.addr, requester_q.vs); // We read less than 64 bits worth of elements if (requester_q.len < (1 << (int'(EW64) - int'(requester_q.vew)))) @@ -381,6 +412,10 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( // Acknowledge the request operand_request_ready_o[requester] = 1'b1; + // New slide unit instruction incoming + if (requester == (NrOperandQueues + VFU_SlideUnit)) + new_sldu_insn = 1'b1; + // Send a command to the operand queue operand_queue_cmd_o[requester] = '{ eew : operand_request_i[requester].eew, @@ -401,18 +436,22 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( // Store the request requester_d = '{ - id : operand_request_i[requester].id, - addr : vaddr(operand_request_i[requester].vs, NrLanes) + - (operand_request_i[requester].vstart >> - (int'(EW64) - int'(operand_request_i[requester].eew))), - len : (operand_request_i[requester].scale_vl) ? - ((operand_request_i[requester].vl << - operand_request_i[requester].vtype.vsew) >> - operand_request_i[requester].eew) : - operand_request_i[requester].vl, - vew : operand_request_i[requester].eew, - hazard : operand_request_i[requester].hazard, - default: '0 + id : operand_request_i[requester].id, + addr : vaddr_offset(vaddr(operand_request_i[requester].vs, NrLanes), + vaddr_t'(operand_request_i[requester].vstart >> + (int'(EW64) - int'(operand_request_i[requester].eew))), operand_request_i[requester].vs), + vs : operand_request_i[requester].vs[idx_width(NrBanks)-1:0], + len : (operand_request_i[requester].scale_vl) ? + ((operand_request_i[requester].vl << + operand_request_i[requester].vtype.vsew) >> + operand_request_i[requester].eew) : + operand_request_i[requester].vl, + vew : operand_request_i[requester].eew, + hazard : operand_request_i[requester].hazard, + is_widening : operand_request_i[requester].cvt_resize == CVT_WIDE && + operand_request_i[requester].special_hazard, + special_hazard : operand_request_i[requester].special_hazard, + default : '0 }; // The length should be at least one after the rescaling if (requester_d.len == '0) @@ -428,11 +467,13 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin - state_q <= IDLE; - requester_q <= '0; + state_q <= IDLE; + requester_q <= '0; + has_stalled_q <= 1'b0; end else begin - state_q <= state_d; - requester_q <= requester_d; + state_q <= state_d; + requester_q <= requester_d; + has_stalled_q <= has_stalled_d; end end end : gen_operand_requester diff --git a/hardware/src/lane/valu.sv b/hardware/src/lane/valu.sv index 386caca74..7cc93f3d8 100644 --- a/hardware/src/lane/valu.sv +++ b/hardware/src/lane/valu.sv @@ -8,15 +8,20 @@ // in a SIMD fashion, always operating on 64 bits. module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; #( - parameter int unsigned NrLanes = 0, + parameter int unsigned NrLanes = 0, // Support for fixed-point data types - parameter logic FixPtSupport = FixedPointEnable, + parameter logic FixPtSupport = FixedPointEnable, // Type used to address vector register file elements - parameter type vaddr_t = logic, + localparam int unsigned MaxVLenBPerLane = VLENB / NrLanes, // In bytes + localparam int unsigned VRFBSizePerLane = MaxVLenBPerLane * 32, // In bytes + localparam int unsigned VaddrIdxWidth = $clog2(VRFBSizePerLane), + localparam int unsigned VaddrBankWidth = $clog2(NrVRFBanksPerLane), + localparam int unsigned VaddrVregWidth = $clog2(MaxVLenBPerLane), + localparam type vaddr_t = logic [VaddrIdxWidth-1:0], // Dependant parameters. DO NOT CHANGE! - localparam int unsigned DataWidth = $bits(elen_t), - localparam int unsigned StrbWidth = DataWidth/8, - localparam type strb_t = logic [StrbWidth-1:0] + localparam int unsigned DataWidth = $bits(elen_t), + localparam int unsigned StrbWidth = DataWidth/8, + localparam type strb_t = logic [StrbWidth-1:0] ) ( input logic clk_i, input logic rst_ni, @@ -55,6 +60,9 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; output logic mask_ready_o ); + // Include address-handling functions + `include "../../include/ara_vaddr.svh" + import cf_math_pkg::idx_width; ///////////// @@ -137,6 +145,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; logic mask; } payload_t; + vaddr_t addr_d, addr_q; + // Result queue payload_t [ResultQueueDepth-1:0] result_queue_d, result_queue_q; logic [ResultQueueDepth-1:0] result_queue_valid_d, result_queue_valid_q; @@ -424,6 +434,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; reduction_rx_cnt_d = reduction_rx_cnt_q; sldu_transactions_cnt_d = sldu_transactions_cnt_q; red_hs_synch_d = red_hs_synch_q; + addr_d = addr_q; alu_red_valid_o = 1'b0; sldu_alu_ready_d = 1'b0; simd_red_cnt_max_d = simd_red_cnt_max_q; @@ -474,8 +485,9 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; mask_ready_o = !vinsn_issue_q.vm; // Store the result in the result queue + addr_d = next_vaddr(addr_q, vinsn_issue_q.vd); result_queue_d[result_queue_write_pnt_q].wdata = result_queue_q[result_queue_write_pnt_q].wdata | valu_result; - result_queue_d[result_queue_write_pnt_q].addr = vaddr(vinsn_issue_q.vd, NrLanes) + ((vinsn_issue_q.vl - issue_cnt_q) >> (int'(EW64) - vinsn_issue_q.vtype.vsew)); + result_queue_d[result_queue_write_pnt_q].addr = addr_q; result_queue_d[result_queue_write_pnt_q].id = vinsn_issue_q.id; result_queue_d[result_queue_write_pnt_q].mask = vinsn_issue_q.vfu == VFU_MaskUnit; if (!narrowing(vinsn_issue_q.op) || !narrowing_select_q) @@ -531,6 +543,11 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; else vinsn_queue_d.issue_pnt = vinsn_queue_q.issue_pnt + 1; + // Change starting address when we issue a new instruction + // Since this unit is not pipelined and elements written in the + // result queue belong to vinsn_issue_q + addr_d = vaddr(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vd, NrLanes); + // Assign vector length for next instruction in the instruction queue if (vinsn_queue_d.issue_cnt != 0) begin if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]})) @@ -830,6 +847,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; red_hs_synch_d = 1'b1; // Allow the first valid issue_cnt_d = vfu_operation_i.vl; + // Initialize the starting address for the next instruction + addr_d = vaddr(vfu_operation_i.vd, NrLanes); if (!(vfu_operation_i.op inside {[VMANDNOT:VMXNOR]})) issue_cnt_d = vfu_operation_i.vl; else begin @@ -877,6 +896,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; simd_red_cnt_max_q <= '0; alu_red_ready_q <= 1'b0; alu_vxsat_q <= '0; + addr_q <= '0; end else begin issue_cnt_q <= issue_cnt_d; commit_cnt_q <= commit_cnt_d; @@ -890,6 +910,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; simd_red_cnt_max_q <= simd_red_cnt_max_d; alu_red_ready_q <= alu_red_ready_i; alu_vxsat_q <= alu_vxsat_d; + addr_q <= addr_d; end end diff --git a/hardware/src/lane/vector_fus_stage.sv b/hardware/src/lane/vector_fus_stage.sv index 6eb28e7c2..42b6a347e 100644 --- a/hardware/src/lane/vector_fus_stage.sv +++ b/hardware/src/lane/vector_fus_stage.sv @@ -96,9 +96,8 @@ module vector_fus_stage import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg ////////////////// valu #( - .NrLanes(NrLanes), - .FixPtSupport(FixPtSupport), - .vaddr_t(vaddr_t) + .NrLanes (NrLanes ), + .FixPtSupport(FixPtSupport) ) i_valu ( .clk_i (clk_i ), .rst_ni (rst_ni ), @@ -143,10 +142,9 @@ module vector_fus_stage import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg /////////////////// vmfpu #( - .NrLanes (NrLanes ), - .FPUSupport(FPUSupport), - .FixPtSupport(FixPtSupport), - .vaddr_t (vaddr_t ) + .NrLanes (NrLanes ), + .FPUSupport (FPUSupport ), + .FixPtSupport(FixPtSupport) ) i_vmfpu ( .clk_i (clk_i ), .rst_ni (rst_ni ), diff --git a/hardware/src/lane/vmfpu.sv b/hardware/src/lane/vmfpu.sv index c4ffc6d72..81c729864 100644 --- a/hardware/src/lane/vmfpu.sv +++ b/hardware/src/lane/vmfpu.sv @@ -9,17 +9,22 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; import cf_math_pkg::idx_width; #( - parameter int unsigned NrLanes = 0, + parameter int unsigned NrLanes = 0, // Support for floating-point data types - parameter fpu_support_e FPUSupport = FPUSupportHalfSingleDouble, + parameter fpu_support_e FPUSupport = FPUSupportHalfSingleDouble, // Support for fixed-point data types - parameter logic FixPtSupport = FixedPointEnable, + parameter logic FixPtSupport = FixedPointEnable, // Type used to address vector register file elements - parameter type vaddr_t = logic, + localparam int unsigned MaxVLenBPerLane = VLENB / NrLanes, // In bytes + localparam int unsigned VRFBSizePerLane = MaxVLenBPerLane * 32, // In bytes + localparam int unsigned VaddrIdxWidth = $clog2(VRFBSizePerLane), + localparam int unsigned VaddrBankWidth = $clog2(NrVRFBanksPerLane), + localparam int unsigned VaddrVregWidth = $clog2(MaxVLenBPerLane), + localparam type vaddr_t = logic [VaddrIdxWidth-1:0], // Dependant parameters. DO NOT CHANGE! - localparam int unsigned DataWidth = $bits(elen_t), - localparam int unsigned StrbWidth = DataWidth/8, - localparam type strb_t = logic [DataWidth/8-1:0] + localparam int unsigned DataWidth = $bits(elen_t), + localparam int unsigned StrbWidth = DataWidth/8, + localparam type strb_t = logic [DataWidth/8-1:0] ) ( input logic clk_i, input logic rst_ni, @@ -61,6 +66,9 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; output logic mask_ready_o ); + // Include address-handling functions + `include "../../include/ara_vaddr.svh" + //////////////////////////////// // Vector instruction queue // //////////////////////////////// @@ -180,6 +188,8 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; // Helper signals // ////////////////////// + vaddr_t addr_d, addr_q; + logic vinsn_issue_mul, vinsn_issue_div, vinsn_issue_fpu; assign vinsn_issue_mul = vinsn_issue_q.op inside {[VMUL:VSMUL]}; @@ -1044,6 +1054,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; reduction_rx_cnt_d = reduction_rx_cnt_q; sldu_transactions_cnt_d = sldu_transactions_cnt_q; red_hs_synch_d = red_hs_synch_q; + addr_d = addr_q; mfpu_red_valid_o = 1'b0; sldu_mfpu_ready_d = 1'b0; simd_red_cnt_max_d = simd_red_cnt_max_q; @@ -1218,9 +1229,9 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; to_process_cnt_d = (narrowing(vinsn_processing_q.cvt_resize)) ? (to_process_cnt_q - processed_element_cnt_narrow) : (to_process_cnt_q - processed_element_cnt); // Store the result in the result queue + addr_d = next_vaddr(addr_q, vinsn_processing_q.vd); result_queue_d[result_queue_write_pnt_q].id = vinsn_processing_q.id; - result_queue_d[result_queue_write_pnt_q].addr = vaddr(vinsn_processing_q.vd, NrLanes) + - ((vinsn_processing_q.vl - to_process_cnt_q) >> (int'(EW64) - vinsn_processing_q.vtype.vsew)); + result_queue_d[result_queue_write_pnt_q].addr = addr_q; // FP narrowing instructions pack the result in two different cycles, and only some 16-bit slices are active if (narrowing(vinsn_processing_q.cvt_resize)) begin for (int b = 0; b < 4; b++) begin @@ -1275,6 +1286,10 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; if (vinsn_queue_d.processing_cnt != 0) to_process_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.processing_pnt].vl; + + // Update the address for the results of the next cycles since they belong + // to the next instruction + addr_d = vaddr(vinsn_queue_q.vinsn[vinsn_queue_d.processing_pnt].vd, NrLanes); end end end @@ -1695,6 +1710,9 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; if (vinsn_queue_d.processing_cnt != 0) to_process_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.processing_pnt].vl; + // Update the starting address for the next instruction + addr_d = vaddr(vinsn_queue_q.vinsn[vinsn_queue_d.processing_pnt].vd, NrLanes); + // Bump issue counter and pointers vinsn_queue_d.issue_cnt -= 1; if (vinsn_queue_q.issue_pnt == VInsnQueueDepth-1) vinsn_queue_d.issue_pnt = '0; @@ -1833,9 +1851,12 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; osum_issue_cnt_d = '0; issue_cnt_d = vfu_operation_i.vl; end - if (vinsn_queue_d.processing_cnt == '0) to_process_cnt_d = vfu_operation_i.vl; - if (vinsn_queue_d.commit_cnt == '0) commit_cnt_d = - is_reduction(vfu_operation_i.op) ? 1 : vfu_operation_i.vl; + if (vinsn_queue_d.processing_cnt == '0) begin + to_process_cnt_d = vfu_operation_i.vl; + // A new instruction to process; update the starting address + addr_d = vaddr(vfu_operation_i.vd, NrLanes); + end + if (vinsn_queue_d.commit_cnt == '0) commit_cnt_d = is_reduction(vfu_operation_i.op) ? 1 : vfu_operation_i.vl; // Floating-Point re-encoding for widening operations // Enabled only for the supported formats if (FPUSupport != FPUSupportNone) begin @@ -1902,6 +1923,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; intra_op_rx_cnt_q <= '0; osum_issue_cnt_q <= '0; mfpu_vxsat_q <= '0; + addr_q <= '0; end else begin issue_cnt_q <= issue_cnt_d; to_process_cnt_q <= to_process_cnt_d; @@ -1925,6 +1947,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; intra_op_rx_cnt_q <= intra_op_rx_cnt_d; osum_issue_cnt_q <= osum_issue_cnt_d; mfpu_vxsat_q <= mfpu_vxsat_d; + addr_q <= addr_d; end end diff --git a/hardware/src/masku/masku.sv b/hardware/src/masku/masku.sv index afea302f6..58cc11f1b 100644 --- a/hardware/src/masku/masku.sv +++ b/hardware/src/masku/masku.sv @@ -10,12 +10,18 @@ // predicated instructions. module masku import ara_pkg::*; import rvv_pkg::*; #( - parameter int unsigned NrLanes = 0, - parameter type vaddr_t = logic, // Type used to address vector register file elements + parameter int unsigned NrLanes = 0, + // Address of an element in the lane's VRF + localparam int unsigned MaxVLenBPerLane = VLENB / NrLanes, // In bytes + localparam int unsigned VRFBSizePerLane = MaxVLenBPerLane * 32, // In bytes + localparam int unsigned VaddrIdxWidth = $clog2(VRFBSizePerLane), + localparam int unsigned VaddrBankWidth = $clog2(NrVRFBanksPerLane), + localparam int unsigned VaddrVregWidth = $clog2(MaxVLenBPerLane), + localparam type vaddr_t = logic [VaddrIdxWidth-1:0], // Dependant parameters. DO NOT CHANGE! - localparam int unsigned DataWidth = $bits(elen_t), // Width of the lane datapath - localparam int unsigned StrbWidth = DataWidth/8, - localparam type strb_t = logic [StrbWidth-1:0] // Byte-strobe type + localparam int unsigned DataWidth = $bits(elen_t), // Width of the lane datapath + localparam int unsigned StrbWidth = DataWidth/8, + localparam type strb_t = logic [StrbWidth-1:0] // Byte-strobe type ) ( input logic clk_i, input logic rst_ni, @@ -48,6 +54,9 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( input logic sldu_mask_ready_i ); + // Include address-handling functions + `include "../../include/ara_vaddr.svh" + import cf_math_pkg::idx_width; //////////////// @@ -142,6 +151,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // There is a mask queue per lane, holding the operands that were not // yet used by the corresponding lane. + vaddr_t addr_d, addr_q; + // Mask queue strb_t [MaskQueueDepth-1:0][NrLanes-1:0] mask_queue_d, mask_queue_q; logic [MaskQueueDepth-1:0][NrLanes-1:0] mask_queue_valid_d, mask_queue_valid_q; @@ -647,6 +658,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( popcount_d = popcount_q; vfirst_count_d = vfirst_count_q; + addr_d = addr_q; + mask_queue_d = mask_queue_q; mask_queue_valid_d = mask_queue_valid_q; mask_queue_write_pnt_d = mask_queue_write_pnt_q; @@ -732,6 +745,9 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( else mask_queue_write_pnt_d = mask_queue_write_pnt_q + 1; + // Increment write-back address + addr_d = next_vaddr(addr_q, vinsn_issue.vd); + // Account for the operands that were issued read_cnt_d = read_cnt_q - NrLanes * (1 << (int'(EW64) - vinsn_issue.vtype.vsew)); if (read_cnt_q < NrLanes * (1 << (int'(EW64) - vinsn_issue.vtype.vsew))) @@ -838,9 +854,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( result_queue_d[result_queue_write_pnt_q][lane] = '{ wdata: result_queue_q[result_queue_write_pnt_q][lane].wdata | alu_result[lane], be : (vinsn_issue.op inside {[VMSBF:VID]}) ? '1 : be(element_cnt, vinsn_issue.vtype.vsew), - addr : (vinsn_issue.op inside {[VMSBF:VID]}) ? vaddr(vinsn_issue.vd, NrLanes) + ((vinsn_issue.vl - issue_cnt_q) >> (int'(EW64) - vinsn_issue.vtype.vsew)) : vaddr(vinsn_issue.vd, NrLanes) + - (((vinsn_issue.vl - issue_cnt_q) / NrLanes / DataWidth)), - id : vinsn_issue.id + addr : addr_q, + id : vinsn_issue.id }; end @@ -848,6 +863,9 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( if (vinsn_issue.op inside {[VMFEQ:VMSGTU], [VMSGT:VMSBC]}) begin vrf_pnt_d = vrf_pnt_q + (NrLanes << (int'(EW64) - vinsn_issue.vtype.vsew)); + // Increment write-back address + addr_d = next_vaddr(addr_q, vinsn_issue.vd); + // Filled-up a word, or finished execution if (vrf_pnt_d == DataWidth*NrLanes || vrf_pnt_d >= issue_cnt_q) begin result_queue_valid_d[result_queue_write_pnt_q] = {NrLanes{1'b1}}; @@ -1077,6 +1095,9 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( issue_cnt_d = pe_req_i.vl; read_cnt_d = pe_req_i.vl; + // Initialize the starting address of the next instruction + addr_d = vaddr(pe_req_i.vd, NrLanes); + // Trim skipped words if (pe_req_i.op == VSLIDEUP) begin issue_cnt_d -= vlen_t'(trimmed_stride); @@ -1131,6 +1152,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( result_final_gnt_q <= '0; popcount_q <= '0; vfirst_count_q <= '0; + addr_q <= '0; end else begin vinsn_running_q <= vinsn_running_d; read_cnt_q <= read_cnt_d; @@ -1142,6 +1164,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( result_final_gnt_q <= result_final_gnt_d; popcount_q <= popcount_d; vfirst_count_q <= vfirst_count_d; + addr_q <= addr_d; end end diff --git a/hardware/src/sldu/sldu.sv b/hardware/src/sldu/sldu.sv index 9c06c3ac5..7439bed7a 100644 --- a/hardware/src/sldu/sldu.sv +++ b/hardware/src/sldu/sldu.sv @@ -8,12 +8,18 @@ // instructions, which need access to the whole Vector Register File. module sldu import ara_pkg::*; import rvv_pkg::*; #( - parameter int unsigned NrLanes = 0, - parameter type vaddr_t = logic, // Type used to address vector register file elements + parameter int unsigned NrLanes = 0, + // Address of an element in the lane's VRF + localparam int unsigned MaxVLenBPerLane = VLENB / NrLanes, // In bytes + localparam int unsigned VRFBSizePerLane = MaxVLenBPerLane * 32, // In bytes + localparam int unsigned VaddrIdxWidth = $clog2(VRFBSizePerLane), + localparam int unsigned VaddrBankWidth = $clog2(NrVRFBanksPerLane), + localparam int unsigned VaddrVregWidth = $clog2(MaxVLenBPerLane), + localparam type vaddr_t = logic [VaddrIdxWidth-1:0], // Dependant parameters. DO NOT CHANGE! - localparam int unsigned DataWidth = $bits(elen_t), // Width of the lane datapath - localparam int unsigned StrbWidth = DataWidth/8, - localparam type strb_t = logic [StrbWidth-1:0] // Byte-strobe type + localparam int unsigned DataWidth = $bits(elen_t), // Width of the lane datapath + localparam int unsigned StrbWidth = DataWidth/8, + localparam type strb_t = logic [StrbWidth-1:0] // Byte-strobe type ) ( input logic clk_i, input logic rst_ni, @@ -46,6 +52,9 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #( `include "common_cells/registers.svh" + // Include address-handling functions + `include "../../include/ara_vaddr.svh" + import cf_math_pkg::idx_width; //////////////////////////////// @@ -108,6 +117,8 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #( localparam int unsigned ResultQueueDepth = 2; + vaddr_t addr_d, addr_q; + // There is a result queue per lane, holding the results that were not // yet accepted by the corresponding lane. typedef struct packed { @@ -220,6 +231,7 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #( out_pnt_d = out_pnt_q; vrf_pnt_d = vrf_pnt_q; state_d = state_q; + addr_d = addr_q; result_queue_d = result_queue_q; result_queue_valid_d = result_queue_valid_q; @@ -268,6 +280,9 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #( // Start writing at the middle of the destination vector vrf_pnt_d = vinsn_issue_q.stride >> $clog2(8*NrLanes); + // Fix the starting address + addr_d = vaddr_offset(addr_q, vrf_pnt_d, vinsn_issue_q.vd); + // Go to SLIDE_RUN_VSLIDE1UP_FIRST_WORD if this is a vslide1up instruction if (vinsn_issue_q.use_scalar_op) state_d = SLIDE_RUN_VSLIDE1UP_FIRST_WORD; @@ -349,8 +364,7 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #( // Initialize id and addr fields of the result queue requests for (int lane = 0; lane < NrLanes; lane++) begin result_queue_d[result_queue_write_pnt_q][lane].id = vinsn_issue_q.id; - result_queue_d[result_queue_write_pnt_q][lane].addr = - vaddr(vinsn_issue_q.vd, NrLanes) + vrf_pnt_q; + result_queue_d[result_queue_write_pnt_q][lane].addr = addr_q; end // Bump pointers (reductions always finish in one shot) @@ -409,8 +423,8 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #( if (vinsn_issue_q.op inside {VSLIDEUP, VSLIDEDOWN}) mask_ready_o = !vinsn_issue_q.vm; - // Increment VRF address - vrf_pnt_d = vrf_pnt_q + 1; + // Increment write-back address + addr_d = vaddr_offset(addr_q, 1, vinsn_issue_q.vd); // Send result to the VRF result_queue_cnt_d += 1; @@ -466,6 +480,8 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #( // Increment vector instruction queue pointers and counters vinsn_queue_d.issue_pnt += 1; vinsn_queue_d.issue_cnt -= 1; + + addr_d = vaddr(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vd, NrLanes); end end end @@ -500,6 +516,8 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #( // Increment vector instruction queue pointers and counters vinsn_queue_d.issue_pnt += 1; vinsn_queue_d.issue_cnt -= 1; + + addr_d = vaddr(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vd, NrLanes); end end SLIDE_WAIT_OSUM: begin @@ -607,6 +625,9 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #( // VSLIDE1UP always writes at least 1 element if (pe_req_i.op == VSLIDEUP && !pe_req_i.use_scalar_op) issue_cnt_d -= vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].stride; + + // Initialize the starting address for the next instruction + addr_d = vaddr(pe_req_i.vd, NrLanes); end if (vinsn_queue_d.commit_cnt == '0) begin commit_cnt_d = pe_req_i.op inside {VSLIDEUP, VSLIDEDOWN} @@ -638,6 +659,7 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #( pe_resp_o <= '0; result_final_gnt_q <= '0; red_stride_cnt_q <= 1; + addr_q <= '0; end else begin vinsn_running_q <= vinsn_running_d; issue_cnt_q <= issue_cnt_d; @@ -649,6 +671,7 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #( pe_resp_o <= pe_resp; result_final_gnt_q <= result_final_gnt_d; red_stride_cnt_q <= red_stride_cnt_d; + addr_q <= addr_d; end end diff --git a/hardware/src/vlsu/addrgen.sv b/hardware/src/vlsu/addrgen.sv index def21df8e..2fbe05e55 100644 --- a/hardware/src/vlsu/addrgen.sv +++ b/hardware/src/vlsu/addrgen.sv @@ -89,8 +89,8 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( logic axi_addrgen_queue_empty; fifo_v3 #( - .DEPTH(4 ), - .dtype(addrgen_axi_req_t) + .DEPTH(VaddrgenInsnQueueDepth), + .dtype(addrgen_axi_req_t ) ) i_addrgen_req_queue ( .clk_i (clk_i ), .rst_ni (rst_ni ), diff --git a/hardware/src/vlsu/vldu.sv b/hardware/src/vlsu/vldu.sv index 51042ed8e..6f94d9ec1 100644 --- a/hardware/src/vlsu/vldu.sv +++ b/hardware/src/vlsu/vldu.sv @@ -8,16 +8,22 @@ // upon receiving vector memory operations. module vldu import ara_pkg::*; import rvv_pkg::*; #( - parameter int unsigned NrLanes = 0, - parameter type vaddr_t = logic, // Type used to address vector register file elements + parameter int unsigned NrLanes = 0, + // Address of an element in the lane's VRF + localparam int unsigned MaxVLenBPerLane = VLENB / NrLanes, // In bytes + localparam int unsigned VRFBSizePerLane = MaxVLenBPerLane * 32, // In bytes + localparam int unsigned VaddrIdxWidth = $clog2(VRFBSizePerLane), + localparam int unsigned VaddrBankWidth = $clog2(NrVRFBanksPerLane), + localparam int unsigned VaddrVregWidth = $clog2(MaxVLenBPerLane), + localparam type vaddr_t = logic [VaddrIdxWidth-1:0], // AXI Interface parameters - parameter int unsigned AxiDataWidth = 0, - parameter int unsigned AxiAddrWidth = 0, - parameter type axi_r_t = logic, + parameter int unsigned AxiDataWidth = 0, + parameter int unsigned AxiAddrWidth = 0, + parameter type axi_r_t = logic, // Dependant parameters. DO NOT CHANGE! - localparam int DataWidth = $bits(elen_t), - localparam type strb_t = logic[DataWidth/8-1:0], - localparam type axi_addr_t = logic [AxiAddrWidth-1:0] + localparam int DataWidth = $bits(elen_t), + localparam type strb_t = logic[DataWidth/8-1:0], + localparam type axi_addr_t = logic [AxiAddrWidth-1:0] ) ( input logic clk_i, input logic rst_ni, @@ -33,6 +39,10 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( input logic [NrVInsn-1:0] pe_vinsn_running_i, output logic pe_req_ready_o, output pe_resp_t pe_resp_o, + // Hazard handling to main sequencer + output vid_t commit_id_o, + output logic commit_id_valid_o, + input logic hazard_i, // Interface with the address generator input addrgen_axi_req_t axi_addrgen_req_i, input logic axi_addrgen_req_valid_i, @@ -51,7 +61,11 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( output logic mask_ready_o ); + // Include address-handling functions + `include "../../include/ara_vaddr.svh" + import cf_math_pkg::idx_width; + import axi_pkg::beat_lower_byte; import axi_pkg::beat_upper_byte; import axi_pkg::BURST_INCR; @@ -101,6 +115,9 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( logic vinsn_commit_valid; assign vinsn_commit = vinsn_queue_q.vinsn[vinsn_queue_q.commit_pnt]; assign vinsn_commit_valid = (vinsn_queue_q.commit_cnt != '0); + // To the main sequencer, for hazard checking + assign commit_id_valid_o = vinsn_commit_valid; + assign commit_id_o = vinsn_commit.id; always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin @@ -118,6 +135,8 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( localparam int unsigned ResultQueueDepth = 2; + vaddr_t addr_d, addr_q; + // There is a result queue per lane, holding the results that were not // yet accepted by the corresponding lane. typedef struct packed { @@ -197,6 +216,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( vinsn_queue_d = vinsn_queue_q; issue_cnt_d = issue_cnt_q; commit_cnt_d = commit_cnt_q; + addr_d = addr_q; len_d = len_q; r_pnt_d = r_pnt_q; @@ -286,9 +306,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( // Initialize id and addr fields of the result queue requests for (int lane = 0; lane < NrLanes; lane++) begin result_queue_d[result_queue_write_pnt_q][lane].id = vinsn_issue_q.id; - result_queue_d[result_queue_write_pnt_q][lane].addr = vaddr(vinsn_issue_q.vd, NrLanes) + - (((vinsn_issue_q.vl - (issue_cnt_q >> int'(vinsn_issue_q.vtype.vsew))) / NrLanes) >> - (int'(EW64) - int'(vinsn_issue_q.vtype.vsew))); + result_queue_d[result_queue_write_pnt_q][lane].addr = addr_q; end end @@ -304,6 +322,9 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( // Trigger the request signal result_queue_valid_d[result_queue_write_pnt_q] = {NrLanes{1'b1}}; + // Increase the address + addr_d = next_vaddr(addr_q, vinsn_issue_q.vd); + // Acknowledge the mask operands mask_ready_o = !vinsn_issue_q.vm; @@ -342,6 +363,9 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( else vinsn_queue_d.issue_pnt += 1; + // Modify the next instruction's address + addr_d = vaddr(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vd, NrLanes); + // Prepare for the next vector instruction if (vinsn_queue_d.issue_cnt != 0) issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl << int'(vinsn_queue_q.vinsn[ @@ -354,7 +378,10 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( ////////////////////////////////// for (int lane = 0; lane < NrLanes; lane++) begin: result_write - ldu_result_req_o[lane] = result_queue_valid_q[result_queue_read_pnt_q][lane]; + // Create a request only if there are no more hazards on vd (check vs1 since the info about + // hazard vd is also there) + ldu_result_req_o[lane] = result_queue_valid_q[result_queue_read_pnt_q][lane] && + !vinsn_commit.hazard_vs1; ldu_result_addr_o[lane] = result_queue_q[result_queue_read_pnt_q][lane].addr; ldu_result_id_o[lane] = result_queue_q[result_queue_read_pnt_q][lane].id; ldu_result_wdata_o[lane] = result_queue_q[result_queue_read_pnt_q][lane].wdata; @@ -415,6 +442,10 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( vinsn_queue_d.commit_pnt].vtype.vsew); end + // Update the Vd hazard bit for the current instruction + // hazard_vs1, hazard_vs2, hazard_vm all contain the info about hazard_vd, so work on one of them (vs1) + if (commit_id_valid_o) vinsn_queue_d.vinsn[vinsn_queue_q.commit_pnt].hazard_vs1 &= {NrVInsn{hazard_i}}; + ////////////////////////////// // Accept new instruction // ////////////////////////////// @@ -425,8 +456,10 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( vinsn_running_d[pe_req_i.id] = 1'b1; // Initialize counters - if (vinsn_queue_d.issue_cnt == '0) + if (vinsn_queue_d.issue_cnt == '0) begin issue_cnt_d = pe_req_i.vl << int'(pe_req_i.vtype.vsew); + addr_d = vaddr(pe_req_i.vd, NrLanes); + end if (vinsn_queue_d.commit_cnt == '0) commit_cnt_d = pe_req_i.vl << int'(pe_req_i.vtype.vsew); @@ -447,6 +480,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( vrf_pnt_q <= '0; pe_resp_o <= '0; result_final_gnt_q <= '0; + addr_q <= '0; end else begin vinsn_running_q <= vinsn_running_d; issue_cnt_q <= issue_cnt_d; @@ -456,6 +490,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( vrf_pnt_q <= vrf_pnt_d; pe_resp_o <= pe_resp; result_final_gnt_q <= result_final_gnt_d; + addr_q <= addr_d; end end diff --git a/hardware/src/vlsu/vlsu.sv b/hardware/src/vlsu/vlsu.sv index aa2e05283..c86b7ee15 100644 --- a/hardware/src/vlsu/vlsu.sv +++ b/hardware/src/vlsu/vlsu.sv @@ -44,6 +44,9 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( output logic addrgen_ack_o, output logic addrgen_error_o, output vlen_t addrgen_error_vl_o, + output vid_t commit_id_o, + output logic commit_id_valid_o, + input logic hazard_i, // Interface with the lanes // Store unit operands input elen_t [NrLanes-1:0] stu_operand_i, @@ -155,8 +158,7 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( .AxiAddrWidth(AxiAddrWidth), .AxiDataWidth(AxiDataWidth), .axi_r_t (axi_r_t ), - .NrLanes (NrLanes ), - .vaddr_t (vaddr_t ) + .NrLanes (NrLanes ) ) i_vldu ( .clk_i (clk_i ), .rst_ni (rst_ni ), @@ -172,6 +174,9 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( .pe_vinsn_running_i (pe_vinsn_running_i ), .pe_req_ready_o (pe_req_ready_o[OffsetLoad]), .pe_resp_o (pe_resp_o[OffsetLoad] ), + .commit_id_o (commit_id_o ), + .commit_id_valid_o (commit_id_valid_o ), + .hazard_i (hazard_i ), // Interface with the address generator .axi_addrgen_req_i (axi_addrgen_req ), .axi_addrgen_req_valid_i(axi_addrgen_req_valid ),