From 830540b7574f57c2bf155cf4d75cd1849549eb0b Mon Sep 17 00:00:00 2001 From: Florian Zaruba <zarubaf@iis.ee.ethz.ch> Date: Sat, 20 Apr 2019 18:53:16 +0200 Subject: [PATCH] frontend: Clean-up instruction frontend The instuction frontend has become an increasingly messy part an needed cleaning-up. The current solution contains 2 x 32 bit instruction data fifos and 1 x 64 bit address fifo. Hence, it should be significantly more area efficient that the previous one. The interface to `id_stage` is a ready/valid handshake. The credit based system has been replaced in favour of a replay mechanism as it was very brittle and overly pessimistic. Branch-prediction has been cleaned up: The front-end was also partially predicting on jumps, this could have potentially let to performance bugs if the branch detection wasn't correct in the frontend. --- Makefile | 3 +- include/ariane_pkg.sv | 54 +-- src/ariane.sv | 31 +- src/branch_unit.sv | 91 ++--- src/ex_stage.sv | 4 + src/frontend/bht.sv | 71 ++-- src/frontend/btb.sv | 53 ++- src/frontend/frontend.sv | 758 ++++++++++++++++-------------------- src/frontend/instr_queue.sv | 353 +++++++++++++++++ src/frontend/instr_scan.sv | 45 ++- src/id_stage.sv | 115 +++--- src/instr_realign.sv | 358 +++++++++++++++++ src/instr_realigner.sv | 252 ------------ tb/ariane_soc_pkg.sv | 3 + 14 files changed, 1296 insertions(+), 895 deletions(-) create mode 100644 src/frontend/instr_queue.sv create mode 100644 src/instr_realign.sv delete mode 100644 src/instr_realigner.sv diff --git a/Makefile b/Makefile index 4dc8febf..ad845921 100644 --- a/Makefile +++ b/Makefile @@ -146,6 +146,7 @@ src := $(filter-out src/ariane_regfile.sv, $(wildcard src/*.sv)) \ src/axi/src/axi_delayer.sv \ src/axi/src/axi_to_axi_lite.sv \ src/fpga-support/rtl/SyncSpRamBeNx64.sv \ + src/common_cells/src/unread.sv \ src/common_cells/src/sync.sv \ src/common_cells/src/cdc_2phase.sv \ src/common_cells/src/spill_register.sv \ @@ -157,6 +158,7 @@ src := $(filter-out src/ariane_regfile.sv, $(wildcard src/*.sv)) \ src/common_cells/src/deprecated/fifo_v2.sv \ src/common_cells/src/fifo_v3.sv \ src/common_cells/src/lzc.sv \ + src/common_cells/src/popcount.sv \ src/common_cells/src/rr_arb_tree.sv \ src/common_cells/src/deprecated/rrarbiter.sv \ src/common_cells/src/stream_delay.sv \ @@ -361,7 +363,6 @@ verilate_command := $(verilator) -Wno-UNOPTFLAT \ -Wno-style \ $(if $(PROFILE),--stats --stats-vars --profile-cfuncs,) \ - -Wno-lint \ $(if $(DEBUG),--trace --trace-structs,) \ -LDFLAGS "-L$(RISCV)/lib -Wl,-rpath,$(RISCV)/lib -lfesvr$(if $(PROFILE), -g -pg,)" \ -CFLAGS "$(CFLAGS)$(if $(PROFILE), -g -pg,)" -Wall --cc --vpi \ diff --git a/include/ariane_pkg.sv b/include/ariane_pkg.sv index 55061956..12f24eb5 100644 --- a/include/ariane_pkg.sv +++ b/include/ariane_pkg.sv @@ -34,6 +34,9 @@ package ariane_pkg; localparam NrMaxRules = 16; typedef struct packed { + int RASDepth; + int BTBEntries; + int BHTEntries; // PMAs int NrNonIdempotentRules; // Number of non idempotent rules logic [NrMaxRules-1:0][63:0] NonIdempotentAddrBase; // base which needs to match @@ -52,6 +55,9 @@ package ariane_pkg; } ariane_cfg_t; localparam ariane_cfg_t ArianeDefaultConfig = '{ + RASDepth: 2, + BTBEntries: 32, + BHTEntries: 128, // idempotent region NrNonIdempotentRules: 2, NonIdempotentAddrBase: {64'b0, 64'b0}, @@ -75,6 +81,9 @@ package ariane_pkg; function automatic void check_cfg (ariane_cfg_t Cfg); // pragma translate_off `ifndef VERILATOR + assert(Cfg.RASDepth > 0); + assert(2**$clog2(Cfg.BTBEntries) == Cfg.BTBEntries); + assert(2**$clog2(Cfg.BHTEntries) == Cfg.BHTEntries); assert(Cfg.NrNonIdempotentRules <= NrMaxRules); assert(Cfg.NrExecuteRegionRules <= NrMaxRules); assert(Cfg.NrCachedRegionRules <= NrMaxRules); @@ -131,9 +140,6 @@ package ariane_pkg; localparam TRANS_ID_BITS = $clog2(NR_SB_ENTRIES); // depending on the number of scoreboard entries we need that many bits // to uniquely identify the entry in the scoreboard localparam ASID_WIDTH = 1; - localparam BTB_ENTRIES = 64; - localparam BHT_ENTRIES = 128; - localparam RAS_DEPTH = 2; localparam BITS_SATURATION_COUNTER = 2; localparam NR_COMMIT_PORTS = 2; @@ -142,8 +148,8 @@ package ariane_pkg; localparam ISSUE_WIDTH = 1; // amount of pipeline registers inserted for load/store return path // this can be tuned to trade-off IPC vs. cycle time - localparam NR_LOAD_PIPE_REGS = 1; - localparam NR_STORE_PIPE_REGS = 0; + localparam int unsigned NR_LOAD_PIPE_REGS = 1; + localparam int unsigned NR_STORE_PIPE_REGS = 0; // depth of store-buffers, this needs to be a power of two localparam int unsigned DEPTH_SPEC = 4; @@ -281,7 +287,7 @@ package ariane_pkg; // --------------- // leave as is (fails with >8 entries and wider fetch width) - localparam int unsigned FETCH_FIFO_DEPTH = 8; + localparam int unsigned FETCH_FIFO_DEPTH = 4; localparam int unsigned FETCH_WIDTH = 32; // maximum instructions we can fetch on one request (we support compressed instructions) localparam int unsigned INSTR_PER_FETCH = FETCH_WIDTH / 16; @@ -295,18 +301,24 @@ package ariane_pkg; logic valid; } exception_t; - typedef enum logic [1:0] { BHT, BTB, RAS } cf_t; + typedef enum logic [2:0] { + NoCF, // No control flow prediction + Branch, // Branch + Jump, // Jump to address from immediate + JumpR, // Jump to address from registers + Return // Return Address Prediction + } cf_t; // branch-predict // this is the struct we get back from ex stage and we will use it to update // all the necessary data structures + // bp_resolve_t typedef struct packed { logic valid; // prediction with all its values is valid - logic [63:0] pc; // pc of predict or mis-predict + logic [63:0] pc; // PC of predict or mis-predict logic [63:0] target_address; // target address at which to jump, or not logic is_mispredict; // set if this was a mis-predict logic is_taken; // branch is taken - // in the lower 16 bit of the word cf_t cf_type; // Type of control flow change } bp_resolve_t; @@ -314,11 +326,8 @@ package ariane_pkg; // this is the struct which we will inject into the pipeline to guide the various // units towards the correct branch decision and resolve typedef struct packed { - logic valid; // this is a valid hint + cf_t cf; // type of control flow prediction logic [63:0] predict_address; // target address at which to jump, or not - logic predict_taken; // branch is taken - // in the lower 16 bit of the word - cf_t cf_type; // Type of control flow change } branchpredict_sbe_t; typedef struct packed { @@ -340,14 +349,12 @@ package ariane_pkg; typedef struct packed { logic valid; logic [63:0] pc; // update at PC - logic mispredict; logic taken; } bht_update_t; typedef struct packed { logic valid; logic taken; - logic strongly_taken; } bht_prediction_t; typedef enum logic[3:0] { @@ -444,7 +451,7 @@ package ariane_pkg; // comparisons LTS, LTU, GES, GEU, EQ, NE, // jumps - JALR, + JALR, BRANCH, // set lower than operations SLTS, SLTU, // CSR functions @@ -482,6 +489,13 @@ package ariane_pkg; logic [TRANS_ID_BITS-1:0] trans_id; } fu_data_t; + function automatic logic is_branch (input fu_op op); + unique case (op) inside + EQ, NE, LTS, GES, LTU, GEU: return 1'b1; + default : return 1'b0; // all other ops + endcase + endfunction; + // ------------------------------- // Extract Src/Dst FP Reg from Op // ------------------------------- @@ -570,14 +584,6 @@ package ariane_pkg; // --------------- // IF/ID Stage // --------------- - typedef struct packed { - logic [63:0] address; // the address of the instructions from below - logic [FETCH_WIDTH-1:0] instruction; // instruction word - branchpredict_sbe_t branch_predict; // this field contains branch prediction information regarding the forward branch path - logic [INSTR_PER_FETCH-1:0] bp_taken; // at which instruction is this branch taken? - logic page_fault; // an instruction page fault happened - } frontend_fetch_t; - // store the decompressed instruction typedef struct packed { logic [63:0] address; // the address of the instructions from below diff --git a/src/ariane.sv b/src/ariane.sv index 4c78e02e..edd23033 100644 --- a/src/ariane.sv +++ b/src/ariane.sv @@ -1,4 +1,4 @@ -// Copyright 2018 ETH Zurich and University of Bologna. +// Copyright 2017-2019 ETH Zurich and University of Bologna. // Copyright and related rights are licensed under the Solderpad Hardware // License, Version 0.51 (the "License"); you may not use this file except in // compliance with the License. You may obtain a copy of the License at @@ -59,9 +59,9 @@ module ariane #( // -------------- // IF <-> ID // -------------- - frontend_fetch_t fetch_entry_if_id; + fetch_entry_t fetch_entry_if_id; logic fetch_valid_if_id; - logic decode_ack_id_if; + logic fetch_ready_id_if; // -------------- // ID <-> ISSUE @@ -220,7 +220,7 @@ module ariane #( // Frontend // -------------- frontend #( - .DmBaseAddress ( ArianeCfg.DmBaseAddress ) + .ArianeCfg ( ArianeCfg ) ) i_frontend ( .flush_i ( flush_ctrl_if ), // not entirely correct .flush_bp_i ( 1'b0 ), @@ -238,7 +238,7 @@ module ariane #( .ex_valid_i ( ex_commit.valid ), .fetch_entry_o ( fetch_entry_if_id ), .fetch_entry_valid_o ( fetch_valid_if_id ), - .fetch_ack_i ( decode_ack_id_if ), + .fetch_entry_ready_i ( fetch_ready_id_if ), .* ); @@ -246,11 +246,14 @@ module ariane #( // ID // --------- id_stage id_stage_i ( - .debug_req_i, + .clk_i, + .rst_ni, .flush_i ( flush_ctrl_if ), + .debug_req_i, + .fetch_entry_i ( fetch_entry_if_id ), .fetch_entry_valid_i ( fetch_valid_if_id ), - .decoded_instr_ack_o ( decode_ack_id_if ), + .fetch_entry_ready_o ( fetch_ready_id_if ), .issue_entry_o ( issue_entry_id_issue ), .issue_entry_valid_o ( issue_entry_valid_id_issue ), @@ -260,13 +263,12 @@ module ariane #( .priv_lvl_i ( priv_lvl ), .fs_i ( fs ), .frm_i ( frm_csr_id_issue_ex ), + .irq_i ( irq_i ), + .irq_ctrl_i ( irq_ctrl_csr_id ), .debug_mode_i ( debug_mode ), .tvm_i ( tvm_csr_id ), .tw_i ( tw_csr_id ), - .tsr_i ( tsr_csr_id ), - .irq_i ( irq_i ), - .irq_ctrl_i ( irq_ctrl_csr_id ), - .* + .tsr_i ( tsr_csr_id ) ); // --------- @@ -334,6 +336,7 @@ module ariane #( ) ex_stage_i ( .clk_i ( clk_i ), .rst_ni ( rst_ni ), + .debug_mode_i ( debug_mode ), .flush_i ( flush_ctrl_ex ), .fu_data_i ( fu_data_id_ex ), .pc_i ( pc_id_ex ), @@ -708,9 +711,9 @@ module ariane #( assign tracer_if.flush_unissued = flush_unissued_instr_ctrl_id; assign tracer_if.flush = flush_ctrl_ex; // fetch - assign tracer_if.instruction = id_stage_i.instr_realigner_i.fetch_entry_o.instruction; - assign tracer_if.fetch_valid = id_stage_i.instr_realigner_i.fetch_entry_valid_o; - assign tracer_if.fetch_ack = id_stage_i.instr_realigner_i.fetch_ack_i; + assign tracer_if.instruction = id_stage_i.fetch_entry_i.instruction; + assign tracer_if.fetch_valid = id_stage_i.fetch_entry_valid_i; + assign tracer_if.fetch_ack = id_stage_i.fetch_entry_ready_o; // Issue assign tracer_if.issue_ack = issue_stage_i.i_scoreboard.issue_ack_i; assign tracer_if.issue_sbe = issue_stage_i.i_scoreboard.issue_instr_o; diff --git a/src/branch_unit.sv b/src/branch_unit.sv index 3f5e1e84..3f3774d5 100644 --- a/src/branch_unit.sv +++ b/src/branch_unit.sv @@ -12,10 +12,11 @@ // Date: 09.05.2017 // Description: Branch target calculation and comparison -import ariane_pkg::*; - module branch_unit ( - input fu_data_t fu_data_i, + input logic clk_i, + input logic rst_ni, + input logic debug_mode_i, + input ariane_pkg::fu_data_t fu_data_i, input logic [63:0] pc_i, // PC of instruction input logic is_compressed_instr_i, input logic fu_valid_i, // any functional unit is valid, check that there is no accidental mis-predict @@ -23,83 +24,62 @@ module branch_unit ( input logic branch_comp_res_i, // branch comparison result from ALU output logic [63:0] branch_result_o, - input branchpredict_sbe_t branch_predict_i, // this is the address we predicted - output bp_resolve_t resolved_branch_o, // this is the actual address we are targeting + input ariane_pkg::branchpredict_sbe_t branch_predict_i, // this is the address we predicted + output ariane_pkg::bp_resolve_t resolved_branch_o, // this is the actual address we are targeting output logic resolve_branch_o, // to ID to clear that we resolved the branch and we can // accept new entries to the scoreboard - output exception_t branch_exception_o // branch exception out + output ariane_pkg::exception_t branch_exception_o // branch exception out ); logic [63:0] target_address; logic [63:0] next_pc; - // here we handle the various possibilities of mis-predicts + // here we handle the various possibilities of mis-predicts always_comb begin : mispredict_handler // set the jump base, for JALR we need to look at the register, for all other control flow instructions we can take the current PC automatic logic [63:0] jump_base; - jump_base = (fu_data_i.operator == JALR) ? fu_data_i.operand_a : pc_i; + // TODO(zarubaf): The ALU can be used to calculate the branch target + jump_base = (fu_data_i.operator == ariane_pkg::JALR) ? fu_data_i.operand_a : pc_i; + target_address = 64'b0; resolve_branch_o = 1'b0; resolved_branch_o.target_address = 64'b0; resolved_branch_o.is_taken = 1'b0; resolved_branch_o.valid = branch_valid_i; resolved_branch_o.is_mispredict = 1'b0; - resolved_branch_o.cf_type = branch_predict_i.cf_type; + resolved_branch_o.cf_type = branch_predict_i.cf; // calculate next PC, depending on whether the instruction is compressed or not this may be different + // TODO(zarubaf): We already calculate this a couple of times, maybe re-use? next_pc = pc_i + ((is_compressed_instr_i) ? 64'h2 : 64'h4); // calculate target address simple 64 bit addition target_address = $unsigned($signed(jump_base) + $signed(fu_data_i.imm)); // on a JALR we are supposed to reset the LSB to 0 (according to the specification) - if (fu_data_i.operator == JALR) - target_address[0] = 1'b0; - // if we need to put the branch target address in a destination register, output it here to WB + if (fu_data_i.operator == ariane_pkg::JALR) target_address[0] = 1'b0; + // we need to put the branch target address into rd, this is the result of this unit branch_result_o = next_pc; - - // save PC - we need this to get the target row in the branch target buffer - // we play this trick with the branch instruction which wraps a word boundary: - // /---------- Place the prediction on this PC - // \/ - // ____________________________________________________ - // |branch [15:0] | branch[31:16] | compressed 1[15:0] | - // |____________________________________________________ - // This will relief the pre-fetcher to re-fetch partially fetched unaligned branch instructions e.g.: - // we don't have a back arch between the pre-fetcher and decoder/instruction FIFO. - resolved_branch_o.pc = (is_compressed_instr_i || pc_i[1] == 1'b0) ? pc_i : ({pc_i[63:2], 2'b0} + 64'h4); - + resolved_branch_o.pc = pc_i; + // There are only two sources of mispredicts: + // 1. Branches + // 2. Jumps to register addresses if (branch_valid_i) begin - // write target address which goes to pc gen + // write target address which goes to PC Gen resolved_branch_o.target_address = (branch_comp_res_i) ? target_address : next_pc; - resolved_branch_o.is_taken = branch_comp_res_i; - // we've detected a branch in ID with the following parameters - // we mis-predicted e.g.: the predicted address is unequal to the actual address - if (target_address[0] == 1'b0) begin - // we've got a valid branch prediction - if (branch_predict_i.valid) begin - // if the outcome doesn't match we've got a mis-predict - if (branch_predict_i.predict_taken != branch_comp_res_i) begin - resolved_branch_o.is_mispredict = 1'b1; - end - // check if the address of the predict taken branch is correct - if (branch_predict_i.predict_taken && target_address != branch_predict_i.predict_address) begin - resolved_branch_o.is_mispredict = 1'b1; - end - // branch-prediction didn't do anything (e.g.: it fetched PC + 2/4), so if this branch is taken - // we also have a mis-predict - end else begin - if (branch_comp_res_i) begin - resolved_branch_o.is_mispredict = 1'b1; - end - end + resolved_branch_o.is_taken = branch_comp_res_i; + // check the outcome of the branch speculation + if (ariane_pkg::is_branch(fu_data_i.operator) && branch_comp_res_i != (branch_predict_i.cf == ariane_pkg::Branch)) begin + // we mis-predicted the outcome + // if the outcome doesn't match we've got a mis-predict + resolved_branch_o.is_mispredict = 1'b1; + resolved_branch_o.cf_type = ariane_pkg::Branch; + end + if (fu_data_i.operator == ariane_pkg::JALR + // check if the address of the jump register is correct and that we actually predicted + && (branch_predict_i.cf == ariane_pkg::NoCF || target_address != branch_predict_i.predict_address)) begin + resolved_branch_o.is_mispredict = 1'b1; + // update BTB only if this wasn't a return + if (branch_predict_i.cf != ariane_pkg::Return) resolved_branch_o.cf_type = ariane_pkg::JumpR; end // to resolve the branch in ID resolve_branch_o = 1'b1; - // the other case would be that this instruction was no branch but branch prediction thought that it was one - // this is essentially also a mis-predict - end else if (fu_valid_i && branch_predict_i.valid && branch_predict_i.predict_taken) begin - // re-set the branch to the next PC - resolved_branch_o.is_mispredict = 1'b1; - resolved_branch_o.target_address = next_pc; - resolved_branch_o.valid = 1'b1; - resolve_branch_o = 1'b1; end end // use ALU exception signal for storing instruction fetch exceptions if @@ -109,7 +89,6 @@ module branch_unit ( branch_exception_o.valid = 1'b0; branch_exception_o.tval = pc_i; // only throw exception if this is indeed a branch - if (branch_valid_i && target_address[0] != 1'b0) - branch_exception_o.valid = 1'b1; + if (branch_valid_i && target_address[0] != 1'b0) branch_exception_o.valid = 1'b1; end endmodule diff --git a/src/ex_stage.sv b/src/ex_stage.sv index e255bdfc..7ef8093b 100644 --- a/src/ex_stage.sv +++ b/src/ex_stage.sv @@ -21,6 +21,7 @@ module ex_stage #( input logic clk_i, // Clock input logic rst_ni, // Asynchronous reset active low input logic flush_i, + input logic debug_mode_i, input fu_data_t fu_data_i, input logic [63:0] pc_i, // PC of current instruction @@ -143,6 +144,9 @@ module ex_stage #( // we don't silence the branch unit as this is already critical and we do // not want to add another layer of logic branch_unit branch_unit_i ( + .clk_i, + .rst_ni, + .debug_mode_i, .fu_data_i, .pc_i, .is_compressed_instr_i, diff --git a/src/frontend/bht.sv b/src/frontend/bht.sv index a49f007b..5b9d67b8 100644 --- a/src/frontend/bht.sv +++ b/src/frontend/bht.sv @@ -1,4 +1,4 @@ -//Copyright (C) 2018 to present, +// Copyright 2018 - 2019 ETH Zurich and University of Bologna. // Copyright and related rights are licensed under the Solderpad Hardware // License, Version 2.0 (the "License"); you may not use this file except in // compliance with the License. You may obtain a copy of the License at @@ -6,7 +6,8 @@ // or agreed to in writing, software, hardware and materials distributed under // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR // CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License.// +// specific language governing permissions and limitations under the License. +// // Author: Florian Zaruba, ETH Zurich // Date: 08.02.2018 // Migrated: Luis Vitorio Cargnini, IEEE @@ -20,65 +21,81 @@ module bht #( input logic rst_ni, input logic flush_i, input logic debug_mode_i, - input logic [63:0] vpc_i, input ariane_pkg::bht_update_t bht_update_i, - output ariane_pkg::bht_prediction_t bht_prediction_o + // we potentially need INSTR_PER_FETCH predictions/cycle + output ariane_pkg::bht_prediction_t [ariane_pkg::INSTR_PER_FETCH-1:0] bht_prediction_o ); - localparam OFFSET = 2; // we are using compressed instructions so do not use the lower 2 bits for prediction - localparam ANTIALIAS_BITS = 8; + // the last bit is always zero, we don't need it for indexing + localparam OFFSET = 1; + // re-shape the branch history table + localparam NR_ROWS = NR_ENTRIES / ariane_pkg::INSTR_PER_FETCH; + // number of bits needed to index the row + localparam ROW_ADDR_BITS = $clog2(ariane_pkg::INSTR_PER_FETCH); // number of bits we should use for prediction - localparam PREDICTION_BITS = $clog2(NR_ENTRIES) + OFFSET; + localparam PREDICTION_BITS = $clog2(NR_ROWS) + OFFSET + ROW_ADDR_BITS; + // we are not interested in all bits of the address + unread i_unread (.d_i(|vpc_i)); struct packed { logic valid; logic [1:0] saturation_counter; - } bht_d[NR_ENTRIES-1:0], bht_q[NR_ENTRIES-1:0]; + } bht_d[NR_ROWS-1:0][ariane_pkg::INSTR_PER_FETCH-1:0], bht_q[NR_ROWS-1:0][ariane_pkg::INSTR_PER_FETCH-1:0]; + + logic [$clog2(NR_ROWS)-1:0] index, update_pc; + logic [ROW_ADDR_BITS-1:0] update_row_index; + logic [1:0] saturation_counter; - logic [$clog2(NR_ENTRIES)-1:0] index, update_pc; - logic [1:0] saturation_counter; + assign index = vpc_i[PREDICTION_BITS - 1:ROW_ADDR_BITS + OFFSET]; + assign update_pc = bht_update_i.pc[PREDICTION_BITS - 1:ROW_ADDR_BITS + OFFSET]; + assign update_row_index = bht_update_i.pc[ROW_ADDR_BITS + OFFSET - 1:OFFSET]; - assign index = vpc_i[PREDICTION_BITS - 1:OFFSET]; - assign update_pc = bht_update_i.pc[PREDICTION_BITS - 1:OFFSET]; // prediction assignment - assign bht_prediction_o.valid = bht_q[index].valid; - assign bht_prediction_o.taken = bht_q[index].saturation_counter == 2'b10; - assign bht_prediction_o.strongly_taken = (bht_q[index].saturation_counter == 2'b11); + for (genvar i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin : gen_bht_output + assign bht_prediction_o[i].valid = bht_q[index][i].valid; + assign bht_prediction_o[i].taken = bht_q[index][i].saturation_counter[1] == 1'b1; + end + always_comb begin : update_bht bht_d = bht_q; - saturation_counter = bht_q[update_pc].saturation_counter; + saturation_counter = bht_q[update_pc][update_row_index].saturation_counter; if (bht_update_i.valid && !debug_mode_i) begin - bht_d[update_pc].valid = 1'b1; + bht_d[update_pc][update_row_index].valid = 1'b1; if (saturation_counter == 2'b11) begin // we can safely decrease it - if (~bht_update_i.taken) - bht_d[update_pc].saturation_counter = saturation_counter - 1; + if (!bht_update_i.taken) + bht_d[update_pc][update_row_index].saturation_counter = saturation_counter - 1; // then check if it saturated in the negative regime e.g.: branch not taken end else if (saturation_counter == 2'b00) begin // we can safely increase it if (bht_update_i.taken) - bht_d[update_pc].saturation_counter = saturation_counter + 1; + bht_d[update_pc][update_row_index].saturation_counter = saturation_counter + 1; end else begin // otherwise we are not in any boundaries and can decrease or increase it if (bht_update_i.taken) - bht_d[update_pc].saturation_counter = saturation_counter + 1; + bht_d[update_pc][update_row_index].saturation_counter = saturation_counter + 1; else - bht_d[update_pc].saturation_counter = saturation_counter - 1; + bht_d[update_pc][update_row_index].saturation_counter = saturation_counter - 1; end end end always_ff @(posedge clk_i or negedge rst_ni) begin - if (~rst_ni) begin - for (int unsigned i = 0; i < NR_ENTRIES; i++) - bht_q[i] <= '0; + if (!rst_ni) begin + for (int unsigned i = 0; i < NR_ENTRIES; i++) begin + for (int j = 0; j < ariane_pkg::INSTR_PER_FETCH; j++) begin + bht_q[i][j] <= '0; + end + end end else begin // evict all entries if (flush_i) begin for (int i = 0; i < NR_ENTRIES; i++) begin - bht_q[i].valid <= 1'b0; - bht_q[i].saturation_counter <= 2'b10; + for (int j = 0; j < ariane_pkg::INSTR_PER_FETCH; j++) begin + bht_q[i][j].valid <= 1'b0; + bht_q[i][j].saturation_counter <= 2'b10; + end end end else begin bht_q <= bht_d; diff --git a/src/frontend/btb.sv b/src/frontend/btb.sv index 3c4ebc87..cbd4309d 100644 --- a/src/frontend/btb.sv +++ b/src/frontend/btb.sv @@ -1,4 +1,4 @@ -//Copyright (C) 2018 to present, +// Copyright 2018 - 2019 ETH Zurich and University of Bologna. // Copyright and related rights are licensed under the Solderpad Hardware // License, Version 2.0 (the "License"); you may not use this file except in // compliance with the License. You may obtain a copy of the License at @@ -13,10 +13,6 @@ // Migrated: Luis Vitorio Cargnini, IEEE // Date: 09.06.2018 -// ------------------------------ -// Branch Prediction -// ------------------------------ - // branch target buffer module btb #( parameter int NR_ENTRIES = 8 @@ -28,23 +24,36 @@ module btb #( input logic [63:0] vpc_i, // virtual PC from IF stage input ariane_pkg::btb_update_t btb_update_i, // update btb with this information - output ariane_pkg::btb_prediction_t btb_prediction_o // prediction from btb + output ariane_pkg::btb_prediction_t [ariane_pkg::INSTR_PER_FETCH-1:0] btb_prediction_o // prediction from btb ); - // number of bits which are not used for indexing - localparam OFFSET = 1; // we are using compressed instructions so do use the lower 2 bits for prediction - localparam ANTIALIAS_BITS = 8; + // the last bit is always zero, we don't need it for indexing + localparam OFFSET = 1; + // re-shape the branch history table + localparam NR_ROWS = NR_ENTRIES / ariane_pkg::INSTR_PER_FETCH; + // number of bits needed to index the row + localparam ROW_ADDR_BITS = $clog2(ariane_pkg::INSTR_PER_FETCH); // number of bits we should use for prediction - localparam PREDICTION_BITS = $clog2(NR_ENTRIES) + OFFSET; + localparam PREDICTION_BITS = $clog2(NR_ROWS) + OFFSET + ROW_ADDR_BITS; + // prevent aliasing to degrade performance + localparam ANTIALIAS_BITS = 8; + // we are not interested in all bits of the address + unread i_unread (.d_i(|vpc_i)); + // typedef for all branch target entries // we may want to try to put a tag field that fills the rest of the PC in-order to mitigate aliasing effects - ariane_pkg::btb_prediction_t btb_d [NR_ENTRIES-1:0], btb_q [NR_ENTRIES-1:0]; - logic [$clog2(NR_ENTRIES)-1:0] index, update_pc; + ariane_pkg::btb_prediction_t btb_d [NR_ROWS-1:0][ariane_pkg::INSTR_PER_FETCH-1:0], + btb_q [NR_ROWS-1:0][ariane_pkg::INSTR_PER_FETCH-1:0]; + logic [$clog2(NR_ROWS)-1:0] index, update_pc; + logic [ROW_ADDR_BITS-1:0] update_row_index; - assign index = vpc_i[PREDICTION_BITS - 1:OFFSET]; - assign update_pc = btb_update_i.pc[PREDICTION_BITS - 1:OFFSET]; + assign index = vpc_i[PREDICTION_BITS - 1:ROW_ADDR_BITS + OFFSET]; + assign update_pc = btb_update_i.pc[PREDICTION_BITS - 1:ROW_ADDR_BITS + OFFSET]; + assign update_row_index = btb_update_i.pc[ROW_ADDR_BITS + OFFSET - 1:OFFSET]; // output matching prediction - assign btb_prediction_o = btb_q[index]; + for (genvar i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin : gen_btb_output + assign btb_prediction_o[i] = btb_q[index][i]; // workaround + end // ------------------------- // Update Branch Prediction @@ -54,23 +63,25 @@ module btb #( btb_d = btb_q; if (btb_update_i.valid && !debug_mode_i) begin - btb_d[update_pc].valid = 1'b1; + btb_d[update_pc][update_row_index].valid = 1'b1; // the target address is simply updated - btb_d[update_pc].target_address = btb_update_i.target_address; + btb_d[update_pc][update_row_index].target_address = btb_update_i.target_address; end end // sequential process always_ff @(posedge clk_i or negedge rst_ni) begin - if (~rst_ni) begin + if (!rst_ni) begin // Bias the branches to be taken upon first arrival - for (int i = 0; i < NR_ENTRIES; i++) + for (int i = 0; i < NR_ROWS; i++) btb_q[i] <= '{default: 0}; end else begin // evict all entries if (flush_i) begin - for (int i = 0; i < NR_ENTRIES; i++) begin - btb_q[i].valid <= 1'b0; + for (int i = 0; i < NR_ROWS; i++) begin + for (int j = 0; j < ariane_pkg::INSTR_PER_FETCH; j++) begin + btb_q[i][j].valid <= 1'b0; + end end end else begin btb_q <= btb_d; diff --git a/src/frontend/frontend.sv b/src/frontend/frontend.sv index 8518b261..18a41f1f 100644 --- a/src/frontend/frontend.sv +++ b/src/frontend/frontend.sv @@ -11,61 +11,66 @@ // Author: Florian Zaruba, ETH Zurich // Date: 08.02.2018 // Description: Ariane Instruction Fetch Frontend - - +// +// This module interfaces with the instruction cache, handles control +// change request from the back-end and does branch prediction. import ariane_pkg::*; module frontend #( - parameter logic [63:0] DmBaseAddress = 64'h0 // debug module base address + parameter ariane_pkg::ariane_cfg_t ArianeCfg = ariane_pkg::ArianeDefaultConfig ) ( - input logic clk_i, // Clock - input logic rst_ni, // Asynchronous reset active low - input logic flush_i, // flush request for PCGEN - input logic flush_bp_i, // flush branch prediction - input logic debug_mode_i, - // global input - input logic [63:0] boot_addr_i, - // Set a new PC - // mispredict - input bp_resolve_t resolved_branch_i, // from controller signaling a branch_predict -> update BTB - // from commit, when flushing the whole pipeline - input logic set_pc_commit_i, // Take the PC from commit stage - input logic [63:0] pc_commit_i, // PC of instruction in commit stage - // CSR input - input logic [63:0] epc_i, // exception PC which we need to return to - input logic eret_i, // return from exception - input logic [63:0] trap_vector_base_i, // base of trap vector - input logic ex_valid_i, // exception is valid - from commit - input logic set_debug_pc_i, // jump to debug address - // Instruction Fetch - input icache_dreq_o_t icache_dreq_i, - output icache_dreq_i_t icache_dreq_o, - // instruction output port -> to processor back-end - output frontend_fetch_t fetch_entry_o, // fetch entry containing all relevant data for the ID stage - output logic fetch_entry_valid_o, // instruction in IF is valid - input logic fetch_ack_i // ID acknowledged this instruction + input logic clk_i, // Clock + input logic rst_ni, // Asynchronous reset active low + input logic flush_i, // flush request for PCGEN + input logic flush_bp_i, // flush branch prediction + input logic debug_mode_i, + // global input + input logic [63:0] boot_addr_i, + // Set a new PC + // mispredict + input bp_resolve_t resolved_branch_i, // from controller signaling a branch_predict -> update BTB + // from commit, when flushing the whole pipeline + input logic set_pc_commit_i, // Take the PC from commit stage + input logic [63:0] pc_commit_i, // PC of instruction in commit stage + // CSR input + input logic [63:0] epc_i, // exception PC which we need to return to + input logic eret_i, // return from exception + input logic [63:0] trap_vector_base_i, // base of trap vector + input logic ex_valid_i, // exception is valid - from commit + input logic set_debug_pc_i, // jump to debug address + // Instruction Fetch + output icache_dreq_i_t icache_dreq_o, + input icache_dreq_o_t icache_dreq_i, + // instruction output port -> to processor back-end + output fetch_entry_t fetch_entry_o, // fetch entry containing all relevant data for the ID stage + output logic fetch_entry_valid_o, // instruction in IF is valid + input logic fetch_entry_ready_i // ID acknowledged this instruction ); - // Registers - logic [31:0] icache_data_q; - logic icache_valid_q; - logic icache_ex_valid_q; - logic instruction_valid; - logic [INSTR_PER_FETCH-1:0] instr_is_compressed; - - logic [63:0] icache_vaddr_q; - // BHT, BTB and RAS prediction - bht_prediction_t bht_prediction; - btb_prediction_t btb_prediction; - ras_t ras_predict; - bht_update_t bht_update; - btb_update_t btb_update; - logic ras_push, ras_pop; - logic [63:0] ras_update; - + // Instruction Cache Registers, from I$ + logic [FETCH_WIDTH-1:0] icache_data_q; + logic icache_valid_q; + logic icache_ex_valid_q; + logic [63:0] icache_vaddr_q; + logic instr_queue_ready; + logic [ariane_pkg::INSTR_PER_FETCH-1:0] instr_queue_consumed; + // upper-most branch-prediction from last cycle + btb_prediction_t btb_q; + bht_prediction_t bht_q; // instruction fetch is ready logic if_ready; logic [63:0] npc_d, npc_q; // next PC - logic npc_rst_load_q; //indicates whether we come out of reset (then we need to load boot_addr_i) + + // indicates whether we come out of reset (then we need to load boot_addr_i) + logic npc_rst_load_q; + + logic replay; + logic [63:0] replay_addr; + + // shift amount + logic [$clog2(ariane_pkg::INSTR_PER_FETCH)-1:0] shamt; + // address will always be 16 bit aligned, make this explicit here + assign shamt = icache_dreq_i.vaddr[$clog2(ariane_pkg::INSTR_PER_FETCH):1]; + // ----------------------- // Ctrl Flow Speculation // ----------------------- @@ -74,209 +79,185 @@ module frontend #( rvi_jalr, rvi_jump; logic [INSTR_PER_FETCH-1:0][63:0] rvi_imm; // RVC branching - logic [INSTR_PER_FETCH-1:0] is_rvc; logic [INSTR_PER_FETCH-1:0] rvc_branch, rvc_jump, rvc_jr, rvc_return, rvc_jalr, rvc_call; logic [INSTR_PER_FETCH-1:0][63:0] rvc_imm; // re-aligned instruction and address (coming from cache - combinationally) logic [INSTR_PER_FETCH-1:0][31:0] instr; logic [INSTR_PER_FETCH-1:0][63:0] addr; + logic [INSTR_PER_FETCH-1:0] instruction_valid; + // BHT, BTB and RAS prediction + bht_prediction_t [INSTR_PER_FETCH-1:0] bht_prediction; + btb_prediction_t [INSTR_PER_FETCH-1:0] btb_prediction; + bht_prediction_t [INSTR_PER_FETCH-1:0] bht_prediction_shifted; + btb_prediction_t [INSTR_PER_FETCH-1:0] btb_prediction_shifted; + ras_t ras_predict; - logic [63:0] bp_vaddr; - logic bp_valid; // we have a valid branch-prediction - logic is_mispredict; - // branch-prediction which we inject into the pipeline - branchpredict_sbe_t bp_sbe; - // fetch fifo credit system - logic fifo_valid, fifo_ready, fifo_empty, fifo_pop; - logic s2_eff_kill, issue_req, s2_in_flight_d, s2_in_flight_q; - logic [$clog2(FETCH_FIFO_DEPTH):0] fifo_credits_d; - logic [$clog2(FETCH_FIFO_DEPTH):0] fifo_credits_q; - - // save the unaligned part of the instruction to this ff - logic [15:0] unaligned_instr_d, unaligned_instr_q; - // the last instruction was unaligned - logic unaligned_d, unaligned_q; - // register to save the unaligned address - logic [63:0] unaligned_address_d, unaligned_address_q; + // branch-predict update + logic is_mispredict; + logic ras_push, ras_pop; + logic [63:0] ras_update; - for (genvar i = 0; i < INSTR_PER_FETCH; i ++) begin - // LSB != 2'b11 - assign instr_is_compressed[i] = ~&icache_data_q[i * 16 +: 2]; + // Instruction FIFO + logic [63:0] predict_address; + cf_t [ariane_pkg::INSTR_PER_FETCH-1:0] cf_type; + logic [ariane_pkg::INSTR_PER_FETCH-1:0] taken_rvi_cf; + logic [ariane_pkg::INSTR_PER_FETCH-1:0] taken_rvc_cf; + + logic serving_unaligned; + // Re-align instructions + instr_realign i_instr_realign ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .flush_i ( icache_dreq_o.kill_s2 ), + .valid_i ( icache_valid_q ), + .serving_unaligned_o ( serving_unaligned ), + .address_i ( icache_vaddr_q ), + .data_i ( icache_data_q ), + .valid_o ( instruction_valid ), + .addr_o ( addr ), + .instr_o ( instr ) + ); + // -------------------- + // Branch Prediction + // -------------------- + // select the right branch prediction result + // in case we are serving an unaligned instruction in instr[0] we need to take + // the prediction we saved from the previous fetch + assign bht_prediction_shifted[0] = (serving_unaligned) ? bht_q : bht_prediction[0]; + assign btb_prediction_shifted[0] = (serving_unaligned) ? btb_q : btb_prediction[0]; + // for all other predictions we can use the generated address to index + // into the branch prediction data structures + for (genvar i = 1; i < INSTR_PER_FETCH; i++) begin : gen_prediction_address + assign bht_prediction_shifted[i] = bht_prediction[addr[i][$clog2(INSTR_PER_FETCH):1]]; + assign btb_prediction_shifted[i] = btb_prediction[addr[i][$clog2(INSTR_PER_FETCH):1]]; end + // for the return address stack it doens't matter as we have the + // address of the call/return already + logic bp_valid; - // Soft-realignment to do branch-prediction - always_comb begin : re_align - unaligned_d = unaligned_q; - unaligned_address_d = unaligned_address_q; - unaligned_instr_d = unaligned_instr_q; - instruction_valid = icache_valid_q; - - // 32-bit can contain 2 instructions - instr[0] = icache_data_q; - addr[0] = icache_vaddr_q; - - instr[1] = '0; - addr[1] = {icache_vaddr_q[63:2], 2'b10}; - - if (icache_valid_q) begin - // last instruction was unaligned - if (unaligned_q) begin - instr[0] = {icache_data_q[15:0], unaligned_instr_q}; - addr[0] = unaligned_address_q; + logic [INSTR_PER_FETCH-1:0] is_branch; + logic [INSTR_PER_FETCH-1:0] is_call; + logic [INSTR_PER_FETCH-1:0] is_jump; + logic [INSTR_PER_FETCH-1:0] is_return; + logic [INSTR_PER_FETCH-1:0] is_jalr; - unaligned_address_d = {icache_vaddr_q[63:2], 2'b10}; - unaligned_instr_d = icache_data_q[31:16]; // save the upper bits for next cycle - - // check if this is instruction is still unaligned e.g.: it is not compressed - // if its compressed re-set unaligned flag - // for 32 bit we can simply check the next instruction and whether it is compressed or not - // if it is compressed the next fetch will contain an aligned instruction - if (instr_is_compressed[1]) begin - unaligned_d = 1'b0; - instr[1] = {16'b0, icache_data_q[31:16]}; - end - end else if (instr_is_compressed[0]) begin // instruction zero is RVC - // is instruction 1 also compressed - // yes? -> no problem, no -> we've got an unaligned instruction - if (instr_is_compressed[1]) begin - instr[1] = {16'b0, icache_data_q[31:16]}; - end else begin - unaligned_instr_d = icache_data_q[31:16]; - unaligned_address_d = {icache_vaddr_q[63:2], 2'b10}; - unaligned_d = 1'b1; - end - end // else -> normal fetch - end - - // we started to fetch on a unaligned boundary with a whole instruction -> wait until we've - // received the next instruction - if (icache_valid_q && icache_vaddr_q[1] && !instr_is_compressed[1]) begin - instruction_valid = 1'b0; - unaligned_d = 1'b1; - unaligned_address_d = {icache_vaddr_q[63:2], 2'b10}; - unaligned_instr_d = icache_data_q[31:16]; - end - - // if we killed the consecutive fetch we are starting on a clean slate - if (icache_dreq_o.kill_s2) begin - unaligned_d = 1'b0; - end + for (genvar i = 0; i < INSTR_PER_FETCH; i++) begin + // branch history table -> BHT + assign is_branch[i] = instruction_valid[i] & (rvi_branch[i] | rvc_branch[i]); + // function calls -> RAS + assign is_call[i] = instruction_valid[i] & (rvi_call[i] | rvc_call[i]); + // function return -> RAS + assign is_return[i] = instruction_valid[i] & (rvi_return[i] | rvc_return[i]); + // unconditional jumps with known target -> immediately resolved + assign is_jump[i] = instruction_valid[i] & (rvi_jump[i] | rvc_jump[i]); + // unconditional jumps with unknown target -> BTB + assign is_jalr[i] = instruction_valid[i] & ~is_return[i] & ~is_call[i] & (rvi_jalr[i] | rvc_jalr[i] | rvc_jr[i]); end - - logic [INSTR_PER_FETCH:0] taken; - // control front-end + branch-prediction - always_comb begin : frontend_ctrl - automatic logic take_rvi_cf; // take the control flow change (non-compressed) - automatic logic take_rvc_cf; // take the control flow change (compressed) - - take_rvi_cf = 1'b0; - take_rvc_cf = 1'b0; - ras_pop = 1'b0; - ras_push = 1'b0; - ras_update = '0; - taken = '0; - take_rvi_cf = 1'b0; - - bp_vaddr = '0; // predicted address - bp_valid = 1'b0; // prediction is valid - - bp_sbe.cf_type = RAS; - - // only predict if the response is valid - if (instruction_valid) begin - // look at instruction 0, 1, 2, ... - for (int unsigned i = 0; i < INSTR_PER_FETCH; i++) begin - // only speculate if the previous instruction was not taken - if (!taken[i]) begin - // function call - ras_push = rvi_call[i] | rvc_call[i]; - ras_update = addr[i] + (rvc_call[i] ? 2 : 4); - - // Branch Prediction - **speculative** - if (rvi_branch[i] || rvc_branch[i]) begin - bp_sbe.cf_type = BHT; - // dynamic prediction valid? - if (bht_prediction.valid) begin - take_rvi_cf = rvi_branch[i] & (bht_prediction.taken | bht_prediction.strongly_taken); - take_rvc_cf = rvc_branch[i] & (bht_prediction.taken | bht_prediction.strongly_taken); - // default to static prediction - end else begin - // set if immediate is negative - static prediction - take_rvi_cf = rvi_branch[i] & rvi_imm[i][63]; - take_rvc_cf = rvc_branch[i] & rvc_imm[i][63]; - end - end - - // unconditional jumps - if (rvi_jump[i] || rvc_jump[i]) begin - take_rvi_cf = rvi_jump[i]; - take_rvc_cf = rvc_jump[i]; - end - - // to take this jump we need a valid prediction target **speculative** - if ((rvi_jalr[i] || rvc_jalr[i]) && ~(rvi_call[i] || rvc_call[i])) begin - bp_sbe.cf_type = BTB; - if (btb_prediction.valid) begin - bp_vaddr = btb_prediction.target_address; - taken[i+1] = 1'b1; - end - end - - // is it a return and the RAS contains a valid prediction? **speculative** - if ((rvi_return[i] || rvc_return[i]) && ras_predict.valid) begin - bp_vaddr = ras_predict.ra; - ras_pop = 1'b1; - taken[i+1] = 1'b1; - bp_sbe.cf_type = RAS; - end - - if (take_rvi_cf) begin - taken[i+1] = 1'b1; - bp_vaddr = addr[i] + rvi_imm[i]; - end - - if (take_rvc_cf) begin - taken[i+1] = 1'b1; - bp_vaddr = addr[i] + rvc_imm[i]; - end - - // we are not interested in the lower instruction - if (icache_vaddr_q[1]) begin - taken[1] = 1'b0; - // TODO(zarubaf): that seems to be overly pessimistic - ras_pop = 1'b0; - ras_push = 1'b0; - end - end + // taken/not taken + always_comb begin + taken_rvi_cf = '0; + taken_rvc_cf = '0; + predict_address = '0; + + for (int i = 0; i < INSTR_PER_FETCH; i++) cf_type[i] = ariane_pkg::NoCF; + + ras_push = 1'b0; + ras_pop = 1'b0; + ras_update = '0; + + // lower most prediction gets precedence + for (int i = INSTR_PER_FETCH - 1; i >= 0 ; i--) begin + unique case ({is_branch[i], is_return[i], is_jump[i], is_jalr[i]}) + 4'b0000:; // regular instruction e.g.: no branch + // unconditional jump to register, we need the BTB to resolve this + 4'b0001: begin + ras_pop = 1'b0; + ras_push = 1'b0; + if (btb_prediction_shifted[i].valid) begin + predict_address = btb_prediction_shifted[i].target_address; + cf_type[i] = ariane_pkg::JumpR; end - end - - bp_valid = |taken; - // assemble scoreboard entry - bp_sbe.valid = bp_valid; - bp_sbe.predict_address = bp_vaddr; - bp_sbe.predict_taken = bp_valid; + end + // its an unconditional jump to an immediate + 4'b0010: begin + ras_pop = 1'b0; + ras_push = 1'b0; + taken_rvi_cf[i] = rvi_jump[i]; + taken_rvc_cf[i] = rvc_jump[i]; + cf_type[i] = ariane_pkg::Jump; + end + // return + 4'b0100: begin + // make sure to only alter the RAS if we actually consumed the instruction + ras_pop = ras_predict.valid & instr_queue_consumed[i]; + ras_push = 1'b0; + predict_address = ras_predict.ra; + cf_type[i] = ariane_pkg::Return; + end + // branch prediction + 4'b1000: begin + ras_pop = 1'b0; + ras_push = 1'b0; + // if we have a valid dynamic prediction use it + if (bht_prediction_shifted[i].valid) begin + taken_rvi_cf[i] = rvi_branch[i] & bht_prediction_shifted[i].taken; + taken_rvc_cf[i] = rvc_branch[i] & bht_prediction_shifted[i].taken; + // otherwise default to static prediction + end else begin + // set if immediate is negative - static prediction + taken_rvi_cf[i] = rvi_branch[i] & rvi_imm[i][63]; + taken_rvc_cf[i] = rvc_branch[i] & rvc_imm[i][63]; + end + if (taken_rvi_cf[i] || taken_rvc_cf[i]) cf_type[i] = ariane_pkg::Branch; + end + default:; + // default: $error("Decoded more than one control flow"); + endcase + // if this instruction, in addition, is a call, save the resulting address + // but only if we actually consumed the address + if (is_call[i]) begin + ras_push = instr_queue_consumed[i]; + ras_update = addr[i] + (rvc_call[i] ? 2 : 4); + end + // calculate the jump target address + if (taken_rvc_cf[i] || taken_rvi_cf[i]) begin + predict_address = addr[i] + (taken_rvc_cf[i] ? rvc_imm[i] : rvi_imm[i]); + end + end + end + // or reduce struct + always_comb begin + bp_valid = 1'b0; + for (int i = 0; i < INSTR_PER_FETCH; i++) bp_valid |= (cf_type[i] != NoCF); end - assign is_mispredict = resolved_branch_i.valid & resolved_branch_i.is_mispredict; - // we mis-predicted so kill the icache request and the fetch queue - assign icache_dreq_o.kill_s1 = is_mispredict | flush_i; - // if we have a valid branch-prediction we need to kill the last cache request + + // Cache interface + assign icache_dreq_o.req = instr_queue_ready; + assign if_ready = icache_dreq_i.ready & instr_queue_ready; + // We need to flush the cache pipeline if: + // 1. We mispredicted + // 2. Want to flush the whole processor front-end + // 3. Need to replay an instruction because the fetch-fifo was full + assign icache_dreq_o.kill_s1 = is_mispredict | flush_i | replay; + // if we have a valid branch-prediction we need to only kill the last cache request + // also if we killed the first stage we also need to kill the second stage (inclusive flush) assign icache_dreq_o.kill_s2 = icache_dreq_o.kill_s1 | bp_valid; - assign fifo_valid = icache_valid_q; - // ---------------------------------------- // Update Control Flow Predictions - // ---------------------------------------- - // BHT - assign bht_update.valid = resolved_branch_i.valid & (resolved_branch_i.cf_type == BHT); + bht_update_t bht_update; + btb_update_t btb_update; + + assign bht_update.valid = resolved_branch_i.valid + & (resolved_branch_i.cf_type == ariane_pkg::Branch); assign bht_update.pc = resolved_branch_i.pc; - assign bht_update.mispredict = resolved_branch_i.is_mispredict; assign bht_update.taken = resolved_branch_i.is_taken; - // BTB - assign btb_update.valid = resolved_branch_i.valid & (resolved_branch_i.cf_type == BTB); + // only update mispredicted branches e.g. no returns from the RAS + assign btb_update.valid = resolved_branch_i.valid + & resolved_branch_i.is_mispredict + & (resolved_branch_i.cf_type == ariane_pkg::JumpR); assign btb_update.pc = resolved_branch_i.pc; assign btb_update.target_address = resolved_branch_i.target_address; @@ -284,7 +265,7 @@ module frontend #( // Next PC // ------------------- // next PC (NPC) can come from (in order of precedence): - // 0. Default assignment + // 0. Default assignment/replay instruction // 1. Branch Predict taken // 2. Control flow change request (misprediction) // 3. Return from environment call @@ -293,211 +274,160 @@ module frontend #( // Mis-predict handling is a little bit different // select PC a.k.a PC Gen always_comb begin : npc_select - automatic logic [63:0] fetch_address; - - // check whether we come out of reset - // this is a workaround. some tools have issues - // having boot_addr_i in the asynchronous - // reset assignment to npc_q, even though - // boot_addr_i will be assigned a constant - // on the top-level. - if (npc_rst_load_q) begin - npc_d = boot_addr_i; - fetch_address = boot_addr_i; - end else begin - fetch_address = npc_q; - // keep stable by default - npc_d = npc_q; - end - - // ------------------------------- - // 1. Branch Prediction - // ------------------------------- - if (bp_valid) begin - fetch_address = bp_vaddr; - npc_d = bp_vaddr; - end - // ------------------------------- - // 0. Default assignment - // ------------------------------- - if (if_ready) begin - npc_d = {fetch_address[63:2], 2'b0} + 'h4; - end - // ------------------------------- - // 2. Control flow change request - // ------------------------------- - if (is_mispredict) begin - npc_d = resolved_branch_i.target_address; - end - // ------------------------------- - // 3. Return from environment call - // ------------------------------- - if (eret_i) begin - npc_d = epc_i; - end - // ------------------------------- - // 4. Exception/Interrupt - // ------------------------------- - if (ex_valid_i) begin - npc_d = trap_vector_base_i; - end - // ----------------------------------------------- - // 5. Pipeline Flush because of CSR side effects - // ----------------------------------------------- - // On a pipeline flush start fetching from the next address - // of the instruction in the commit stage - if (set_pc_commit_i) begin - // we came here from a flush request of a CSR instruction or AMO, - // as CSR or AMO instructions do not exist in a compressed form - // we can unconditionally do PC + 4 here - // TODO(zarubaf) This adder can at least be merged with the one in the csr_regfile stage - npc_d = pc_commit_i + 64'h4; - end - // ------------------------------- - // 6. Debug - // ------------------------------- - // enter debug on a hard-coded base-address - if (set_debug_pc_i) begin - npc_d = DmBaseAddress + dm::HaltAddress; - end - - icache_dreq_o.vaddr = fetch_address; + automatic logic [63:0] fetch_address; + // check whether we come out of reset + // this is a workaround. some tools have issues + // having boot_addr_i in the asynchronous + // reset assignment to npc_q, even though + // boot_addr_i will be assigned a constant + // on the top-level. + if (npc_rst_load_q) begin + npc_d = boot_addr_i; + fetch_address = boot_addr_i; + end else begin + fetch_address = npc_q; + // keep stable by default + npc_d = npc_q; + end + // 0. Branch Prediction + if (bp_valid) begin + fetch_address = predict_address; + npc_d = predict_address; + end + // 1. Default assignment + if (if_ready) npc_d = {fetch_address[63:2], 2'b0} + 'h4; + // 2. Replay instruction fetch + if (replay) npc_d = replay_addr; + // 3. Control flow change request + if (is_mispredict) npc_d = resolved_branch_i.target_address; + // 4. Return from environment call + if (eret_i) npc_d = epc_i; + // 5. Exception/Interrupt + if (ex_valid_i) npc_d = trap_vector_base_i; + // 6. Pipeline Flush because of CSR side effects + // On a pipeline flush start fetching from the next address + // of the instruction in the commit stage + // we came here from a flush request of a CSR instruction or AMO, + // as CSR or AMO instructions do not exist in a compressed form + // we can unconditionally do PC + 4 here + // TODO(zarubaf) This adder can at least be merged with the one in the csr_regfile stage + if (set_pc_commit_i) npc_d = pc_commit_i + 64'h4; + // 7. Debug + // enter debug on a hard-coded base-address + if (set_debug_pc_i) npc_d = ArianeCfg.DmBaseAddress + dm::HaltAddress; + icache_dreq_o.vaddr = fetch_address; end - // ------------------- - // Credit-based fetch FIFO flow ctrl - // ------------------- - assign fifo_credits_d = (flush_i) ? FETCH_FIFO_DEPTH : - fifo_credits_q + fifo_pop + s2_eff_kill - issue_req; - - // check whether there is a request in flight that is being killed now - // if this is the case, we need to increment the credit by 1 - assign s2_eff_kill = s2_in_flight_q & icache_dreq_o.kill_s2; - assign s2_in_flight_d = (flush_i) ? 1'b0 : - (issue_req) ? 1'b1 : - (icache_dreq_i.valid) ? 1'b0 : - s2_in_flight_q; - - // only enable counter if current request is not being killed - assign issue_req = if_ready & (~icache_dreq_o.kill_s1); - assign fifo_pop = fetch_ack_i & fetch_entry_valid_o; - assign fifo_ready = (|fifo_credits_q); - assign if_ready = icache_dreq_i.ready & fifo_ready; - assign icache_dreq_o.req = fifo_ready; - assign fetch_entry_valid_o = ~fifo_empty; - - -//pragma translate_off -`ifndef VERILATOR - fetch_fifo_credits0 : assert property ( - @(posedge clk_i) disable iff (~rst_ni) (fifo_credits_q <= FETCH_FIFO_DEPTH)) - else $fatal(1,"[frontend] fetch fifo credits must be <= FETCH_FIFO_DEPTH!"); - initial begin - assert (FETCH_FIFO_DEPTH <= 8) else $fatal(1,"[frontend] fetch fifo deeper than 8 not supported"); - assert (FETCH_WIDTH == 32) else $fatal(1,"[frontend] fetch width != not supported"); - end -`endif -//pragma translate_on + logic [FETCH_WIDTH-1:0] icache_data; + // re-align the cache line + assign icache_data = icache_dreq_i.data >> {shamt, 4'b0}; always_ff @(posedge clk_i or negedge rst_ni) begin - if (~rst_ni) begin - npc_q <= '0; - npc_rst_load_q <= 1'b1; - icache_data_q <= '0; - icache_valid_q <= 1'b0; - icache_vaddr_q <= 'b0; - icache_ex_valid_q <= 1'b0; - unaligned_q <= 1'b0; - unaligned_address_q <= '0; - unaligned_instr_q <= '0; - fifo_credits_q <= FETCH_FIFO_DEPTH; - s2_in_flight_q <= 1'b0; - end else begin - npc_rst_load_q <= 1'b0; - npc_q <= npc_d; - icache_data_q <= icache_dreq_i.data; - icache_valid_q <= icache_dreq_i.valid; - icache_vaddr_q <= icache_dreq_i.vaddr; - icache_ex_valid_q <= icache_dreq_i.ex.valid; - unaligned_q <= unaligned_d; - unaligned_address_q <= unaligned_address_d; - unaligned_instr_q <= unaligned_instr_d; - fifo_credits_q <= fifo_credits_d; - s2_in_flight_q <= s2_in_flight_d; + if (!rst_ni) begin + npc_rst_load_q <= 1'b1; + npc_q <= '0; + icache_data_q <= '0; + icache_valid_q <= 1'b0; + icache_vaddr_q <= 'b0; + icache_ex_valid_q <= 1'b0; + btb_q <= '0; + bht_q <= '0; + end else begin + npc_rst_load_q <= 1'b0; + npc_q <= npc_d; + icache_valid_q <= icache_dreq_i.valid; + if (icache_dreq_i.valid) begin + icache_data_q <= icache_data; + icache_vaddr_q <= icache_dreq_i.vaddr; + icache_ex_valid_q <= icache_dreq_i.ex; + // save the uppermost prediction + btb_q <= btb_prediction[INSTR_PER_FETCH-1]; + bht_q <= bht_prediction[INSTR_PER_FETCH-1]; end + end end ras #( - .DEPTH ( RAS_DEPTH ) + .DEPTH ( ArianeCfg.RASDepth ) ) i_ras ( - .clk_i, - .rst_ni, - .flush_i( flush_bp_i ), - .push_i ( ras_push ), - .pop_i ( ras_pop ), - .data_i ( ras_update ), - .data_o ( ras_predict ) + .clk_i, + .rst_ni, + .flush_i( flush_bp_i ), + .push_i ( ras_push ), + .pop_i ( ras_pop ), + .data_i ( ras_update ), + .data_o ( ras_predict ) ); btb #( - .NR_ENTRIES ( BTB_ENTRIES ) + .NR_ENTRIES ( ArianeCfg.BTBEntries ) ) i_btb ( - .clk_i, - .rst_ni, - .flush_i ( flush_bp_i ), - .debug_mode_i, - .vpc_i ( icache_vaddr_q ), - .btb_update_i ( btb_update ), - .btb_prediction_o ( btb_prediction ) + .clk_i, + .rst_ni, + .flush_i ( flush_bp_i ), + .debug_mode_i, + .vpc_i ( icache_vaddr_q ), + .btb_update_i ( btb_update ), + .btb_prediction_o ( btb_prediction ) ); bht #( - .NR_ENTRIES ( BHT_ENTRIES ) + .NR_ENTRIES ( ArianeCfg.BHTEntries ) ) i_bht ( - .clk_i, - .rst_ni, - .flush_i ( flush_bp_i ), - .debug_mode_i, - .vpc_i ( icache_vaddr_q ), - .bht_update_i ( bht_update ), - .bht_prediction_o ( bht_prediction ) + .clk_i, + .rst_ni, + .flush_i ( flush_bp_i ), + .debug_mode_i, + .vpc_i ( icache_vaddr_q ), + .bht_update_i ( bht_update ), + .bht_prediction_o ( bht_prediction ) ); - for (genvar i = 0; i < INSTR_PER_FETCH; i++) begin - instr_scan i_instr_scan ( - .instr_i ( instr[i] ), - .is_rvc_o ( is_rvc[i] ), - .rvi_return_o ( rvi_return[i] ), - .rvi_call_o ( rvi_call[i] ), - .rvi_branch_o ( rvi_branch[i] ), - .rvi_jalr_o ( rvi_jalr[i] ), - .rvi_jump_o ( rvi_jump[i] ), - .rvi_imm_o ( rvi_imm[i] ), - .rvc_branch_o ( rvc_branch[i] ), - .rvc_jump_o ( rvc_jump[i] ), - .rvc_jr_o ( rvc_jr[i] ), - .rvc_return_o ( rvc_return[i] ), - .rvc_jalr_o ( rvc_jalr[i] ), - .rvc_call_o ( rvc_call[i] ), - .rvc_imm_o ( rvc_imm[i] ) - ); + // we need to inspect up to INSTR_PER_FETCH instructions for branches + // and jumps + for (genvar i = 0; i < INSTR_PER_FETCH; i++) begin : gen_instr_scan + instr_scan i_instr_scan ( + .instr_i ( instr[i] ), + .rvi_return_o ( rvi_return[i] ), + .rvi_call_o ( rvi_call[i] ), + .rvi_branch_o ( rvi_branch[i] ), + .rvi_jalr_o ( rvi_jalr[i] ), + .rvi_jump_o ( rvi_jump[i] ), + .rvi_imm_o ( rvi_imm[i] ), + .rvc_branch_o ( rvc_branch[i] ), + .rvc_jump_o ( rvc_jump[i] ), + .rvc_jr_o ( rvc_jr[i] ), + .rvc_return_o ( rvc_return[i] ), + .rvc_jalr_o ( rvc_jalr[i] ), + .rvc_call_o ( rvc_call[i] ), + .rvc_imm_o ( rvc_imm[i] ) + ); end - fifo_v3 #( - .DEPTH ( 8 ), - .dtype ( frontend_fetch_t ) - ) i_fetch_fifo ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), - .flush_i ( flush_i ), - .testmode_i ( 1'b0 ), - .full_o ( ), - .empty_o ( fifo_empty ), - .usage_o ( ), - .data_i ( {icache_vaddr_q, icache_data_q, bp_sbe, taken[INSTR_PER_FETCH:1], icache_ex_valid_q} ), - .push_i ( fifo_valid ), - .data_o ( fetch_entry_o ), - .pop_i ( fifo_pop ) + instr_queue i_instr_queue ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .flush_i ( flush_i ), + .instr_i ( instr ), // from re-aligner + .addr_i ( addr ), // from re-aligner + .exception_i ( icache_ex_valid_q ), // from I$ + .predict_address_i ( predict_address ), + .cf_type_i ( cf_type ), + .valid_i ( instruction_valid ), // from re-aligner + .consumed_o ( instr_queue_consumed ), + .ready_o ( instr_queue_ready ), + .replay_o ( replay ), + .replay_addr_o ( replay_addr ), + .fetch_entry_o ( fetch_entry_o ), // to back-end + .fetch_entry_valid_o ( fetch_entry_valid_o ), // to back-end + .fetch_entry_ready_i ( fetch_entry_ready_i ) // to back-end ); + // pragma translate_off + `ifndef VERILATOR + initial begin + assert (FETCH_WIDTH == 32 || FETCH_WIDTH == 64) else $fatal("[frontend] fetch width != not supported"); + end + `endif + // pragma translate_on endmodule diff --git a/src/frontend/instr_queue.sv b/src/frontend/instr_queue.sv new file mode 100644 index 00000000..57a0e190 --- /dev/null +++ b/src/frontend/instr_queue.sv @@ -0,0 +1,353 @@ +// Copyright 2018 - 2019 ETH Zurich and University of Bologna. +// Copyright and related rights are licensed under the Solderpad Hardware +// License, Version 0.51 (the "License"); you may not use this file except in +// compliance with the License. You may obtain a copy of the License at +// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law +// or agreed to in writing, software, hardware and materials distributed under +// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. +// +// Author: Florian Zaruba, ETH Zurich +// Date: 26.10.2018sim:/ariane_tb/dut/i_ariane/i_frontend/icache_ex_valid_q + +// Description: Instruction Queue, separates instruction front-end from processor +// back-end. +// +// This is an optimized instruction queue which supports the handling of +// compressed instructions (16 bit instructions). Internally it is organized as +// FETCH_ENTRY x 32 bit queues which are filled in a consecutive manner. Two pointers +// point into (`idx_is_q` and `idx_ds_q`) the fill port and the read port. The read port +// is designed so that it will easily allow for multiple issue implementation. +// The input supports arbitrary power of two instruction fetch widths. +// +// The queue supports handling of branch prediction and will take care of +// only saving a valid instruction stream. +// +// Furthermore it contains a replay interface in case the instruction queue +// is already full. As instructions are in general easily replayed this should +// increase the efficiency as I$ misses are potentially hidden. This stands in +// contrast to pessimistic actions (early stalling) or credit based approaches. +// Credit based systems might be difficult to implement with the current system +// as we do not exactly know how much space we are going to need in the fifos +// as each instruction can take either one or two slots. +// +// So the consumed/valid interface degenerates to a `information` interface. If the +// upstream circuits keeps pushing the queue will discard the information +// and start replaying from the point were it could last manage to accept instructions. +// +// The instruction front-end will stop issuing instructions as soon as the +// fifo is full. This will gate the logic if the processor is e.g.: halted +// +// TODO(zarubaf): The instruction queues can be reduced to 16 bit. Potentially +// the replay mechanism gets more complicated as it can be that a 32 bit instruction +// can not be pushed at once. + +module instr_queue ( + input logic clk_i, + input logic rst_ni, + input logic flush_i, + input logic [ariane_pkg::INSTR_PER_FETCH-1:0][31:0] instr_i, + input logic [ariane_pkg::INSTR_PER_FETCH-1:0][63:0] addr_i, + input logic [ariane_pkg::INSTR_PER_FETCH-1:0] valid_i, + output logic ready_o, + output logic [ariane_pkg::INSTR_PER_FETCH-1:0] consumed_o, + // we've encountered an exception, at this point the only possible exceptions are page-table faults + input logic exception_i, + // branch predict + input logic [63:0] predict_address_i, + input ariane_pkg::cf_t [ariane_pkg::INSTR_PER_FETCH-1:0] cf_type_i, + // replay instruction because one of the FIFO was already full + output logic replay_o, + output logic [63:0] replay_addr_o, // address at which to replay this instruction + // to processor backend + output ariane_pkg::fetch_entry_t fetch_entry_o, + output logic fetch_entry_valid_o, + input logic fetch_entry_ready_i +); + + typedef struct packed { + logic [31:0] instr; // instruction word + ariane_pkg::cf_t cf; // branch was taken + logic ex; // exception happened + } instr_data_t; + + logic [$clog2(ariane_pkg::INSTR_PER_FETCH)-1:0] branch_index; + // instruction queues + logic [ariane_pkg::INSTR_PER_FETCH-1:0] + [$clog2(ariane_pkg::FETCH_FIFO_DEPTH)-1:0] instr_queue_usage; + instr_data_t [ariane_pkg::INSTR_PER_FETCH-1:0] instr_data_in, instr_data_out; + logic [ariane_pkg::INSTR_PER_FETCH-1:0] push_instr, push_instr_fifo; + logic [ariane_pkg::INSTR_PER_FETCH-1:0] pop_instr; + logic [ariane_pkg::INSTR_PER_FETCH-1:0] instr_queue_full; + logic [ariane_pkg::INSTR_PER_FETCH-1:0] instr_queue_empty; + logic instr_overflow; + // address queue + logic [$clog2(ariane_pkg::FETCH_FIFO_DEPTH)-1:0] address_queue_usage; + logic [63:0] address_out; + logic pop_address; + logic push_address; + logic full_address; + logic empty_address; + logic address_overflow; + // input stream counter + logic [$clog2(ariane_pkg::INSTR_PER_FETCH)-1:0] idx_is_d, idx_is_q; + // Registers + // output FIFO select, one-hot + logic [ariane_pkg::INSTR_PER_FETCH-1:0] idx_ds_d, idx_ds_q; + logic [63:0] pc_d, pc_q; // current PC + logic reset_address_d, reset_address_q; // we need to re-set the address because of a flush + + logic [ariane_pkg::INSTR_PER_FETCH*2-2:0] branch_mask_extended; + logic [ariane_pkg::INSTR_PER_FETCH-1:0] branch_mask; + logic branch_empty; + logic [ariane_pkg::INSTR_PER_FETCH-1:0] taken; + // shift amount, e.g.: instructions we want to retire + logic [$clog2(ariane_pkg::INSTR_PER_FETCH):0] popcount; + logic [$clog2(ariane_pkg::INSTR_PER_FETCH)-1:0] shamt; + logic [ariane_pkg::INSTR_PER_FETCH-1:0] valid; + logic [ariane_pkg::INSTR_PER_FETCH*2-1:0] consumed_extended; + // FIFO mask + logic [ariane_pkg::INSTR_PER_FETCH*2-1:0] fifo_pos_extended; + logic [ariane_pkg::INSTR_PER_FETCH-1:0] fifo_pos; + logic [ariane_pkg::INSTR_PER_FETCH*2-1:0][31:0] instr; + ariane_pkg::cf_t [ariane_pkg::INSTR_PER_FETCH*2-1:0] cf; + // replay interface + logic [ariane_pkg::INSTR_PER_FETCH-1:0] instr_overflow_fifo; + + assign ready_o = ~(|instr_queue_full) & ~full_address; + + for (genvar i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin : gen_unpack_taken + assign taken[i] = cf_type_i[i] != ariane_pkg::NoCF; + end + // calculate a branch mask, e.g.: get the first taken branch + lzc #( + .WIDTH ( ariane_pkg::INSTR_PER_FETCH ), + .MODE ( 0 ) // count trailing zeros + ) i_lzc_branch_index ( + .in_i ( taken ), // we want to count trailing zeros + .cnt_o ( branch_index ), // first branch on branch_index + .empty_o ( branch_empty ) + ); + // the first index is for sure valid + // for example (64 bit fetch): + // taken mask: 0 1 1 0 + // leading zero count = 1 + // 0 0 0 1, 1 1 1 << 1 = 0 0 1 1, 1 1 0 + // take the upper 4 bits: 0 0 1 1 + assign branch_mask_extended = {{{ariane_pkg::INSTR_PER_FETCH-1}{1'b0}}, {{ariane_pkg::INSTR_PER_FETCH}{1'b1}}} << branch_index; + assign branch_mask = branch_mask_extended[ariane_pkg::INSTR_PER_FETCH * 2 - 2:ariane_pkg::INSTR_PER_FETCH - 1]; + + // mask with taken branches to get the actual amount of instructions we want to push + assign valid = valid_i & branch_mask; + // rotate right again + assign consumed_extended = {push_instr_fifo, push_instr_fifo} >> idx_is_q; + assign consumed_o = consumed_extended[ariane_pkg::INSTR_PER_FETCH-1:0]; + // count the numbers of valid instructions we've pushed from this package + popcount #( + .INPUT_WIDTH ( ariane_pkg::INSTR_PER_FETCH ) + ) i_popcount ( + .data_i ( push_instr_fifo ), + .popcount_o ( popcount ) + ); + assign shamt = popcount[$bits(shamt)-1:0]; + + // save the shift amount for next cycle + assign idx_is_d = idx_is_q + shamt; + + // ---------------------- + // Input interface + // ---------------------- + // rotate left by the current position + assign fifo_pos_extended = { valid, valid } << idx_is_q; + // we just care about the upper bits + assign fifo_pos = fifo_pos_extended[ariane_pkg::INSTR_PER_FETCH*2-1:ariane_pkg::INSTR_PER_FETCH]; + // the fifo_position signal can directly be used to guide the push signal of each FIFO + // make sure it is not full + assign push_instr = fifo_pos & ~instr_queue_full; + + // duplicate the entries for easier selection e.g.: 3 2 1 0 3 2 1 0 + for (genvar i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin : gen_duplicate_instr_input + assign instr[i] = instr_i[i]; + assign instr[i + ariane_pkg::INSTR_PER_FETCH] = instr_i[i]; + assign cf[i] = cf_type_i[i]; + assign cf[i + ariane_pkg::INSTR_PER_FETCH] = cf_type_i[i]; + end + + // shift the inputs + for (genvar i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin : gen_fifo_input_select + /* verilator lint_off WIDTH */ + assign instr_data_in[i].instr = instr[i + idx_is_q]; + assign instr_data_in[i].cf = cf[i + idx_is_q]; + assign instr_data_in[i].ex = exception_i; // exceptions hold for the whole fetch packet + /* verilator lint_on WIDTH */ + end + + // ---------------------- + // Replay Logic + // ---------------------- + // We need to replay a instruction fetch iff: + // 1. One of the instruction data FIFOs was full and we needed it + // (e.g.: we pushed and it was full) + // 2. The address/branch predict FIFO was full + // if one of the FIFOs was full we need to replay the faulting instruction + assign instr_overflow_fifo = instr_queue_full & fifo_pos; + assign instr_overflow = |instr_overflow_fifo; // at least one instruction overflowed + assign address_overflow = full_address & push_address; + assign replay_o = instr_overflow | address_overflow; + + // select the address, in the case of an address fifo overflow just + // use the base of this package + // if we successfully pushed some instructions we can output the next instruction + // which we didn't manage to push + assign replay_addr_o = (address_overflow) ? addr_i[0] : addr_i[shamt]; + + // ---------------------- + // Downstream interface + // ---------------------- + // as long as there is at least one queue which can take the value we have a valid instruction + assign fetch_entry_valid_o = ~(&instr_queue_empty); + + always_comb begin + idx_ds_d = idx_ds_q; + + pop_instr = '0; + // assemble fetch entry + fetch_entry_o.instruction = '0; + fetch_entry_o.address = pc_q; + fetch_entry_o.ex.valid = 1'b0; + // This is the only exception which can occur up to this point. + fetch_entry_o.ex.cause = riscv::INSTR_PAGE_FAULT; + fetch_entry_o.ex.tval = '0; + fetch_entry_o.branch_predict.predict_address = address_out; + // output mux select + for (int unsigned i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin + if (idx_ds_q[i]) begin + fetch_entry_o.instruction = instr_data_out[i].instr; + fetch_entry_o.ex.valid = instr_data_out[i].ex; + fetch_entry_o.ex.tval = pc_q; + fetch_entry_o.branch_predict.cf = instr_data_out[i].cf; + pop_instr[i] = fetch_entry_valid_o & fetch_entry_ready_i; + end + end + // rotate the pointer left + if (fetch_entry_ready_i) begin + idx_ds_d = {idx_ds_q[ariane_pkg::INSTR_PER_FETCH-2:0], idx_ds_q[ariane_pkg::INSTR_PER_FETCH-1]}; + end + end + + // TODO(zarubaf): This needs to change for dual-issue + // if the handshaking is successful and we had a prediction pop one address entry + assign pop_address = ((fetch_entry_o.branch_predict.cf != ariane_pkg::NoCF) & |pop_instr); + + // ---------------------- + // Calculate (Next) PC + // ---------------------- + always_comb begin + pc_d = pc_q; + reset_address_d = flush_i ? 1'b1 : reset_address_q; + + if (fetch_entry_ready_i) begin + // TODO(zarubaf): This needs to change for a dual issue implementation + // advance the PC + pc_d = pc_q + ((fetch_entry_o.instruction[1:0] != 2'b11) ? 'd2 : 'd4); + end + + if (pop_address) pc_d = address_out; + + // we previously flushed so we need to reset the address + if (valid_i[0] && reset_address_q) begin + // this is the base of the first instruction + pc_d = addr_i[0]; + reset_address_d = 1'b0; + end + end + + // FIFOs + for (genvar i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin : gen_instr_fifo + // Make sure we don't save any instructions if we couldn't save the address + assign push_instr_fifo[i] = push_instr[i] & ~address_overflow; + fifo_v3 #( + .DEPTH ( ariane_pkg::FETCH_FIFO_DEPTH ), + .dtype ( instr_data_t ) + ) i_fifo_instr_data ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .flush_i ( flush_i ), + .testmode_i ( 1'b0 ), + .full_o ( instr_queue_full[i] ), + .empty_o ( instr_queue_empty[i] ), + .usage_o ( instr_queue_usage[i] ), + .data_i ( instr_data_in[i] ), + .push_i ( push_instr_fifo[i] ), + .data_o ( instr_data_out[i] ), + .pop_i ( pop_instr[i] ) + ); + end + // or reduce and check whether we are retiring a taken branch (might be that the corresponding) + // fifo is full. + always_comb begin + push_address = 1'b0; + // check if we are pushing a ctrl flow change, if so save the address + for (int i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin + push_address |= push_instr[i] & (instr_data_in[i].cf != ariane_pkg::NoCF); + end + end + + fifo_v3 #( + .DEPTH ( ariane_pkg::FETCH_FIFO_DEPTH ), // TODO(zarubaf): Fork out to separate param + .DATA_WIDTH ( 64 ) + ) i_fifo_address ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .flush_i ( flush_i ), + .testmode_i ( 1'b0 ), + .full_o ( full_address ), + .empty_o ( empty_address ), + .usage_o ( address_queue_usage ), + .data_i ( predict_address_i ), + .push_i ( push_address & ~full_address ), + .data_o ( address_out ), + .pop_i ( pop_address ) + ); + + unread i_unread_address_fifo (.d_i(|{empty_address, address_queue_usage})); + unread i_unread_branch_mask (.d_i(|branch_mask_extended)); + unread i_unread_lzc (.d_i(|{branch_empty})); + unread i_unread_fifo_pos (.d_i(|fifo_pos_extended)); // we don't care about the lower signals + unread i_unread_instr_fifo (.d_i(|instr_queue_usage)); + + always_ff @(posedge clk_i or negedge rst_ni) begin + if (!rst_ni) begin + idx_ds_q <= 'b1; + idx_is_q <= '0; + pc_q <= '0; + reset_address_q <= 1'b1; + end else begin + pc_q <= pc_d; + reset_address_q <= reset_address_d; + if (flush_i) begin + // one-hot encoded + idx_ds_q <= 'b1; + // binary encoded + idx_is_q <= '0; + reset_address_q <= 1'b1; + end else begin + idx_ds_q <= idx_ds_d; + idx_is_q <= idx_is_d; + end + end + end + + // pragma translate_off + `ifndef VERILATOR + replay_address_fifo: assert property ( + @(posedge clk_i) disable iff (!rst_ni) replay_o |-> !i_fifo_address.push_i + ) else $fatal(1,"[instr_queue] Pushing address although replay asserted"); + + output_select_onehot: assert property ( + @(posedge clk_i) $onehot0(idx_ds_q) + ) else begin $error("Output select should be one-hot encoded"); $stop(); end + `endif + // pragma translate_on +endmodule diff --git a/src/frontend/instr_scan.sv b/src/frontend/instr_scan.sv index 06fa3b92..ee345364 100644 --- a/src/frontend/instr_scan.sv +++ b/src/frontend/instr_scan.sv @@ -1,4 +1,4 @@ -//Copyright (C) 2018 to present, +// Copyright 2018 - 2019 ETH Zurich and University of Bologna. // Copyright and related rights are licensed under the Solderpad Hardware // License, Version 2.0 (the "License"); you may not use this file except in // compliance with the License. You may obtain a copy of the License at @@ -17,7 +17,6 @@ // ------------------------------ module instr_scan ( input logic [31:0] instr_i, // expect aligned instruction, compressed or not - output logic is_rvc_o, output logic rvi_return_o, output logic rvi_call_o, output logic rvi_branch_o, @@ -32,35 +31,39 @@ module instr_scan ( output logic rvc_call_o, output logic [63:0] rvc_imm_o ); - assign is_rvc_o = (instr_i[1:0] != 2'b11); - // check that rs1 is either x1 or x5 and that rs1 is not x1 or x5, TODO: check the fact about bit 7 - assign rvi_return_o = rvi_jalr_o & ~instr_i[7] & ~instr_i[19] & ~instr_i[18] & ~instr_i[16] & instr_i[15]; - assign rvi_call_o = (rvi_jalr_o | rvi_jump_o) & instr_i[7]; // TODO: check that this captures calls + logic is_rvc; + assign is_rvc = (instr_i[1:0] != 2'b11); + // check that rs1 is either x1 or x5 and that rs1 is not x1 or x5 + assign rvi_return_o = rvi_jalr_o & ((instr_i[11:7] == 5'd1) | instr_i[11:7] == 5'd5) + & (instr_i[19:15] != instr_i[11:7]); + // Opocde is JAL[R] and destination register is either x1 or x5 + assign rvi_call_o = (rvi_jalr_o | rvi_jump_o) & ((instr_i[11:7] == 5'd1) | instr_i[11:7] == 5'd5); // differentiates between JAL and BRANCH opcode, JALR comes from BHT assign rvi_imm_o = (instr_i[3]) ? ariane_pkg::uj_imm(instr_i) : ariane_pkg::sb_imm(instr_i); - assign rvi_branch_o = (instr_i[6:0] == riscv::OpcodeBranch) ? 1'b1 : 1'b0; - assign rvi_jalr_o = (instr_i[6:0] == riscv::OpcodeJalr) ? 1'b1 : 1'b0; - assign rvi_jump_o = (instr_i[6:0] == riscv::OpcodeJal) ? 1'b1 : 1'b0; + assign rvi_branch_o = (instr_i[6:0] == riscv::OpcodeBranch); + assign rvi_jalr_o = (instr_i[6:0] == riscv::OpcodeJalr); + assign rvi_jump_o = (instr_i[6:0] == riscv::OpcodeJal); + // opcode JAL - assign rvc_jump_o = (instr_i[15:13] == riscv::OpcodeC1J) & is_rvc_o & (instr_i[1:0] == riscv::OpcodeC1); + assign rvc_jump_o = (instr_i[15:13] == riscv::OpcodeC1J) & is_rvc & (instr_i[1:0] == riscv::OpcodeC1); // always links to register 0 - assign rvc_jr_o = (instr_i[15:13] == riscv::OpcodeC2JalrMvAdd) - & ~instr_i[12] + logic is_jal_r; + assign is_jal_r = (instr_i[15:13] == riscv::OpcodeC2JalrMvAdd) & & (instr_i[6:2] == 5'b00000) & (instr_i[1:0] == riscv::OpcodeC2) - & is_rvc_o; + & is_rvc; + assign rvc_jr_o = is_jal_r & ~instr_i[12]; + // always links to register 1 e.g.: it is a jump + assign rvc_jalr_o = is_jal_r & instr_i[12]; + assign rvc_call_o = rvc_jalr_o; + assign rvc_branch_o = ((instr_i[15:13] == riscv::OpcodeC1Beqz) | (instr_i[15:13] == riscv::OpcodeC1Bnez)) & (instr_i[1:0] == riscv::OpcodeC1) - & is_rvc_o; + & is_rvc; // check that rs1 is x1 or x5 - assign rvc_return_o = ~instr_i[11] & ~instr_i[10] & ~instr_i[8] & instr_i[7] & rvc_jr_o ; - // always links to register 1 e.g.: it is a jump - assign rvc_jalr_o = (instr_i[15:13] == riscv::OpcodeC2JalrMvAdd) - & instr_i[12] - & (instr_i[6:2] == 5'b00000) & is_rvc_o; - assign rvc_call_o = rvc_jalr_o; + assign rvc_return_o = ((instr_i[11:7] == 5'd1) | (instr_i[11:7] == 5'd5)) & rvc_jr_o ; - // // differentiates between JAL and BRANCH opcode, JALR comes from BHT + // differentiates between JAL and BRANCH opcode, JALR comes from BHT assign rvc_imm_o = (instr_i[14]) ? {{56{instr_i[12]}}, instr_i[6:5], instr_i[2], instr_i[11:10], instr_i[4:3], 1'b0} : {{53{instr_i[12]}}, instr_i[8], instr_i[10:9], instr_i[6], instr_i[7], instr_i[2], instr_i[11], instr_i[5:3], 1'b0}; endmodule diff --git a/src/id_stage.sv b/src/id_stage.sv index eb0132ec..9d661040 100644 --- a/src/id_stage.sv +++ b/src/id_stage.sv @@ -10,95 +10,81 @@ // // Author: Florian Zaruba, ETH Zurich // Date: 15.04.2017 -// Description: Description: Instruction decode, contains the logic for decode, +// Description: Instruction decode, contains the logic for decode, // issue and read operands. -import ariane_pkg::*; - module id_stage ( - input logic clk_i, // Clock - input logic rst_ni, // Asynchronous reset active low + input logic clk_i, + input logic rst_ni, - input logic flush_i, - input logic debug_req_i, + input logic flush_i, + input logic debug_req_i, // from IF - input frontend_fetch_t fetch_entry_i, - input logic fetch_entry_valid_i, - output logic decoded_instr_ack_o, // acknowledge the instruction (fetch entry) - + input ariane_pkg::fetch_entry_t fetch_entry_i, + input logic fetch_entry_valid_i, + output logic fetch_entry_ready_o, // acknowledge the instruction (fetch entry) // to ID - output scoreboard_entry_t issue_entry_o, // a decoded instruction - output logic issue_entry_valid_o, // issue entry is valid - output logic is_ctrl_flow_o, // the instruction we issue is a ctrl flow instructions - input logic issue_instr_ack_i, // issue stage acknowledged sampling of instructions + output ariane_pkg::scoreboard_entry_t issue_entry_o, // a decoded instruction + output logic issue_entry_valid_o, // issue entry is valid + output logic is_ctrl_flow_o, // the instruction we issue is a ctrl flow instructions + input logic issue_instr_ack_i, // issue stage acknowledged sampling of instructions // from CSR file - input riscv::priv_lvl_t priv_lvl_i, // current privilege level - input riscv::xs_t fs_i, // floating point extension status - input logic [2:0] frm_i, // floating-point dynamic rounding mode - input logic [1:0] irq_i, - input irq_ctrl_t irq_ctrl_i, - input logic debug_mode_i, // we are in debug mode - input logic tvm_i, - input logic tw_i, - input logic tsr_i + input riscv::priv_lvl_t priv_lvl_i, // current privilege level + input riscv::xs_t fs_i, // floating point extension status + input logic [2:0] frm_i, // floating-point dynamic rounding mode + input logic [1:0] irq_i, + input ariane_pkg::irq_ctrl_t irq_ctrl_i, + input logic debug_mode_i, // we are in debug mode + input logic tvm_i, + input logic tw_i, + input logic tsr_i ); - // register stage + // ID/ISSUE register stage struct packed { - logic valid; - scoreboard_entry_t sbe; - logic is_ctrl_flow; + logic valid; + ariane_pkg::scoreboard_entry_t sbe; + logic is_ctrl_flow; } issue_n, issue_q; - logic is_control_flow_instr; - scoreboard_entry_t decoded_instruction; + logic is_control_flow_instr; + ariane_pkg::scoreboard_entry_t decoded_instruction; - fetch_entry_t fetch_entry; logic is_illegal; logic [31:0] instruction; logic is_compressed; - logic fetch_ack_i; - logic fetch_entry_valid; - - // --------------------------------------------------------- - // 1. Re-align instructions - // --------------------------------------------------------- - instr_realigner instr_realigner_i ( - .fetch_entry_i ( fetch_entry_i ), - .fetch_entry_valid_i ( fetch_entry_valid_i ), - .fetch_ack_o ( decoded_instr_ack_o ), - .fetch_entry_o ( fetch_entry ), - .fetch_entry_valid_o ( fetch_entry_valid ), - .fetch_ack_i ( fetch_ack_i ), - .* - ); // --------------------------------------------------------- - // 2. Check if they are compressed and expand in case they are + // 1. Check if they are compressed and expand in case they are // --------------------------------------------------------- compressed_decoder compressed_decoder_i ( - .instr_i ( fetch_entry.instruction ), + .instr_i ( fetch_entry_i.instruction ), .instr_o ( instruction ), .illegal_instr_o ( is_illegal ), .is_compressed_o ( is_compressed ) - ); // --------------------------------------------------------- - // 3. Decode and emit instruction to issue stage + // 2. Decode and emit instruction to issue stage // --------------------------------------------------------- decoder decoder_i ( .debug_req_i, - .pc_i ( fetch_entry.address ), - .is_compressed_i ( is_compressed ), - .compressed_instr_i ( fetch_entry.instruction[15:0] ), - .instruction_i ( instruction ), - .branch_predict_i ( fetch_entry.branch_predict ), - .is_illegal_i ( is_illegal ), - .ex_i ( fetch_entry.ex ), - .instruction_o ( decoded_instruction ), - .is_control_flow_instr_o ( is_control_flow_instr ), + .irq_ctrl_i, + .irq_i, + .pc_i ( fetch_entry_i.address ), + .is_compressed_i ( is_compressed ), + .is_illegal_i ( is_illegal ), + .instruction_i ( instruction ), + .compressed_instr_i ( fetch_entry_i.instruction[15:0] ), + .branch_predict_i ( fetch_entry_i.branch_predict ), + .ex_i ( fetch_entry_i.ex ), + .priv_lvl_i ( priv_lvl_i ), + .debug_mode_i ( debug_mode_i ), .fs_i, .frm_i, - .* + .tvm_i, + .tw_i, + .tsr_i, + .instruction_o ( decoded_instruction ), + .is_control_flow_instr_o ( is_control_flow_instr ) ); // ------------------ @@ -110,7 +96,7 @@ module id_stage ( always_comb begin issue_n = issue_q; - fetch_ack_i = 1'b0; + fetch_entry_ready_o = 1'b0; // Clear the valid flag if issue has acknowledged the instruction if (issue_instr_ack_i) @@ -119,9 +105,9 @@ module id_stage ( // if we have a space in the register and the fetch is valid, go get it // or the issue stage is currently acknowledging an instruction, which means that we will have space // for a new instruction - if ((!issue_q.valid || issue_instr_ack_i) && fetch_entry_valid) begin - fetch_ack_i = 1'b1; - issue_n = {1'b1, decoded_instruction, is_control_flow_instr}; + if ((!issue_q.valid || issue_instr_ack_i) && fetch_entry_valid_i) begin + fetch_entry_ready_o = 1'b1; + issue_n = '{1'b1, decoded_instruction, is_control_flow_instr}; end // invalidate the pipeline register on a flush @@ -138,5 +124,4 @@ module id_stage ( issue_q <= issue_n; end end - endmodule diff --git a/src/instr_realign.sv b/src/instr_realign.sv new file mode 100644 index 00000000..dad93f19 --- /dev/null +++ b/src/instr_realign.sv @@ -0,0 +1,358 @@ +// Copyright 2018 - 2019 ETH Zurich and University of Bologna. +// Copyright and related rights are licensed under the Solderpad Hardware +// License, Version 0.51 (the "License"); you may not use this file except in +// compliance with the License. You may obtain a copy of the License at +// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law +// or agreed to in writing, software, hardware and materials distributed under +// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. +// +// Author: Florian Zaruba <zarubaf@iis.ee.ethz.ch> +// Description: Instruction Re-aligner +// +// This module takes 32-bit aligned cache blocks and extracts the instructions. +// As we are supporting the compressed instruction set extension in a 32 bit instruction word +// are up to 2 compressed instructions. +// Furthermore those instructions can be arbitrarily interleaved which makes it possible to fetch +// only the lower part of a 32 bit instruction. +// Furthermore we need to handle the case if we want to start fetching from an unaligned +// instruction e.g. a branch. + +import ariane_pkg::*; + +module instr_realign ( + input logic clk_i, + input logic rst_ni, + input logic flush_i, + input logic valid_i, + output logic serving_unaligned_o, // we have an unaligned instruction in [0] + input logic [63:0] address_i, + input logic [FETCH_WIDTH-1:0] data_i, + output logic [INSTR_PER_FETCH-1:0] valid_o, + output logic [INSTR_PER_FETCH-1:0][63:0] addr_o, + output logic [INSTR_PER_FETCH-1:0][31:0] instr_o +); + // as a maximum we support a fetch width of 64-bit, hence there can be 4 compressed instructions + logic [3:0] instr_is_compressed; + + for (genvar i = 0; i < INSTR_PER_FETCH; i ++) begin + // LSB != 2'b11 + assign instr_is_compressed[i] = ~&data_i[i * 16 +: 2]; + end + + // save the unaligned part of the instruction to this ff + logic [15:0] unaligned_instr_d, unaligned_instr_q; + // the last instruction was unaligned + logic unaligned_d, unaligned_q; + // register to save the unaligned address + logic [63:0] unaligned_address_d, unaligned_address_q; + // we have an unaligned instruction + assign serving_unaligned_o = unaligned_q; + + // Instruction re-alignment + if (FETCH_WIDTH == 32) begin : realign_bp_32 + always_comb begin : re_align + unaligned_d = unaligned_q; + unaligned_address_d = {address_i[63:2], 2'b10}; + unaligned_instr_d = data_i[31:16]; + + valid_o[0] = valid_i; + instr_o[0] = (unaligned_q) ? {data_i[15:0], unaligned_instr_q} : data_i[31:0]; + addr_o[0] = (unaligned_q) ? unaligned_address_q : address_i; + + valid_o[1] = 1'b0; + instr_o[1] = '0; + addr_o[1] = {address_i[63:2], 2'b10}; + + // this instruction is compressed or the last instruction was unaligned + if (instr_is_compressed[0] || unaligned_q) begin + // check if this is instruction is still unaligned e.g.: it is not compressed + // if its compressed re-set unaligned flag + // for 32 bit we can simply check the next instruction and whether it is compressed or not + // if it is compressed the next fetch will contain an aligned instruction + // is instruction 1 also compressed + // yes? -> no problem, no -> we've got an unaligned instruction + if (instr_is_compressed[1]) begin + unaligned_d = 1'b0; + valid_o[1] = valid_i; + instr_o[1] = {16'b0, data_i[31:16]}; + end else begin + // save the upper bits for next cycle + unaligned_d = 1'b1; + unaligned_instr_d = data_i[31:16]; + unaligned_address_d = {address_i[63:2], 2'b10}; + end + end // else -> normal fetch + + // we started to fetch on a unaligned boundary with a whole instruction -> wait until we've + // received the next instruction + if (valid_i && address_i[1]) begin + // the instruction is not compressed so we can't do anything in this cycle + if (!instr_is_compressed[0]) begin + valid_o = '0; + unaligned_d = 1'b1; + unaligned_address_d = {address_i[63:2], 2'b10}; + unaligned_instr_d = data_i[15:0]; + // the instruction isn't compressed but only the lower is ready + end else begin + valid_o = 1'b1; + end + end + end + // TODO(zarubaf): Fix 64 bit FETCH_WIDTH, maybe generalize to arbitrary fetch width + end else if (FETCH_WIDTH == 64) begin : realign_bp_64 + initial begin + $error("Not propperly implemented"); + end + always_comb begin : re_align + unaligned_d = unaligned_q; + unaligned_address_d = unaligned_address_q; + unaligned_instr_d = unaligned_instr_q; + + valid_o = '0; + valid_o[0] = valid_i; + + instr_o[0] = data_i[31:0]; + addr_o[0] = address_i; + + instr_o[1] = '0; + addr_o[1] = {address_i[63:3], 3'b010}; + + instr_o[2] = {16'b0, data_i[47:32]}; + addr_o[2] = {address_i[63:3], 3'b100}; + + instr_o[3] = {16'b0, data_i[63:48]}; + addr_o[3] = {address_i[63:3], 3'b110}; + + // last instruction was unaligned + if (unaligned_q) begin + instr_o[0] = {data_i[15:0], unaligned_instr_q}; + addr_o[0] = unaligned_address_q; + // for 64 bit there exist the following options: + // 64 32 0 + // | 3 | 2 | 1 | 0 | <- instruction slot + // | I | I | U | -> again unaligned + // | * | C | I | U | -> aligned + // | * | I | C | U | -> aligned + // | I | C | C | U | -> again unaligned + // | * | C | C | C | U | -> aligned + // Legend: C = compressed, I = 32 bit instruction, U = unaligned upper half + // * = don't care + if (instr_is_compressed[1]) begin + instr_o[1] = {16'b0, data_i[31:16]}; + valid_o[1] = valid_i; + + if (instr_is_compressed[2]) begin + if (instr_is_compressed[3]) begin + unaligned_d = 1'b0; + valid_o[3] = valid_i; + end else begin + // continues to be unaligned + end + end else begin + unaligned_d = 1'b0; + instr_o[2] = data_i[63:32]; + valid_o[2] = valid_i; + end + // instruction 1 is not compressed + end else begin + instr_o[1] = data_i[47:16]; + valid_o[1] = valid_i; + addr_o[2] = {address_i[63:3], 3'b110}; + if (instr_is_compressed[2]) begin + unaligned_d = 1'b0; + instr_o[2] = {16'b0, data_i[63:48]}; + valid_o[2] = valid_i; + end else begin + // continues to be unaligned + end + end + end else if (instr_is_compressed[0]) begin // instruction zero is RVC + // 64 32 0 + // | 3 | 2 | 1 | 0 | <- instruction slot + // | I | I | C | -> again unaligned + // | * | C | I | C | -> aligned + // | * | I | C | C | -> aligned + // | I | C | C | C | -> again unaligned + // | * | C | C | C | C | -> aligned + if (instr_is_compressed[1]) begin + instr_o[1] = {16'b0, data_i[31:16]}; + valid_o[1] = valid_i; + + if (instr_is_compressed[2]) begin + valid_o[2] = valid_i; + if (instr_is_compressed[3]) begin + valid_o[3] = valid_i; + end else begin + // this instruction is unaligned + unaligned_d = 1'b1; + unaligned_instr_d = data_i[63:48]; + unaligned_address_d = addr_o[3]; + end + end else begin + instr_o[2] = data_i[63:32]; + valid_o[2] = valid_i; + end + // instruction 1 is not compressed -> check slot 3 + end else begin + instr_o[1] = data_i[47:16]; + valid_o[1] = valid_i; + addr_o[2] = {address_i[63:3], 3'b110}; + if (instr_is_compressed[3]) begin + instr_o[2] = data_i[63:48]; + valid_o[2] = valid_i; + end else begin + unaligned_d = 1'b1; + unaligned_instr_d = data_i[63:48]; + unaligned_address_d = addr_o[2]; + end + end + + // Full instruction in slot zero + // 64 32 0 + // | 3 | 2 | 1 | 0 | <- instruction slot + // | I | C | I | + // | * | C | C | I | + // | * | I | I | + end else begin + addr_o[1] = {address_i[63:3], 3'b100}; + + if (instr_is_compressed[2]) begin + instr_o[1] = {16'b0, data_i[47:32]}; + valid_o[1] = valid_i; + addr_o[2] = {address_i[63:3], 3'b110}; + if (instr_is_compressed[3]) begin + // | * | C | C | I | + valid_o[2] = valid_i; + addr_o[2] = {16'b0, data_i[63:48]}; + end else begin + // this instruction is unaligned + unaligned_d = 1'b1; + unaligned_instr_d = data_i[63:48]; + unaligned_address_d = addr_o[2]; + end + end else begin + // two regular instructions back-to-back + instr_o[1] = data_i[63:32]; + valid_o[1] = valid_i; + end + end + + // -------------------------- + // Unaligned fetch + // -------------------------- + // Address was not 64 bit aligned + case (address_i[2:1]) + // this means the previouse instruction was either compressed or unaligned + // in any case we don't ccare + 2'b01: begin + // 64 32 0 + // | 3 | 2 | 1 | 0 | <- instruction slot + // | I | I | x -> again unaligned + // | * | C | I | x -> aligned + // | * | I | C | x -> aligned + // | I | C | C | x -> again unaligned + // | * | C | C | C | x -> aligned + addr_o[0] = {address_i[63:3], 3'b010}; + + if (instr_is_compressed[1]) begin + instr_o[0] = {16'b0, data_i[31:16]}; + valid_o[0] = valid_i; + + if (instr_is_compressed[2]) begin + valid_o[1] = valid_i; + instr_o[1] = {16'b0, data_i[47:32]}; + addr_o[1] = {address_i[63:3], 3'b100}; + if (instr_is_compressed[3]) begin + instr_o[2] = {16'b0, data_i[63:48]}; + addr_o[2] = {address_i[63:3], 3'b110}; + valid_o[2] = valid_i; + end else begin + // this instruction is unaligned + unaligned_d = 1'b1; + unaligned_instr_d = data_i[63:48]; + unaligned_address_d = addr_o[3]; + end + end else begin + instr_o[1] = data_i[63:32]; + addr_o[1] = {address_i[63:3], 3'b100}; + valid_o[1] = valid_i; + end + // instruction 1 is not compressed -> check slot 3 + end else begin + instr_o[0] = data_i[47:16]; + valid_o[0] = valid_i; + addr_o[1] = {address_i[63:3], 3'b110}; + if (instr_is_compressed[3]) begin + instr_o[1] = data_i[63:48]; + valid_o[1] = valid_i; + end else begin + unaligned_d = 1'b1; + unaligned_instr_d = data_i[63:48]; + unaligned_address_d = addr_o[1]; + end + end + end + 2'b10: begin + valid_o = '0; + // 64 32 0 + // | 3 | 2 | 1 | 0 | <- instruction slot + // | I | C | * | <- unaligned + // | C | C | * | <- aligned + // | I | * | <- aligned + if (instr_is_compressed[2]) begin + valid_o[0] = valid_i; + instr_o[0] = data_i[47:32]; + // second instruction is also compressed + if (instr_is_compressed[3]) begin + valid_o[1] = valid_i; + instr_o[1] = data_i[63:48]; + // regular instruction -> unaligned + end else begin + unaligned_d = 1'b1; + unaligned_address_d = {address_i[63:3], 3'b110}; + unaligned_instr_d = data_i[63:48]; + end + // instruction is a regular instruction + end else begin + valid_o[0] = valid_i; + instr_o[0] = data_i[63:32]; + addr_o[0] = address_i; + end + end + // we started to fetch on a unaligned boundary with a whole instruction -> wait until we've + // received the next instruction + 2'b11: begin + valid_o = '0; + if (!instr_is_compressed[3]) begin + unaligned_d = 1'b1; + unaligned_address_d = {address_i[63:3], 3'b110}; + unaligned_instr_d = data_i[63:48]; + end else begin + valid_o[3] = valid_i; + end + end + endcase + end + end + + always_ff @(posedge clk_i or negedge rst_ni) begin + if (~rst_ni) begin + unaligned_q <= 1'b0; + unaligned_address_q <= '0; + unaligned_instr_q <= '0; + end else begin + if (valid_i) begin + unaligned_address_q <= unaligned_address_d; + unaligned_instr_q <= unaligned_instr_d; + end + + if (flush_i) begin + unaligned_q <= 1'b0; + end else if (valid_i) begin + unaligned_q <= unaligned_d; + end + end + end +endmodule diff --git a/src/instr_realigner.sv b/src/instr_realigner.sv deleted file mode 100644 index 9b5557fd..00000000 --- a/src/instr_realigner.sv +++ /dev/null @@ -1,252 +0,0 @@ -// Copyright 2018 ETH Zurich and University of Bologna. -// Copyright and related rights are licensed under the Solderpad Hardware -// License, Version 0.51 (the "License"); you may not use this file except in -// compliance with the License. You may obtain a copy of the License at -// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law -// or agreed to in writing, software, hardware and materials distributed under -// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. -// -// Author: Florian Zaruba, ETH Zurich -// Date: 14.05.2017 -// Description: Emits and re-aligns compressed and unaligned instructions - -import ariane_pkg::*; - -module instr_realigner ( - input logic clk_i, // Clock - input logic rst_ni, // Asynchronous reset active low - // control signals - input logic flush_i, - - input frontend_fetch_t fetch_entry_i, - input logic fetch_entry_valid_i, - output logic fetch_ack_o, - - output fetch_entry_t fetch_entry_o, - output logic fetch_entry_valid_o, - input logic fetch_ack_i -); - // ---------- - // Registers - // ---------- - // the last instruction was unaligned - logic unaligned_n, unaligned_q; - // save the unaligned part of the instruction to this ff - logic [15:0] unaligned_instr_n, unaligned_instr_q; - // the previous instruction was compressed - logic compressed_n, compressed_q; - // register to save the unaligned address - logic [63:0] unaligned_address_n, unaligned_address_q; - // get the next instruction, needed on a unaligned access - logic jump_unaligned_half_word; - - // check if the lower compressed instruction was no branch otherwise we will need to squash this instruction - // but only if we predicted it to be taken, the predict was on the lower 16 bit compressed instruction - logic kill_upper_16_bit; - assign kill_upper_16_bit = fetch_entry_i.branch_predict.valid & - fetch_entry_i.branch_predict.predict_taken & - fetch_entry_i.bp_taken[0]; - // ---------- - // Registers - // ---------- - always_comb begin : realign_instr - - unaligned_n = unaligned_q; - unaligned_instr_n = unaligned_instr_q; - compressed_n = compressed_q; - unaligned_address_n = unaligned_address_q; - - // directly output this instruction. adoptions are made throughout the always comb block - fetch_entry_o.address = fetch_entry_i.address; - fetch_entry_o.instruction = fetch_entry_i.instruction; - fetch_entry_o.branch_predict = fetch_entry_i.branch_predict; - fetch_entry_o.ex.valid = fetch_entry_i.page_fault; - fetch_entry_o.ex.tval = (fetch_entry_i.page_fault) ? fetch_entry_i.address : '0; - fetch_entry_o.ex.cause = (fetch_entry_i.page_fault) ? riscv::INSTR_PAGE_FAULT : '0; - - fetch_entry_valid_o = fetch_entry_valid_i; - fetch_ack_o = fetch_ack_i; - // we just jumped to a half word and encountered an unaligned 32-bit instruction - jump_unaligned_half_word = 1'b0; - // --------------------------------- - // Input port & Instruction Aligner - // --------------------------------- - // check if the entry if the fetch FIFO is valid and if we are currently not serving the second part - // of a compressed instruction - if (fetch_entry_valid_i && !compressed_q) begin - // ------------------------ - // Access on Word Boundary - // ------------------------ - if (fetch_entry_i.address[1] == 1'b0) begin - // do we actually want the first instruction or was the address a half word access? - if (!unaligned_q) begin - // we got a valid instruction so we can satisfy the unaligned instruction - unaligned_n = 1'b0; - // check if the instruction is compressed - if (fetch_entry_i.instruction[1:0] != 2'b11) begin - // it is compressed - fetch_entry_o.instruction = {15'b0, fetch_entry_i.instruction[15:0]}; - // we need to kill the lower prediction - if (fetch_entry_i.branch_predict.valid && !fetch_entry_i.bp_taken[0]) - fetch_entry_o.branch_predict.valid = 1'b0; - - // should we even look at the upper instruction bits? - if (!kill_upper_16_bit) begin - // Yes, so... - // 1. Is the second instruction also compressed, like: - // _____________________________________________ - // | compressed 2 [31:16] | compressed 1[15:0] | - // |____________________________________________ - if (fetch_entry_i.instruction[17:16] != 2'b11) begin - // yes, this was a compressed instruction - compressed_n = 1'b1; - // do not advance the queue pointer - fetch_ack_o = 1'b0; - // 2. or is it an unaligned 32 bit instruction like - // ____________________________________________________ - // |instr [15:0] | instr [31:16] | compressed 1[15:0] | - // |____________________________________________________ - end else begin - // save the lower 16 bit - unaligned_instr_n = fetch_entry_i.instruction[31:16]; - // save the address - unaligned_address_n = {fetch_entry_i.address[63:2], 2'b10}; - // and that it was unaligned - unaligned_n = 1'b1; - // this does not consume space in the FIFO - end - end - end - end - // this is a full 32 bit instruction like - // _______________________ - // | instruction [31:0] | - // |______________________ - - // we have an outstanding unaligned instruction - else if (unaligned_q) begin - - - fetch_entry_o.address = unaligned_address_q; - fetch_entry_o.instruction = {fetch_entry_i.instruction[15:0], unaligned_instr_q}; - - // again should we look at the upper bits? - if (!kill_upper_16_bit) begin - // whats up with the other upper 16 bit of this instruction - // is the second instruction also compressed, like: - // _____________________________________________ - // | compressed 2 [31:16] | unaligned[31:16] | - // |____________________________________________ - // check if the lower compressed instruction was no branch otherwise we will need to squash this instruction - // but only if we predicted it to be taken, the predict was on the lower 16 bit compressed instruction - if (fetch_entry_i.instruction[17:16] != 2'b11) begin - // this was a compressed instruction - compressed_n = 1'b1; - // do not advance the queue pointer - fetch_ack_o = 1'b0; - // unaligned access served - unaligned_n = 1'b0; - // we need to kill the lower prediction - if (fetch_entry_i.branch_predict.valid && !fetch_entry_i.bp_taken[0]) - fetch_entry_o.branch_predict.valid = 1'b0; - // or is it an unaligned 32 bit instruction like - // ____________________________________________________ - // |instr [15:0] | instr [31:16] | compressed 1[15:0] | - // |____________________________________________________ - end else if (!kill_upper_16_bit) begin - // save the lower 16 bit - unaligned_instr_n = fetch_entry_i.instruction[31:16]; - // save the address - unaligned_address_n = {fetch_entry_i.address[63:2], 2'b10}; - // and that it was unaligned - unaligned_n = 1'b1; - end - end - // we've got a predicted taken branch we need to clear the unaligned flag if it was decoded as a lower 16 instruction - else if (fetch_entry_i.branch_predict.valid) begin - // the next fetch will start from a 4 byte boundary again - unaligned_n = 1'b0; - end - end - end - // ---------------------------- - // Access on half-Word Boundary - // ---------------------------- - else if (fetch_entry_i.address[1] == 1'b1) begin // address was a half word access - // reset the unaligned flag as this is a completely new fetch (because consecutive fetches only happen on a word basis) - unaligned_n = 1'b0; - // this is a compressed instruction - if (fetch_entry_i.instruction[17:16] != 2'b11) begin - // it is compressed - fetch_entry_o.instruction = {15'b0, fetch_entry_i.instruction[31:16]}; - - // this is the first part of a 32 bit unaligned instruction - end else begin - // save the lower 16 bit - unaligned_instr_n = fetch_entry_i.instruction[31:16]; - // and that it was unaligned - unaligned_n = 1'b1; - // save the address - unaligned_address_n = {fetch_entry_i.address[63:2], 2'b10}; - // we need to wait for the second instruction - fetch_entry_valid_o = 1'b0; - // so get it by acknowledging this instruction - fetch_ack_o = 1'b1; - // we got to an unaligned instruction -> get the next entry to full-fill the need - jump_unaligned_half_word = 1'b1; - end - // there can never be a whole 32 bit instruction on a half word access - end - end - // ---------------------------- - // Next compressed instruction - // ---------------------------- - // we are serving the second part of an instruction which was also compressed - if (compressed_q) begin - fetch_ack_o = fetch_ack_i; - compressed_n = 1'b0; - fetch_entry_o.instruction = {16'b0, fetch_entry_i.instruction[31:16]}; - fetch_entry_o.address = {fetch_entry_i.address[63:2], 2'b10}; - fetch_entry_valid_o = 1'b1; - end - - // if we didn't get an acknowledge keep the registers stable - if (!fetch_ack_i && !jump_unaligned_half_word) begin - unaligned_n = unaligned_q; - unaligned_instr_n = unaligned_instr_q; - compressed_n = compressed_q; - unaligned_address_n = unaligned_address_q; - end - - if (flush_i) begin - // clear the unaligned and compressed instruction - unaligned_n = 1'b0; - compressed_n = 1'b0; - end - - // assign the correct address for a potentially faulting unaligned instruction - // we've already done the re-alignment for the instruction word so we - // can just assign it here to tval - fetch_entry_o.ex.tval = fetch_entry_o.address; - end - - // --------- - // Registers - // --------- - always_ff @(posedge clk_i or negedge rst_ni) begin - if (~rst_ni) begin - unaligned_q <= 1'b0; - unaligned_instr_q <= 16'b0; - unaligned_address_q <= 64'b0; - compressed_q <= 1'b0; - end else begin - unaligned_q <= unaligned_n; - unaligned_instr_q <= unaligned_instr_n; - unaligned_address_q <= unaligned_address_n; - compressed_q <= compressed_n; - end - end - -endmodule diff --git a/tb/ariane_soc_pkg.sv b/tb/ariane_soc_pkg.sv index bfe408f2..4e0ef3aa 100644 --- a/tb/ariane_soc_pkg.sv +++ b/tb/ariane_soc_pkg.sv @@ -67,6 +67,9 @@ package ariane_soc; localparam logic [NrRegion-1:0][NB_PERIPHERALS-1:0] ValidRule = {{NrRegion * NB_PERIPHERALS}{1'b1}}; localparam ariane_pkg::ariane_cfg_t ArianeSocCfg = '{ + RASDepth: 2, + BTBEntries: 32, + BHTEntries: 128, // idempotent region NrNonIdempotentRules: 0, NonIdempotentAddrBase: {64'b0}, -- GitLab