diff --git a/src/frontend/instr_fifo.sv b/src/frontend/instr_fifo.sv
new file mode 100644
index 0000000000000000000000000000000000000000..cd575c0312e3f63b0104021d3bd39cb5d179c624
--- /dev/null
+++ b/src/frontend/instr_fifo.sv
@@ -0,0 +1,166 @@
+// Copyright 2018 ETH Zurich and University of Bologna.
+// Copyright and related rights are licensed under the Solderpad Hardware
+// License, Version 0.51 (the "License"); you may not use this file except in
+// compliance with the License. You may obtain a copy of the License at
+// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+// or agreed to in writing, software, hardware and materials distributed under
+// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+// Author: Florian Zaruba <zarubaf@iis.ee.ethz.ch>
+
+module instr_fifo #(
+    parameter bit          FALL_THROUGH = 1'b0, // fifo is in fall-through mode
+    parameter int unsigned DATA_WIDTH   = 32,   // default data width if the fifo is of type logic
+    parameter int unsigned DEPTH        = 8,    // depth can be arbitrary from 0 to 2**32
+    parameter type dtype                = logic [DATA_WIDTH-1:0],
+    // DO NOT OVERWRITE THIS PARAMETER
+    parameter int unsigned ADDR_DEPTH   = (DEPTH > 1) ? $clog2(DEPTH) : 1
+)(
+    input  logic  clk_i,            // Clock
+    input  logic  rst_ni,           // Asynchronous reset active low
+    input  logic  flush_i,          // flush the queue
+    input  logic  testmode_i,       // test_mode to bypass clock gating
+    // status flags
+    output logic  full_o,           // queue is full
+    output logic  empty_o,          // queue is empty
+    output logic  [ADDR_DEPTH-1:0] usage_o,  // fill pointer
+    // as long as the queue is not full we can push new data
+    input  dtype  data_i,           // data to push into the queue
+    input  logic  push_i,           // data is valid and can be pushed to the queue
+    // as long as the queue is not empty we can pop new elements
+    output dtype  data_o,           // output data
+    input  logic  pop_i,            // pop head from queue
+
+    output dtype  [DEPTH-1:0] full_data_o,
+    output logic  [DEPTH-1:0] full_data_valid_o
+);
+    // local parameter
+    // FIFO depth - handle the case of pass-through, synthesizer will do constant propagation
+    localparam int unsigned FIFO_DEPTH = (DEPTH > 0) ? DEPTH : 1;
+    // clock gating control
+    logic gate_clock;
+    // pointer to the read and write section of the queue
+    logic [ADDR_DEPTH - 1:0] read_pointer_n, read_pointer_q, write_pointer_n, write_pointer_q;
+    // keep a counter to keep track of the current queue status
+    logic [ADDR_DEPTH:0] status_cnt_n, status_cnt_q; // this integer will be truncated by the synthesis tool
+    // actual memory
+    dtype [FIFO_DEPTH - 1:0] mem_n, mem_q;
+    logic [FIFO_DEPTH - 1:0] valid_n, valid_q;
+
+    assign usage_o = status_cnt_q[ADDR_DEPTH-1:0];
+
+    if (DEPTH == 0) begin
+        assign empty_o     = ~push_i;
+        assign full_o      = ~pop_i;
+    end else begin
+        assign full_o       = (status_cnt_q == FIFO_DEPTH[ADDR_DEPTH:0]);
+        assign empty_o      = (status_cnt_q == 0) & ~(FALL_THROUGH & push_i);
+    end
+    // status flags
+
+    // read and write queue logic
+    always_comb begin : read_write_comb
+        // default assignment
+        read_pointer_n  = read_pointer_q;
+        write_pointer_n = write_pointer_q;
+        status_cnt_n    = status_cnt_q;
+        data_o          = (DEPTH == 0) ? data_i : mem_q[read_pointer_q];
+        mem_n           = mem_q;
+        gate_clock      = 1'b1;
+        valid_n         = valid_q;
+
+        // push a new element to the queue
+        if (push_i && ~full_o) begin
+            // push the data onto the queue
+            mem_n[write_pointer_q] = data_i;
+            valid_n[write_pointer_q] = 1'b1;
+            // un-gate the clock, we want to write something
+            gate_clock             = 1'b0;
+            // increment the write counter
+            if (write_pointer_q == FIFO_DEPTH[ADDR_DEPTH-1:0] - 1)
+                write_pointer_n = '0;
+            else
+                write_pointer_n = write_pointer_q + 1;
+            // increment the overall counter
+            status_cnt_n    = status_cnt_q + 1;
+        end
+
+        if (pop_i && ~empty_o) begin
+            // read from the queue is a default assignment
+            // but increment the read pointer...
+            valid_n[read_pointer_q] = 1'b0;
+            if (read_pointer_n == FIFO_DEPTH[ADDR_DEPTH-1:0] - 1)
+                read_pointer_n = '0;
+            else
+                read_pointer_n = read_pointer_q + 1;
+            // ... and decrement the overall count
+            status_cnt_n   = status_cnt_q - 1;
+        end
+
+        // keep the count pointer stable if we push and pop at the same time
+        if (push_i && pop_i &&  ~full_o && ~empty_o)
+            status_cnt_n   = status_cnt_q;
+
+        // FIFO is in pass through mode -> do not change the pointers
+        if (FALL_THROUGH && (status_cnt_q == 0) && push_i) begin
+            data_o = data_i;
+            if (pop_i) begin
+                status_cnt_n = status_cnt_q;
+                read_pointer_n = read_pointer_q;
+                write_pointer_n = write_pointer_q;
+            end
+        end
+    end // block: read_write_comb
+
+    assign full_data_o = mem_q;
+    assign full_data_valid_o = valid_q;
+
+    // sequential process
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+        if(~rst_ni) begin
+            read_pointer_q  <= '0;
+            write_pointer_q <= '0;
+            status_cnt_q    <= '0;
+            valid_q <= '0;
+        end else begin
+            if (flush_i) begin
+                read_pointer_q  <= '0;
+                write_pointer_q <= '0;
+                status_cnt_q    <= '0;
+                valid_q <= '0;
+             end else begin
+                read_pointer_q  <= read_pointer_n;
+                write_pointer_q <= write_pointer_n;
+                status_cnt_q    <= status_cnt_n;
+                valid_q <= valid_n;
+            end
+        end
+    end
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+        if(~rst_ni) begin
+            mem_q   <= '0;
+        end else if (!gate_clock) begin
+            mem_q   <= mem_n;
+        end
+    end
+
+// pragma translate_off
+`ifndef VERILATOR
+    initial begin
+        assert (DEPTH > 0)             else $error("DEPTH must be greater than 0.");
+    end
+
+    full_write : assert property(
+        @(posedge clk_i) disable iff (~rst_ni) (full_o |-> ~push_i))
+        else $fatal (1, "Trying to push new data although the FIFO is full.");
+
+    empty_read : assert property(
+        @(posedge clk_i) disable iff (~rst_ni) (empty_o |-> ~pop_i))
+        else $fatal (1, "Trying to pop data although the FIFO is empty.");
+`endif
+// pragma translate_on
+
+endmodule // fifo_v3
diff --git a/src/frontend/instr_queue.sv b/src/frontend/instr_queue.sv
index 6e125aacccbd185b4248f2dc30fc5698ae163b07..6f8e59d076240ad92262854a220e40c4399542da 100644
--- a/src/frontend/instr_queue.sv
+++ b/src/frontend/instr_queue.sv
@@ -123,7 +123,9 @@ module instr_queue (
   logic [ariane_pkg::INSTR_PER_FETCH-1:0] input_is_mem;
   logic                                   output_is_mem;
 
-  logic [$clog2(ariane_pkg::FETCH_FIFO_DEPTH)*2-1:0] nr_mem_n, nr_mem_q;
+  logic [ariane_pkg::INSTR_PER_FETCH-1:0][ariane_pkg::FETCH_FIFO_DEPTH-1:0] fifo_instr_valid;
+  instr_data_t [ariane_pkg::INSTR_PER_FETCH-1:0][ariane_pkg::FETCH_FIFO_DEPTH-1:0] fifo_instr;
+  logic [ariane_pkg::INSTR_PER_FETCH-1:0][ariane_pkg::FETCH_FIFO_DEPTH-1:0] fifo_instr_is_mem;
 
   assign ready_o = ~(|instr_queue_full) & ~full_address;
 
@@ -231,6 +233,17 @@ module instr_queue (
                               (instr_i[i][6:0] == riscv::OpcodeAmo));
   end
 
+  for (genvar i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin
+    for (genvar j = 0; j < ariane_pkg::FETCH_FIFO_DEPTH; j++) begin
+      assign fifo_instr_is_mem[i][j] = fifo_instr_valid[i][j] &
+                                       ((fifo_instr[i][j].instr[6:0] == riscv::OpcodeLoad) |
+                                        (fifo_instr[i][j].instr[6:0] == riscv::OpcodeLoadFp) |
+                                        (fifo_instr[i][j].instr[6:0] == riscv::OpcodeStore) |
+                                        (fifo_instr[i][j].instr[6:0] == riscv::OpcodeStoreFp) |
+                                        (fifo_instr[i][j].instr[6:0] == riscv::OpcodeAmo));
+    end
+  end
+
   assign output_is_mem = fetch_entry_valid_o &
                          ((fetch_entry_o.instruction[6:0] == riscv::OpcodeLoad) |
                           (fetch_entry_o.instruction[6:0] == riscv::OpcodeLoadFp) |
@@ -238,21 +251,7 @@ module instr_queue (
                           (fetch_entry_o.instruction[6:0] == riscv::OpcodeStoreFp) |
                           (fetch_entry_o.instruction[6:0] == riscv::OpcodeAmo));
 
-  assign has_mem_access_o = (|nr_mem_n) | (|input_is_mem) | output_is_mem;
-
-  always_comb begin
-    nr_mem_n            = nr_mem_q;
-
-    for (int unsigned i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin
-      if (push_instr_fifo[i]) begin
-        nr_mem_n = nr_mem_n + 1'b1;
-      end
-    end
-
-    if (fetch_entry_ready_i) begin
-      nr_mem_n = nr_mem_n - 1'b1;
-    end
-  end
+  assign has_mem_access_o = (|fifo_instr_is_mem) | (|input_is_mem) | output_is_mem;
 
   always_comb begin
     idx_ds_d = idx_ds_q;
@@ -318,7 +317,7 @@ module instr_queue (
   for (genvar i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin : gen_instr_fifo
     // Make sure we don't save any instructions if we couldn't save the address
     assign push_instr_fifo[i] = push_instr[i] & ~address_overflow;
-    fifo_v3 #(
+    instr_fifo #(
       .DEPTH      ( ariane_pkg::FETCH_FIFO_DEPTH ),
       .dtype      ( instr_data_t                 )
     ) i_fifo_instr_data (
@@ -332,7 +331,9 @@ module instr_queue (
       .data_i     ( instr_data_in[i]     ),
       .push_i     ( push_instr_fifo[i]   ),
       .data_o     ( instr_data_out[i]    ),
-      .pop_i      ( pop_instr[i]         )
+      .pop_i      ( pop_instr[i]         ),
+      .full_data_o (fifo_instr[i]),
+      .full_data_valid_o (fifo_instr_valid[i])
     );
   end
   // or reduce and check whether we are retiring a taken branch (might be that the corresponding)
@@ -374,7 +375,6 @@ module instr_queue (
       idx_is_q        <= '0;
       pc_q            <= '0;
       reset_address_q <= 1'b1;
-      nr_mem_q        <= '0;
     end else begin
       pc_q            <= pc_d;
       reset_address_q <= reset_address_d;
@@ -384,11 +384,9 @@ module instr_queue (
           // binary encoded
           idx_is_q        <= '0;
           reset_address_q <= 1'b1;
-          nr_mem_q        <= '0;
       end else begin
         idx_ds_q   <= idx_ds_d;
         idx_is_q   <= idx_is_d;
-        nr_mem_q   <= nr_mem_n;
       end
     end
   end