Merge remote-tracking branch 'origin/dev' into ext-spad

ucb-bar · May 16, 2024 · 8a9c423 · 8a9c423
2 parents 5bcbead + f72dc8c
commit 8a9c423
Show file tree

Hide file tree

Showing 13 changed files with 78 additions and 46 deletions.
diff --git a/CHIPYARD.hash b/CHIPYARD.hash
@@ -1 +1 @@
-ef3409f87ff2988fa862ea48c995d2c27c93c7a2
+b4aae0ddfdc5aaced32e0df90b633eab5b8327ca
diff --git a/software/libgemmini b/software/libgemmini
diff --git a/src/main/scala/gemmini/AccumulatorScale.scala b/src/main/scala/gemmini/AccumulatorScale.scala
@@ -401,7 +401,7 @@ object AccumulatorScale {
     val neg_q_iexp = neg(q)
     val z_iexp = (neg_q_iexp * qln2_inv).asUInt.do_>>(16).asTypeOf(q) // q is non-positive
     val z_iexp_saturated = Wire(z_iexp.cloneType)
-    z_iexp_saturated := Mux((5 until 16).map(z_iexp.asUInt(_)).reduce(_ | _), 32.S, z_iexp.asUInt.asSInt)
+    z_iexp_saturated := Mux((5 until 16).map(z_iexp.asUInt(_)).reduce(_ | _), 32.S.asTypeOf(z_iexp), z_iexp)
     val qp_iexp = q.mac(z_iexp, qln2).withWidthOf(q)
     val q_poly_iexp = qc.mac(qp_iexp + qb, qp_iexp + qb).withWidthOf(q)
     // we dont want a rounding shift

diff --git a/src/main/scala/gemmini/ConfigsFP.scala b/src/main/scala/gemmini/ConfigsFP.scala
@@ -109,8 +109,30 @@ object GemminiFPConfigs {
                                                mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")),
                                               )
 
-}
 
+  val chipFP32Config = FP32DefaultConfig.copy(sp_capacity=CapacityInKilobytes(32), acc_capacity=CapacityInKilobytes(8), dataflow=Dataflow.WS,
+    acc_scale_args = Some(ScaleArguments((t: Float, u: Float) => {t}, 1, Float(8, 24), -1, identity = "1.0",
+      c_str = "((x))"
+    )),
+    mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 3, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")),
+    mvin_scale_acc_args=None,
+    acc_singleported=false,
+    acc_sub_banks = 1,
+    acc_banks = 2,
+    mesh_output_delay = 2,
+    tile_latency = 1,
+    acc_latency = 3,
+    ex_read_from_acc=false,
+    ex_write_to_spad=false,
+    has_training_convs = false,
+    hardcode_d_to_garbage_addr = true,
+    acc_read_full_width = false,
+    max_in_flight_mem_reqs = 16,
+    headerFileName = "gemmini_params_fp32.h",
+    num_counter = 0,
+    clock_gate = true 
+  )
+}
 
 //===========FP32 Default Config=========
 class GemminiFP32DefaultConfig extends Config((site, here, up) => {
@@ -123,6 +145,18 @@ class GemminiFP32DefaultConfig extends Config((site, here, up) => {
   )
 })
 
+class ChipFP32GemminiConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
+  gemminiConfig: GemminiArrayConfig[T,U,V] = GemminiFPConfigs.chipFP32Config
+) extends Config((site, here, up) => {
+  case BuildRoCC => up(BuildRoCC) ++ Seq(
+    (p: Parameters) => {
+      implicit val q = p
+      val gemmini = LazyModule(new Gemmini(gemminiConfig))
+      gemmini
+    }
+  )
+})
+
 
 //===========FP16 Default Config=========
 class GemminiFP16DefaultConfig extends Config((site, here, up) => {

diff --git a/src/main/scala/gemmini/Controller.scala b/src/main/scala/gemmini/Controller.scala
@@ -552,7 +552,7 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]
 
   // Debugging signals
   val pipeline_stall_counter = RegInit(0.U(32.W))
-  when (io.cmd.fire()) {
+  when (io.cmd.fire) {
     pipeline_stall_counter := 0.U
   }.elsewhen(io.busy) {
     pipeline_stall_counter := pipeline_stall_counter + 1.U

diff --git a/src/main/scala/gemmini/DMA.scala b/src/main/scala/gemmini/DMA.scala
@@ -3,7 +3,6 @@ package gemmini
 
 import chisel3._
 import chisel3.util._
-import chisel3.experimental.DataMirror
 
 import org.chipsalliance.cde.config.Parameters
 import freechips.rocketchip.diplomacy.{IdRange, LazyModule, LazyModuleImp}
@@ -211,7 +210,7 @@ class StreamReaderCore[T <: Data, U <: Data, V <: Data](config: GemminiArrayConf
     )._2
 
     class TLBundleAWithInfo extends Bundle {
-      val tl_a = DataMirror.internal.chiselTypeClone[TLBundleA](tl.a.bits)
+      val tl_a = tl.a.bits.cloneType
       val vaddr = Output(UInt(vaddrBits.W))
       val status = Output(new MStatus)
     }
@@ -503,7 +502,7 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes:
     )._2
 
     class TLBundleAWithInfo extends Bundle {
-      val tl_a = DataMirror.internal.chiselTypeClone[TLBundleA](tl.a.bits)
+      val tl_a = tl.a.bits.cloneType
       val vaddr = Output(UInt(vaddrBits.W))
       val status = Output(new MStatus)
       val passthrough = Output(Bool())

diff --git a/src/main/scala/gemmini/LoopConv.scala b/src/main/scala/gemmini/LoopConv.scala
@@ -974,7 +974,7 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth:
   io.loop_id := req.loop_id
 
   command_p.io.in.valid := state =/= idle && !skip && io.ex_completed
-  command_p.io.in.bits.cmd := MuxLookup(state.asUInt, mvout_cmd, Seq(
+  command_p.io.in.bits.cmd := MuxLookup(state.asUInt, mvout_cmd)(Seq(
     pre_pool_config.asUInt -> pre_pool_config_cmd,
     pool.asUInt -> pool_cmd,
     post_pool_config.asUInt -> post_pool_config_cmd)

diff --git a/src/main/scala/gemmini/LoopMatmul.scala b/src/main/scala/gemmini/LoopMatmul.scala
@@ -643,7 +643,7 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In
 
   when (req.dram_addr === 0.U) {
     state := idle
-  }.elsewhen (io.cmd.fire() && state === st) {
+  }.elsewhen (io.cmd.fire && state === st) {
     // The order here is k, j, i
     val next_i = floorAdd(i, 1.U, req.max_i)
     val next_j = floorAdd(j, max_blocks, req.max_j, next_i === 0.U)
@@ -654,9 +654,9 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In
     when (next_i === 0.U && next_j === 0.U) {
       state := idle
     }
-  }.elsewhen (io.cmd.fire() && state === ln_config) {
+  }.elsewhen (io.cmd.fire && state === ln_config) {
     state := ln_st
-  }.elsewhen (io.cmd.fire() && state === ln_st) {
+  }.elsewhen (io.cmd.fire && state === ln_st) {
     val next_j = floorAdd(j, max_blocks, req.max_j)
     val next_stat_id = floorAdd(ln_stat_id, 1.U, ln_stat_ids, next_j === 0.U)
     val next_cmd = floorAdd(ln_cmd, 1.U, ln_norm_cmds.size.U, next_j === 0.U && next_stat_id === 0.U)

diff --git a/src/main/scala/gemmini/NormCmd.scala b/src/main/scala/gemmini/NormCmd.scala
@@ -3,7 +3,6 @@ package gemmini
 
 import chisel3._
 import chisel3.util._
-import chisel3.experimental.ChiselEnum
 
 object NormCmd extends ChiselEnum {
   val RESET, SUM, MEAN, VARIANCE, INV_STDDEV, MAX, SUM_EXP, INV_SUM_EXP = Value

diff --git a/src/main/scala/gemmini/Normalizer.scala b/src/main/scala/gemmini/Normalizer.scala
@@ -2,7 +2,6 @@
 package gemmini
 
 import chisel3._
-import chisel3.experimental.ChiselEnum
 import chisel3.util._
 import gemmini.AccumulatorScale.iexp
 import hardfloat.{DivSqrtRecFN_small, INToRecFN, MulRecFN, consts, fNFromRecFN, recFNFromFN}
@@ -348,7 +347,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_
     lanes.io.ins.bits.stats_id := in_lanes_stats_id
     lanes.io.ins.bits.iexp_const := iexp_const
 
-    when (lanes.io.ins.fire()) {
+    when (lanes.io.ins.fire) {
       stat.elems_left := stat.elems_left - len
     }
   }
@@ -359,7 +358,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_
 
     val stat = stats(out_lanes_stats_id)
 
-    when (lanes.io.out.fire()) {
+    when (lanes.io.out.fire) {
       stat.sum := stat.sum + lanes.io.out.bits.result
     }
   }
@@ -379,7 +378,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_
     max_lanes.io.ins.bits.len := len
     max_lanes.io.ins.bits.stats_id := max_in_lanes_stats_id
 
-    when (max_lanes.io.ins.fire()) {
+    when (max_lanes.io.ins.fire) {
       stat.elems_left := stat.elems_left - len
     }
   }
@@ -390,7 +389,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_
 
     val stat = stats(max_out_lanes_stats_id)
 
-    when (max_lanes.io.out.fire()) {
+    when (max_lanes.io.out.fire) {
       val new_max = Mux(max_lanes.io.out.bits.result > stat.running_max, max_lanes.io.out.bits.result, stat.running_max)
       stat.running_max := new_max
       stat.max := new_max
@@ -645,13 +644,13 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_
       next_state := idle
       done := DontCare
     }.elsewhen(state === output) {
-      next_state := Mux(io.out.fire() && out_stats_id === id.U, idle, state)
-      done := io.out.fire() && out_stats_id === id.U
+      next_state := Mux(io.out.fire && out_stats_id === id.U, idle, state)
+      done := io.out.fire && out_stats_id === id.U
     }.elsewhen(state === get_max) {
       val is_last_lane_input = stat.vec_groups_left === 0.U ||
         (stat.vec_groups_left === 1.U &&
           max_lanes.io.ins.bits.stats_id === id.U &&
-          max_lanes.io.ins.fire())
+          max_lanes.io.ins.fire)
 
       next_state := Mux(
         is_last_lane_input,
@@ -667,7 +666,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_
       val is_last_lane_input = stat.vec_groups_left === 0.U ||
         (stat.vec_groups_left === 1.U &&
           lanes.io.ins.bits.stats_id === id.U &&
-          lanes.io.ins.fire())
+          lanes.io.ins.fire)
 
       next_state := Mux(
         is_last_lane_input,
@@ -688,51 +687,51 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_
 
       done := is_last_lane_input && cmd =/= NormCmd.MEAN && cmd =/= NormCmd.INV_STDDEV && cmd =/= NormCmd.INV_SUM_EXP
     }.elsewhen(state === get_mean || state === get_variance) {
-      next_state := Mux(divider_in.fire() && sum_to_divide_id === id.U, state.next, state)
+      next_state := Mux(divider_in.fire && sum_to_divide_id === id.U, state.next, state)
       done := false.B
     }.elsewhen(state === waiting_for_mean) {
-      next_state := Mux(divider_out.fire(), idle, state)
-      done := divider_out.fire()
+      next_state := Mux(divider_out.fire, idle, state)
+      done := divider_out.fire
     }.elsewhen(state === waiting_for_variance) {
-      next_state := Mux(divider_out.fire(), get_stddev, state)
+      next_state := Mux(divider_out.fire, get_stddev, state)
       done := false.B
     }.elsewhen(state === get_stddev) {
-      next_state := Mux(sqrt_in.fire() && variance_to_sqrt_id === id.U, state.next, state)
+      next_state := Mux(sqrt_in.fire && variance_to_sqrt_id === id.U, state.next, state)
       done := false.B
     }.elsewhen(state === waiting_for_stddev) {
-      next_state := Mux(sqrt_out.fire(), state.next, state)
+      next_state := Mux(sqrt_out.fire, state.next, state)
       done := false.B
     }.elsewhen(state === get_inv_stddev) {
-      next_state := Mux(reciprocal_in.fire() && stddev_to_inv_id === id.U, state.next, state)
+      next_state := Mux(reciprocal_in.fire && stddev_to_inv_id === id.U, state.next, state)
       done := false.B
     }.elsewhen(state === waiting_for_inv_stddev) {
-      next_state := Mux(reciprocal_out.fire(), state.next, state)
+      next_state := Mux(reciprocal_out.fire, state.next, state)
       done := false.B
     }.elsewhen(state === get_scaled_inv_stddev) {
-      next_state := Mux(inv_stddev_scale_mul_pipe.io.ins.fire() && inv_stddev_to_scale_id === id.U, state.next, state)
+      next_state := Mux(inv_stddev_scale_mul_pipe.io.ins.fire && inv_stddev_to_scale_id === id.U, state.next, state)
       done := false.B
     }.elsewhen(state === waiting_for_scaled_inv_stddev) {
-      next_state := Mux(inv_stddev_scale_mul_pipe.io.out.fire(), idle, state)
-      done := inv_stddev_scale_mul_pipe.io.out.fire()
+      next_state := Mux(inv_stddev_scale_mul_pipe.io.out.fire, idle, state)
+      done := inv_stddev_scale_mul_pipe.io.out.fire
     }.elsewhen(state === get_inv_sum_exp) {
-      next_state := Mux(exp_divider_in.fire() && sum_exp_to_inv_id === id.U, state.next, state)
+      next_state := Mux(exp_divider_in.fire && sum_exp_to_inv_id === id.U, state.next, state)
       done := false.B
     }.elsewhen(state === waiting_for_inv_sum_exp) {
-      next_state := Mux(exp_divider_out.fire(), state.next, state)
+      next_state := Mux(exp_divider_out.fire, state.next, state)
       done := false.B
     }.elsewhen(state === get_scaled_inv_sum_exp) {
-      next_state := Mux(inv_sum_exp_scale_mul_pipe.io.ins.fire() && inv_sum_exp_to_scale_id === id.U, state.next, state)
+      next_state := Mux(inv_sum_exp_scale_mul_pipe.io.ins.fire && inv_sum_exp_to_scale_id === id.U, state.next, state)
       done := false.B
     }.elsewhen(state === waiting_for_scaled_inv_sum_exp) {
-      next_state := Mux(inv_sum_exp_scale_mul_pipe.io.out.fire(), idle, state)
-      done := inv_sum_exp_scale_mul_pipe.io.out.fire()
+      next_state := Mux(inv_sum_exp_scale_mul_pipe.io.out.fire, idle, state)
+      done := inv_sum_exp_scale_mul_pipe.io.out.fire
     }.otherwise {
       assert(false.B, "invalid state in Normalizer")
       next_state := DontCare
       done := DontCare
     }
 
-    when (io.in.fire() && in_stats_id === id.U) {
+    when (io.in.fire && in_stats_id === id.U) {
       next_state := Mux(io.in.bits.cmd === NormCmd.RESET, output,
         Mux(io.in.bits.cmd === NormCmd.MAX, get_max, get_sum))
     }
@@ -747,7 +746,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_
         (state === get_mean && next_state =/= get_mean) ||
         (state === get_variance && next_state =/= get_variance)
 
-    val is_input = io.in.fire() && in_stats_id === id.U
+    val is_input = io.in.fire && in_stats_id === id.U
 
     when (is_input) {
       stat.req := io.in.bits

diff --git a/src/main/scala/gemmini/ReservationStation.scala b/src/main/scala/gemmini/ReservationStation.scala
@@ -19,7 +19,7 @@ class ReservationStationIssue[T <: Data](cmd_t: T, id_width: Int) extends Bundle
   val cmd = Output(cmd_t.cloneType)
   val rob_id = Output(UInt(id_width.W))
 
-  def fire(dummy: Int=0) = valid && ready
+  def fire = valid && ready
 }
 
 // TODO we don't need to store the full command in here. We should be able to release the command directly into the relevant controller and only store the associated metadata in the ROB. This would reduce the size considerably
@@ -183,7 +183,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G
   val new_entry_oh = new_allocs_oh_ld ++ new_allocs_oh_ex ++ new_allocs_oh_st
   new_entry_oh.foreach(_ := false.B)
 
-  val alloc_fire = io.alloc.fire()
+  val alloc_fire = io.alloc.fire
 
   io.alloc.ready := false.B
   when (io.alloc.valid) {
@@ -452,7 +452,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G
     val from_conv_fsm = entries_type(issue_id).bits.cmd.from_conv_fsm
     val from_matmul_fsm = entries_type(issue_id).bits.cmd.from_matmul_fsm
 
-    when (io.fire()) {
+    when (io.fire) {
       entries_type.zipWithIndex.foreach { case (e, i) =>
         when (issue_sel(i)) {
           e.bits.issued := true.B
@@ -560,7 +560,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G
 
   val cycles_since_issue = RegInit(0.U(16.W))
 
-  when (io.issue.ld.fire() || io.issue.st.fire() || io.issue.ex.fire() || !io.busy || io.completed.fire) {
+  when (io.issue.ld.fire || io.issue.st.fire || io.issue.ex.fire || !io.busy || io.completed.fire) {
     cycles_since_issue := 0.U
   }.elsewhen(io.busy) {
     cycles_since_issue := cycles_since_issue + 1.U

diff --git a/src/main/scala/gemmini/TransposePreloadUnroller.scala b/src/main/scala/gemmini/TransposePreloadUnroller.scala
@@ -2,7 +2,6 @@ package gemmini
 
 import chisel3._
 import chisel3.util._
-import chisel3.experimental.ChiselEnum
 import org.chipsalliance.cde.config.Parameters
 import Util._
 import midas.targetutils.PerfCounter

diff --git a/src/main/scala/gemmini/VectorScalarMultiplier.scala b/src/main/scala/gemmini/VectorScalarMultiplier.scala
@@ -198,6 +198,8 @@ object VectorScalarMultiplier {
   ) = {
     assert(!is_acc || is_mvin)
     val vsm = Module(new VectorScalarMultiplier(scale_args, cols, t, tag_t))
-    (vsm.io.req, vsm.io.resp)
+    val vsm_in_q = Module(new Queue(chiselTypeOf(vsm.io.req.bits), 2))
+    vsm.io.req <> vsm_in_q.io.deq
+    (vsm_in_q.io.enq, vsm.io.resp) 
   }
 }