diff --git a/CHIPYARD.hash b/CHIPYARD.hash index 07c62705..eabe7a12 100644 --- a/CHIPYARD.hash +++ b/CHIPYARD.hash @@ -1 +1 @@ -ef3409f87ff2988fa862ea48c995d2c27c93c7a2 +b4aae0ddfdc5aaced32e0df90b633eab5b8327ca diff --git a/software/libgemmini b/software/libgemmini index d873aa8b..2b0f1cf6 160000 --- a/software/libgemmini +++ b/software/libgemmini @@ -1 +1 @@ -Subproject commit d873aa8b8f39a01bca225044970745632816ce3d +Subproject commit 2b0f1cf61f9ffaa6fe3efdb58e56c31954b93d36 diff --git a/src/main/scala/gemmini/AccumulatorScale.scala b/src/main/scala/gemmini/AccumulatorScale.scala index 5ca35b46..75b76369 100644 --- a/src/main/scala/gemmini/AccumulatorScale.scala +++ b/src/main/scala/gemmini/AccumulatorScale.scala @@ -401,7 +401,7 @@ object AccumulatorScale { val neg_q_iexp = neg(q) val z_iexp = (neg_q_iexp * qln2_inv).asUInt.do_>>(16).asTypeOf(q) // q is non-positive val z_iexp_saturated = Wire(z_iexp.cloneType) - z_iexp_saturated := Mux((5 until 16).map(z_iexp.asUInt(_)).reduce(_ | _), 32.S, z_iexp.asUInt.asSInt) + z_iexp_saturated := Mux((5 until 16).map(z_iexp.asUInt(_)).reduce(_ | _), 32.S.asTypeOf(z_iexp), z_iexp) val qp_iexp = q.mac(z_iexp, qln2).withWidthOf(q) val q_poly_iexp = qc.mac(qp_iexp + qb, qp_iexp + qb).withWidthOf(q) // we dont want a rounding shift diff --git a/src/main/scala/gemmini/ConfigsFP.scala b/src/main/scala/gemmini/ConfigsFP.scala index ceb9d2a0..8644f9db 100644 --- a/src/main/scala/gemmini/ConfigsFP.scala +++ b/src/main/scala/gemmini/ConfigsFP.scala @@ -109,8 +109,30 @@ object GemminiFPConfigs { mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")), ) -} + val chipFP32Config = FP32DefaultConfig.copy(sp_capacity=CapacityInKilobytes(32), acc_capacity=CapacityInKilobytes(8), dataflow=Dataflow.WS, + acc_scale_args = Some(ScaleArguments((t: Float, u: Float) => {t}, 1, Float(8, 24), -1, identity = "1.0", + c_str = "((x))" + )), + mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 3, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")), + mvin_scale_acc_args=None, + acc_singleported=false, + acc_sub_banks = 1, + acc_banks = 2, + mesh_output_delay = 2, + tile_latency = 1, + acc_latency = 3, + ex_read_from_acc=false, + ex_write_to_spad=false, + has_training_convs = false, + hardcode_d_to_garbage_addr = true, + acc_read_full_width = false, + max_in_flight_mem_reqs = 16, + headerFileName = "gemmini_params_fp32.h", + num_counter = 0, + clock_gate = true + ) +} //===========FP32 Default Config========= class GemminiFP32DefaultConfig extends Config((site, here, up) => { @@ -123,6 +145,18 @@ class GemminiFP32DefaultConfig extends Config((site, here, up) => { ) }) +class ChipFP32GemminiConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( + gemminiConfig: GemminiArrayConfig[T,U,V] = GemminiFPConfigs.chipFP32Config +) extends Config((site, here, up) => { + case BuildRoCC => up(BuildRoCC) ++ Seq( + (p: Parameters) => { + implicit val q = p + val gemmini = LazyModule(new Gemmini(gemminiConfig)) + gemmini + } + ) +}) + //===========FP16 Default Config========= class GemminiFP16DefaultConfig extends Config((site, here, up) => { diff --git a/src/main/scala/gemmini/Controller.scala b/src/main/scala/gemmini/Controller.scala index daae1230..54d88e6c 100644 --- a/src/main/scala/gemmini/Controller.scala +++ b/src/main/scala/gemmini/Controller.scala @@ -552,7 +552,7 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] // Debugging signals val pipeline_stall_counter = RegInit(0.U(32.W)) - when (io.cmd.fire()) { + when (io.cmd.fire) { pipeline_stall_counter := 0.U }.elsewhen(io.busy) { pipeline_stall_counter := pipeline_stall_counter + 1.U diff --git a/src/main/scala/gemmini/DMA.scala b/src/main/scala/gemmini/DMA.scala index 162b3226..5b911853 100644 --- a/src/main/scala/gemmini/DMA.scala +++ b/src/main/scala/gemmini/DMA.scala @@ -3,7 +3,6 @@ package gemmini import chisel3._ import chisel3.util._ -import chisel3.experimental.DataMirror import org.chipsalliance.cde.config.Parameters import freechips.rocketchip.diplomacy.{IdRange, LazyModule, LazyModuleImp} @@ -211,7 +210,7 @@ class StreamReaderCore[T <: Data, U <: Data, V <: Data](config: GemminiArrayConf )._2 class TLBundleAWithInfo extends Bundle { - val tl_a = DataMirror.internal.chiselTypeClone[TLBundleA](tl.a.bits) + val tl_a = tl.a.bits.cloneType val vaddr = Output(UInt(vaddrBits.W)) val status = Output(new MStatus) } @@ -503,7 +502,7 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes: )._2 class TLBundleAWithInfo extends Bundle { - val tl_a = DataMirror.internal.chiselTypeClone[TLBundleA](tl.a.bits) + val tl_a = tl.a.bits.cloneType val vaddr = Output(UInt(vaddrBits.W)) val status = Output(new MStatus) val passthrough = Output(Bool()) diff --git a/src/main/scala/gemmini/LoopConv.scala b/src/main/scala/gemmini/LoopConv.scala index 07dfefdc..2db2b034 100644 --- a/src/main/scala/gemmini/LoopConv.scala +++ b/src/main/scala/gemmini/LoopConv.scala @@ -974,7 +974,7 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: io.loop_id := req.loop_id command_p.io.in.valid := state =/= idle && !skip && io.ex_completed - command_p.io.in.bits.cmd := MuxLookup(state.asUInt, mvout_cmd, Seq( + command_p.io.in.bits.cmd := MuxLookup(state.asUInt, mvout_cmd)(Seq( pre_pool_config.asUInt -> pre_pool_config_cmd, pool.asUInt -> pool_cmd, post_pool_config.asUInt -> post_pool_config_cmd) diff --git a/src/main/scala/gemmini/LoopMatmul.scala b/src/main/scala/gemmini/LoopMatmul.scala index 94228269..79ab9e8b 100644 --- a/src/main/scala/gemmini/LoopMatmul.scala +++ b/src/main/scala/gemmini/LoopMatmul.scala @@ -643,7 +643,7 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In when (req.dram_addr === 0.U) { state := idle - }.elsewhen (io.cmd.fire() && state === st) { + }.elsewhen (io.cmd.fire && state === st) { // The order here is k, j, i val next_i = floorAdd(i, 1.U, req.max_i) val next_j = floorAdd(j, max_blocks, req.max_j, next_i === 0.U) @@ -654,9 +654,9 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In when (next_i === 0.U && next_j === 0.U) { state := idle } - }.elsewhen (io.cmd.fire() && state === ln_config) { + }.elsewhen (io.cmd.fire && state === ln_config) { state := ln_st - }.elsewhen (io.cmd.fire() && state === ln_st) { + }.elsewhen (io.cmd.fire && state === ln_st) { val next_j = floorAdd(j, max_blocks, req.max_j) val next_stat_id = floorAdd(ln_stat_id, 1.U, ln_stat_ids, next_j === 0.U) val next_cmd = floorAdd(ln_cmd, 1.U, ln_norm_cmds.size.U, next_j === 0.U && next_stat_id === 0.U) diff --git a/src/main/scala/gemmini/NormCmd.scala b/src/main/scala/gemmini/NormCmd.scala index 515fabb0..52da8cde 100644 --- a/src/main/scala/gemmini/NormCmd.scala +++ b/src/main/scala/gemmini/NormCmd.scala @@ -3,7 +3,6 @@ package gemmini import chisel3._ import chisel3.util._ -import chisel3.experimental.ChiselEnum object NormCmd extends ChiselEnum { val RESET, SUM, MEAN, VARIANCE, INV_STDDEV, MAX, SUM_EXP, INV_SUM_EXP = Value diff --git a/src/main/scala/gemmini/Normalizer.scala b/src/main/scala/gemmini/Normalizer.scala index de3f301b..c22e9af8 100644 --- a/src/main/scala/gemmini/Normalizer.scala +++ b/src/main/scala/gemmini/Normalizer.scala @@ -2,7 +2,6 @@ package gemmini import chisel3._ -import chisel3.experimental.ChiselEnum import chisel3.util._ import gemmini.AccumulatorScale.iexp import hardfloat.{DivSqrtRecFN_small, INToRecFN, MulRecFN, consts, fNFromRecFN, recFNFromFN} @@ -348,7 +347,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ lanes.io.ins.bits.stats_id := in_lanes_stats_id lanes.io.ins.bits.iexp_const := iexp_const - when (lanes.io.ins.fire()) { + when (lanes.io.ins.fire) { stat.elems_left := stat.elems_left - len } } @@ -359,7 +358,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ val stat = stats(out_lanes_stats_id) - when (lanes.io.out.fire()) { + when (lanes.io.out.fire) { stat.sum := stat.sum + lanes.io.out.bits.result } } @@ -379,7 +378,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ max_lanes.io.ins.bits.len := len max_lanes.io.ins.bits.stats_id := max_in_lanes_stats_id - when (max_lanes.io.ins.fire()) { + when (max_lanes.io.ins.fire) { stat.elems_left := stat.elems_left - len } } @@ -390,7 +389,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ val stat = stats(max_out_lanes_stats_id) - when (max_lanes.io.out.fire()) { + when (max_lanes.io.out.fire) { val new_max = Mux(max_lanes.io.out.bits.result > stat.running_max, max_lanes.io.out.bits.result, stat.running_max) stat.running_max := new_max stat.max := new_max @@ -645,13 +644,13 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ next_state := idle done := DontCare }.elsewhen(state === output) { - next_state := Mux(io.out.fire() && out_stats_id === id.U, idle, state) - done := io.out.fire() && out_stats_id === id.U + next_state := Mux(io.out.fire && out_stats_id === id.U, idle, state) + done := io.out.fire && out_stats_id === id.U }.elsewhen(state === get_max) { val is_last_lane_input = stat.vec_groups_left === 0.U || (stat.vec_groups_left === 1.U && max_lanes.io.ins.bits.stats_id === id.U && - max_lanes.io.ins.fire()) + max_lanes.io.ins.fire) next_state := Mux( is_last_lane_input, @@ -667,7 +666,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ val is_last_lane_input = stat.vec_groups_left === 0.U || (stat.vec_groups_left === 1.U && lanes.io.ins.bits.stats_id === id.U && - lanes.io.ins.fire()) + lanes.io.ins.fire) next_state := Mux( is_last_lane_input, @@ -688,51 +687,51 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ done := is_last_lane_input && cmd =/= NormCmd.MEAN && cmd =/= NormCmd.INV_STDDEV && cmd =/= NormCmd.INV_SUM_EXP }.elsewhen(state === get_mean || state === get_variance) { - next_state := Mux(divider_in.fire() && sum_to_divide_id === id.U, state.next, state) + next_state := Mux(divider_in.fire && sum_to_divide_id === id.U, state.next, state) done := false.B }.elsewhen(state === waiting_for_mean) { - next_state := Mux(divider_out.fire(), idle, state) - done := divider_out.fire() + next_state := Mux(divider_out.fire, idle, state) + done := divider_out.fire }.elsewhen(state === waiting_for_variance) { - next_state := Mux(divider_out.fire(), get_stddev, state) + next_state := Mux(divider_out.fire, get_stddev, state) done := false.B }.elsewhen(state === get_stddev) { - next_state := Mux(sqrt_in.fire() && variance_to_sqrt_id === id.U, state.next, state) + next_state := Mux(sqrt_in.fire && variance_to_sqrt_id === id.U, state.next, state) done := false.B }.elsewhen(state === waiting_for_stddev) { - next_state := Mux(sqrt_out.fire(), state.next, state) + next_state := Mux(sqrt_out.fire, state.next, state) done := false.B }.elsewhen(state === get_inv_stddev) { - next_state := Mux(reciprocal_in.fire() && stddev_to_inv_id === id.U, state.next, state) + next_state := Mux(reciprocal_in.fire && stddev_to_inv_id === id.U, state.next, state) done := false.B }.elsewhen(state === waiting_for_inv_stddev) { - next_state := Mux(reciprocal_out.fire(), state.next, state) + next_state := Mux(reciprocal_out.fire, state.next, state) done := false.B }.elsewhen(state === get_scaled_inv_stddev) { - next_state := Mux(inv_stddev_scale_mul_pipe.io.ins.fire() && inv_stddev_to_scale_id === id.U, state.next, state) + next_state := Mux(inv_stddev_scale_mul_pipe.io.ins.fire && inv_stddev_to_scale_id === id.U, state.next, state) done := false.B }.elsewhen(state === waiting_for_scaled_inv_stddev) { - next_state := Mux(inv_stddev_scale_mul_pipe.io.out.fire(), idle, state) - done := inv_stddev_scale_mul_pipe.io.out.fire() + next_state := Mux(inv_stddev_scale_mul_pipe.io.out.fire, idle, state) + done := inv_stddev_scale_mul_pipe.io.out.fire }.elsewhen(state === get_inv_sum_exp) { - next_state := Mux(exp_divider_in.fire() && sum_exp_to_inv_id === id.U, state.next, state) + next_state := Mux(exp_divider_in.fire && sum_exp_to_inv_id === id.U, state.next, state) done := false.B }.elsewhen(state === waiting_for_inv_sum_exp) { - next_state := Mux(exp_divider_out.fire(), state.next, state) + next_state := Mux(exp_divider_out.fire, state.next, state) done := false.B }.elsewhen(state === get_scaled_inv_sum_exp) { - next_state := Mux(inv_sum_exp_scale_mul_pipe.io.ins.fire() && inv_sum_exp_to_scale_id === id.U, state.next, state) + next_state := Mux(inv_sum_exp_scale_mul_pipe.io.ins.fire && inv_sum_exp_to_scale_id === id.U, state.next, state) done := false.B }.elsewhen(state === waiting_for_scaled_inv_sum_exp) { - next_state := Mux(inv_sum_exp_scale_mul_pipe.io.out.fire(), idle, state) - done := inv_sum_exp_scale_mul_pipe.io.out.fire() + next_state := Mux(inv_sum_exp_scale_mul_pipe.io.out.fire, idle, state) + done := inv_sum_exp_scale_mul_pipe.io.out.fire }.otherwise { assert(false.B, "invalid state in Normalizer") next_state := DontCare done := DontCare } - when (io.in.fire() && in_stats_id === id.U) { + when (io.in.fire && in_stats_id === id.U) { next_state := Mux(io.in.bits.cmd === NormCmd.RESET, output, Mux(io.in.bits.cmd === NormCmd.MAX, get_max, get_sum)) } @@ -747,7 +746,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ (state === get_mean && next_state =/= get_mean) || (state === get_variance && next_state =/= get_variance) - val is_input = io.in.fire() && in_stats_id === id.U + val is_input = io.in.fire && in_stats_id === id.U when (is_input) { stat.req := io.in.bits diff --git a/src/main/scala/gemmini/ReservationStation.scala b/src/main/scala/gemmini/ReservationStation.scala index f715b752..1ad02ee9 100644 --- a/src/main/scala/gemmini/ReservationStation.scala +++ b/src/main/scala/gemmini/ReservationStation.scala @@ -19,7 +19,7 @@ class ReservationStationIssue[T <: Data](cmd_t: T, id_width: Int) extends Bundle val cmd = Output(cmd_t.cloneType) val rob_id = Output(UInt(id_width.W)) - def fire(dummy: Int=0) = valid && ready + def fire = valid && ready } // TODO we don't need to store the full command in here. We should be able to release the command directly into the relevant controller and only store the associated metadata in the ROB. This would reduce the size considerably @@ -183,7 +183,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G val new_entry_oh = new_allocs_oh_ld ++ new_allocs_oh_ex ++ new_allocs_oh_st new_entry_oh.foreach(_ := false.B) - val alloc_fire = io.alloc.fire() + val alloc_fire = io.alloc.fire io.alloc.ready := false.B when (io.alloc.valid) { @@ -452,7 +452,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G val from_conv_fsm = entries_type(issue_id).bits.cmd.from_conv_fsm val from_matmul_fsm = entries_type(issue_id).bits.cmd.from_matmul_fsm - when (io.fire()) { + when (io.fire) { entries_type.zipWithIndex.foreach { case (e, i) => when (issue_sel(i)) { e.bits.issued := true.B @@ -560,7 +560,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G val cycles_since_issue = RegInit(0.U(16.W)) - when (io.issue.ld.fire() || io.issue.st.fire() || io.issue.ex.fire() || !io.busy || io.completed.fire) { + when (io.issue.ld.fire || io.issue.st.fire || io.issue.ex.fire || !io.busy || io.completed.fire) { cycles_since_issue := 0.U }.elsewhen(io.busy) { cycles_since_issue := cycles_since_issue + 1.U diff --git a/src/main/scala/gemmini/TransposePreloadUnroller.scala b/src/main/scala/gemmini/TransposePreloadUnroller.scala index 68407344..878eaa1a 100644 --- a/src/main/scala/gemmini/TransposePreloadUnroller.scala +++ b/src/main/scala/gemmini/TransposePreloadUnroller.scala @@ -2,7 +2,6 @@ package gemmini import chisel3._ import chisel3.util._ -import chisel3.experimental.ChiselEnum import org.chipsalliance.cde.config.Parameters import Util._ import midas.targetutils.PerfCounter diff --git a/src/main/scala/gemmini/VectorScalarMultiplier.scala b/src/main/scala/gemmini/VectorScalarMultiplier.scala index 2311b381..153fd23c 100644 --- a/src/main/scala/gemmini/VectorScalarMultiplier.scala +++ b/src/main/scala/gemmini/VectorScalarMultiplier.scala @@ -198,6 +198,8 @@ object VectorScalarMultiplier { ) = { assert(!is_acc || is_mvin) val vsm = Module(new VectorScalarMultiplier(scale_args, cols, t, tag_t)) - (vsm.io.req, vsm.io.resp) + val vsm_in_q = Module(new Queue(chiselTypeOf(vsm.io.req.bits), 2)) + vsm.io.req <> vsm_in_q.io.deq + (vsm_in_q.io.enq, vsm.io.resp) } }