Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/dev' into ext-spad
Browse files Browse the repository at this point in the history
  • Loading branch information
richardyrh committed May 16, 2024
2 parents 5bcbead + f72dc8c commit 8a9c423
Show file tree
Hide file tree
Showing 13 changed files with 78 additions and 46 deletions.
2 changes: 1 addition & 1 deletion CHIPYARD.hash
Original file line number Diff line number Diff line change
@@ -1 +1 @@
ef3409f87ff2988fa862ea48c995d2c27c93c7a2
b4aae0ddfdc5aaced32e0df90b633eab5b8327ca
2 changes: 1 addition & 1 deletion software/libgemmini
Submodule libgemmini updated 1 files
+1 −1 Makefile
2 changes: 1 addition & 1 deletion src/main/scala/gemmini/AccumulatorScale.scala
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,7 @@ object AccumulatorScale {
val neg_q_iexp = neg(q)
val z_iexp = (neg_q_iexp * qln2_inv).asUInt.do_>>(16).asTypeOf(q) // q is non-positive
val z_iexp_saturated = Wire(z_iexp.cloneType)
z_iexp_saturated := Mux((5 until 16).map(z_iexp.asUInt(_)).reduce(_ | _), 32.S, z_iexp.asUInt.asSInt)
z_iexp_saturated := Mux((5 until 16).map(z_iexp.asUInt(_)).reduce(_ | _), 32.S.asTypeOf(z_iexp), z_iexp)
val qp_iexp = q.mac(z_iexp, qln2).withWidthOf(q)
val q_poly_iexp = qc.mac(qp_iexp + qb, qp_iexp + qb).withWidthOf(q)
// we dont want a rounding shift
Expand Down
36 changes: 35 additions & 1 deletion src/main/scala/gemmini/ConfigsFP.scala
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,30 @@ object GemminiFPConfigs {
mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")),
)

}

val chipFP32Config = FP32DefaultConfig.copy(sp_capacity=CapacityInKilobytes(32), acc_capacity=CapacityInKilobytes(8), dataflow=Dataflow.WS,
acc_scale_args = Some(ScaleArguments((t: Float, u: Float) => {t}, 1, Float(8, 24), -1, identity = "1.0",
c_str = "((x))"
)),
mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 3, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")),
mvin_scale_acc_args=None,
acc_singleported=false,
acc_sub_banks = 1,
acc_banks = 2,
mesh_output_delay = 2,
tile_latency = 1,
acc_latency = 3,
ex_read_from_acc=false,
ex_write_to_spad=false,
has_training_convs = false,
hardcode_d_to_garbage_addr = true,
acc_read_full_width = false,
max_in_flight_mem_reqs = 16,
headerFileName = "gemmini_params_fp32.h",
num_counter = 0,
clock_gate = true
)
}

//===========FP32 Default Config=========
class GemminiFP32DefaultConfig extends Config((site, here, up) => {
Expand All @@ -123,6 +145,18 @@ class GemminiFP32DefaultConfig extends Config((site, here, up) => {
)
})

class ChipFP32GemminiConfig[T <: Data : Arithmetic, U <: Data, V <: Data](
gemminiConfig: GemminiArrayConfig[T,U,V] = GemminiFPConfigs.chipFP32Config
) extends Config((site, here, up) => {
case BuildRoCC => up(BuildRoCC) ++ Seq(
(p: Parameters) => {
implicit val q = p
val gemmini = LazyModule(new Gemmini(gemminiConfig))
gemmini
}
)
})


//===========FP16 Default Config=========
class GemminiFP16DefaultConfig extends Config((site, here, up) => {
Expand Down
2 changes: 1 addition & 1 deletion src/main/scala/gemmini/Controller.scala
Original file line number Diff line number Diff line change
Expand Up @@ -552,7 +552,7 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data]

// Debugging signals
val pipeline_stall_counter = RegInit(0.U(32.W))
when (io.cmd.fire()) {
when (io.cmd.fire) {
pipeline_stall_counter := 0.U
}.elsewhen(io.busy) {
pipeline_stall_counter := pipeline_stall_counter + 1.U
Expand Down
5 changes: 2 additions & 3 deletions src/main/scala/gemmini/DMA.scala
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package gemmini

import chisel3._
import chisel3.util._
import chisel3.experimental.DataMirror

import org.chipsalliance.cde.config.Parameters
import freechips.rocketchip.diplomacy.{IdRange, LazyModule, LazyModuleImp}
Expand Down Expand Up @@ -211,7 +210,7 @@ class StreamReaderCore[T <: Data, U <: Data, V <: Data](config: GemminiArrayConf
)._2

class TLBundleAWithInfo extends Bundle {
val tl_a = DataMirror.internal.chiselTypeClone[TLBundleA](tl.a.bits)
val tl_a = tl.a.bits.cloneType
val vaddr = Output(UInt(vaddrBits.W))
val status = Output(new MStatus)
}
Expand Down Expand Up @@ -503,7 +502,7 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes:
)._2

class TLBundleAWithInfo extends Bundle {
val tl_a = DataMirror.internal.chiselTypeClone[TLBundleA](tl.a.bits)
val tl_a = tl.a.bits.cloneType
val vaddr = Output(UInt(vaddrBits.W))
val status = Output(new MStatus)
val passthrough = Output(Bool())
Expand Down
2 changes: 1 addition & 1 deletion src/main/scala/gemmini/LoopConv.scala
Original file line number Diff line number Diff line change
Expand Up @@ -974,7 +974,7 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth:
io.loop_id := req.loop_id

command_p.io.in.valid := state =/= idle && !skip && io.ex_completed
command_p.io.in.bits.cmd := MuxLookup(state.asUInt, mvout_cmd, Seq(
command_p.io.in.bits.cmd := MuxLookup(state.asUInt, mvout_cmd)(Seq(
pre_pool_config.asUInt -> pre_pool_config_cmd,
pool.asUInt -> pool_cmd,
post_pool_config.asUInt -> post_pool_config_cmd)
Expand Down
6 changes: 3 additions & 3 deletions src/main/scala/gemmini/LoopMatmul.scala
Original file line number Diff line number Diff line change
Expand Up @@ -643,7 +643,7 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In

when (req.dram_addr === 0.U) {
state := idle
}.elsewhen (io.cmd.fire() && state === st) {
}.elsewhen (io.cmd.fire && state === st) {
// The order here is k, j, i
val next_i = floorAdd(i, 1.U, req.max_i)
val next_j = floorAdd(j, max_blocks, req.max_j, next_i === 0.U)
Expand All @@ -654,9 +654,9 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In
when (next_i === 0.U && next_j === 0.U) {
state := idle
}
}.elsewhen (io.cmd.fire() && state === ln_config) {
}.elsewhen (io.cmd.fire && state === ln_config) {
state := ln_st
}.elsewhen (io.cmd.fire() && state === ln_st) {
}.elsewhen (io.cmd.fire && state === ln_st) {
val next_j = floorAdd(j, max_blocks, req.max_j)
val next_stat_id = floorAdd(ln_stat_id, 1.U, ln_stat_ids, next_j === 0.U)
val next_cmd = floorAdd(ln_cmd, 1.U, ln_norm_cmds.size.U, next_j === 0.U && next_stat_id === 0.U)
Expand Down
1 change: 0 additions & 1 deletion src/main/scala/gemmini/NormCmd.scala
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package gemmini

import chisel3._
import chisel3.util._
import chisel3.experimental.ChiselEnum

object NormCmd extends ChiselEnum {
val RESET, SUM, MEAN, VARIANCE, INV_STDDEV, MAX, SUM_EXP, INV_SUM_EXP = Value
Expand Down
53 changes: 26 additions & 27 deletions src/main/scala/gemmini/Normalizer.scala
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
package gemmini

import chisel3._
import chisel3.experimental.ChiselEnum
import chisel3.util._
import gemmini.AccumulatorScale.iexp
import hardfloat.{DivSqrtRecFN_small, INToRecFN, MulRecFN, consts, fNFromRecFN, recFNFromFN}
Expand Down Expand Up @@ -348,7 +347,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_
lanes.io.ins.bits.stats_id := in_lanes_stats_id
lanes.io.ins.bits.iexp_const := iexp_const

when (lanes.io.ins.fire()) {
when (lanes.io.ins.fire) {
stat.elems_left := stat.elems_left - len
}
}
Expand All @@ -359,7 +358,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_

val stat = stats(out_lanes_stats_id)

when (lanes.io.out.fire()) {
when (lanes.io.out.fire) {
stat.sum := stat.sum + lanes.io.out.bits.result
}
}
Expand All @@ -379,7 +378,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_
max_lanes.io.ins.bits.len := len
max_lanes.io.ins.bits.stats_id := max_in_lanes_stats_id

when (max_lanes.io.ins.fire()) {
when (max_lanes.io.ins.fire) {
stat.elems_left := stat.elems_left - len
}
}
Expand All @@ -390,7 +389,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_

val stat = stats(max_out_lanes_stats_id)

when (max_lanes.io.out.fire()) {
when (max_lanes.io.out.fire) {
val new_max = Mux(max_lanes.io.out.bits.result > stat.running_max, max_lanes.io.out.bits.result, stat.running_max)
stat.running_max := new_max
stat.max := new_max
Expand Down Expand Up @@ -645,13 +644,13 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_
next_state := idle
done := DontCare
}.elsewhen(state === output) {
next_state := Mux(io.out.fire() && out_stats_id === id.U, idle, state)
done := io.out.fire() && out_stats_id === id.U
next_state := Mux(io.out.fire && out_stats_id === id.U, idle, state)
done := io.out.fire && out_stats_id === id.U
}.elsewhen(state === get_max) {
val is_last_lane_input = stat.vec_groups_left === 0.U ||
(stat.vec_groups_left === 1.U &&
max_lanes.io.ins.bits.stats_id === id.U &&
max_lanes.io.ins.fire())
max_lanes.io.ins.fire)

next_state := Mux(
is_last_lane_input,
Expand All @@ -667,7 +666,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_
val is_last_lane_input = stat.vec_groups_left === 0.U ||
(stat.vec_groups_left === 1.U &&
lanes.io.ins.bits.stats_id === id.U &&
lanes.io.ins.fire())
lanes.io.ins.fire)

next_state := Mux(
is_last_lane_input,
Expand All @@ -688,51 +687,51 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_

done := is_last_lane_input && cmd =/= NormCmd.MEAN && cmd =/= NormCmd.INV_STDDEV && cmd =/= NormCmd.INV_SUM_EXP
}.elsewhen(state === get_mean || state === get_variance) {
next_state := Mux(divider_in.fire() && sum_to_divide_id === id.U, state.next, state)
next_state := Mux(divider_in.fire && sum_to_divide_id === id.U, state.next, state)
done := false.B
}.elsewhen(state === waiting_for_mean) {
next_state := Mux(divider_out.fire(), idle, state)
done := divider_out.fire()
next_state := Mux(divider_out.fire, idle, state)
done := divider_out.fire
}.elsewhen(state === waiting_for_variance) {
next_state := Mux(divider_out.fire(), get_stddev, state)
next_state := Mux(divider_out.fire, get_stddev, state)
done := false.B
}.elsewhen(state === get_stddev) {
next_state := Mux(sqrt_in.fire() && variance_to_sqrt_id === id.U, state.next, state)
next_state := Mux(sqrt_in.fire && variance_to_sqrt_id === id.U, state.next, state)
done := false.B
}.elsewhen(state === waiting_for_stddev) {
next_state := Mux(sqrt_out.fire(), state.next, state)
next_state := Mux(sqrt_out.fire, state.next, state)
done := false.B
}.elsewhen(state === get_inv_stddev) {
next_state := Mux(reciprocal_in.fire() && stddev_to_inv_id === id.U, state.next, state)
next_state := Mux(reciprocal_in.fire && stddev_to_inv_id === id.U, state.next, state)
done := false.B
}.elsewhen(state === waiting_for_inv_stddev) {
next_state := Mux(reciprocal_out.fire(), state.next, state)
next_state := Mux(reciprocal_out.fire, state.next, state)
done := false.B
}.elsewhen(state === get_scaled_inv_stddev) {
next_state := Mux(inv_stddev_scale_mul_pipe.io.ins.fire() && inv_stddev_to_scale_id === id.U, state.next, state)
next_state := Mux(inv_stddev_scale_mul_pipe.io.ins.fire && inv_stddev_to_scale_id === id.U, state.next, state)
done := false.B
}.elsewhen(state === waiting_for_scaled_inv_stddev) {
next_state := Mux(inv_stddev_scale_mul_pipe.io.out.fire(), idle, state)
done := inv_stddev_scale_mul_pipe.io.out.fire()
next_state := Mux(inv_stddev_scale_mul_pipe.io.out.fire, idle, state)
done := inv_stddev_scale_mul_pipe.io.out.fire
}.elsewhen(state === get_inv_sum_exp) {
next_state := Mux(exp_divider_in.fire() && sum_exp_to_inv_id === id.U, state.next, state)
next_state := Mux(exp_divider_in.fire && sum_exp_to_inv_id === id.U, state.next, state)
done := false.B
}.elsewhen(state === waiting_for_inv_sum_exp) {
next_state := Mux(exp_divider_out.fire(), state.next, state)
next_state := Mux(exp_divider_out.fire, state.next, state)
done := false.B
}.elsewhen(state === get_scaled_inv_sum_exp) {
next_state := Mux(inv_sum_exp_scale_mul_pipe.io.ins.fire() && inv_sum_exp_to_scale_id === id.U, state.next, state)
next_state := Mux(inv_sum_exp_scale_mul_pipe.io.ins.fire && inv_sum_exp_to_scale_id === id.U, state.next, state)
done := false.B
}.elsewhen(state === waiting_for_scaled_inv_sum_exp) {
next_state := Mux(inv_sum_exp_scale_mul_pipe.io.out.fire(), idle, state)
done := inv_sum_exp_scale_mul_pipe.io.out.fire()
next_state := Mux(inv_sum_exp_scale_mul_pipe.io.out.fire, idle, state)
done := inv_sum_exp_scale_mul_pipe.io.out.fire
}.otherwise {
assert(false.B, "invalid state in Normalizer")
next_state := DontCare
done := DontCare
}

when (io.in.fire() && in_stats_id === id.U) {
when (io.in.fire && in_stats_id === id.U) {
next_state := Mux(io.in.bits.cmd === NormCmd.RESET, output,
Mux(io.in.bits.cmd === NormCmd.MAX, get_max, get_sum))
}
Expand All @@ -747,7 +746,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_
(state === get_mean && next_state =/= get_mean) ||
(state === get_variance && next_state =/= get_variance)

val is_input = io.in.fire() && in_stats_id === id.U
val is_input = io.in.fire && in_stats_id === id.U

when (is_input) {
stat.req := io.in.bits
Expand Down
8 changes: 4 additions & 4 deletions src/main/scala/gemmini/ReservationStation.scala
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class ReservationStationIssue[T <: Data](cmd_t: T, id_width: Int) extends Bundle
val cmd = Output(cmd_t.cloneType)
val rob_id = Output(UInt(id_width.W))

def fire(dummy: Int=0) = valid && ready
def fire = valid && ready
}

// TODO we don't need to store the full command in here. We should be able to release the command directly into the relevant controller and only store the associated metadata in the ROB. This would reduce the size considerably
Expand Down Expand Up @@ -183,7 +183,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G
val new_entry_oh = new_allocs_oh_ld ++ new_allocs_oh_ex ++ new_allocs_oh_st
new_entry_oh.foreach(_ := false.B)

val alloc_fire = io.alloc.fire()
val alloc_fire = io.alloc.fire

io.alloc.ready := false.B
when (io.alloc.valid) {
Expand Down Expand Up @@ -452,7 +452,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G
val from_conv_fsm = entries_type(issue_id).bits.cmd.from_conv_fsm
val from_matmul_fsm = entries_type(issue_id).bits.cmd.from_matmul_fsm

when (io.fire()) {
when (io.fire) {
entries_type.zipWithIndex.foreach { case (e, i) =>
when (issue_sel(i)) {
e.bits.issued := true.B
Expand Down Expand Up @@ -560,7 +560,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G

val cycles_since_issue = RegInit(0.U(16.W))

when (io.issue.ld.fire() || io.issue.st.fire() || io.issue.ex.fire() || !io.busy || io.completed.fire) {
when (io.issue.ld.fire || io.issue.st.fire || io.issue.ex.fire || !io.busy || io.completed.fire) {
cycles_since_issue := 0.U
}.elsewhen(io.busy) {
cycles_since_issue := cycles_since_issue + 1.U
Expand Down
1 change: 0 additions & 1 deletion src/main/scala/gemmini/TransposePreloadUnroller.scala
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package gemmini

import chisel3._
import chisel3.util._
import chisel3.experimental.ChiselEnum
import org.chipsalliance.cde.config.Parameters
import Util._
import midas.targetutils.PerfCounter
Expand Down
4 changes: 3 additions & 1 deletion src/main/scala/gemmini/VectorScalarMultiplier.scala
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,8 @@ object VectorScalarMultiplier {
) = {
assert(!is_acc || is_mvin)
val vsm = Module(new VectorScalarMultiplier(scale_args, cols, t, tag_t))
(vsm.io.req, vsm.io.resp)
val vsm_in_q = Module(new Queue(chiselTypeOf(vsm.io.req.bits), 2))
vsm.io.req <> vsm_in_q.io.deq
(vsm_in_q.io.enq, vsm.io.resp)
}
}

0 comments on commit 8a9c423

Please sign in to comment.