Skip to content

Commit

Permalink
optimize by adding pipeline and multi-thread conversion
Browse files Browse the repository at this point in the history
  • Loading branch information
Ubuntu committed Jul 7, 2024
1 parent 0e81887 commit acd9a42
Show file tree
Hide file tree
Showing 11 changed files with 205 additions and 103 deletions.
32 changes: 15 additions & 17 deletions encoding/kzg/prover/gpu/ecntt.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,33 +12,31 @@ import (
"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
)

func (c *GpuComputeDevice) ECNtt(batchPoints []bn254.G1Affine, isInverse bool) ([]bn254.G1Affine, error) {
totalNumSym := len(batchPoints)
func (c *GpuComputeDevice) ECNttToGnark(batchPoints core.HostOrDeviceSlice, isInverse bool, totalSize int) ([]bn254.G1Affine, error) {
output, err := c.ECNtt(batchPoints, isInverse, totalSize)
if err != nil {
return nil, err
}

// convert icicle projective to gnark affine
gpuFFTBatch := gpu_utils.HostSliceIcicleProjectiveToGnarkAffine(output, int(c.NumWorker))

// convert gnark affine to icicle projective on slice
pointsIcileProjective := gpu_utils.BatchConvertGnarkAffineToIcicleProjective(batchPoints)
pointsCopy := core.HostSliceFromElements[icicle_bn254.Projective](pointsIcileProjective)
return gpuFFTBatch, nil
}

output := make(core.HostSlice[icicle_bn254.Projective], int(totalNumSym))
func (c *GpuComputeDevice) ECNtt(batchPoints core.HostOrDeviceSlice, isInverse bool, totalSize int) (core.HostSlice[icicle_bn254.Projective], error) {
output := make(core.HostSlice[icicle_bn254.Projective], totalSize)

// compute
if isInverse {
err := ecntt.ECNtt(pointsCopy, core.KInverse, &c.NttCfg, output)
err := ecntt.ECNtt(batchPoints, core.KInverse, &c.NttCfg, output)
if err.CudaErrorCode != cr.CudaSuccess || err.IcicleErrorCode != core.IcicleSuccess {
return nil, fmt.Errorf("inverse ecntt failed")
}
} else {
err := ecntt.ECNtt(pointsCopy, core.KForward, &c.NttCfg, output)
err := ecntt.ECNtt(batchPoints, core.KForward, &c.NttCfg, output)
if err.CudaErrorCode != cr.CudaSuccess || err.IcicleErrorCode != core.IcicleSuccess {
return nil, fmt.Errorf("forward ecntt failed")
}
}

// convert icicle projective to gnark affine
gpuFFTBatch := make([]bn254.G1Affine, len(batchPoints))
for j := 0; j < totalNumSym; j++ {
gpuFFTBatch[j] = gpu_utils.IcicleProjectiveToGnarkAffine(output[j])
}

return gpuFFTBatch, nil
return output, nil
}
44 changes: 8 additions & 36 deletions encoding/kzg/prover/gpu/msm.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,58 +3,30 @@ package gpu
import (
"fmt"

"github.com/Layr-Labs/eigenda/encoding/utils/gpu_utils"
"github.com/consensys/gnark-crypto/ecc/bn254"
"github.com/consensys/gnark-crypto/ecc/bn254/fr"
"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
icicle_bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
icicle_bn254_msm "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/msm"
)

// MsmBatch function supports batch across blobs
func (c *GpuComputeDevice) MsmBatch(rowsFr [][]fr.Element, rowsG1 [][]bn254.G1Affine) ([]bn254.G1Affine, error) {
// MsmBatch function supports batch across blobs.
// totalSize is the number of output points, which equals to numPoly * 2 * dimE , dimE is number of chunks
func (c *GpuComputeDevice) MsmBatch(rowsFrIcicleCopy core.HostOrDeviceSlice, rowsG1Icicle []icicle_bn254.Affine, totalSize int) (core.DeviceSlice, error) {
msmCfg := icicle_bn254_msm.GetDefaultMSMConfig()
rowsSfIcicle := make([]icicle_bn254.ScalarField, 0)
rowsAffineIcicle := make([]icicle_bn254.Affine, 0)
numBatchEle := len(rowsFr)

// Prepare scalar fields
for _, row := range rowsFr {
rowsSfIcicle = append(rowsSfIcicle, gpu_utils.ConvertFrToScalarFieldsBytes(row)...)
}
rowsFrIcicleCopy := core.HostSliceFromElements[icicle_bn254.ScalarField](rowsSfIcicle)

// Prepare icicle g1 affines
for _, row := range rowsG1 {
rowsAffineIcicle = append(rowsAffineIcicle, gpu_utils.BatchConvertGnarkAffineToIcicleAffine(row)...)
}
rowsG1IcicleCopy := core.HostSliceFromElements[icicle_bn254.Affine](rowsAffineIcicle)
rowsG1IcicleCopy := core.HostSliceFromElements[icicle_bn254.Affine](rowsG1Icicle)

var p icicle_bn254.Projective
var out core.DeviceSlice

// prepare output
_, err := out.Malloc(numBatchEle*p.Size(), p.Size())
_, err := out.Malloc(totalSize*p.Size(), p.Size())
if err != cr.CudaSuccess {
return nil, fmt.Errorf("allocating bytes on device for projective results failed")
return out, fmt.Errorf("%v", "Allocating bytes on device for Projective results failed")
}

err = icicle_bn254_msm.Msm(rowsFrIcicleCopy, rowsG1IcicleCopy, &msmCfg, out)
if err != cr.CudaSuccess {
return nil, fmt.Errorf("msm failed")
return out, fmt.Errorf("%v", "Msm failed")
}

// move output out of device
outHost := make(core.HostSlice[icicle_bn254.Projective], numBatchEle)
outHost.CopyFromDevice(&out)
out.Free()

// convert data back to gnark format
gnarkOuts := make([]bn254.G1Affine, numBatchEle)
for i := 0; i < numBatchEle; i++ {
gnarkOuts[i] = gpu_utils.IcicleProjectiveToGnarkAffine(outHost[i])
}

return gnarkOuts, nil
return out, nil
}
70 changes: 43 additions & 27 deletions encoding/kzg/prover/gpu/multiframe_proof.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@ type WorkerResult struct {

type GpuComputeDevice struct {
*kzg.KzgConfig
Fs *fft.FFTSettings
FFTPointsT [][]bn254.G1Affine // transpose of FFTPoints
SFs *fft.FFTSettings
Srs *kzg.SRS
G2Trailing []bn254.G2Affine
NttCfg core.NTTConfig[[bn254_icicle.SCALAR_LIMBS]uint32]
GpuLock *sync.Mutex // lock whenever gpu is needed,
Fs *fft.FFTSettings
FlatFFTPointsT []bn254_icicle.Affine
SFs *fft.FFTSettings
Srs *kzg.SRS
G2Trailing []bn254.G2Affine
NttCfg core.NTTConfig[[bn254_icicle.SCALAR_LIMBS]uint32]
GpuLock *sync.Mutex // lock whenever gpu is needed,
}

// benchmarks shows cpu commit on 2MB blob only takes 24.165562ms. For now, use cpu
Expand Down Expand Up @@ -76,6 +76,7 @@ func (p *GpuComputeDevice) ComputeMultiFrameProof(polyFr []fr.Element, numChunks
dimE := numChunks
l := chunkLen
numPoly := uint64(len(polyFr)) / dimE / chunkLen
fmt.Println("numPoly", numPoly)

begin := time.Now()

Expand Down Expand Up @@ -117,60 +118,74 @@ func (p *GpuComputeDevice) ComputeMultiFrameProof(polyFr []fr.Element, numChunks
defer p.GpuLock.Unlock()

// Compute NTT on the coeff matrix
p.NttCfg.BatchSize = int32(l)
coeffStoreFFT, e := p.NTT(coeffStore)
p.NttCfg.BatchSize = int32(l * numPoly)
coeffStoreFft, e := p.NTT(coeffStore)
if e != nil {
return nil, e
}
nttDone := time.Now()

// transpose the FFT tranformed matrix
coeffStoreFFTT := make([][]fr.Element, dimE*2*numPoly)
for i := range coeffStoreFFTT {
coeffStoreFFTT[i] = make([]fr.Element, l)
}

for k := uint64(0); k < numPoly; k++ {
step := int(k * dimE * 2)
for i := 0; i < int(l); i++ {
vec := coeffStoreFFT[i+int(k*l)]
for j := 0; j < int(dimE*2); j++ {
coeffStoreFFTT[j+step][i] = vec[j]
/*
fmt.Println("after fft")
vec := gpu_utils.ConvertScalarFieldsToFrBytes(coeffStoreFft)
for i := 0; i < int(l*numPoly); i++ {
length := int(dimE) * 2
for j := 0; j < length; j++ {
fmt.Printf("%v ", vec[i*length+j].String())
}
fmt.Println()
}
*/

// transpose the FFT tranformed matrix
coeffStoreFftTranspose, err := Transpose(coeffStoreFft, int(l), int(numPoly), int(dimE)*2)
if err != nil {
return nil, e
}
transposingDone := time.Now()

// compute msm on each rows of the transposed matrix
sumVec, err := p.MsmBatch(coeffStoreFFTT, p.FFTPointsT)
sumVec, err := p.MsmBatch(coeffStoreFftTranspose, p.FlatFFTPointsT, int(numPoly)*int(dimE)*2)
if err != nil {
return nil, err
}
msmDone := time.Now()

// compute the first ecntt, and set new batch size for ntt
p.NttCfg.BatchSize = int32(numPoly)
sumVecInv, err := p.ECNtt(sumVec, true)
sumVecInv, err := p.ECNtt(sumVec, true, int(dimE)*2*int(numPoly))
if err != nil {
return nil, err
}
firstECNttDone := time.Now()
sumVec.Free()

// remove half points per poly
batchInv := make([]bn254.G1Affine, len(sumVecInv)/2)
// extract proofs
prunedSumVecInv := core.HostSliceWithValue(bn254_icicle.Projective{}, len(sumVecInv)/2)
k := 0
for i := 0; i < int(numPoly); i++ {
for j := 0; j < int(dimE); j++ {
batchInv[k] = sumVecInv[i*int(dimE)*2+j]
prunedSumVecInv[k] = sumVecInv[i*int(dimE)*2+j]
k += 1
}
}

// compute the second ecntt on the reduced size array
flatProofsBatch, err := p.ECNtt(batchInv, false)
flatProofsBatch, err := p.ECNttToGnark(prunedSumVecInv, false, int(numPoly)*int(dimE))
if err != nil {
return nil, fmt.Errorf("second ECNtt error: %w", err)
}

/*
// debug
for i := 0; i < int(numPoly); i++ {
for j := 0; j < int(dimE); j++ {
fmt.Printf("%v ", flatProofsBatch[i*int(dimE)+j].String())
}
fmt.Println()
}
*/

secondECNttDone := time.Now()

fmt.Printf("Multiproof Time Decomp \n\t\ttotal %-20v \n\t\tpreproc %-20v \n\t\tntt %-20v \n\t\ttranspose %-20v \n\t\tmsm %-v \n\t\tfft1 %-v \n\t\tfft2 %-v,\n",
Expand All @@ -183,6 +198,7 @@ func (p *GpuComputeDevice) ComputeMultiFrameProof(polyFr []fr.Element, numChunks
secondECNttDone.Sub(firstECNttDone),
)

// only takes the first half
return flatProofsBatch, nil
}

Expand Down
13 changes: 3 additions & 10 deletions encoding/kzg/prover/gpu/ntt.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import (
bn254_icicle_ntt "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/ntt"
)

func (c *GpuComputeDevice) NTT(batchFr [][]fr.Element) ([][]fr.Element, error) {
func (c *GpuComputeDevice) NTT(batchFr [][]fr.Element) (core.HostSlice[bn254_icicle.ScalarField], error) {
if len(batchFr) == 0 {
return nil, fmt.Errorf("input to NTT contains no blob")
}
Expand All @@ -25,19 +25,12 @@ func (c *GpuComputeDevice) NTT(batchFr [][]fr.Element) ([][]fr.Element, error) {
for i := 0; i < len(batchFr); i++ {
flattenBatchFr = append(flattenBatchFr, batchFr[i]...)
}
flattenBatchSf := gpu_utils.ConvertFrToScalarFieldsBytes(flattenBatchFr)
flattenBatchSf := gpu_utils.ConvertFrToScalarFieldsBytesThread(flattenBatchFr, int(c.NumWorker))
scalarsCopy := core.HostSliceFromElements[bn254_icicle.ScalarField](flattenBatchSf)

// run ntt
output := make(core.HostSlice[bn254_icicle.ScalarField], totalSize)
bn254_icicle_ntt.Ntt(scalarsCopy, core.KForward, &c.NttCfg, output)
flattenBatchFrOutput := gpu_utils.ConvertScalarFieldsToFrBytes(output)

// convert ntt output from icicle to gnark
nttOutput := make([][]fr.Element, len(batchFr))
for i := 0; i < len(batchFr); i++ {
nttOutput[i] = flattenBatchFrOutput[i*numSymbol : (i+1)*numSymbol]
}

return nttOutput, nil
return output, nil
}
27 changes: 27 additions & 0 deletions encoding/kzg/prover/gpu/transpose.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package gpu

import (
"fmt"

"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
bn254_icicle "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
"github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/vecOps"
)

// numRow and numCol describes input dimension
func Transpose(coeffStoreFFT core.HostSlice[bn254_icicle.ScalarField], l, numPoly, numCol int) (core.HostSlice[bn254_icicle.ScalarField], error) {
totalSize := l * numPoly * numCol
ctx, err := cr.GetDefaultDeviceContext()
if err != cr.CudaSuccess {
return nil, fmt.Errorf("allocating bytes on device for projective results failed")
}

transposedNTTOutput := make(core.HostSlice[bn254_icicle.ScalarField], totalSize)

for i := 0; i < numPoly; i++ {
vecOps.TransposeMatrix(coeffStoreFFT[i*l*numCol:(i+1)*l*numCol], transposedNTTOutput[i*l*numCol:(i+1)*l*numCol], l, numCol, ctx, false, false)
}

return transposedNTTOutput, nil
}
12 changes: 12 additions & 0 deletions encoding/kzg/prover/gpu/transpose_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package gpu_test

import (
"testing"
)

// numRow and numCol describes input dimension
func TestTranspose(t *testing.T) {

//gpu.Transpose()

}
9 changes: 8 additions & 1 deletion encoding/kzg/prover/parametrized_prover.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,14 @@ func (g *ParametrizedProver) Encode(inputFr []fr.Element) (*bn254.G1Affine, *bn2
paddedCoeffs := make([]fr.Element, g.NumEvaluations())
// polyCoeffs has less points than paddedCoeffs in general due to erasure redundancy
copy(paddedCoeffs, inputFr)
proofs, err := g.Computer.ComputeMultiFrameProof(paddedCoeffs, g.NumChunks, g.ChunkLength, g.NumWorker)

numBlob := 1
flatpaddedCoeffs := make([]fr.Element, 0, numBlob*len(paddedCoeffs))
for i := 0; i < numBlob; i++ {
flatpaddedCoeffs = append(flatpaddedCoeffs, paddedCoeffs...)
}

proofs, err := g.Computer.ComputeMultiFrameProof(flatpaddedCoeffs, g.NumChunks, g.ChunkLength, g.NumWorker)
proofChan <- ProofsResult{
Proofs: proofs,
Err: err,
Expand Down
17 changes: 9 additions & 8 deletions encoding/kzg/prover/prover.go
Original file line number Diff line number Diff line change
Expand Up @@ -262,16 +262,17 @@ func (g *Prover) newProver(params encoding.EncodingParams) (*ParametrizedProver,
}
} else {
nttCfg := gpu_utils.SetupNTT()
flatFftPointsT := gpu_utils.SetupMsm(fftPointsT)
GpuLock := sync.Mutex{}
computer = &gpu.GpuComputeDevice{
Fs: fs,
FFTPointsT: fftPointsT,
SFs: sfs,
Srs: g.Srs,
G2Trailing: g.G2Trailing,
KzgConfig: g.KzgConfig,
NttCfg: nttCfg,
GpuLock: &GpuLock,
Fs: fs,
FlatFFTPointsT: flatFftPointsT,
SFs: sfs,
Srs: g.Srs,
G2Trailing: g.G2Trailing,
KzgConfig: g.KzgConfig,
NttCfg: nttCfg,
GpuLock: &GpuLock,
}

RsComputeDevice = &rs_gpu.GpuComputeDevice{
Expand Down
Loading

0 comments on commit acd9a42

Please sign in to comment.