From acd9a42563dbe91038895facaecb9a173d394e95 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sun, 7 Jul 2024 20:21:55 +0000 Subject: [PATCH] optimize by adding pipeline and multi-thread conversion --- encoding/kzg/prover/gpu/ecntt.go | 32 +++++----- encoding/kzg/prover/gpu/msm.go | 44 +++---------- encoding/kzg/prover/gpu/multiframe_proof.go | 70 +++++++++++++-------- encoding/kzg/prover/gpu/ntt.go | 13 +--- encoding/kzg/prover/gpu/transpose.go | 27 ++++++++ encoding/kzg/prover/gpu/transpose_test.go | 12 ++++ encoding/kzg/prover/parametrized_prover.go | 9 ++- encoding/kzg/prover/prover.go | 17 ++--- encoding/test/main.go | 13 ++-- encoding/utils/gpu_utils/msm_setup.go | 15 +++++ encoding/utils/gpu_utils/utils.go | 56 +++++++++++++++++ 11 files changed, 205 insertions(+), 103 deletions(-) create mode 100644 encoding/kzg/prover/gpu/transpose.go create mode 100644 encoding/kzg/prover/gpu/transpose_test.go create mode 100644 encoding/utils/gpu_utils/msm_setup.go diff --git a/encoding/kzg/prover/gpu/ecntt.go b/encoding/kzg/prover/gpu/ecntt.go index 668ffb65c..97f165ee6 100644 --- a/encoding/kzg/prover/gpu/ecntt.go +++ b/encoding/kzg/prover/gpu/ecntt.go @@ -12,33 +12,31 @@ import ( "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core" ) -func (c *GpuComputeDevice) ECNtt(batchPoints []bn254.G1Affine, isInverse bool) ([]bn254.G1Affine, error) { - totalNumSym := len(batchPoints) +func (c *GpuComputeDevice) ECNttToGnark(batchPoints core.HostOrDeviceSlice, isInverse bool, totalSize int) ([]bn254.G1Affine, error) { + output, err := c.ECNtt(batchPoints, isInverse, totalSize) + if err != nil { + return nil, err + } + + // convert icicle projective to gnark affine + gpuFFTBatch := gpu_utils.HostSliceIcicleProjectiveToGnarkAffine(output, int(c.NumWorker)) - // convert gnark affine to icicle projective on slice - pointsIcileProjective := gpu_utils.BatchConvertGnarkAffineToIcicleProjective(batchPoints) - pointsCopy := core.HostSliceFromElements[icicle_bn254.Projective](pointsIcileProjective) + return gpuFFTBatch, nil +} - output := make(core.HostSlice[icicle_bn254.Projective], int(totalNumSym)) +func (c *GpuComputeDevice) ECNtt(batchPoints core.HostOrDeviceSlice, isInverse bool, totalSize int) (core.HostSlice[icicle_bn254.Projective], error) { + output := make(core.HostSlice[icicle_bn254.Projective], totalSize) - // compute if isInverse { - err := ecntt.ECNtt(pointsCopy, core.KInverse, &c.NttCfg, output) + err := ecntt.ECNtt(batchPoints, core.KInverse, &c.NttCfg, output) if err.CudaErrorCode != cr.CudaSuccess || err.IcicleErrorCode != core.IcicleSuccess { return nil, fmt.Errorf("inverse ecntt failed") } } else { - err := ecntt.ECNtt(pointsCopy, core.KForward, &c.NttCfg, output) + err := ecntt.ECNtt(batchPoints, core.KForward, &c.NttCfg, output) if err.CudaErrorCode != cr.CudaSuccess || err.IcicleErrorCode != core.IcicleSuccess { return nil, fmt.Errorf("forward ecntt failed") } } - - // convert icicle projective to gnark affine - gpuFFTBatch := make([]bn254.G1Affine, len(batchPoints)) - for j := 0; j < totalNumSym; j++ { - gpuFFTBatch[j] = gpu_utils.IcicleProjectiveToGnarkAffine(output[j]) - } - - return gpuFFTBatch, nil + return output, nil } diff --git a/encoding/kzg/prover/gpu/msm.go b/encoding/kzg/prover/gpu/msm.go index cbb8c7bf1..9d36c92ee 100644 --- a/encoding/kzg/prover/gpu/msm.go +++ b/encoding/kzg/prover/gpu/msm.go @@ -3,58 +3,30 @@ package gpu import ( "fmt" - "github.com/Layr-Labs/eigenda/encoding/utils/gpu_utils" - "github.com/consensys/gnark-crypto/ecc/bn254" - "github.com/consensys/gnark-crypto/ecc/bn254/fr" "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core" cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime" icicle_bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254" icicle_bn254_msm "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/msm" ) -// MsmBatch function supports batch across blobs -func (c *GpuComputeDevice) MsmBatch(rowsFr [][]fr.Element, rowsG1 [][]bn254.G1Affine) ([]bn254.G1Affine, error) { +// MsmBatch function supports batch across blobs. +// totalSize is the number of output points, which equals to numPoly * 2 * dimE , dimE is number of chunks +func (c *GpuComputeDevice) MsmBatch(rowsFrIcicleCopy core.HostOrDeviceSlice, rowsG1Icicle []icicle_bn254.Affine, totalSize int) (core.DeviceSlice, error) { msmCfg := icicle_bn254_msm.GetDefaultMSMConfig() - rowsSfIcicle := make([]icicle_bn254.ScalarField, 0) - rowsAffineIcicle := make([]icicle_bn254.Affine, 0) - numBatchEle := len(rowsFr) - // Prepare scalar fields - for _, row := range rowsFr { - rowsSfIcicle = append(rowsSfIcicle, gpu_utils.ConvertFrToScalarFieldsBytes(row)...) - } - rowsFrIcicleCopy := core.HostSliceFromElements[icicle_bn254.ScalarField](rowsSfIcicle) - - // Prepare icicle g1 affines - for _, row := range rowsG1 { - rowsAffineIcicle = append(rowsAffineIcicle, gpu_utils.BatchConvertGnarkAffineToIcicleAffine(row)...) - } - rowsG1IcicleCopy := core.HostSliceFromElements[icicle_bn254.Affine](rowsAffineIcicle) + rowsG1IcicleCopy := core.HostSliceFromElements[icicle_bn254.Affine](rowsG1Icicle) var p icicle_bn254.Projective var out core.DeviceSlice - // prepare output - _, err := out.Malloc(numBatchEle*p.Size(), p.Size()) + _, err := out.Malloc(totalSize*p.Size(), p.Size()) if err != cr.CudaSuccess { - return nil, fmt.Errorf("allocating bytes on device for projective results failed") + return out, fmt.Errorf("%v", "Allocating bytes on device for Projective results failed") } err = icicle_bn254_msm.Msm(rowsFrIcicleCopy, rowsG1IcicleCopy, &msmCfg, out) if err != cr.CudaSuccess { - return nil, fmt.Errorf("msm failed") + return out, fmt.Errorf("%v", "Msm failed") } - - // move output out of device - outHost := make(core.HostSlice[icicle_bn254.Projective], numBatchEle) - outHost.CopyFromDevice(&out) - out.Free() - - // convert data back to gnark format - gnarkOuts := make([]bn254.G1Affine, numBatchEle) - for i := 0; i < numBatchEle; i++ { - gnarkOuts[i] = gpu_utils.IcicleProjectiveToGnarkAffine(outHost[i]) - } - - return gnarkOuts, nil + return out, nil } diff --git a/encoding/kzg/prover/gpu/multiframe_proof.go b/encoding/kzg/prover/gpu/multiframe_proof.go index 9069faaaf..a1e865332 100644 --- a/encoding/kzg/prover/gpu/multiframe_proof.go +++ b/encoding/kzg/prover/gpu/multiframe_proof.go @@ -22,13 +22,13 @@ type WorkerResult struct { type GpuComputeDevice struct { *kzg.KzgConfig - Fs *fft.FFTSettings - FFTPointsT [][]bn254.G1Affine // transpose of FFTPoints - SFs *fft.FFTSettings - Srs *kzg.SRS - G2Trailing []bn254.G2Affine - NttCfg core.NTTConfig[[bn254_icicle.SCALAR_LIMBS]uint32] - GpuLock *sync.Mutex // lock whenever gpu is needed, + Fs *fft.FFTSettings + FlatFFTPointsT []bn254_icicle.Affine + SFs *fft.FFTSettings + Srs *kzg.SRS + G2Trailing []bn254.G2Affine + NttCfg core.NTTConfig[[bn254_icicle.SCALAR_LIMBS]uint32] + GpuLock *sync.Mutex // lock whenever gpu is needed, } // benchmarks shows cpu commit on 2MB blob only takes 24.165562ms. For now, use cpu @@ -76,6 +76,7 @@ func (p *GpuComputeDevice) ComputeMultiFrameProof(polyFr []fr.Element, numChunks dimE := numChunks l := chunkLen numPoly := uint64(len(polyFr)) / dimE / chunkLen + fmt.Println("numPoly", numPoly) begin := time.Now() @@ -117,32 +118,34 @@ func (p *GpuComputeDevice) ComputeMultiFrameProof(polyFr []fr.Element, numChunks defer p.GpuLock.Unlock() // Compute NTT on the coeff matrix - p.NttCfg.BatchSize = int32(l) - coeffStoreFFT, e := p.NTT(coeffStore) + p.NttCfg.BatchSize = int32(l * numPoly) + coeffStoreFft, e := p.NTT(coeffStore) if e != nil { return nil, e } nttDone := time.Now() - // transpose the FFT tranformed matrix - coeffStoreFFTT := make([][]fr.Element, dimE*2*numPoly) - for i := range coeffStoreFFTT { - coeffStoreFFTT[i] = make([]fr.Element, l) - } - - for k := uint64(0); k < numPoly; k++ { - step := int(k * dimE * 2) - for i := 0; i < int(l); i++ { - vec := coeffStoreFFT[i+int(k*l)] - for j := 0; j < int(dimE*2); j++ { - coeffStoreFFTT[j+step][i] = vec[j] + /* + fmt.Println("after fft") + vec := gpu_utils.ConvertScalarFieldsToFrBytes(coeffStoreFft) + for i := 0; i < int(l*numPoly); i++ { + length := int(dimE) * 2 + for j := 0; j < length; j++ { + fmt.Printf("%v ", vec[i*length+j].String()) } + fmt.Println() } + */ + + // transpose the FFT tranformed matrix + coeffStoreFftTranspose, err := Transpose(coeffStoreFft, int(l), int(numPoly), int(dimE)*2) + if err != nil { + return nil, e } transposingDone := time.Now() // compute msm on each rows of the transposed matrix - sumVec, err := p.MsmBatch(coeffStoreFFTT, p.FFTPointsT) + sumVec, err := p.MsmBatch(coeffStoreFftTranspose, p.FlatFFTPointsT, int(numPoly)*int(dimE)*2) if err != nil { return nil, err } @@ -150,27 +153,39 @@ func (p *GpuComputeDevice) ComputeMultiFrameProof(polyFr []fr.Element, numChunks // compute the first ecntt, and set new batch size for ntt p.NttCfg.BatchSize = int32(numPoly) - sumVecInv, err := p.ECNtt(sumVec, true) + sumVecInv, err := p.ECNtt(sumVec, true, int(dimE)*2*int(numPoly)) if err != nil { return nil, err } firstECNttDone := time.Now() + sumVec.Free() - // remove half points per poly - batchInv := make([]bn254.G1Affine, len(sumVecInv)/2) + // extract proofs + prunedSumVecInv := core.HostSliceWithValue(bn254_icicle.Projective{}, len(sumVecInv)/2) k := 0 for i := 0; i < int(numPoly); i++ { for j := 0; j < int(dimE); j++ { - batchInv[k] = sumVecInv[i*int(dimE)*2+j] + prunedSumVecInv[k] = sumVecInv[i*int(dimE)*2+j] k += 1 } } // compute the second ecntt on the reduced size array - flatProofsBatch, err := p.ECNtt(batchInv, false) + flatProofsBatch, err := p.ECNttToGnark(prunedSumVecInv, false, int(numPoly)*int(dimE)) if err != nil { return nil, fmt.Errorf("second ECNtt error: %w", err) } + + /* + // debug + for i := 0; i < int(numPoly); i++ { + for j := 0; j < int(dimE); j++ { + fmt.Printf("%v ", flatProofsBatch[i*int(dimE)+j].String()) + } + fmt.Println() + } + */ + secondECNttDone := time.Now() fmt.Printf("Multiproof Time Decomp \n\t\ttotal %-20v \n\t\tpreproc %-20v \n\t\tntt %-20v \n\t\ttranspose %-20v \n\t\tmsm %-v \n\t\tfft1 %-v \n\t\tfft2 %-v,\n", @@ -183,6 +198,7 @@ func (p *GpuComputeDevice) ComputeMultiFrameProof(polyFr []fr.Element, numChunks secondECNttDone.Sub(firstECNttDone), ) + // only takes the first half return flatProofsBatch, nil } diff --git a/encoding/kzg/prover/gpu/ntt.go b/encoding/kzg/prover/gpu/ntt.go index d55474181..de44bbcf2 100644 --- a/encoding/kzg/prover/gpu/ntt.go +++ b/encoding/kzg/prover/gpu/ntt.go @@ -10,7 +10,7 @@ import ( bn254_icicle_ntt "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/ntt" ) -func (c *GpuComputeDevice) NTT(batchFr [][]fr.Element) ([][]fr.Element, error) { +func (c *GpuComputeDevice) NTT(batchFr [][]fr.Element) (core.HostSlice[bn254_icicle.ScalarField], error) { if len(batchFr) == 0 { return nil, fmt.Errorf("input to NTT contains no blob") } @@ -25,19 +25,12 @@ func (c *GpuComputeDevice) NTT(batchFr [][]fr.Element) ([][]fr.Element, error) { for i := 0; i < len(batchFr); i++ { flattenBatchFr = append(flattenBatchFr, batchFr[i]...) } - flattenBatchSf := gpu_utils.ConvertFrToScalarFieldsBytes(flattenBatchFr) + flattenBatchSf := gpu_utils.ConvertFrToScalarFieldsBytesThread(flattenBatchFr, int(c.NumWorker)) scalarsCopy := core.HostSliceFromElements[bn254_icicle.ScalarField](flattenBatchSf) // run ntt output := make(core.HostSlice[bn254_icicle.ScalarField], totalSize) bn254_icicle_ntt.Ntt(scalarsCopy, core.KForward, &c.NttCfg, output) - flattenBatchFrOutput := gpu_utils.ConvertScalarFieldsToFrBytes(output) - // convert ntt output from icicle to gnark - nttOutput := make([][]fr.Element, len(batchFr)) - for i := 0; i < len(batchFr); i++ { - nttOutput[i] = flattenBatchFrOutput[i*numSymbol : (i+1)*numSymbol] - } - - return nttOutput, nil + return output, nil } diff --git a/encoding/kzg/prover/gpu/transpose.go b/encoding/kzg/prover/gpu/transpose.go new file mode 100644 index 000000000..30d3ddae1 --- /dev/null +++ b/encoding/kzg/prover/gpu/transpose.go @@ -0,0 +1,27 @@ +package gpu + +import ( + "fmt" + + "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core" + cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime" + bn254_icicle "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254" + "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/vecOps" +) + +// numRow and numCol describes input dimension +func Transpose(coeffStoreFFT core.HostSlice[bn254_icicle.ScalarField], l, numPoly, numCol int) (core.HostSlice[bn254_icicle.ScalarField], error) { + totalSize := l * numPoly * numCol + ctx, err := cr.GetDefaultDeviceContext() + if err != cr.CudaSuccess { + return nil, fmt.Errorf("allocating bytes on device for projective results failed") + } + + transposedNTTOutput := make(core.HostSlice[bn254_icicle.ScalarField], totalSize) + + for i := 0; i < numPoly; i++ { + vecOps.TransposeMatrix(coeffStoreFFT[i*l*numCol:(i+1)*l*numCol], transposedNTTOutput[i*l*numCol:(i+1)*l*numCol], l, numCol, ctx, false, false) + } + + return transposedNTTOutput, nil +} diff --git a/encoding/kzg/prover/gpu/transpose_test.go b/encoding/kzg/prover/gpu/transpose_test.go new file mode 100644 index 000000000..87062b82c --- /dev/null +++ b/encoding/kzg/prover/gpu/transpose_test.go @@ -0,0 +1,12 @@ +package gpu_test + +import ( + "testing" +) + +// numRow and numCol describes input dimension +func TestTranspose(t *testing.T) { + + //gpu.Transpose() + +} diff --git a/encoding/kzg/prover/parametrized_prover.go b/encoding/kzg/prover/parametrized_prover.go index 9f3d61d62..e590780c9 100644 --- a/encoding/kzg/prover/parametrized_prover.go +++ b/encoding/kzg/prover/parametrized_prover.go @@ -131,7 +131,14 @@ func (g *ParametrizedProver) Encode(inputFr []fr.Element) (*bn254.G1Affine, *bn2 paddedCoeffs := make([]fr.Element, g.NumEvaluations()) // polyCoeffs has less points than paddedCoeffs in general due to erasure redundancy copy(paddedCoeffs, inputFr) - proofs, err := g.Computer.ComputeMultiFrameProof(paddedCoeffs, g.NumChunks, g.ChunkLength, g.NumWorker) + + numBlob := 1 + flatpaddedCoeffs := make([]fr.Element, 0, numBlob*len(paddedCoeffs)) + for i := 0; i < numBlob; i++ { + flatpaddedCoeffs = append(flatpaddedCoeffs, paddedCoeffs...) + } + + proofs, err := g.Computer.ComputeMultiFrameProof(flatpaddedCoeffs, g.NumChunks, g.ChunkLength, g.NumWorker) proofChan <- ProofsResult{ Proofs: proofs, Err: err, diff --git a/encoding/kzg/prover/prover.go b/encoding/kzg/prover/prover.go index 7c83725dc..0311bdad8 100644 --- a/encoding/kzg/prover/prover.go +++ b/encoding/kzg/prover/prover.go @@ -262,16 +262,17 @@ func (g *Prover) newProver(params encoding.EncodingParams) (*ParametrizedProver, } } else { nttCfg := gpu_utils.SetupNTT() + flatFftPointsT := gpu_utils.SetupMsm(fftPointsT) GpuLock := sync.Mutex{} computer = &gpu.GpuComputeDevice{ - Fs: fs, - FFTPointsT: fftPointsT, - SFs: sfs, - Srs: g.Srs, - G2Trailing: g.G2Trailing, - KzgConfig: g.KzgConfig, - NttCfg: nttCfg, - GpuLock: &GpuLock, + Fs: fs, + FlatFFTPointsT: flatFftPointsT, + SFs: sfs, + Srs: g.Srs, + G2Trailing: g.G2Trailing, + KzgConfig: g.KzgConfig, + NttCfg: nttCfg, + GpuLock: &GpuLock, } RsComputeDevice = &rs_gpu.GpuComputeDevice{ diff --git a/encoding/test/main.go b/encoding/test/main.go index 321dc0cba..f428adbeb 100644 --- a/encoding/test/main.go +++ b/encoding/test/main.go @@ -4,9 +4,9 @@ import ( "fmt" "log" "math/rand" + "time" "runtime" - "time" "github.com/Layr-Labs/eigenda/encoding" "github.com/Layr-Labs/eigenda/encoding/kzg" @@ -20,7 +20,7 @@ import ( func main() { TestKzgRs() - //err := kzg.WriteGeneratorPoints(30000) + //err := kzg.WriteGeneratorPoints(600000) //if err != nil { // log.Println("WriteGeneratorPoints failed:", err) //} @@ -62,8 +62,8 @@ func TestKzgRs() { fmt.Println("num device ", numDevices) kzgConfig := &kzg.KzgConfig{ - G1Path: "../../inabox/resources/kzg/g1.point.300000", - G2Path: "../../inabox/resources/kzg/g2.point.300000", + G1Path: "../../inabox/resources/kzg/g1.point.600000", + G2Path: "../../inabox/resources/kzg/g2.point.600000", CacheDir: "SRSTables", SRSOrder: 300000, SRSNumberToLoad: 300000, @@ -115,10 +115,14 @@ func TestKzgRs() { commit, lengthCommit, lengthProof, frames, fIndices, err := enc.Encode(inputFr) _ = lengthProof _ = lengthCommit + _ = commit + _ = frames + _ = fIndices if err != nil { log.Fatal(err) } // Optionally verify + startVerify := time.Now() //os.Exit(0) @@ -165,6 +169,7 @@ func TestKzgRs() { //log.Fatalf("%v", err) //} _ = dataFr + //fmt.Println(dataFr) // printFr(dataFr) //deData := kzg.ToByteArray(dataFr, inputByteSize) diff --git a/encoding/utils/gpu_utils/msm_setup.go b/encoding/utils/gpu_utils/msm_setup.go new file mode 100644 index 000000000..8a9d81488 --- /dev/null +++ b/encoding/utils/gpu_utils/msm_setup.go @@ -0,0 +1,15 @@ +package gpu_utils + +import ( + "github.com/consensys/gnark-crypto/ecc/bn254" + bn254_icicle "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254" +) + +func SetupMsm(rowsG1 [][]bn254.G1Affine) []bn254_icicle.Affine { + rowsG1Icicle := make([]bn254_icicle.Affine, 0) + + for _, row := range rowsG1 { + rowsG1Icicle = append(rowsG1Icicle, BatchConvertGnarkAffineToIcicleAffine(row)...) + } + return rowsG1Icicle +} diff --git a/encoding/utils/gpu_utils/utils.go b/encoding/utils/gpu_utils/utils.go index f20129cdc..5283a5d04 100644 --- a/encoding/utils/gpu_utils/utils.go +++ b/encoding/utils/gpu_utils/utils.go @@ -1,9 +1,13 @@ package gpu_utils import ( + "math" + "sync" + "github.com/consensys/gnark-crypto/ecc/bn254" "github.com/consensys/gnark-crypto/ecc/bn254/fp" "github.com/consensys/gnark-crypto/ecc/bn254/fr" + "github.com/ingonyama-zk/icicle/v2/wrappers/golang/core" bn254_icicle "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254" ) @@ -77,3 +81,55 @@ func IcicleProjectiveToGnarkAffine(p bn254_icicle.Projective) bn254.G1Affine { return bn254.G1Affine{X: *x, Y: *y} } + +func HostSliceIcicleProjectiveToGnarkAffine(ps core.HostSlice[bn254_icicle.Projective], numWorker int) []bn254.G1Affine { + output := make([]bn254.G1Affine, len(ps)) + + var wg sync.WaitGroup + + interval := int(math.Ceil(float64(len(ps)) / float64(numWorker))) + + for w := 0; w < numWorker; w++ { + wg.Add(1) + start := w * interval + end := (w + 1) * interval + if len(ps) < end { + end = len(ps) + } + + go func(workerStart, workerEnd int) { + defer wg.Done() + for i := workerStart; i < workerEnd; i++ { + output[i] = IcicleProjectiveToGnarkAffine(ps[i]) + } + + }(start, end) + } + wg.Wait() + return output +} + +func ConvertFrToScalarFieldsBytesThread(data []fr.Element, numWorker int) []bn254_icicle.ScalarField { + scalars := make([]bn254_icicle.ScalarField, len(data)) + + var wg sync.WaitGroup + + interval := int(math.Ceil(float64(len(data)) / float64(numWorker))) + + for w := 0; w < numWorker; w++ { + wg.Add(1) + start := w * interval + end := (w + 1) * interval + if len(data) < end { + end = len(data) + } + + go func(workerStart, workerEnd int) { + defer wg.Done() + output := ConvertFrToScalarFieldsBytes(data[workerStart:workerEnd]) + copy(scalars[workerStart:workerEnd], output[:]) + }(start, end) + } + wg.Wait() + return scalars +}