optimize by adding pipeline and multi-thread conversion

Layr-Labs · Jul 7, 2024 · acd9a42 · acd9a42
1 parent 0e81887
commit acd9a42
Show file tree

Hide file tree

Showing 11 changed files with 205 additions and 103 deletions.
diff --git a/encoding/kzg/prover/gpu/ecntt.go b/encoding/kzg/prover/gpu/ecntt.go
@@ -12,33 +12,31 @@ import (
 	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
 )
 
-func (c *GpuComputeDevice) ECNtt(batchPoints []bn254.G1Affine, isInverse bool) ([]bn254.G1Affine, error) {
-	totalNumSym := len(batchPoints)
+func (c *GpuComputeDevice) ECNttToGnark(batchPoints core.HostOrDeviceSlice, isInverse bool, totalSize int) ([]bn254.G1Affine, error) {
+	output, err := c.ECNtt(batchPoints, isInverse, totalSize)
+	if err != nil {
+		return nil, err
+	}
+
+	// convert icicle projective to gnark affine
+	gpuFFTBatch := gpu_utils.HostSliceIcicleProjectiveToGnarkAffine(output, int(c.NumWorker))
 
-	// convert gnark affine to icicle projective on slice
-	pointsIcileProjective := gpu_utils.BatchConvertGnarkAffineToIcicleProjective(batchPoints)
-	pointsCopy := core.HostSliceFromElements[icicle_bn254.Projective](pointsIcileProjective)
+	return gpuFFTBatch, nil
+}
 
-	output := make(core.HostSlice[icicle_bn254.Projective], int(totalNumSym))
+func (c *GpuComputeDevice) ECNtt(batchPoints core.HostOrDeviceSlice, isInverse bool, totalSize int) (core.HostSlice[icicle_bn254.Projective], error) {
+	output := make(core.HostSlice[icicle_bn254.Projective], totalSize)
 
-	// compute
 	if isInverse {
-		err := ecntt.ECNtt(pointsCopy, core.KInverse, &c.NttCfg, output)
+		err := ecntt.ECNtt(batchPoints, core.KInverse, &c.NttCfg, output)
 		if err.CudaErrorCode != cr.CudaSuccess || err.IcicleErrorCode != core.IcicleSuccess {
 			return nil, fmt.Errorf("inverse ecntt failed")
 		}
 	} else {
-		err := ecntt.ECNtt(pointsCopy, core.KForward, &c.NttCfg, output)
+		err := ecntt.ECNtt(batchPoints, core.KForward, &c.NttCfg, output)
 		if err.CudaErrorCode != cr.CudaSuccess || err.IcicleErrorCode != core.IcicleSuccess {
 			return nil, fmt.Errorf("forward ecntt failed")
 		}
 	}
-
-	// convert icicle projective to gnark affine
-	gpuFFTBatch := make([]bn254.G1Affine, len(batchPoints))
-	for j := 0; j < totalNumSym; j++ {
-		gpuFFTBatch[j] = gpu_utils.IcicleProjectiveToGnarkAffine(output[j])
-	}
-
-	return gpuFFTBatch, nil
+	return output, nil
 }
diff --git a/encoding/kzg/prover/gpu/msm.go b/encoding/kzg/prover/gpu/msm.go
@@ -3,58 +3,30 @@ package gpu
 import (
 	"fmt"
 
-	"github.com/Layr-Labs/eigenda/encoding/utils/gpu_utils"
-	"github.com/consensys/gnark-crypto/ecc/bn254"
-	"github.com/consensys/gnark-crypto/ecc/bn254/fr"
 	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
 	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
 	icicle_bn254 "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
 	icicle_bn254_msm "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/msm"
 )
 
-// MsmBatch function supports batch across blobs
-func (c *GpuComputeDevice) MsmBatch(rowsFr [][]fr.Element, rowsG1 [][]bn254.G1Affine) ([]bn254.G1Affine, error) {
+// MsmBatch function supports batch across blobs.
+// totalSize is the number of output points, which equals to numPoly * 2 * dimE , dimE is number of chunks
+func (c *GpuComputeDevice) MsmBatch(rowsFrIcicleCopy core.HostOrDeviceSlice, rowsG1Icicle []icicle_bn254.Affine, totalSize int) (core.DeviceSlice, error) {
 	msmCfg := icicle_bn254_msm.GetDefaultMSMConfig()
-	rowsSfIcicle := make([]icicle_bn254.ScalarField, 0)
-	rowsAffineIcicle := make([]icicle_bn254.Affine, 0)
-	numBatchEle := len(rowsFr)
 
-	// Prepare scalar fields
-	for _, row := range rowsFr {
-		rowsSfIcicle = append(rowsSfIcicle, gpu_utils.ConvertFrToScalarFieldsBytes(row)...)
-	}
-	rowsFrIcicleCopy := core.HostSliceFromElements[icicle_bn254.ScalarField](rowsSfIcicle)
-
-	// Prepare icicle g1 affines
-	for _, row := range rowsG1 {
-		rowsAffineIcicle = append(rowsAffineIcicle, gpu_utils.BatchConvertGnarkAffineToIcicleAffine(row)...)
-	}
-	rowsG1IcicleCopy := core.HostSliceFromElements[icicle_bn254.Affine](rowsAffineIcicle)
+	rowsG1IcicleCopy := core.HostSliceFromElements[icicle_bn254.Affine](rowsG1Icicle)
 
 	var p icicle_bn254.Projective
 	var out core.DeviceSlice
 
-	// prepare output
-	_, err := out.Malloc(numBatchEle*p.Size(), p.Size())
+	_, err := out.Malloc(totalSize*p.Size(), p.Size())
 	if err != cr.CudaSuccess {
-		return nil, fmt.Errorf("allocating bytes on device for projective results failed")
+		return out, fmt.Errorf("%v", "Allocating bytes on device for Projective results failed")
 	}
 
 	err = icicle_bn254_msm.Msm(rowsFrIcicleCopy, rowsG1IcicleCopy, &msmCfg, out)
 	if err != cr.CudaSuccess {
-		return nil, fmt.Errorf("msm failed")
+		return out, fmt.Errorf("%v", "Msm failed")
 	}
-
-	// move output out of device
-	outHost := make(core.HostSlice[icicle_bn254.Projective], numBatchEle)
-	outHost.CopyFromDevice(&out)
-	out.Free()
-
-	// convert data back to gnark format
-	gnarkOuts := make([]bn254.G1Affine, numBatchEle)
-	for i := 0; i < numBatchEle; i++ {
-		gnarkOuts[i] = gpu_utils.IcicleProjectiveToGnarkAffine(outHost[i])
-	}
-
-	return gnarkOuts, nil
+	return out, nil
 }
diff --git a/encoding/kzg/prover/gpu/multiframe_proof.go b/encoding/kzg/prover/gpu/multiframe_proof.go
@@ -22,13 +22,13 @@ type WorkerResult struct {
 
 type GpuComputeDevice struct {
 	*kzg.KzgConfig
-	Fs         *fft.FFTSettings
-	FFTPointsT [][]bn254.G1Affine // transpose of FFTPoints
-	SFs        *fft.FFTSettings
-	Srs        *kzg.SRS
-	G2Trailing []bn254.G2Affine
-	NttCfg     core.NTTConfig[[bn254_icicle.SCALAR_LIMBS]uint32]
-	GpuLock    *sync.Mutex // lock whenever gpu is needed,
+	Fs             *fft.FFTSettings
+	FlatFFTPointsT []bn254_icicle.Affine
+	SFs            *fft.FFTSettings
+	Srs            *kzg.SRS
+	G2Trailing     []bn254.G2Affine
+	NttCfg         core.NTTConfig[[bn254_icicle.SCALAR_LIMBS]uint32]
+	GpuLock        *sync.Mutex // lock whenever gpu is needed,
 }
 
 // benchmarks shows cpu commit on 2MB blob only takes 24.165562ms. For now, use cpu
@@ -76,6 +76,7 @@ func (p *GpuComputeDevice) ComputeMultiFrameProof(polyFr []fr.Element, numChunks
 	dimE := numChunks
 	l := chunkLen
 	numPoly := uint64(len(polyFr)) / dimE / chunkLen
+	fmt.Println("numPoly", numPoly)
 
 	begin := time.Now()
 
@@ -117,60 +118,74 @@ func (p *GpuComputeDevice) ComputeMultiFrameProof(polyFr []fr.Element, numChunks
 	defer p.GpuLock.Unlock()
 
 	// Compute NTT on the coeff matrix
-	p.NttCfg.BatchSize = int32(l)
-	coeffStoreFFT, e := p.NTT(coeffStore)
+	p.NttCfg.BatchSize = int32(l * numPoly)
+	coeffStoreFft, e := p.NTT(coeffStore)
 	if e != nil {
 		return nil, e
 	}
 	nttDone := time.Now()
 
-	// transpose the FFT tranformed matrix
-	coeffStoreFFTT := make([][]fr.Element, dimE*2*numPoly)
-	for i := range coeffStoreFFTT {
-		coeffStoreFFTT[i] = make([]fr.Element, l)
-	}
-
-	for k := uint64(0); k < numPoly; k++ {
-		step := int(k * dimE * 2)
-		for i := 0; i < int(l); i++ {
-			vec := coeffStoreFFT[i+int(k*l)]
-			for j := 0; j < int(dimE*2); j++ {
-				coeffStoreFFTT[j+step][i] = vec[j]
+	/*
+		fmt.Println("after fft")
+		vec := gpu_utils.ConvertScalarFieldsToFrBytes(coeffStoreFft)
+		for i := 0; i < int(l*numPoly); i++ {
+			length := int(dimE) * 2
+			for j := 0; j < length; j++ {
+				fmt.Printf("%v ", vec[i*length+j].String())
 			}
+			fmt.Println()
 		}
+	*/
+
+	// transpose the FFT tranformed matrix
+	coeffStoreFftTranspose, err := Transpose(coeffStoreFft, int(l), int(numPoly), int(dimE)*2)
+	if err != nil {
+		return nil, e
 	}
 	transposingDone := time.Now()
 
 	// compute msm on each rows of the transposed matrix
-	sumVec, err := p.MsmBatch(coeffStoreFFTT, p.FFTPointsT)
+	sumVec, err := p.MsmBatch(coeffStoreFftTranspose, p.FlatFFTPointsT, int(numPoly)*int(dimE)*2)
 	if err != nil {
 		return nil, err
 	}
 	msmDone := time.Now()
 
 	// compute the first ecntt, and set new batch size for ntt
 	p.NttCfg.BatchSize = int32(numPoly)
-	sumVecInv, err := p.ECNtt(sumVec, true)
+	sumVecInv, err := p.ECNtt(sumVec, true, int(dimE)*2*int(numPoly))
 	if err != nil {
 		return nil, err
 	}
 	firstECNttDone := time.Now()
+	sumVec.Free()
 
-	// remove half points per poly
-	batchInv := make([]bn254.G1Affine, len(sumVecInv)/2)
+	// extract proofs
+	prunedSumVecInv := core.HostSliceWithValue(bn254_icicle.Projective{}, len(sumVecInv)/2)
 	k := 0
 	for i := 0; i < int(numPoly); i++ {
 		for j := 0; j < int(dimE); j++ {
-			batchInv[k] = sumVecInv[i*int(dimE)*2+j]
+			prunedSumVecInv[k] = sumVecInv[i*int(dimE)*2+j]
 			k += 1
 		}
 	}
 
 	// compute the second ecntt on the reduced size array
-	flatProofsBatch, err := p.ECNtt(batchInv, false)
+	flatProofsBatch, err := p.ECNttToGnark(prunedSumVecInv, false, int(numPoly)*int(dimE))
 	if err != nil {
 		return nil, fmt.Errorf("second ECNtt error: %w", err)
 	}
+
+	/*
+		// debug
+		for i := 0; i < int(numPoly); i++ {
+			for j := 0; j < int(dimE); j++ {
+				fmt.Printf("%v ", flatProofsBatch[i*int(dimE)+j].String())
+			}
+			fmt.Println()
+		}
+	*/
+
 	secondECNttDone := time.Now()
 
 	fmt.Printf("Multiproof Time Decomp \n\t\ttotal   %-20v \n\t\tpreproc %-20v \n\t\tntt     %-20v \n\t\ttranspose %-20v \n\t\tmsm     %-v \n\t\tfft1    %-v \n\t\tfft2    %-v,\n",
@@ -183,6 +198,7 @@ func (p *GpuComputeDevice) ComputeMultiFrameProof(polyFr []fr.Element, numChunks
 		secondECNttDone.Sub(firstECNttDone),
 	)
 
+	// only takes the first half
 	return flatProofsBatch, nil
 }
 

diff --git a/encoding/kzg/prover/gpu/ntt.go b/encoding/kzg/prover/gpu/ntt.go
@@ -10,7 +10,7 @@ import (
 	bn254_icicle_ntt "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/ntt"
 )
 
-func (c *GpuComputeDevice) NTT(batchFr [][]fr.Element) ([][]fr.Element, error) {
+func (c *GpuComputeDevice) NTT(batchFr [][]fr.Element) (core.HostSlice[bn254_icicle.ScalarField], error) {
 	if len(batchFr) == 0 {
 		return nil, fmt.Errorf("input to NTT contains no blob")
 	}
@@ -25,19 +25,12 @@ func (c *GpuComputeDevice) NTT(batchFr [][]fr.Element) ([][]fr.Element, error) {
 	for i := 0; i < len(batchFr); i++ {
 		flattenBatchFr = append(flattenBatchFr, batchFr[i]...)
 	}
-	flattenBatchSf := gpu_utils.ConvertFrToScalarFieldsBytes(flattenBatchFr)
+	flattenBatchSf := gpu_utils.ConvertFrToScalarFieldsBytesThread(flattenBatchFr, int(c.NumWorker))
 	scalarsCopy := core.HostSliceFromElements[bn254_icicle.ScalarField](flattenBatchSf)
 
 	// run ntt
 	output := make(core.HostSlice[bn254_icicle.ScalarField], totalSize)
 	bn254_icicle_ntt.Ntt(scalarsCopy, core.KForward, &c.NttCfg, output)
-	flattenBatchFrOutput := gpu_utils.ConvertScalarFieldsToFrBytes(output)
 
-	// convert ntt output from icicle to gnark
-	nttOutput := make([][]fr.Element, len(batchFr))
-	for i := 0; i < len(batchFr); i++ {
-		nttOutput[i] = flattenBatchFrOutput[i*numSymbol : (i+1)*numSymbol]
-	}
-
-	return nttOutput, nil
+	return output, nil
 }
diff --git a/encoding/kzg/prover/gpu/transpose.go b/encoding/kzg/prover/gpu/transpose.go
@@ -0,0 +1,27 @@
+package gpu
+
+import (
+	"fmt"
+
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core"
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	bn254_icicle "github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/curves/bn254/vecOps"
+)
+
+// numRow and numCol describes input dimension
+func Transpose(coeffStoreFFT core.HostSlice[bn254_icicle.ScalarField], l, numPoly, numCol int) (core.HostSlice[bn254_icicle.ScalarField], error) {
+	totalSize := l * numPoly * numCol
+	ctx, err := cr.GetDefaultDeviceContext()
+	if err != cr.CudaSuccess {
+		return nil, fmt.Errorf("allocating bytes on device for projective results failed")
+	}
+
+	transposedNTTOutput := make(core.HostSlice[bn254_icicle.ScalarField], totalSize)
+
+	for i := 0; i < numPoly; i++ {
+		vecOps.TransposeMatrix(coeffStoreFFT[i*l*numCol:(i+1)*l*numCol], transposedNTTOutput[i*l*numCol:(i+1)*l*numCol], l, numCol, ctx, false, false)
+	}
+
+	return transposedNTTOutput, nil
+}
diff --git a/encoding/kzg/prover/gpu/transpose_test.go b/encoding/kzg/prover/gpu/transpose_test.go
@@ -0,0 +1,12 @@
+package gpu_test
+
+import (
+	"testing"
+)
+
+// numRow and numCol describes input dimension
+func TestTranspose(t *testing.T) {
+
+	//gpu.Transpose()
+
+}
diff --git a/encoding/kzg/prover/parametrized_prover.go b/encoding/kzg/prover/parametrized_prover.go
@@ -131,7 +131,14 @@ func (g *ParametrizedProver) Encode(inputFr []fr.Element) (*bn254.G1Affine, *bn2
 		paddedCoeffs := make([]fr.Element, g.NumEvaluations())
 		// polyCoeffs has less points than paddedCoeffs in general due to erasure redundancy
 		copy(paddedCoeffs, inputFr)
-		proofs, err := g.Computer.ComputeMultiFrameProof(paddedCoeffs, g.NumChunks, g.ChunkLength, g.NumWorker)
+
+		numBlob := 1
+		flatpaddedCoeffs := make([]fr.Element, 0, numBlob*len(paddedCoeffs))
+		for i := 0; i < numBlob; i++ {
+			flatpaddedCoeffs = append(flatpaddedCoeffs, paddedCoeffs...)
+		}
+
+		proofs, err := g.Computer.ComputeMultiFrameProof(flatpaddedCoeffs, g.NumChunks, g.ChunkLength, g.NumWorker)
 		proofChan <- ProofsResult{
 			Proofs:   proofs,
 			Err:      err,

diff --git a/encoding/kzg/prover/prover.go b/encoding/kzg/prover/prover.go
@@ -262,16 +262,17 @@ func (g *Prover) newProver(params encoding.EncodingParams) (*ParametrizedProver,
 		}
 	} else {
 		nttCfg := gpu_utils.SetupNTT()
+		flatFftPointsT := gpu_utils.SetupMsm(fftPointsT)
 		GpuLock := sync.Mutex{}
 		computer = &gpu.GpuComputeDevice{
-			Fs:         fs,
-			FFTPointsT: fftPointsT,
-			SFs:        sfs,
-			Srs:        g.Srs,
-			G2Trailing: g.G2Trailing,
-			KzgConfig:  g.KzgConfig,
-			NttCfg:     nttCfg,
-			GpuLock:    &GpuLock,
+			Fs:             fs,
+			FlatFFTPointsT: flatFftPointsT,
+			SFs:            sfs,
+			Srs:            g.Srs,
+			G2Trailing:     g.G2Trailing,
+			KzgConfig:      g.KzgConfig,
+			NttCfg:         nttCfg,
+			GpuLock:        &GpuLock,
 		}
 
 		RsComputeDevice = &rs_gpu.GpuComputeDevice{