Rework the package, and add POCL-based CPU CI (#210)

JuliaGPU · Aug 9, 2024 · 30113b6 · 30113b6
1 parent 488704f
commit 30113b6
Show file tree

Hide file tree

Showing 68 changed files with 4,171 additions and 3,010 deletions.
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -1,14 +1,16 @@
 steps:
-  - label: "Julia 1.6 - CUDA"
+  - label: "CUDA"
     plugins:
       - JuliaCI/julia#v1:
-          version: "1.6"
+          version: "1.10"
       - JuliaCI/julia-test#v1: ~
       - JuliaCI/julia-coverage#v1:
           codecov: true
     agents:
       queue: "juliagpu"
       cuda: "*"
     if: build.message !~ /\[skip tests\]/
-    command: "mkdir -p /etc/OpenCL/vendors && echo libnvidia-opencl.so.1 > /etc/OpenCL/vendors/nvidia.icd"
+    env:
+      JULIA_OPENCL_BACKEND: "CUDA"
+      OCL_ICD_FILENAMES: "libnvidia-opencl.so.1"
     timeout_in_minutes: 60
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -0,0 +1,37 @@
+name: CI
+
+on:
+  push:
+    branches: [master]
+    tags: ["*"]
+  pull_request:
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  test:
+    name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        version: ['1.10', 'pre']
+        os: ['ubuntu-latest', 'macOS-latest']
+        arch: [x64]
+    steps:
+      - uses: actions/checkout@v4
+      - uses: julia-actions/setup-julia@v2
+        with:
+          version: ${{ matrix.version }}
+          arch: ${{ matrix.arch }}
+      - uses: julia-actions/cache@v2
+      - uses: julia-actions/julia-buildpkg@v1
+      - uses: julia-actions/julia-runtest@v1
+      - uses: julia-actions/julia-processcoverage@v1
+      - uses: codecov/codecov-action@v4
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          files: lcov.info
diff --git a/Project.toml b/Project.toml
@@ -1,18 +1,12 @@
 name = "OpenCL"
 uuid = "08131aa3-fb12-5dee-8b74-c09406e224a2"
-version = "0.9.0"
+version = "0.10.0"
 
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 OpenCL_jll = "6cb37087-e8b6-5417-8430-1f242f1e46e4"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 
 [compat]
-OpenCL_jll = "2022.9.23"
-julia = "1.6"
-
-[extras]
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-
-[targets]
-test = ["Test"]
+OpenCL_jll = "2024.5.8"
+julia = "1.10"
diff --git a/README.md b/README.md
@@ -46,9 +46,9 @@ b = rand(Float32, 50_000)
 
 device, ctx, queue = cl.create_compute_context()
 
-a_buff = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=a)
-b_buff = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=b)
-c_buff = cl.Buffer(Float32, ctx, :w, length(a))
+a_buff = cl.Buffer(Float32, ctx, length(a), (:r, :copy), hostbuf=a)
+b_buff = cl.Buffer(Float32, ctx, length(b), (:r, :copy), hostbuf=b)
+c_buff = cl.Buffer(Float32, ctx, length(a), :w)
 
 p = cl.Program(ctx, source=sum_kernel) |> cl.build!
 k = cl.Kernel(p, "sum")
@@ -146,7 +146,7 @@ Here's a rough translation between the OpenCL API in C to this Julia version. Op
 | `clGetKernelInfo` | `cl.info(kernel, :symbol)` | Kernel info: `:name`, `:num_args`, `:reference_count`, `:program`, `:attributes` |
 | `clEnqueueNDRangeKernel` | `cl.enqueue_kernel(queue, kernel, global_work_size)`, `cl.enqueue_kernel(queue, kernel, global_work_size, local_work_size; global_work_offset, wait_on)` | |
 | `clSetKernelArg` | `cl.set_arg!(kernel, idx, arg)` | `idx` starts at 1 |
-| `clCreateUserEvent` | `cl.UserEvent(ctx; retain)`  | | 
+| `clCreateUserEvent` | `cl.UserEvent(ctx; retain)`  | |
 | `clGetEventInfo`    | `cl.info(event, :symbol)`    | Event info: `:context`, `:command_queue`, `:reference_count`, `:command_type`, `:status`, `:profile_start`, `:profile_end`, `:profile_queued`, `:profile_submit`, `:profile_duration`
 | `clWaitForEvents`   | `cl.wait(event)`, `cl.wait(events)` |
 | `clEnqueueMarkerWithWaitList` | `cl.enqueue_marker_with_wait_list(queue, wait_for)` | |

diff --git a/examples/demo.jl b/examples/demo.jl
@@ -16,9 +16,9 @@ device, ctx, queue = cl.create_compute_context()
 
 # create opencl buffer objects
 # copies to the device initiated when the kernel function is called
-a_buff = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=a)
-b_buff = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=b)
-c_buff = cl.Buffer(Float32, ctx, :w, length(a))
+a_buff = cl.Buffer(Float32, ctx, length(a), (:r, :copy); hostbuf=a)
+b_buff = cl.Buffer(Float32, ctx, length(b), (:r, :copy); hostbuf=b)
+c_buff = cl.Buffer(Float32, ctx, length(a), :w)
 
 # build the program and construct a kernel object
 p = cl.Program(ctx, source=sum_kernel_src) |> cl.build!

diff --git a/examples/hands_on_opencl/ex04/vadd_chain.jl b/examples/hands_on_opencl/ex04/vadd_chain.jl
@@ -67,14 +67,14 @@ h_g = rand(Float32, LENGTH)
 # {:use (use host buffer), :alloc (alloc pinned memory), :copy (default)}
 
 # Create the input (a, b, e, g) arrays in device memory and copy data from host
-d_a = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_a)
-d_b = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_b)
-d_e = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_e)
-d_g = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_g)
+d_a = cl.Buffer(Float32, ctx, length(h_a), (:r, :copy), hostbuf=h_a)
+d_b = cl.Buffer(Float32, ctx, length(h_b), (:r, :copy), hostbuf=h_b)
+d_e = cl.Buffer(Float32, ctx, length(h_e), (:r, :copy), hostbuf=h_e)
+d_g = cl.Buffer(Float32, ctx, length(h_g), (:r, :copy), hostbuf=h_g)
 # Create the output (c, d, f) array in device memory
-d_c = cl.Buffer(Float32, ctx, :w, LENGTH)
-d_d = cl.Buffer(Float32, ctx, :w, LENGTH)
-d_f = cl.Buffer(Float32, ctx, :w, LENGTH)
+d_c = cl.Buffer(Float32, ctx, :LENGTH, w)
+d_d = cl.Buffer(Float32, ctx, :LENGTH, w)
+d_f = cl.Buffer(Float32, ctx, :LENGTH, w)
 
 # create the kernel
 vadd = cl.Kernel(program, "vadd")

diff --git a/examples/hands_on_opencl/ex05/vadd_abc.jl b/examples/hands_on_opencl/ex05/vadd_abc.jl
@@ -50,12 +50,12 @@ h_a = rand(Float32, LENGTH)
 h_b = rand(Float32, LENGTH)
 h_c = rand(Float32, LENGTH)
 
-d_a = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_a)
-d_b = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_b)
-d_c = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=h_c)
+d_a = cl.Buffer(Float32, ctx, length(h_a), (:r, :copy), hostbuf=h_a)
+d_b = cl.Buffer(Float32, ctx, length(h_b), (:r, :copy), hostbuf=h_b)
+d_c = cl.Buffer(Float32, ctx, length(h_c), (:r, :copy), hostbuf=h_c)
 
 # create the output (r) buffer in device memory
-d_r = cl.Buffer(Float32, ctx, :w, LENGTH)
+d_r = cl.Buffer(Float32, ctx, LENGTH, :w)
 
 # create the kernel
 vadd = cl.Kernel(program, "vadd")

diff --git a/examples/hands_on_opencl/ex06/matmul.jl b/examples/hands_on_opencl/ex06/matmul.jl
@@ -112,9 +112,9 @@ ctx = cl.create_some_context()
 queue = cl.CmdQueue(ctx, :profile)
 
 # create OpenCL Buffers
-d_a = cl.Buffer(Float32, ctx, (:r,:copy), hostbuf=h_A)
-d_b = cl.Buffer(Float32, ctx, (:r,:copy), hostbuf=h_B)
-d_c = cl.Buffer(Float32, ctx, :w, length(h_C))
+d_a = cl.Buffer(Float32, ctx, length(h_A), (:r,:copy), hostbuf=h_A)
+d_b = cl.Buffer(Float32, ctx, length(h_B), (:r,:copy), hostbuf=h_B)
+d_c = cl.Buffer(Float32, ctx, length(h_C), :w)
 
 prg  = cl.Program(ctx, source=kernel_source) |> cl.build!
 mmul = cl.Kernel(prg, "mmul")

diff --git a/examples/hands_on_opencl/ex07/matmul.jl b/examples/hands_on_opencl/ex07/matmul.jl
@@ -93,9 +93,9 @@ ctx = cl.create_some_context()
 queue = cl.CmdQueue(ctx, :profile)
 
 # create OpenCL Buffers
-d_a = cl.Buffer(Float32, ctx, (:r,:copy), hostbuf=h_A)
-d_b = cl.Buffer(Float32, ctx, (:r,:copy), hostbuf=h_B)
-d_c = cl.Buffer(Float32, ctx, :w, length(h_C))
+d_a = cl.Buffer(Float32, ctx, length(h_A), (:r,:copy), hostbuf=h_A)
+d_b = cl.Buffer(Float32, ctx, length(h_B), (:r,:copy), hostbuf=h_B)
+d_c = cl.Buffer(Float32, ctx, length(h_C), :w)
 
 #--------------------------------------------------------------------------------
 # OpenCL matrix multiplication ... Naive

diff --git a/examples/hands_on_opencl/ex08/matmul.jl b/examples/hands_on_opencl/ex08/matmul.jl
@@ -93,9 +93,9 @@ ctx = cl.create_some_context()
 queue = cl.CmdQueue(ctx, :profile)
 
 # create OpenCL Buffers
-d_a = cl.Buffer(Float32, ctx, (:r,:copy), hostbuf=h_A)
-d_b = cl.Buffer(Float32, ctx, (:r,:copy), hostbuf=h_B)
-d_c = cl.Buffer(Float32, ctx, :w, length(h_C))
+d_a = cl.Buffer(Float32, ctx, length(h_A), (:r,:copy), hostbuf=h_A)
+d_b = cl.Buffer(Float32, ctx, length(h_B), (:r,:copy), hostbuf=h_B)
+d_c = cl.Buffer(Float32, ctx, length(h_C), :w)
 
 #--------------------------------------------------------------------------------
 # OpenCL matrix multiplication ... Naive

diff --git a/examples/hands_on_opencl/ex09/pi_ocl.jl b/examples/hands_on_opencl/ex09/pi_ocl.jl
@@ -60,7 +60,7 @@ h_psum = Vector{Float32}(undef, nwork_groups)
 println("$nwork_groups work groups of size $work_group_size.")
 println("$nsteps integration steps")
 
-d_partial_sums = cl.Buffer(Float32, ctx, :w, length(h_psum))
+d_partial_sums = cl.Buffer(Float32, ctx, length(h_psum), :w)
 
 # start timer
 rtime = time()

diff --git a/examples/hands_on_opencl/exA/pi_vocl.jl b/examples/hands_on_opencl/exA/pi_vocl.jl
@@ -102,7 +102,7 @@ h_psum = Vector{Float32}(undef, nwork_groups)
 println("$nwork_groups work groups of size $work_group_size.")
 println("$nsteps integration steps")
 
-d_partial_sums = cl.Buffer(Float32, ctx, :w, length(h_psum))
+d_partial_sums = cl.Buffer(Float32, ctx, length(h_psum), :w)
 
 # start timer
 rtime = time()

diff --git a/examples/notebooks/Transpose.ipynb b/examples/notebooks/Transpose.ipynb
@@ -188,8 +188,8 @@
       "                                 (\"block\", enqueue_block_kernel, block_kernel))\n",
       "        for s in array_sizes  \n",
       "            src  = rand(Float32, (s, s))\n",
-      "            a_buf   = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=src)\n",
-      "            a_t_buf = cl.Buffer(Float32, ctx, :w, length(src))\n",
+      "            a_buf   = cl.Buffer(Float32, ctx, length(src), (:r, :copy), hostbuf=src)\n",
+      "            a_t_buf = cl.Buffer(Float32, ctx, length(src), :w)\n",
       "            \n",
       "            # warm up....\n",
       "            for i in 1:4\n",

diff --git a/examples/notebooks/julia_set_fractal.ipynb b/examples/notebooks/julia_set_fractal.ipynb
@@ -295,16 +295,16 @@
     "julia_source = \"\n",
     "\n",
     "__kernel void julia(__global float2 *q,\n",
-    "                    __global ushort *output, \n",
+    "                    __global ushort *output,\n",
     "                    ushort const maxiter)\n",
     "{\n",
     " int gid = get_global_id(0);\n",
     " float nreal = 0;\n",
     " float real  = q[gid].x;\n",
     " float imag  = q[gid].y;\n",
-    " \n",
+    "\n",
     " output[gid] = 0;\n",
-    " \n",
+    "\n",
     " for(int curiter = 0; curiter < maxiter; curiter++) {\n",
     "     if (real*real + imag*imag > 4.0f) {\n",
     "         output[gid] = curiter;\n",
@@ -341,15 +341,15 @@
     "\n",
     "    out = Array{UInt16}(size(q))\n",
     "\n",
-    "    q_buff = cl.Buffer(Complex64, ctx, (:r, :copy), hostbuf=q)\n",
-    "    o_buff = cl.Buffer(UInt16, ctx, :w, length(out))\n",
+    "    q_buff = cl.Buffer(Complex64, ctx, length(q), (:r, :copy), hostbuf=q)\n",
+    "    o_buff = cl.Buffer(UInt16, ctx, length(out), :w)\n",
     "\n",
     "    prg = cl.Program(ctx, source=julia_source) |> cl.build!\n",
     "    k = cl.Kernel(prg, \"julia\")\n",
-    "    \n",
+    "\n",
     "    queue(k, length(q), nothing, q_buff, o_buff, UInt16(maxiter))\n",
     "    cl.copy!(queue, out, o_buff)\n",
-    "    \n",
+    "\n",
     "    return out\n",
     "end\n"
    ]

diff --git a/examples/notebooks/mandelbrot_fractal.ipynb b/examples/notebooks/mandelbrot_fractal.ipynb
@@ -55,7 +55,7 @@
    "source": [
     "mandel_source = \"\n",
     "__kernel void mandelbrot(__global float2 *q,\n",
-    "                         __global ushort *output, \n",
+    "                         __global ushort *output,\n",
     "                         ushort const maxiter)\n",
     "{\n",
     " int gid = get_global_id(0);\n",
@@ -98,18 +98,18 @@
     "\n",
     "    out = Array{UInt16}(size(q))\n",
     "\n",
-    "    q_buff = cl.Buffer(Complex64, ctx, (:r, :copy), hostbuf=q)\n",
-    "    o_buff = cl.Buffer(UInt16, ctx, :w, length(out))\n",
+    "    q_buff = cl.Buffer(Complex64, ctx, length(q), (:r, :copy), hostbuf=q)\n",
+    "    o_buff = cl.Buffer(UInt16, ctx, length(out), :w)\n",
     "\n",
     "    prg = cl.Program(ctx, source=mandel_source) |> cl.build!\n",
-    "    \n",
+    "\n",
     "    k = cl.Kernel(prg, \"mandelbrot\")\n",
     "    #cl.call(queue, k, length(out), nothing, q_buff, o_buff, uint16(maxiter))\n",
     "    queue(k, length(q), nothing, q_buff, o_buff, UInt16(maxiter))\n",
     "\n",
     "\n",
     "    cl.copy!(queue, out, o_buff)\n",
-    "    \n",
+    "\n",
     "    return out\n",
     "end"
    ]
@@ -150,7 +150,7 @@
     "    y2 = 1.0\n",
     "    x1 = -1.5\n",
     "    x2 = 0.5\n",
-    "    \n",
+    "\n",
     "    q = Array{Complex64}(h, w)\n",
     "    for x in 1:w\n",
     "        for y in 1:h\n",

diff --git a/examples/performance.jl b/examples/performance.jl
@@ -79,9 +79,9 @@ function cl_performance(ndatapts::Integer, nworkers::Integer)
             ctx   = cl.Context(device)
             queue = cl.CmdQueue(ctx, :profile)
 
-            a_buf = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=a)
-            b_buf = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=b)
-            c_buf = cl.Buffer(Float32, ctx, :w, length(a))
+            a_buf = cl.Buffer(Float32, ctx, length(a), (:r, :copy), hostbuf=a)
+            b_buf = cl.Buffer(Float32, ctx, length(b), (:r, :copy), hostbuf=b)
+            c_buf = cl.Buffer(Float32, ctx, length(a), :w)
 
             prg  = cl.Program(ctx, source=bench_kernel) |> cl.build!
             kern = cl.Kernel(prg, "sum")