From 3c146d974b49f2a3dbab262ac1bca8aacecf892d Mon Sep 17 00:00:00 2001 From: Pavel Tisnovsky Date: Wed, 22 May 2024 10:41:34 +0200 Subject: [PATCH] Faster Python - 15th version --- docs/faster_python.md | 173 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) diff --git a/docs/faster_python.md b/docs/faster_python.md index 027f346..7af2fd7 100644 --- a/docs/faster_python.md +++ b/docs/faster_python.md @@ -90,6 +90,14 @@ threading.Thread(target=worker).start() --- +### GIL + +* Threads just concurrent, not parallel + +![GIL](images/GIL.gif) + +--- + ### Goal --- @@ -131,6 +139,15 @@ threading.Thread(target=worker).start() --- +## Python w/o GIL + +* Recent CPython optimized slightly +* CPython w/o GIL + - fork of standard CPython +* Most AOT/JIT compilers support "nogil" + +--- + ## Technologies available today * Cython @@ -551,8 +568,10 @@ def function(): ## Mypyc * AOT compiler + - a bit similar to Cython * Heavily based on type hints - part of `mypy` package + - might be problematic for large code base --- @@ -564,7 +583,10 @@ def function(): * AOT compiler * Is able to compile into "one file" + - can be used to make application distribution * Does not require type hints everywhere +* (Very) slow compilation process + - up to 30 minutes for large code base! --- @@ -578,6 +600,157 @@ def function(): --- +![mandelbrot](images/mandelbrot.png) + +--- + +```python +def calc_mandelbrot(width, height, maxiter, palette): + print("P3") + print("{w} {h}".format(w=width, h=height)) + print("255") + + cy = -1.5 + for y in range(0, height): + cx = -2.0 + for x in range(0, width): + zx = 0.0 + zy = 0.0 + i = 0 + while i < maxiter: + zx2 = zx * zx + zy2 = zy * zy + if zx2 + zy2 > 4.0: + break + zy = 2.0 * zx * zy + cy + zx = zx2 - zy2 + cx + i += 1 + + r = palette[i % 256][0] + g = palette[i % 256][1] + b = palette[i % 256][2] + print("{r} {g} {b}".format(r=r, g=g, b=b)) + cx += 3.0/width + cy += 3.0/height +``` + +--- + +```python +@cython.cdivision(True) +@cython.nogil +@cython.cfunc +def calc_mandelbrot(width: cython.int, height: cython.int, maxiter: cython.int, palette: cython.p_uchar) -> cython.int: + zx: cython.double + zy: cython.double + zx2: cython.double + zy2: cython.double + cx: cython.double + cy: cython.double + r: cython.uchar + g: cython.uchar + b: cython.uchar + i: cython.int + index: cython.int + + printf("P3\n%d %d\n255\n", width, height) + + cy = -1.5 + for y in range(0, height): + cx = -2.0 + for x in range(0, width): + zx = 0.0 + zy = 0.0 + i = 0 + while i < maxiter: + zx2 = zx * zx + zy2 = zy * zy + if zx2 + zy2 > 4.0: + break + zy = 2.0 * zx * zy + cy + zx = zx2 - zy2 + cx + i += 1 + + index = i * 3 + r = palette[index] + g = palette[index+1] + b = palette[index+2] + printf("%d %d %d\n", r, g, b) + cx += 3.0/width + cy += 3.0/height +``` + +--- + +```C +static int __pyx_f_17mandelbrot_cython_calc_mandelbrot(int __pyx_v_width, int __pyx_v_height, int __pyx_v_maxiter, unsigned char *__pyx_v_palette) { + double __pyx_v_zx; + double __pyx_v_zy; + double __pyx_v_zx2; + double __pyx_v_zy2; + double __pyx_v_cx; + double __pyx_v_cy; + unsigned char __pyx_v_r; + unsigned char __pyx_v_g; + unsigned char __pyx_v_b; + int __pyx_v_i; + int __pyx_v_index; + CYTHON_UNUSED long __pyx_v_y; + CYTHON_UNUSED long __pyx_v_x; + int __pyx_r; + int __pyx_t_1; + int __pyx_t_2; + long __pyx_t_3; + int __pyx_t_4; + int __pyx_t_5; + long __pyx_t_6; + int __pyx_t_7; + + (void)(printf(((char const *)"P3\n%d %d\n255\n"), __pyx_v_width, __pyx_v_height)); + + __pyx_v_cy = -1.5; + __pyx_t_1 = __pyx_v_height; + __pyx_t_2 = __pyx_t_1; + for (__pyx_t_3 = 0; __pyx_t_3 < __pyx_t_2; __pyx_t_3+=1) { + __pyx_v_y = __pyx_t_3; + __pyx_v_cx = -2.0; + __pyx_t_4 = __pyx_v_width; + __pyx_t_5 = __pyx_t_4; + for (__pyx_t_6 = 0; __pyx_t_6 < __pyx_t_5; __pyx_t_6+=1) { + __pyx_v_x = __pyx_t_6; + __pyx_v_zx = 0.0; + __pyx_v_zy = 0.0; + __pyx_v_i = 0; + while (1) { + __pyx_t_7 = (__pyx_v_i < __pyx_v_maxiter); + if (!__pyx_t_7) break; + __pyx_v_zx2 = (__pyx_v_zx * __pyx_v_zx); + __pyx_v_zy2 = (__pyx_v_zy * __pyx_v_zy); + __pyx_t_7 = ((__pyx_v_zx2 + __pyx_v_zy2) > 4.0); + if (__pyx_t_7) { + goto __pyx_L8_break; + } + __pyx_v_zy = (((2.0 * __pyx_v_zx) * __pyx_v_zy) + __pyx_v_cy); + __pyx_v_zx = ((__pyx_v_zx2 - __pyx_v_zy2) + __pyx_v_cx); + __pyx_v_i = (__pyx_v_i + 1); + } + __pyx_L8_break:; + __pyx_v_index = (__pyx_v_i * 3); + __pyx_v_r = (__pyx_v_palette[__pyx_v_index]); + __pyx_v_g = (__pyx_v_palette[(__pyx_v_index + 1)]); + __pyx_v_b = (__pyx_v_palette[(__pyx_v_index + 2)]); + (void)(printf(((char const *)"%d %d %d\n"), __pyx_v_r, __pyx_v_g, __pyx_v_b)); + __pyx_v_cx = (__pyx_v_cx + (3.0 / ((double)__pyx_v_width))); + } + __pyx_v_cy = (__pyx_v_cy + (3.0 / ((double)__pyx_v_height))); + } + __pyx_r = 0; + return __pyx_r; +} +``` + +--- + ## Benchmarks (1/2) * native: ANSI C variant (no Python)