Exercises

ex-sp-ch14-01

Easy

Write a @numba.njit function that computes the element-wise ReLU activation $\text{ReLU}(x) = \max(0, x)$ using an explicit loop. Benchmark it against np.maximum(0, x) for $N = 10^6$ .

Show Hint

Remember to call the function once for warm-up before timing.

Solution

Implementation

import numba
import numpy as np

@numba.njit
def relu_numba(x):
    out = np.empty_like(x)
    for i in range(len(x)):
        out[i] = max(0.0, x[i])
    return out

x = np.random.randn(1_000_000)
relu_numba(x)  # warm-up
# Numba is comparable to NumPy here since np.maximum
# is already a single C call.

ex-sp-ch14-02

Easy

Use jax.grad to compute the derivative of $f(x) = x^3 \sin(x)$ and verify it against the analytical derivative $f'(x) = 3x^2 \sin(x) + x^3 \cos(x)$ at $x = 2.0$ .

Show Hint

Use jax.grad(f)(2.0) and compare with the analytical expression.

Solution

Implementation

import jax
import jax.numpy as jnp

def f(x):
    return x**3 * jnp.sin(x)

df = jax.grad(f)
x = 2.0
ad_result = df(x)
analytical = 3*x**2 * jnp.sin(x) + x**3 * jnp.cos(x)
print(f"AD: {ad_result:.10f}")
print(f"Analytical: {analytical:.10f}")
# Should match to machine precision

ex-sp-ch14-03

Easy

Use multiprocessing.Pool to compute the sum of squares for 4 arrays of size $10^6$ each, in parallel. Verify the results match the sequential computation.

Show Hint

Define a module-level function sum_sq(arr) and use pool.map().

Solution

Implementation

from multiprocessing import Pool
import numpy as np

def sum_sq(arr):
    return np.sum(arr ** 2)

rng = np.random.default_rng(42)
arrays = [rng.random(1_000_000) for _ in range(4)]

with Pool(4) as pool:
    results_par = pool.map(sum_sq, arrays)

results_seq = [sum_sq(a) for a in arrays]
assert np.allclose(results_par, results_seq)

ex-sp-ch14-04

Easy

Load the standard C math library with ctypes and call cos(1.0) from Python. Verify it matches math.cos(1.0).

Show Hint

On Linux: ctypes.CDLL("libm.so.6"). On macOS: ctypes.CDLL("libm.dylib").

Solution

Implementation

import ctypes, math, sys

if sys.platform == 'darwin':
    libm = ctypes.CDLL('libm.dylib')
else:
    libm = ctypes.CDLL('libm.so.6')

libm.cos.argtypes = [ctypes.c_double]
libm.cos.restype = ctypes.c_double

result = libm.cos(1.0)
print(f"ctypes: {result:.15f}")
print(f"math:   {math.cos(1.0):.15f}")
assert abs(result - math.cos(1.0)) < 1e-15

ex-sp-ch14-05

Medium

Create a @numba.vectorize ufunc that computes the soft-plus function $\text{softplus}(x) = \ln(1 + e^x)$ with numerical stability (avoid overflow for large $x$ ). Test with target='parallel'.

Show Hint

For $x > 20$ , $\text{softplus}(x) \approx x$ to avoid exp overflow.

Solution

Implementation

import numba
import numpy as np

@numba.vectorize(['float64(float64)'], target='parallel')
def softplus(x):
    if x > 20.0:
        return x
    elif x < -20.0:
        return 0.0
    else:
        return np.log1p(np.exp(x))

x = np.linspace(-30, 30, 1_000_000)
y = softplus(x)

ex-sp-ch14-06

Medium

Use jax.vmap to compute the Jacobian of a function $f: \mathbb{R}^3 \to \mathbb{R}^2$ defined as $f(x) = [\sin(x_1 x_2),\, x_2 e^{x_3}]$ at the point $x = [1, 2, 3]$ . Verify against jax.jacobian.

Show Hint

The Jacobian of $f: \mathbb{R}^n \to \mathbb{R}^m$ is an $m \times n$ matrix.

Solution

Implementation

import jax
import jax.numpy as jnp

def f(x):
    return jnp.array([jnp.sin(x[0] * x[1]), x[1] * jnp.exp(x[2])])

x = jnp.array([1.0, 2.0, 3.0])
J = jax.jacobian(f)(x)
print("Jacobian:\n", J)
# J[0,:] = [x2*cos(x1*x2), x1*cos(x1*x2), 0]
# J[1,:] = [0, exp(x3), x2*exp(x3)]

ex-sp-ch14-07

Medium

Compare ProcessPoolExecutor and ThreadPoolExecutor for a CPU-bound task (computing eigenvalues of random matrices) and an I/O-bound task (reading files). Measure speedup for each.

Solution

Implementation

from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
import numpy as np
import time

def cpu_task(seed):
    rng = np.random.default_rng(seed)
    A = rng.random((200, 200))
    return np.linalg.eigvals(A @ A.T).sum()

seeds = list(range(20))

for Executor in [ProcessPoolExecutor, ThreadPoolExecutor]:
    t0 = time.perf_counter()
    with Executor(max_workers=4) as exe:
        list(exe.map(cpu_task, seeds))
    print(f"{Executor.__name__}: {time.perf_counter()-t0:.3f}s")
# ProcessPool wins for CPU-bound; Thread wins for I/O-bound

ex-sp-ch14-08

Medium

Use joblib.Parallel with backend='loky' to parallelize a grid search over 100 hyperparameter combinations. Compare wall-clock time with sequential execution.

Solution

Implementation

from joblib import Parallel, delayed
import numpy as np
import time

def evaluate(alpha, beta, seed=42):
    rng = np.random.default_rng(seed)
    X = rng.random((500, 50))
    y = rng.random(500)
    w = np.linalg.lstsq(X.T @ X + alpha * np.eye(50), X.T @ y, rcond=None)[0]
    return np.mean((X @ w - y)**2) + beta * np.sum(w**2)

params = [(a, b) for a in np.logspace(-3, 1, 10) for b in np.logspace(-3, 1, 10)]

t0 = time.perf_counter()
results = Parallel(n_jobs=4)(delayed(evaluate)(a, b) for a, b in params)
t_par = time.perf_counter() - t0

t0 = time.perf_counter()
results_seq = [evaluate(a, b) for a, b in params]
t_seq = time.perf_counter() - t0

print(f"Sequential: {t_seq:.2f}s, Parallel: {t_par:.2f}s")

ex-sp-ch14-09

Hard

Implement a Numba CUDA kernel for matrix multiplication $C = A \cdot B$ using shared memory tiling. Compare against CuPy's cupy.matmul for $N = 1024$ .

Show Hint

Use cuda.shared.array() for tile storage and synchronize with cuda.syncthreads().

Solution

Implementation sketch

from numba import cuda
import numpy as np

TILE = 16

@cuda.jit
def matmul_shared(A, B, C):
    sA = cuda.shared.array((TILE, TILE), dtype=numba.float64)
    sB = cuda.shared.array((TILE, TILE), dtype=numba.float64)
    tx, ty = cuda.threadIdx.x, cuda.threadIdx.y
    row = cuda.blockIdx.y * TILE + ty
    col = cuda.blockIdx.x * TILE + tx
    tmp = 0.0
    for t in range((A.shape[1] + TILE - 1) // TILE):
        if row < A.shape[0] and t*TILE+tx < A.shape[1]:
            sA[ty, tx] = A[row, t*TILE+tx]
        if t*TILE+ty < B.shape[0] and col < B.shape[1]:
            sB[ty, tx] = B[t*TILE+ty, col]
        cuda.syncthreads()
        for k in range(TILE):
            tmp += sA[ty, k] * sB[k, tx]
        cuda.syncthreads()
    if row < C.shape[0] and col < C.shape[1]:
        C[row, col] = tmp

ex-sp-ch14-10

Hard

Implement Newton's method for finding roots using JAX's automatic differentiation. Use jax.jit to compile the iteration loop and find the root of $f(x) = x^3 - 2x - 5$ starting from $x_0 = 2$ .

Solution

Implementation

import jax
import jax.numpy as jnp

def f(x):
    return x**3 - 2*x - 5

df = jax.grad(f)

@jax.jit
def newton_step(x):
    return x - f(x) / df(x)

x = 2.0
for i in range(20):
    x_new = newton_step(x)
    if abs(x_new - x) < 1e-12:
        break
    x = x_new
print(f"Root: {x:.15f}")  # 2.094551481698199

ex-sp-ch14-11

Hard

Write a pybind11 extension that implements a moving average filter on a NumPy array. Compare its performance against np.convolve and scipy.ndimage.uniform_filter1d for $N = 10^6$ .

Solution

C++ source

#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
namespace py = pybind11;

py::array_t<double> moving_avg(py::array_t<double> input, int window) {
    auto buf = input.request();
    auto *x = static_cast<double*>(buf.ptr);
    int n = buf.size;
    auto result = py::array_t<double>(n);
    auto *out = static_cast<double*>(result.request().ptr);
    double sum = 0;
    for (int i = 0; i < window && i < n; i++) sum += x[i];
    out[0] = sum / window;
    for (int i = 1; i < n; i++) {
        if (i + window - 1 < n) sum += x[i + window - 1];
        if (i - 1 >= 0) sum -= x[i - 1];
        out[i] = sum / window;
    }
    return result;
}

PYBIND11_MODULE(filters, m) {
    m.def("moving_avg", &moving_avg);
}

ex-sp-ch14-12

Challenge

Build a complete acceleration benchmark suite that tests the same algorithm (e.g., pairwise Euclidean distances for $N$ points in $\mathbb{R}^d$ ) across all methods: pure Python, NumPy, Numba (CPU + GPU), JAX, multiprocessing, and a ctypes C implementation. Plot speedup vs. $N$ for each method.

Show Hint

Use $d = 3$ and $N \in \{100, 1000, 5000, 10000\}$ .

For ctypes, compile with -O3 -march=native.

Solution

Framework

import time
import numpy as np

def benchmark(func, *args, repeats=5):
    times = []
    for _ in range(repeats):
        t0 = time.perf_counter()
        func(*args)
        times.append(time.perf_counter() - t0)
    return min(times)

# Implement each variant and collect timings
# Plot log-log: N vs time, one line per method

Chapter Summary References & Further Reading