C Extensions and Alternatives

Overcome Python's speed limits with Cython, Numba, and ctypes.

Installation

pip install cython numba cffi

Numba — JIT Compiler (Easiest Approach)

from numba import njit, jit, prange
import numpy as np
import time


# ── Basic @njit ───────────────────────────────────────────
@njit
def sum_squares(n: int) -> float:
    total = 0.0
    for i in range(n):
        total += i * i
    return total


# First call triggers compilation (warmup)
sum_squares(100)

# Comparison
start = time.perf_counter()
result_py = sum(i * i for i in range(10_000_000))
print(f"Python: {time.perf_counter() - start:.3f}s")

start = time.perf_counter()
result_nb = sum_squares(10_000_000)
print(f"Numba:  {time.perf_counter() - start:.3f}s")


# ── NumPy array processing ────────────────────────────────
@njit(parallel=True)  # parallel execution
def parallel_sum(arr: np.ndarray) -> float:
    total = 0.0
    for i in prange(len(arr)):  # prange = parallel range
        total += arr[i]
    return total


arr = np.random.random(10_000_000)
result = parallel_sum(arr)


# ── Vectorized ufunc ──────────────────────────────────────
from numba import vectorize, float64


@vectorize([float64(float64, float64)])
def clip_and_scale(x, threshold):
    if x > threshold:
        return threshold
    return x * 2.0


data = np.array([1.0, 3.0, 5.0, 2.0, 4.0])
result = clip_and_scale(data, 3.0)  # works like a NumPy ufunc

Cython — Compile Python to C

# fibonacci.pyx — Cython source file
# cdef: C type declarations for speed

# --- fibonacci.pyx ---
# def fib_py(n):           # pure Python function
#     if n <= 1: return n
#     return fib_py(n-1) + fib_py(n-2)
#
# cpdef long fib_c(long n):   # C type declaration
#     if n <= 1: return n
#     return fib_c(n-1) + fib_c(n-2)
#
# cdef long fib_internal(long n):  # C-only (not callable from Python)
#     if n <= 1: return n
#     return fib_internal(n-1) + fib_internal(n-2)

# --- setup.py ---
# from setuptools import setup
# from Cython.Build import cythonize
#
# setup(ext_modules=cythonize("fibonacci.pyx"))

# Build:
# python setup.py build_ext --inplace

# Usage:
# import fibonacci
# print(fibonacci.fib_c(40))

# optimized_math.pyx — fully typed version
import numpy as np
cimport numpy as cnp

def moving_average(cnp.ndarray[cnp.double_t, ndim=1] arr, int window):
    """Moving average — Cython optimized"""
    cdef int n = len(arr)
    cdef cnp.ndarray[cnp.double_t, ndim=1] result = np.zeros(n)
    cdef double total = 0.0
    cdef int i

    for i in range(n):
        total += arr[i]
        if i >= window:
            total -= arr[i - window]
        if i >= window - 1:
            result[i] = total / window

    return result

ctypes — Call C Libraries Directly

import ctypes
import ctypes.util
import os
import sys


# ── Standard C library functions ─────────────────────────
if sys.platform == "win32":
    libc = ctypes.CDLL("msvcrt.dll")
else:
    libc = ctypes.CDLL(ctypes.util.find_library("c"))

# printf
libc.printf(b"Hello from C: %d\n", 42)

# ── Custom C library ──────────────────────────────────────
# mathlib.c:
# double add(double a, double b) { return a + b; }
# int factorial(int n) { return n <= 1 ? 1 : n * factorial(n-1); }

# Build: gcc -shared -fPIC -o mathlib.so mathlib.c

# lib = ctypes.CDLL("./mathlib.so")
# lib.add.argtypes = [ctypes.c_double, ctypes.c_double]
# lib.add.restype = ctypes.c_double
# print(lib.add(3.14, 2.72))

# ── Struct definition ─────────────────────────────────────
class Point(ctypes.Structure):
    _fields_ = [
        ("x", ctypes.c_double),
        ("y", ctypes.c_double),
    ]


class Rectangle(ctypes.Structure):
    _fields_ = [
        ("top_left",     Point),
        ("bottom_right", Point),
    ]


p = Point(1.0, 2.0)
print(f"Point({p.x}, {p.y})")

rect = Rectangle(Point(0, 0), Point(10, 5))
width  = rect.bottom_right.x - rect.top_left.x
height = rect.bottom_right.y - rect.top_left.y
print(f"Rectangle {width} x {height}")

# ── Arrays ────────────────────────────────────────────────
IntArray5 = ctypes.c_int * 5
arr = IntArray5(10, 20, 30, 40, 50)
for i in range(5):
    print(arr[i])

cffi — Better C Interface

from cffi import FFI

ffi = FFI()

# Declare C function signatures
ffi.cdef("""
    double sqrt(double x);
    int    abs(int n);
""")

# Load library
if __import__("sys").platform == "win32":
    lib = ffi.dlopen("msvcrt.dll")
else:
    lib = ffi.dlopen(None)  # standard library

result = lib.sqrt(16.0)
print(result)  # 4.0

neg = lib.abs(-42)
print(neg)     # 42

# Inline C compilation
ffi_inline = FFI()
ffi_inline.cdef("int add(int a, int b);")

lib_inline = ffi_inline.verify("""
    int add(int a, int b) {
        return a + b;
    }
""")

print(lib_inline.add(3, 4))  # 7

Performance Comparison

import timeit
import numpy as np

N = 1_000_000

# Pure Python
def py_sum_sq(n):
    return sum(i * i for i in range(n))

# NumPy
def np_sum_sq(n):
    arr = np.arange(n, dtype=np.float64)
    return np.sum(arr * arr)

t_py = timeit.timeit(lambda: py_sum_sq(N), number=3)
t_np = timeit.timeit(lambda: np_sum_sq(N), number=3)

print(f"Python: {t_py:.3f}s")
print(f"NumPy:  {t_np:.3f}s  ({t_py/t_np:.0f}x faster)")
# Numba @njit is comparable to or faster than NumPy for heavy loops

Summary

Tool	Difficulty	Speedup	Best For
`Numba` @njit	Easy	10–100x	Numeric loops, NumPy arrays
`Cython`	Medium	10–200x	Complex logic, type declarations
`ctypes`	Hard	N/A	Calling existing C libraries
`cffi`	Medium	N/A	Safer C interface

Recommended order: NumPy/Pandas vectorization → Numba @njit → Cython → ctypes/cffi

Installation​

Numba — JIT Compiler (Easiest Approach)​

Cython — Compile Python to C​

ctypes — Call C Libraries Directly​

cffi — Better C Interface​

Performance Comparison​

Summary​

Installation

Numba — JIT Compiler (Easiest Approach)

Cython — Compile Python to C

ctypes — Call C Libraries Directly

cffi — Better C Interface

Performance Comparison

Summary