pickle and Binary Files

pickle is Python's standard library for serializing Python objects into a byte stream and deserializing them back. It's used to save Python objects to files or transmit them over a network.

pickle Basics: dumps/loads, dump/load

import pickle


# --- In-memory serialization/deserialization ---

# Simple objects
data = {
    "name": "Alice",
    "age": 30,
    "scores": [95, 87, 92],
    "metadata": {"active": True, "role": "admin"},
}

# Object → byte stream
serialized = pickle.dumps(data)
print(type(serialized))        # <class 'bytes'>
print(len(serialized))         # number of bytes

# Byte stream → object
restored = pickle.loads(serialized)
print(restored == data)        # True
print(restored is data)        # False (new object created)

# --- Saving to / loading from a file ---

# Save
with open("data.pkl", "wb") as f:   # Must use binary mode "wb"
    pickle.dump(data, f)

# Load
with open("data.pkl", "rb") as f:   # Binary read "rb"
    loaded = pickle.load(f)

print(loaded)


# --- Saving multiple objects to one file ---

records = [
    {"id": 1, "value": "first"},
    {"id": 2, "value": "second"},
    {"id": 3, "value": "third"},
]

with open("records.pkl", "wb") as f:
    for record in records:
        pickle.dump(record, f)  # Can dump consecutively

# Load consecutively
loaded_records = []
with open("records.pkl", "rb") as f:
    while True:
        try:
            loaded_records.append(pickle.load(f))
        except EOFError:
            break  # Reached end of file

print(loaded_records)

pickle Protocol Versions

import pickle
import sys


print(f"Default protocol: {pickle.DEFAULT_PROTOCOL}")   # Usually 5
print(f"Highest protocol: {pickle.HIGHEST_PROTOCOL}")   # Varies by Python version

data = list(range(10_000))

# Compare sizes across protocols
for protocol in range(pickle.HIGHEST_PROTOCOL + 1):
    serialized = pickle.dumps(data, protocol=protocol)
    print(f"Protocol {protocol}: {len(serialized):,} bytes")

# Save with a specific protocol
with open("data_v5.pkl", "wb") as f:
    pickle.dump(data, f, protocol=5)

# pickle.HIGHEST_PROTOCOL is recommended (latest Python only)
# Use lower protocol if compatibility with older versions is needed
with open("data_compat.pkl", "wb") as f:
    pickle.dump(data, f, protocol=2)  # Compatible with Python 2.3+

Serializing User-Defined Classes

import pickle
from datetime import datetime


class Employee:
    """A class that can be serialized with pickle"""

    def __init__(self, emp_id: str, name: str, salary: float, hire_date: datetime):
        self.emp_id = emp_id
        self.name = name
        self.salary = salary
        self.hire_date = hire_date
        self._cache: dict = {}  # Cache — may want to exclude from serialization

    def __repr__(self) -> str:
        return f"Employee(id={self.emp_id!r}, name={self.name!r})"


# Default serialization: includes all attributes in __dict__
emp = Employee("E001", "Alice", 80000.0, datetime(2020, 3, 15))
serialized = pickle.dumps(emp)
restored = pickle.loads(serialized)
print(restored)
print(restored.salary)  # 80000.0


# Control serialization with __getstate__ / __setstate__
class OptimizedEmployee:
    def __init__(self, emp_id: str, name: str, salary: float):
        self.emp_id = emp_id
        self.name = name
        self.salary = salary
        self._cache: dict = {}          # Cache (no need to serialize)
        self._connection = None         # Connection object (cannot serialize)

    def __getstate__(self) -> dict:
        """Return state to serialize — excluding _cache and _connection"""
        state = self.__dict__.copy()
        del state["_cache"]
        del state["_connection"]
        return state

    def __setstate__(self, state: dict) -> None:
        """Restore state on deserialization — re-initialize excluded attributes"""
        self.__dict__.update(state)
        self._cache = {}        # Re-initialize with empty cache
        self._connection = None # Connection needs to be re-established separately

    def __repr__(self) -> str:
        return f"OptimizedEmployee({self.emp_id!r}, {self.name!r})"


oe = OptimizedEmployee("E002", "Bob", 90000.0)
oe._cache = {"key": "value"}  # Cache data

serialized = pickle.dumps(oe)
restored = pickle.loads(serialized)

print(restored)
print(restored._cache)       # {} (re-initialized)
print(restored._connection)  # None (re-initialized)

Serializable vs Non-Serializable Objects

import pickle


# ✅ Objects that can be serialized
serializable_objects = [
    None,
    True, False,
    42, 3.14, 1 + 2j,
    "string",
    b"bytes",
    [1, 2, 3],
    (1, "a"),
    {"key": "value"},
    {1, 2, 3},
    range(10),
]

for obj in serializable_objects:
    try:
        data = pickle.dumps(obj)
        restored = pickle.loads(data)
        print(f"✅ {type(obj).__name__}: {obj!r} → serialization succeeded")
    except Exception as e:
        print(f"❌ {type(obj).__name__}: {e}")


# ❌ Objects that cannot be serialized
import threading
import io


def my_function():
    pass  # Regular functions (unlike lambdas) can be serialized


non_serializable = [
    lambda x: x * 2,          # Lambda functions cannot be serialized
    threading.Lock(),          # Thread locks
    io.StringIO("test"),       # Some IO objects
]

for obj in non_serializable:
    try:
        pickle.dumps(obj)
        print(f"✅ {type(obj).__name__} serialization succeeded")
    except (pickle.PicklingError, AttributeError, TypeError) as e:
        print(f"❌ {type(obj).__name__}: {e}")


# Functions and classes defined at module level can be serialized
import pickle

def add(a, b):
    return a + b

serialized_func = pickle.dumps(add)  # Module-level functions are OK
restored_func = pickle.loads(serialized_func)
print(restored_func(3, 4))  # 7

Security Warnings

import pickle


# ⚠️ WARNING: Never use pickle.loads() on untrusted data!
# Arbitrary code can be executed during pickle deserialization.

# Example of malicious pickle data (DO NOT RUN!):
# The following code could be executed during deserialization:
#
# class Exploit:
#     def __reduce__(self):
#         return (os.system, ("rm -rf /",))  # Executes system command!
#
# malicious = pickle.dumps(Exploit())
# pickle.loads(malicious)  # Executes rm -rf /!


# ✅ Safe usage guidelines:
# 1. Only load pickle files generated by your own code
# 2. Never load pickle data received from external sources
# 3. Do not use pickle for network communication
# 4. Do not use pickle data created from user input

# ✅ Alternative — JSON deserializes safely without executing code
import json

safe_data = {"name": "Alice", "age": 30, "scores": [95, 87]}
json_str = json.dumps(safe_data, ensure_ascii=False)
restored = json.loads(json_str)  # Safe: JSON does not execute code
print(restored)

# Using hmac to sign pickle data for integrity verification
import hmac
import hashlib


def secure_dumps(obj, secret_key: bytes) -> tuple[bytes, bytes]:
    """Serialize object and add HMAC signature"""
    data = pickle.dumps(obj)
    signature = hmac.new(secret_key, data, hashlib.sha256).digest()
    return data, signature


def secure_loads(data: bytes, signature: bytes, secret_key: bytes):
    """Verify HMAC signature then deserialize"""
    expected = hmac.new(secret_key, data, hashlib.sha256).digest()
    if not hmac.compare_digest(signature, expected):
        raise ValueError("Data has been tampered with!")
    return pickle.loads(data)


secret = b"my-secret-key-32-bytes-long!!!!!"
original = {"user": "admin", "role": "superuser"}

data, sig = secure_dumps(original, secret)
restored = secure_loads(data, sig, secret)
print(f"Signature verified: {restored}")

# Tampering attempt
tampered_data = data[:-1] + bytes([data[-1] ^ 0xFF])
try:
    secure_loads(tampered_data, sig, secret)
except ValueError as e:
    print(f"Tampering detected: {e}")

Persistent Storage with the shelve Module

shelve is a persistent dictionary based on pickle. It lets you save and retrieve Python objects from a file using key-value pairs.

import shelve
from dataclasses import dataclass, field
from datetime import datetime


@dataclass
class UserProfile:
    user_id: str
    username: str
    email: str
    created_at: datetime = field(default_factory=datetime.now)
    preferences: dict = field(default_factory=dict)

    def __repr__(self) -> str:
        return f"UserProfile({self.username!r})"


# Using shelve — works like a dictionary
with shelve.open("users_db") as db:
    # Save
    user1 = UserProfile("U001", "alice", "alice@example.com")
    user2 = UserProfile("U002", "bob", "bob@example.com",
                        preferences={"theme": "dark", "lang": "en"})

    db["U001"] = user1
    db["U002"] = user2

    # Caution: modifications require re-assignment to be saved
    # Safe way:
    u = db["U001"]
    u.preferences["notifications"] = True
    db["U001"] = u  # Explicit re-assignment

    print(f"Users saved: {len(db)}")
    print(f"Keys: {list(db.keys())}")


# Read back from file later
with shelve.open("users_db") as db:
    for user_id, user in db.items():
        print(f"  {user_id}: {user}")

    # Lookup a specific key
    if "U001" in db:
        alice = db["U001"]
        print(f"Alice preferences: {alice.preferences}")

    # Delete
    if "U002" in db:
        del db["U002"]
        print(f"After deleting U002: {list(db.keys())}")


# writeback=True option — auto-detects modifications (watch memory usage)
with shelve.open("users_db", writeback=True) as db:
    if "U001" in db:
        db["U001"].preferences["auto_save"] = True  # No re-assignment needed with writeback=True
        # But all items are cached in memory, increasing memory usage

Binary File Handling with the struct Module

The struct module packs and unpacks binary data like C structs. It's used for file format parsing, network protocols, hardware communication, and more.

import struct


# struct format strings
# '>' big-endian, '<' little-endian, '=' native endian
# 'i' 4-byte int, 'f' 4-byte float, 'd' 8-byte double
# 'c' 1-byte char, 's' string, 'B' 1-byte unsigned int

# Pack (Python → bytes)
fmt = ">ifd"  # big-endian: int(4) + float(4) + double(8) = 16 bytes
packed = struct.pack(fmt, 42, 3.14, 2.718281828)
print(f"Packed: {packed.hex()}")
print(f"Size: {struct.calcsize(fmt)} bytes")

# Unpack (bytes → Python)
unpacked = struct.unpack(fmt, packed)
print(f"Unpacked: {unpacked}")  # (42, 3.14..., 2.718...)

# Writing/reading binary files with struct
RECORD_FORMAT = ">I10sf"  # unsigned int(4) + 10-byte name + float(4) = 18 bytes
RECORD_SIZE = struct.calcsize(RECORD_FORMAT)

# Record sensor data
sensor_data = [
    (1001, b"Sensor_A  ", 23.5),
    (1002, b"Sensor_B  ", 18.2),
    (1003, b"Sensor_C  ", 35.7),
]

with open("sensors.bin", "wb") as f:
    for sensor_id, name, temperature in sensor_data:
        record = struct.pack(RECORD_FORMAT, sensor_id, name, temperature)
        f.write(record)

# Read binary file
records_read = []
with open("sensors.bin", "rb") as f:
    while True:
        raw = f.read(RECORD_SIZE)
        if len(raw) < RECORD_SIZE:
            break
        sensor_id, name, temperature = struct.unpack(RECORD_FORMAT, raw)
        records_read.append({
            "id": sensor_id,
            "name": name.rstrip(b"\x00 ").decode("ascii"),
            "temperature": round(temperature, 1),
        })

print("Sensor data read:")
for record in records_read:
    print(f"  {record}")

# Random access to a specific position (possible since fixed-size records)
with open("sensors.bin", "rb") as f:
    # Directly access the second record
    f.seek(RECORD_SIZE * 1)
    raw = f.read(RECORD_SIZE)
    sensor_id, name, temp = struct.unpack(RECORD_FORMAT, raw)
    print(f"\n2nd record: ID={sensor_id}, temp={temp:.1f}°C")

Alternative Libraries

joblib — Large Arrays / ML Models

# pip install joblib
import joblib
import numpy as np  # pip install numpy


# joblib serializes NumPy arrays efficiently (faster than pickle)
data = {
    "weights": np.random.randn(1000, 1000),  # Large NumPy array
    "labels": np.array([0, 1, 2, 3]),
    "config": {"learning_rate": 0.001},
}

# Save
joblib.dump(data, "model_data.joblib")

# Memory-mapped load (access without loading file fully into memory)
loaded = joblib.load("model_data.joblib", mmap_mode="r")
print(f"weights shape: {loaded['weights'].shape}")

# Specify compression level (0=none, 9=maximum)
joblib.dump(data, "model_compressed.joblib", compress=3)

msgpack — Fast Cross-Language Serialization

# pip install msgpack
import msgpack


# msgpack is similar to JSON but smaller and faster
# Ideal for exchanging data with other languages (JavaScript, Java, etc.)
data = {
    "user_id": 12345,
    "name": "Alice",
    "scores": [95, 87, 92, 88],
    "active": True,
}

# Serialize
packed = msgpack.packb(data, use_bin_type=True)
print(f"msgpack size: {len(packed)} bytes")

# Compare with JSON
import json
json_str = json.dumps(data, ensure_ascii=False).encode()
print(f"JSON size:    {len(json_str)} bytes")

# Deserialize
unpacked = msgpack.unpackb(packed, raw=False)
print(f"Restored: {unpacked}")

dill — Serialize Lambdas and Closures Too

# pip install dill
import dill


# dill can serialize more Python objects than pickle
# Supports lambdas, closures, nested functions, etc.

multiplier = lambda x: x * 3  # Regular pickle can't serialize lambdas

def make_adder(n: int):
    def adder(x: int) -> int:
        return x + n
    return adder

add5 = make_adder(5)  # Closure

# Serialize with dill
serialized_lambda = dill.dumps(multiplier)
serialized_closure = dill.dumps(add5)

# Deserialize
restored_lambda = dill.loads(serialized_lambda)
restored_closure = dill.loads(serialized_closure)

print(restored_lambda(7))   # 21
print(restored_closure(10)) # 15

# Useful in multiprocessing with lambdas:
# multiprocessing.Pool uses pickle by default,
# but with dill, lambdas can be passed to other processes

Real-World Example: Checkpoint System

import pickle
import os
import hashlib
from datetime import datetime
from pathlib import Path


class Checkpoint:
    """Checkpoint system that saves intermediate training/computation state"""

    def __init__(self, checkpoint_dir: str = "./checkpoints"):
        self.checkpoint_dir = Path(checkpoint_dir)
        self.checkpoint_dir.mkdir(parents=True, exist_ok=True)

    def save(self, name: str, state: dict, metadata: dict | None = None) -> Path:
        """Save state as a checkpoint"""
        checkpoint = {
            "name": name,
            "state": state,
            "metadata": metadata or {},
            "timestamp": datetime.now().isoformat(),
            "version": 1,
        }

        filepath = self.checkpoint_dir / f"{name}.pkl"
        with open(filepath, "wb") as f:
            pickle.dump(checkpoint, f, protocol=pickle.HIGHEST_PROTOCOL)

        # Save checksum
        checksum = self._compute_checksum(filepath)
        checksum_file = self.checkpoint_dir / f"{name}.sha256"
        checksum_file.write_text(checksum)

        print(f"Checkpoint saved: {filepath} (checksum: {checksum[:8]}...)")
        return filepath

    def load(self, name: str) -> dict:
        """Load checkpoint and verify integrity"""
        filepath = self.checkpoint_dir / f"{name}.pkl"
        checksum_file = self.checkpoint_dir / f"{name}.sha256"

        if not filepath.exists():
            raise FileNotFoundError(f"Checkpoint not found: {name}")

        # Integrity check
        if checksum_file.exists():
            expected = checksum_file.read_text().strip()
            actual = self._compute_checksum(filepath)
            if actual != expected:
                raise ValueError(f"Checkpoint is corrupted: {name}")

        with open(filepath, "rb") as f:
            checkpoint = pickle.load(f)

        print(f"Checkpoint loaded: {name} (saved at: {checkpoint['timestamp']})")
        return checkpoint["state"]

    def list_checkpoints(self) -> list[dict]:
        """List saved checkpoints"""
        result = []
        for pkl_file in sorted(self.checkpoint_dir.glob("*.pkl")):
            with open(pkl_file, "rb") as f:
                cp = pickle.load(f)
            result.append({
                "name": cp["name"],
                "timestamp": cp["timestamp"],
                "size": pkl_file.stat().st_size,
                "metadata": cp.get("metadata", {}),
            })
        return result

    def delete(self, name: str) -> None:
        """Delete a checkpoint"""
        for ext in [".pkl", ".sha256"]:
            path = self.checkpoint_dir / f"{name}{ext}"
            if path.exists():
                path.unlink()
        print(f"Checkpoint deleted: {name}")

    @staticmethod
    def _compute_checksum(filepath: Path) -> str:
        h = hashlib.sha256()
        with open(filepath, "rb") as f:
            for chunk in iter(lambda: f.read(8192), b""):
                h.update(chunk)
        return h.hexdigest()


# Usage example
cp = Checkpoint("./checkpoints")

# Simulate saving training state
training_state = {
    "epoch": 50,
    "loss": 0.0234,
    "accuracy": 0.987,
    "model_weights": [0.1, 0.2, 0.3, 0.4, 0.5],  # Would be NumPy arrays in practice
    "optimizer_state": {"lr": 0.001, "momentum": 0.9},
}

cp.save("training_epoch50", training_state,
        metadata={"model": "ResNet50", "dataset": "ImageNet"})

# Restore later
restored_state = cp.load("training_epoch50")
print(f"Restored epoch: {restored_state['epoch']}")
print(f"Restored accuracy: {restored_state['accuracy']}")

# List checkpoints
for info in cp.list_checkpoints():
    print(f"  {info['name']}: {info['timestamp']} ({info['size']} bytes)")

Pro Tips

1. When to use pickle vs JSON

# Use pickle: Python-only, any Python object, fast
# - Machine learning models, numpy arrays, complex object structures
# - Saving data within the same codebase

# Use JSON: language-agnostic, human-readable, safe
# - API responses, config files, logs
# - Data exchange with other languages/systems

# Decision flow
def choose_serialization(obj, cross_language=False, human_readable=False):
    if cross_language or human_readable:
        return "JSON (or msgpack)"
    elif hasattr(obj, "__array__"):  # numpy array
        return "joblib"
    elif callable(obj):  # function, closure
        return "dill"
    else:
        return "pickle"

2. Include version info in pickle files

import pickle

CURRENT_VERSION = 2

def save_with_version(filepath: str, data: dict) -> None:
    versioned = {"version": CURRENT_VERSION, "data": data}
    with open(filepath, "wb") as f:
        pickle.dump(versioned, f)

def load_with_migration(filepath: str) -> dict:
    with open(filepath, "rb") as f:
        versioned = pickle.load(f)

    version = versioned.get("version", 1)
    data = versioned["data"]

    # Migration per version
    if version == 1:
        data = migrate_v1_to_v2(data)

    return data

def migrate_v1_to_v2(data: dict) -> dict:
    """v1 → v2 migration: add new field"""
    data.setdefault("new_field", "default_value")
    return data

Summary

Module/Library	Characteristics	Use Case
`pickle`	Python-only, fast, security risk	Internal data caching, checkpoints
`shelve`	pickle-based dictionary DB	Simple persistent storage
`struct`	C struct binary format	File formats, network protocols
`joblib`	Optimized for NumPy/ML models	Saving ML models
`msgpack`	Cross-language, fast	Multi-language data exchange
`dill`	Serializes lambdas/closures	Multiprocessing, complex functions

Security rule: Never use pickle.loads() on untrusted external data. Arbitrary code can execute during deserialization.

pickle Basics: dumps/loads, dump/load​

pickle Protocol Versions​

Serializing User-Defined Classes​

Serializable vs Non-Serializable Objects​

Security Warnings​

Persistent Storage with the shelve Module​

Binary File Handling with the struct Module​

Alternative Libraries​

joblib — Large Arrays / ML Models​

msgpack — Fast Cross-Language Serialization​

dill — Serialize Lambdas and Closures Too​

Real-World Example: Checkpoint System​

Pro Tips​

1. When to use pickle vs JSON​

2. Include version info in pickle files​

Summary​