pickle and Binary Files
pickle is Python's standard library for serializing Python objects into a byte stream and deserializing them back. It's used to save Python objects to files or transmit them over a network.
pickle Basics: dumps/loads, dump/load
import pickle
# --- In-memory serialization/deserialization ---
# Simple objects
data = {
"name": "Alice",
"age": 30,
"scores": [95, 87, 92],
"metadata": {"active": True, "role": "admin"},
}
# Object → byte stream
serialized = pickle.dumps(data)
print(type(serialized)) # <class 'bytes'>
print(len(serialized)) # number of bytes
# Byte stream → object
restored = pickle.loads(serialized)
print(restored == data) # True
print(restored is data) # False (new object created)
# --- Saving to / loading from a file ---
# Save
with open("data.pkl", "wb") as f: # Must use binary mode "wb"
pickle.dump(data, f)
# Load
with open("data.pkl", "rb") as f: # Binary read "rb"
loaded = pickle.load(f)
print(loaded)
# --- Saving multiple objects to one file ---
records = [
{"id": 1, "value": "first"},
{"id": 2, "value": "second"},
{"id": 3, "value": "third"},
]
with open("records.pkl", "wb") as f:
for record in records:
pickle.dump(record, f) # Can dump consecutively
# Load consecutively
loaded_records = []
with open("records.pkl", "rb") as f:
while True:
try:
loaded_records.append(pickle.load(f))
except EOFError:
break # Reached end of file
print(loaded_records)
pickle Protocol Versions
import pickle
import sys
print(f"Default protocol: {pickle.DEFAULT_PROTOCOL}") # Usually 5
print(f"Highest protocol: {pickle.HIGHEST_PROTOCOL}") # Varies by Python version
data = list(range(10_000))
# Compare sizes across protocols
for protocol in range(pickle.HIGHEST_PROTOCOL + 1):
serialized = pickle.dumps(data, protocol=protocol)
print(f"Protocol {protocol}: {len(serialized):,} bytes")
# Save with a specific protocol
with open("data_v5.pkl", "wb") as f:
pickle.dump(data, f, protocol=5)
# pickle.HIGHEST_PROTOCOL is recommended (latest Python only)
# Use lower protocol if compatibility with older versions is needed
with open("data_compat.pkl", "wb") as f:
pickle.dump(data, f, protocol=2) # Compatible with Python 2.3+
Serializing User-Defined Classes
import pickle
from datetime import datetime
class Employee:
"""A class that can be serialized with pickle"""
def __init__(self, emp_id: str, name: str, salary: float, hire_date: datetime):
self.emp_id = emp_id
self.name = name
self.salary = salary
self.hire_date = hire_date
self._cache: dict = {} # Cache — may want to exclude from serialization
def __repr__(self) -> str:
return f"Employee(id={self.emp_id!r}, name={self.name!r})"
# Default serialization: includes all attributes in __dict__
emp = Employee("E001", "Alice", 80000.0, datetime(2020, 3, 15))
serialized = pickle.dumps(emp)
restored = pickle.loads(serialized)
print(restored)
print(restored.salary) # 80000.0
# Control serialization with __getstate__ / __setstate__
class OptimizedEmployee:
def __init__(self, emp_id: str, name: str, salary: float):
self.emp_id = emp_id
self.name = name
self.salary = salary
self._cache: dict = {} # Cache (no need to serialize)
self._connection = None # Connection object (cannot serialize)
def __getstate__(self) -> dict:
"""Return state to serialize — excluding _cache and _connection"""
state = self.__dict__.copy()
del state["_cache"]
del state["_connection"]
return state
def __setstate__(self, state: dict) -> None:
"""Restore state on deserialization — re-initialize excluded attributes"""
self.__dict__.update(state)
self._cache = {} # Re-initialize with empty cache
self._connection = None # Connection needs to be re-established separately
def __repr__(self) -> str:
return f"OptimizedEmployee({self.emp_id!r}, {self.name!r})"
oe = OptimizedEmployee("E002", "Bob", 90000.0)
oe._cache = {"key": "value"} # Cache data
serialized = pickle.dumps(oe)
restored = pickle.loads(serialized)
print(restored)
print(restored._cache) # {} (re-initialized)
print(restored._connection) # None (re-initialized)
Serializable vs Non-Serializable Objects
import pickle
# ✅ Objects that can be serialized
serializable_objects = [
None,
True, False,
42, 3.14, 1 + 2j,
"string",
b"bytes",
[1, 2, 3],
(1, "a"),
{"key": "value"},
{1, 2, 3},
range(10),
]
for obj in serializable_objects:
try:
data = pickle.dumps(obj)
restored = pickle.loads(data)
print(f"✅ {type(obj).__name__}: {obj!r} → serialization succeeded")
except Exception as e:
print(f"❌ {type(obj).__name__}: {e}")
# ❌ Objects that cannot be serialized
import threading
import io
def my_function():
pass # Regular functions (unlike lambdas) can be serialized
non_serializable = [
lambda x: x * 2, # Lambda functions cannot be serialized
threading.Lock(), # Thread locks
io.StringIO("test"), # Some IO objects
]
for obj in non_serializable:
try:
pickle.dumps(obj)
print(f"✅ {type(obj).__name__} serialization succeeded")
except (pickle.PicklingError, AttributeError, TypeError) as e:
print(f"❌ {type(obj).__name__}: {e}")
# Functions and classes defined at module level can be serialized
import pickle
def add(a, b):
return a + b
serialized_func = pickle.dumps(add) # Module-level functions are OK
restored_func = pickle.loads(serialized_func)
print(restored_func(3, 4)) # 7
Security Warnings
import pickle
# ⚠️ WARNING: Never use pickle.loads() on untrusted data!
# Arbitrary code can be executed during pickle deserialization.
# Example of malicious pickle data (DO NOT RUN!):
# The following code could be executed during deserialization:
#
# class Exploit:
# def __reduce__(self):
# return (os.system, ("rm -rf /",)) # Executes system command!
#
# malicious = pickle.dumps(Exploit())
# pickle.loads(malicious) # Executes rm -rf /!
# ✅ Safe usage guidelines:
# 1. Only load pickle files generated by your own code
# 2. Never load pickle data received from external sources
# 3. Do not use pickle for network communication
# 4. Do not use pickle data created from user input
# ✅ Alternative — JSON deserializes safely without executing code
import json
safe_data = {"name": "Alice", "age": 30, "scores": [95, 87]}
json_str = json.dumps(safe_data, ensure_ascii=False)
restored = json.loads(json_str) # Safe: JSON does not execute code
print(restored)
# Using hmac to sign pickle data for integrity verification
import hmac
import hashlib
def secure_dumps(obj, secret_key: bytes) -> tuple[bytes, bytes]:
"""Serialize object and add HMAC signature"""
data = pickle.dumps(obj)
signature = hmac.new(secret_key, data, hashlib.sha256).digest()
return data, signature
def secure_loads(data: bytes, signature: bytes, secret_key: bytes):
"""Verify HMAC signature then deserialize"""
expected = hmac.new(secret_key, data, hashlib.sha256).digest()
if not hmac.compare_digest(signature, expected):
raise ValueError("Data has been tampered with!")
return pickle.loads(data)
secret = b"my-secret-key-32-bytes-long!!!!!"
original = {"user": "admin", "role": "superuser"}
data, sig = secure_dumps(original, secret)
restored = secure_loads(data, sig, secret)
print(f"Signature verified: {restored}")
# Tampering attempt
tampered_data = data[:-1] + bytes([data[-1] ^ 0xFF])
try:
secure_loads(tampered_data, sig, secret)
except ValueError as e:
print(f"Tampering detected: {e}")
Persistent Storage with the shelve Module
shelve is a persistent dictionary based on pickle. It lets you save and retrieve Python objects from a file using key-value pairs.
import shelve
from dataclasses import dataclass, field
from datetime import datetime
@dataclass
class UserProfile:
user_id: str
username: str
email: str
created_at: datetime = field(default_factory=datetime.now)
preferences: dict = field(default_factory=dict)
def __repr__(self) -> str:
return f"UserProfile({self.username!r})"
# Using shelve — works like a dictionary
with shelve.open("users_db") as db:
# Save
user1 = UserProfile("U001", "alice", "alice@example.com")
user2 = UserProfile("U002", "bob", "bob@example.com",
preferences={"theme": "dark", "lang": "en"})
db["U001"] = user1
db["U002"] = user2
# Caution: modifications require re-assignment to be saved
# Safe way:
u = db["U001"]
u.preferences["notifications"] = True
db["U001"] = u # Explicit re-assignment
print(f"Users saved: {len(db)}")
print(f"Keys: {list(db.keys())}")
# Read back from file later
with shelve.open("users_db") as db:
for user_id, user in db.items():
print(f" {user_id}: {user}")
# Lookup a specific key
if "U001" in db:
alice = db["U001"]
print(f"Alice preferences: {alice.preferences}")
# Delete
if "U002" in db:
del db["U002"]
print(f"After deleting U002: {list(db.keys())}")
# writeback=True option — auto-detects modifications (watch memory usage)
with shelve.open("users_db", writeback=True) as db:
if "U001" in db:
db["U001"].preferences["auto_save"] = True # No re-assignment needed with writeback=True
# But all items are cached in memory, increasing memory usage
Binary File Handling with the struct Module
The struct module packs and unpacks binary data like C structs. It's used for file format parsing, network protocols, hardware communication, and more.
import struct
# struct format strings
# '>' big-endian, '<' little-endian, '=' native endian
# 'i' 4-byte int, 'f' 4-byte float, 'd' 8-byte double
# 'c' 1-byte char, 's' string, 'B' 1-byte unsigned int
# Pack (Python → bytes)
fmt = ">ifd" # big-endian: int(4) + float(4) + double(8) = 16 bytes
packed = struct.pack(fmt, 42, 3.14, 2.718281828)
print(f"Packed: {packed.hex()}")
print(f"Size: {struct.calcsize(fmt)} bytes")
# Unpack (bytes → Python)
unpacked = struct.unpack(fmt, packed)
print(f"Unpacked: {unpacked}") # (42, 3.14..., 2.718...)
# Writing/reading binary files with struct
RECORD_FORMAT = ">I10sf" # unsigned int(4) + 10-byte name + float(4) = 18 bytes
RECORD_SIZE = struct.calcsize(RECORD_FORMAT)
# Record sensor data
sensor_data = [
(1001, b"Sensor_A ", 23.5),
(1002, b"Sensor_B ", 18.2),
(1003, b"Sensor_C ", 35.7),
]
with open("sensors.bin", "wb") as f:
for sensor_id, name, temperature in sensor_data:
record = struct.pack(RECORD_FORMAT, sensor_id, name, temperature)
f.write(record)
# Read binary file
records_read = []
with open("sensors.bin", "rb") as f:
while True:
raw = f.read(RECORD_SIZE)
if len(raw) < RECORD_SIZE:
break
sensor_id, name, temperature = struct.unpack(RECORD_FORMAT, raw)
records_read.append({
"id": sensor_id,
"name": name.rstrip(b"\x00 ").decode("ascii"),
"temperature": round(temperature, 1),
})
print("Sensor data read:")
for record in records_read:
print(f" {record}")
# Random access to a specific position (possible since fixed-size records)
with open("sensors.bin", "rb") as f:
# Directly access the second record
f.seek(RECORD_SIZE * 1)
raw = f.read(RECORD_SIZE)
sensor_id, name, temp = struct.unpack(RECORD_FORMAT, raw)
print(f"\n2nd record: ID={sensor_id}, temp={temp:.1f}°C")
Alternative Libraries
joblib — Large Arrays / ML Models
# pip install joblib
import joblib
import numpy as np # pip install numpy
# joblib serializes NumPy arrays efficiently (faster than pickle)
data = {
"weights": np.random.randn(1000, 1000), # Large NumPy array
"labels": np.array([0, 1, 2, 3]),
"config": {"learning_rate": 0.001},
}
# Save
joblib.dump(data, "model_data.joblib")
# Memory-mapped load (access without loading file fully into memory)
loaded = joblib.load("model_data.joblib", mmap_mode="r")
print(f"weights shape: {loaded['weights'].shape}")
# Specify compression level (0=none, 9=maximum)
joblib.dump(data, "model_compressed.joblib", compress=3)
msgpack — Fast Cross-Language Serialization
# pip install msgpack
import msgpack
# msgpack is similar to JSON but smaller and faster
# Ideal for exchanging data with other languages (JavaScript, Java, etc.)
data = {
"user_id": 12345,
"name": "Alice",
"scores": [95, 87, 92, 88],
"active": True,
}
# Serialize
packed = msgpack.packb(data, use_bin_type=True)
print(f"msgpack size: {len(packed)} bytes")
# Compare with JSON
import json
json_str = json.dumps(data, ensure_ascii=False).encode()
print(f"JSON size: {len(json_str)} bytes")
# Deserialize
unpacked = msgpack.unpackb(packed, raw=False)
print(f"Restored: {unpacked}")
dill — Serialize Lambdas and Closures Too
# pip install dill
import dill
# dill can serialize more Python objects than pickle
# Supports lambdas, closures, nested functions, etc.
multiplier = lambda x: x * 3 # Regular pickle can't serialize lambdas
def make_adder(n: int):
def adder(x: int) -> int:
return x + n
return adder
add5 = make_adder(5) # Closure
# Serialize with dill
serialized_lambda = dill.dumps(multiplier)
serialized_closure = dill.dumps(add5)
# Deserialize
restored_lambda = dill.loads(serialized_lambda)
restored_closure = dill.loads(serialized_closure)
print(restored_lambda(7)) # 21
print(restored_closure(10)) # 15
# Useful in multiprocessing with lambdas:
# multiprocessing.Pool uses pickle by default,
# but with dill, lambdas can be passed to other processes
Real-World Example: Checkpoint System
import pickle
import os
import hashlib
from datetime import datetime
from pathlib import Path
class Checkpoint:
"""Checkpoint system that saves intermediate training/computation state"""
def __init__(self, checkpoint_dir: str = "./checkpoints"):
self.checkpoint_dir = Path(checkpoint_dir)
self.checkpoint_dir.mkdir(parents=True, exist_ok=True)
def save(self, name: str, state: dict, metadata: dict | None = None) -> Path:
"""Save state as a checkpoint"""
checkpoint = {
"name": name,
"state": state,
"metadata": metadata or {},
"timestamp": datetime.now().isoformat(),
"version": 1,
}
filepath = self.checkpoint_dir / f"{name}.pkl"
with open(filepath, "wb") as f:
pickle.dump(checkpoint, f, protocol=pickle.HIGHEST_PROTOCOL)
# Save checksum
checksum = self._compute_checksum(filepath)
checksum_file = self.checkpoint_dir / f"{name}.sha256"
checksum_file.write_text(checksum)
print(f"Checkpoint saved: {filepath} (checksum: {checksum[:8]}...)")
return filepath
def load(self, name: str) -> dict:
"""Load checkpoint and verify integrity"""
filepath = self.checkpoint_dir / f"{name}.pkl"
checksum_file = self.checkpoint_dir / f"{name}.sha256"
if not filepath.exists():
raise FileNotFoundError(f"Checkpoint not found: {name}")
# Integrity check
if checksum_file.exists():
expected = checksum_file.read_text().strip()
actual = self._compute_checksum(filepath)
if actual != expected:
raise ValueError(f"Checkpoint is corrupted: {name}")
with open(filepath, "rb") as f:
checkpoint = pickle.load(f)
print(f"Checkpoint loaded: {name} (saved at: {checkpoint['timestamp']})")
return checkpoint["state"]
def list_checkpoints(self) -> list[dict]:
"""List saved checkpoints"""
result = []
for pkl_file in sorted(self.checkpoint_dir.glob("*.pkl")):
with open(pkl_file, "rb") as f:
cp = pickle.load(f)
result.append({
"name": cp["name"],
"timestamp": cp["timestamp"],
"size": pkl_file.stat().st_size,
"metadata": cp.get("metadata", {}),
})
return result
def delete(self, name: str) -> None:
"""Delete a checkpoint"""
for ext in [".pkl", ".sha256"]:
path = self.checkpoint_dir / f"{name}{ext}"
if path.exists():
path.unlink()
print(f"Checkpoint deleted: {name}")
@staticmethod
def _compute_checksum(filepath: Path) -> str:
h = hashlib.sha256()
with open(filepath, "rb") as f:
for chunk in iter(lambda: f.read(8192), b""):
h.update(chunk)
return h.hexdigest()
# Usage example
cp = Checkpoint("./checkpoints")
# Simulate saving training state
training_state = {
"epoch": 50,
"loss": 0.0234,
"accuracy": 0.987,
"model_weights": [0.1, 0.2, 0.3, 0.4, 0.5], # Would be NumPy arrays in practice
"optimizer_state": {"lr": 0.001, "momentum": 0.9},
}
cp.save("training_epoch50", training_state,
metadata={"model": "ResNet50", "dataset": "ImageNet"})
# Restore later
restored_state = cp.load("training_epoch50")
print(f"Restored epoch: {restored_state['epoch']}")
print(f"Restored accuracy: {restored_state['accuracy']}")
# List checkpoints
for info in cp.list_checkpoints():
print(f" {info['name']}: {info['timestamp']} ({info['size']} bytes)")
Pro Tips
1. When to use pickle vs JSON
# Use pickle: Python-only, any Python object, fast
# - Machine learning models, numpy arrays, complex object structures
# - Saving data within the same codebase
# Use JSON: language-agnostic, human-readable, safe
# - API responses, config files, logs
# - Data exchange with other languages/systems
# Decision flow
def choose_serialization(obj, cross_language=False, human_readable=False):
if cross_language or human_readable:
return "JSON (or msgpack)"
elif hasattr(obj, "__array__"): # numpy array
return "joblib"
elif callable(obj): # function, closure
return "dill"
else:
return "pickle"
2. Include version info in pickle files
import pickle
CURRENT_VERSION = 2
def save_with_version(filepath: str, data: dict) -> None:
versioned = {"version": CURRENT_VERSION, "data": data}
with open(filepath, "wb") as f:
pickle.dump(versioned, f)
def load_with_migration(filepath: str) -> dict:
with open(filepath, "rb") as f:
versioned = pickle.load(f)
version = versioned.get("version", 1)
data = versioned["data"]
# Migration per version
if version == 1:
data = migrate_v1_to_v2(data)
return data
def migrate_v1_to_v2(data: dict) -> dict:
"""v1 → v2 migration: add new field"""
data.setdefault("new_field", "default_value")
return data
Summary
| Module/Library | Characteristics | Use Case |
|---|---|---|
pickle | Python-only, fast, security risk | Internal data caching, checkpoints |
shelve | pickle-based dictionary DB | Simple persistent storage |
struct | C struct binary format | File formats, network protocols |
joblib | Optimized for NumPy/ML models | Saving ML models |
msgpack | Cross-language, fast | Multi-language data exchange |
dill | Serializes lambdas/closures | Multiprocessing, complex functions |
Security rule: Never use pickle.loads() on untrusted external data. Arbitrary code can execute during deserialization.