Deep Learning Introduction — PyTorch Basics
PyTorch is a deep learning framework developed by Meta (Facebook). Its dynamic computation graphs and intuitive Python style make it widely used in both research and production.
Installation
# CPU version
pip install torch torchvision torchaudio
# GPU version (CUDA 12.1)
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
Tensors
import torch
import numpy as np
# Create tensors
t1 = torch.tensor([1, 2, 3, 4, 5])
t2 = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
print(t2.shape) # torch.Size([2, 2])
print(t2.dtype) # torch.float32
print(t2.device) # cpu
# Special tensors
torch.zeros(3, 4) # fill with 0
torch.ones(2, 3) # fill with 1
torch.eye(4) # identity matrix
torch.rand(3, 3) # uniform distribution [0, 1)
torch.randn(3, 3) # standard normal distribution
torch.arange(0, 10, 2) # [0, 2, 4, 6, 8]
# NumPy ↔ Tensor conversion
arr = np.array([1, 2, 3])
t = torch.from_numpy(arr) # shares memory
arr2 = t.numpy() # back to NumPy
# Move to GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
t_gpu = t2.to(device)
Tensor Operations
a = torch.tensor([[1., 2.], [3., 4.]])
b = torch.tensor([[5., 6.], [7., 8.]])
# Element-wise operations
print(a + b)
print(a * b)
# Matrix multiplication
print(a @ b) # matmul
print(torch.mm(a, b)) # same
# Statistics
print(a.sum()) # 10.0
print(a.mean()) # 2.5
print(a.max()) # 4.0
print(a.argmax()) # 3 (global index)
# Shape transformation
c = torch.arange(12).reshape(3, 4)
print(c.shape) # [3, 4]
print(c.T.shape) # [4, 3]
print(c.view(-1).shape) # [12] — reshape (requires contiguous memory)
print(c.flatten().shape) # [12] — always copies
# Dimension manipulation
x = torch.randn(3, 4)
print(x.unsqueeze(0).shape) # [1, 3, 4] add dimension
print(x.unsqueeze(1).shape) # [3, 1, 4]
y = torch.randn(1, 3, 4)
print(y.squeeze(0).shape) # [3, 4] remove dimension
Automatic Differentiation (Autograd)
# requires_grad=True: track gradients
x = torch.tensor(2.0, requires_grad=True)
y = x ** 3 + 2 * x + 1 # y = x³ + 2x + 1
y.backward() # backpropagation
print(x.grad) # dy/dx = 3x² + 2 = 14
# Reset gradients (required every iteration)
x.grad.zero_()
# Disable gradient tracking (for inference)
with torch.no_grad():
pred = model(x) # memory efficient
Neural Network — nn.Module
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
# Define model
class MLPClassifier(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim, dropout=0.3):
super().__init__()
self.network = nn.Sequential(
nn.Linear(input_dim, hidden_dim),
nn.BatchNorm1d(hidden_dim),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(hidden_dim, hidden_dim // 2),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(hidden_dim // 2, output_dim),
)
def forward(self, x):
return self.network(x)
# Create model
model = MLPClassifier(input_dim=30, hidden_dim=128, output_dim=2)
print(model)
# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
Training Loop
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Prepare data
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)
# Convert to Tensor
X_train_t = torch.FloatTensor(X_train_s)
y_train_t = torch.LongTensor(y_train)
X_test_t = torch.FloatTensor(X_test_s)
y_test_t = torch.LongTensor(y_test)
# DataLoader
train_ds = TensorDataset(X_train_t, y_train_t)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
# Model, loss function, optimizer
model = MLPClassifier(input_dim=30, hidden_dim=64, output_dim=2)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5)
def train_epoch(model, loader, criterion, optimizer):
model.train()
total_loss, correct = 0.0, 0
for X_batch, y_batch in loader:
optimizer.zero_grad()
output = model(X_batch)
loss = criterion(output, y_batch)
loss.backward()
optimizer.step()
total_loss += loss.item() * len(X_batch)
correct += (output.argmax(1) == y_batch).sum().item()
return total_loss / len(loader.dataset), correct / len(loader.dataset)
def evaluate(model, X, y, criterion):
model.eval()
with torch.no_grad():
output = model(X)
loss = criterion(output, y).item()
acc = (output.argmax(1) == y).float().mean().item()
return loss, acc
# Training
best_val_acc = 0
for epoch in range(100):
train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer)
val_loss, val_acc = evaluate(model, X_test_t, y_test_t, criterion)
scheduler.step(val_loss)
if val_acc > best_val_acc:
best_val_acc = val_acc
torch.save(model.state_dict(), "best_model.pt")
if (epoch + 1) % 20 == 0:
print(f"Epoch {epoch+1:3d} | Train loss: {train_loss:.4f} acc: {train_acc:.4f} | "
f"Val loss: {val_loss:.4f} acc: {val_acc:.4f}")
print(f"\nBest validation accuracy: {best_val_acc:.4f}")
Saving and Loading Models
# Save
torch.save(model.state_dict(), "model_weights.pt") # weights only (recommended)
torch.save(model, "model_full.pt") # full model
# Load
model_loaded = MLPClassifier(input_dim=30, hidden_dim=64, output_dim=2)
model_loaded.load_state_dict(torch.load("model_weights.pt", map_location=device))
model_loaded.eval()
# Inference
with torch.no_grad():
predictions = model_loaded(X_test_t)
probs = torch.softmax(predictions, dim=1)
predicted_classes = predictions.argmax(1)
Activation Functions and Loss Functions
# Activation functions
nn.ReLU() # max(0, x) — standard for hidden layers
nn.GELU() # used in Transformers
nn.Sigmoid() # binary classification output
nn.Softmax(dim=1) # multi-class output
# Loss functions
nn.CrossEntropyLoss() # multi-class classification (Softmax built-in)
nn.BCEWithLogitsLoss() # binary classification (Sigmoid built-in)
nn.MSELoss() # regression
nn.L1Loss() # MAE regression
# Optimizers
optim.Adam(params, lr=0.001) # default for most cases
optim.AdamW(params, lr=0.001, weight_decay=0.01) # for Transformers
optim.SGD(params, lr=0.01, momentum=0.9)
Summary
| Concept | Description |
|---|---|
| Tensor | GPU-accelerated array (requires_grad for autograd) |
nn.Module | Base class for neural networks |
nn.Sequential | Sequential layer composition |
forward() | Define forward pass |
loss.backward() | Backpropagation (compute gradients) |
optimizer.step() | Update parameters |
model.eval() + no_grad() | Inference mode (disable Dropout/BN) |
The core deep learning flow: Forward pass → Compute loss → Backpropagation → Update parameters