Deep Learning Introduction — PyTorch Basics

PyTorch is a deep learning framework developed by Meta (Facebook). Its dynamic computation graphs and intuitive Python style make it widely used in both research and production.

Installation

# CPU version
pip install torch torchvision torchaudio

# GPU version (CUDA 12.1)
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

Tensors

import torch
import numpy as np

# Create tensors
t1 = torch.tensor([1, 2, 3, 4, 5])
t2 = torch.tensor([[1.0, 2.0], [3.0, 4.0]])

print(t2.shape)   # torch.Size([2, 2])
print(t2.dtype)   # torch.float32
print(t2.device)  # cpu

# Special tensors
torch.zeros(3, 4)           # fill with 0
torch.ones(2, 3)            # fill with 1
torch.eye(4)                # identity matrix
torch.rand(3, 3)            # uniform distribution [0, 1)
torch.randn(3, 3)           # standard normal distribution
torch.arange(0, 10, 2)      # [0, 2, 4, 6, 8]

# NumPy ↔ Tensor conversion
arr = np.array([1, 2, 3])
t = torch.from_numpy(arr)   # shares memory
arr2 = t.numpy()            # back to NumPy

# Move to GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
t_gpu = t2.to(device)

Tensor Operations

a = torch.tensor([[1., 2.], [3., 4.]])
b = torch.tensor([[5., 6.], [7., 8.]])

# Element-wise operations
print(a + b)
print(a * b)

# Matrix multiplication
print(a @ b)           # matmul
print(torch.mm(a, b))  # same

# Statistics
print(a.sum())         # 10.0
print(a.mean())        # 2.5
print(a.max())         # 4.0
print(a.argmax())      # 3 (global index)

# Shape transformation
c = torch.arange(12).reshape(3, 4)
print(c.shape)   # [3, 4]
print(c.T.shape) # [4, 3]
print(c.view(-1).shape)   # [12] — reshape (requires contiguous memory)
print(c.flatten().shape)  # [12] — always copies

# Dimension manipulation
x = torch.randn(3, 4)
print(x.unsqueeze(0).shape)  # [1, 3, 4] add dimension
print(x.unsqueeze(1).shape)  # [3, 1, 4]
y = torch.randn(1, 3, 4)
print(y.squeeze(0).shape)    # [3, 4] remove dimension

Automatic Differentiation (Autograd)

# requires_grad=True: track gradients
x = torch.tensor(2.0, requires_grad=True)
y = x **3 + 2 * x + 1   # y = x³ + 2x + 1

y.backward()              # backpropagation
print(x.grad)             # dy/dx = 3x² + 2 = 14

# Reset gradients (required every iteration)
x.grad.zero_()

# Disable gradient tracking (for inference)
with torch.no_grad():
    pred = model(x)  # memory efficient

Neural Network — nn.Module

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader


# Define model
class MLPClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout=0.3):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, output_dim),
        )

    def forward(self, x):
        return self.network(x)


# Create model
model = MLPClassifier(input_dim=30, hidden_dim=128, output_dim=2)
print(model)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

Training Loop

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Prepare data
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

# Convert to Tensor
X_train_t = torch.FloatTensor(X_train_s)
y_train_t = torch.LongTensor(y_train)
X_test_t = torch.FloatTensor(X_test_s)
y_test_t = torch.LongTensor(y_test)

# DataLoader
train_ds = TensorDataset(X_train_t, y_train_t)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)

# Model, loss function, optimizer
model = MLPClassifier(input_dim=30, hidden_dim=64, output_dim=2)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5)


def train_epoch(model, loader, criterion, optimizer):
    model.train()
    total_loss, correct = 0.0, 0
    for X_batch, y_batch in loader:
        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * len(X_batch)
        correct += (output.argmax(1) == y_batch).sum().item()
    return total_loss / len(loader.dataset), correct / len(loader.dataset)


def evaluate(model, X, y, criterion):
    model.eval()
    with torch.no_grad():
        output = model(X)
        loss = criterion(output, y).item()
        acc = (output.argmax(1) == y).float().mean().item()
    return loss, acc


# Training
best_val_acc = 0
for epoch in range(100):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer)
    val_loss, val_acc = evaluate(model, X_test_t, y_test_t, criterion)
    scheduler.step(val_loss)

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), "best_model.pt")

    if (epoch + 1) % 20 == 0:
        print(f"Epoch {epoch+1:3d} | Train loss: {train_loss:.4f} acc: {train_acc:.4f} | "
              f"Val loss: {val_loss:.4f} acc: {val_acc:.4f}")

print(f"\nBest validation accuracy: {best_val_acc:.4f}")

Saving and Loading Models

# Save
torch.save(model.state_dict(), "model_weights.pt")    # weights only (recommended)
torch.save(model, "model_full.pt")                    # full model

# Load
model_loaded = MLPClassifier(input_dim=30, hidden_dim=64, output_dim=2)
model_loaded.load_state_dict(torch.load("model_weights.pt", map_location=device))
model_loaded.eval()

# Inference
with torch.no_grad():
    predictions = model_loaded(X_test_t)
    probs = torch.softmax(predictions, dim=1)
    predicted_classes = predictions.argmax(1)

Activation Functions and Loss Functions

# Activation functions
nn.ReLU()       # max(0, x) — standard for hidden layers
nn.GELU()       # used in Transformers
nn.Sigmoid()    # binary classification output
nn.Softmax(dim=1)  # multi-class output

# Loss functions
nn.CrossEntropyLoss()    # multi-class classification (Softmax built-in)
nn.BCEWithLogitsLoss()   # binary classification (Sigmoid built-in)
nn.MSELoss()             # regression
nn.L1Loss()              # MAE regression

# Optimizers
optim.Adam(params, lr=0.001)         # default for most cases
optim.AdamW(params, lr=0.001, weight_decay=0.01)  # for Transformers
optim.SGD(params, lr=0.01, momentum=0.9)

Summary

Concept	Description
Tensor	GPU-accelerated array (`requires_grad` for autograd)
`nn.Module`	Base class for neural networks
`nn.Sequential`	Sequential layer composition
`forward()`	Define forward pass
`loss.backward()`	Backpropagation (compute gradients)
`optimizer.step()`	Update parameters
`model.eval()` + `no_grad()`	Inference mode (disable Dropout/BN)

The core deep learning flow: Forward pass → Compute loss → Backpropagation → Update parameters

Installation​

Tensors​

Tensor Operations​

Automatic Differentiation (Autograd)​

Neural Network — nn.Module​

Training Loop​

Saving and Loading Models​

Activation Functions and Loss Functions​

Summary​