Standard Library Essentials — os, sys, re, datetime, collections, itertools

Python's standard library (stdlib) is a powerful set of modules available without any additional installation. Master the most frequently used core modules in real-world development.

os — Operating System Interface

Used for file system, environment variables, and process management.

import os

# Environment variables
db_url = os.environ.get("DATABASE_URL", "sqlite:///dev.db")
os.environ["MY_VAR"] = "value"

# Current directory
print(os.getcwd())
os.chdir("/tmp")

# Path manipulation
path = os.path.join("data", "2024", "report.csv")
print(os.path.exists(path))
print(os.path.dirname(path))   # data/2024
print(os.path.basename(path))  # report.csv
print(os.path.splitext(path))  # ('data/2024/report', '.csv')

# Creating and deleting files/directories
os.makedirs("logs/2024/03", exist_ok=True)
os.remove("old_file.txt")
os.rmdir("empty_dir")

# Traverse directory contents
for dirpath, dirnames, filenames in os.walk("project/"):
    for fname in filenames:
        full_path = os.path.join(dirpath, fname)
        print(full_path)

# Process information
print(os.getpid())    # Current process ID
print(os.cpu_count()) # Number of CPU cores

# Execute a command (subprocess preferred)
exit_code = os.system("ls -la")

sys — Interpreter Information

Module for accessing the Python interpreter itself.

import sys

# Python version
print(sys.version)        # '3.12.0 (main, ...'
print(sys.version_info)   # sys.version_info(major=3, minor=12, ...)

# Platform
print(sys.platform)  # 'linux', 'darwin', 'win32'

# Command-line arguments
# python script.py arg1 arg2 --flag
print(sys.argv)       # ['script.py', 'arg1', 'arg2', '--flag']

# Standard streams
sys.stdout.write("Standard output\n")
sys.stderr.write("Error output\n")

# Module search path
sys.path.insert(0, "/custom/lib")

# Maximum recursion depth
print(sys.getrecursionlimit())  # 1000 (default)
sys.setrecursionlimit(5000)

# Terminate interpreter
# sys.exit(0)   # Normal exit
# sys.exit(1)   # Error exit

# Object size (bytes)
print(sys.getsizeof([1, 2, 3]))    # 80
print(sys.getsizeof("hello"))      # 54

# Version check pattern
if sys.version_info < (3, 12):
    raise RuntimeError("Python 3.12 or higher required.")

re — Regular Expressions Mastery

import re

text = "Meeting on March 15, 2024. Contact: 555-1234-5678, Email: user@example.com"

# Basic functions
# re.search(): find first match
match = re.search(r"\d{4}", text)
if match:
    print(match.group())   # 2024
    print(match.start())   # position
    print(match.end())     # end position

# re.findall(): all matches as a list
phones = re.findall(r"\d{3}-\d{4}-\d{4}", text)
print(phones)  # ['555-1234-5678']

# re.finditer(): all matches as an iterator
for m in re.finditer(r"\d+", text):
    print(f"{m.group()} at {m.start()}")

# re.sub(): substitution
cleaned = re.sub(r"\d{3}-\d{4}-\d{4}", "***-****-****", text)

# re.split(): split
parts = re.split(r"[,\s]+", "apple, banana  cherry")
print(parts)  # ['apple', 'banana', 'cherry']

# Group capture
date_pattern = re.compile(r"(\w+)\s+(\d{1,2}),\s+(\d{4})")
m = date_pattern.search(text)
if m:
    month, day, year = m.groups()
    print(f"{year}-{month}-{day}")

# Named groups
email_pattern = re.compile(
    r"(?P<user>[\w.]+)@(?P<domain>[\w.]+)\.(?P<tld>[a-z]{2,})"
)
m = email_pattern.search(text)
if m:
    print(m.group("user"))    # user
    print(m.group("domain"))  # example
    print(m.groupdict())      # {'user': 'user', 'domain': 'example', 'tld': 'com'}

# Compiled patterns for better performance
EMAIL_RE = re.compile(
    r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
    re.IGNORECASE
)

def extract_emails(text: str) -> list[str]:
    return EMAIL_RE.findall(text)

# Readable pattern with re.VERBOSE
url_pattern = re.compile(r"""
    https?://          # scheme
    (?:www\.)?         # optional www
    ([\w-]+)           # domain
    \.([a-z]{2,})      # TLD
    (/[\w./%-]*)?      # path (optional)
""", re.VERBOSE)

datetime — Date and Time Processing

from datetime import date, time, datetime, timedelta, timezone
import datetime as dt

# date: date only
today = date.today()
print(today)                  # 2024-03-15
print(today.year, today.month, today.day)
birthday = date(1990, 5, 20)
print((today - birthday).days)  # age in days

# time: time only
t = time(14, 30, 0)
print(t.isoformat())  # '14:30:00'

# datetime: date + time
now = datetime.now()
utc_now = datetime.now(timezone.utc)

print(now.isoformat())      # '2024-03-15T14:30:00.123456'
print(now.strftime("%Y-%m-%d %H:%M"))  # '2024-03-15 14:30'

# Parse string
dt_obj = datetime.strptime("2024-03-15 14:30", "%Y-%m-%d %H:%M")

# timedelta: duration
delta = timedelta(days=30, hours=2, minutes=15)
future = now + delta
past = now - timedelta(weeks=2)

print(f"30 days from now: {future.date()}")

# Timezone handling (Python 3.9+ recommends zoneinfo)
from zoneinfo import ZoneInfo

seoul = ZoneInfo("Asia/Seoul")
ny = ZoneInfo("America/New_York")

now_seoul = datetime.now(seoul)
now_ny = now_seoul.astimezone(ny)
print(f"Seoul: {now_seoul.strftime('%H:%M %Z')}")
print(f"New York: {now_ny.strftime('%H:%M %Z')}")

# ISO format parsing (Python 3.7+)
dt_obj = datetime.fromisoformat("2024-03-15T14:30:00+09:00")

collections — Advanced Containers

Counter — Counting Items

from collections import Counter

words = ["apple", "banana", "apple", "cherry", "banana", "apple"]
c = Counter(words)
print(c)                     # Counter({'apple': 3, 'banana': 2, 'cherry': 1})
print(c.most_common(2))      # [('apple', 3), ('banana', 2)]
print(c["apple"])            # 3

# Character frequency
char_count = Counter("mississippi")
print(char_count)  # Counter({'i': 4, 's': 4, 'p': 2, 'm': 1})

# Counter arithmetic
c1 = Counter(a=3, b=2)
c2 = Counter(a=1, b=4, c=1)
print(c1 + c2)  # Counter({'b': 6, 'a': 4, 'c': 1})
print(c1 - c2)  # Counter({'a': 2})

defaultdict — dict with Default Values

from collections import defaultdict

# Regular dict raises KeyError for missing keys
# regular_dict = {}
# regular_dict["missing"]  # KeyError!

# defaultdict auto-creates default values
word_positions = defaultdict(list)
text = "the quick brown fox jumps over the lazy dog"
for i, word in enumerate(text.split()):
    word_positions[word].append(i)

print(word_positions["the"])  # [0, 6]

# Nested defaultdict
nested = defaultdict(lambda: defaultdict(int))
nested["group1"]["count"] += 1

OrderedDict

from collections import OrderedDict

# Regular dicts in Python 3.7+ maintain insertion order
# OrderedDict's special feature: move_to_end
od = OrderedDict([("a", 1), ("b", 2), ("c", 3)])
od.move_to_end("a")              # Move to end
od.move_to_end("c", last=False)  # Move to front
print(list(od.keys()))  # ['c', 'b', 'a']

# Useful for LRU cache implementation

deque — Double-Ended Queue

from collections import deque

d = deque([1, 2, 3], maxlen=5)
d.appendleft(0)   # Add to left
d.append(4)       # Add to right
d.popleft()       # Remove from left — O(1)!
d.pop()           # Remove from right

# Sliding window pattern
def sliding_window_avg(data: list[float], size: int) -> list[float]:
    window = deque(maxlen=size)
    result = []
    for val in data:
        window.append(val)
        if len(window) == size:
            result.append(sum(window) / size)
    return result

namedtuple — Named Tuple

from collections import namedtuple
from typing import NamedTuple

# Old-style
Point = namedtuple("Point", ["x", "y"])
p = Point(3, 4)
print(p.x, p.y)        # 3 4
print(p._asdict())     # {'x': 3, 'y': 4}

# Modern approach (with type hints)
class Coordinate(NamedTuple):
    latitude: float
    longitude: float
    altitude: float = 0.0

coord = Coordinate(37.5665, 126.9780)
print(coord.latitude)  # 37.5665
lat, lon, alt = coord  # Unpacking

itertools — Iterator Tools

import itertools

# Infinite iterators
for i, val in enumerate(itertools.count(10, 2)):
    print(val)   # 10, 12, 14, ...
    if i >= 4:
        break

colors = itertools.cycle(["red", "green", "blue"])
for _ in range(6):
    print(next(colors))  # red, green, blue, red, ...

# chain: combine multiple iterables
result = list(itertools.chain([1, 2], [3, 4], [5, 6]))
print(result)  # [1, 2, 3, 4, 5, 6]

# islice: slice an iterator
first_5 = list(itertools.islice(itertools.count(), 5))
print(first_5)  # [0, 1, 2, 3, 4]

# product: Cartesian product
for combo in itertools.product("AB", repeat=2):
    print(combo)  # ('A', 'A'), ('A', 'B'), ('B', 'A'), ('B', 'B')

# combinations, permutations
print(list(itertools.combinations([1, 2, 3], 2)))
# [(1, 2), (1, 3), (2, 3)]

print(list(itertools.permutations([1, 2, 3], 2)))
# [(1, 2), (1, 3), (2, 1), (2, 3), (3, 1), (3, 2)]

# groupby: group by consecutive key
data = [("A", 1), ("A", 2), ("B", 3), ("B", 4), ("A", 5)]
for key, group in itertools.groupby(data, key=lambda x: x[0]):
    print(key, list(group))
# A [('A', 1), ('A', 2)]
# B [('B', 3), ('B', 4)]
# A [('A', 5)]

Practical Example: Log Analyzer

import re
import sys
from collections import Counter, defaultdict
from datetime import datetime
from pathlib import Path
from itertools import groupby

LOG_PATTERN = re.compile(
    r"(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) "
    r"(?P<level>INFO|WARNING|ERROR|DEBUG) "
    r"(?P<message>.+)"
)

def parse_log_line(line: str) -> dict | None:
    m = LOG_PATTERN.match(line)
    if not m:
        return None
    return {
        "timestamp": datetime.strptime(m.group("timestamp"), "%Y-%m-%d %H:%M:%S"),
        "level": m.group("level"),
        "message": m.group("message"),
    }

def analyze_logs(log_file: str) -> None:
    path = Path(log_file)
    if not path.exists():
        print(f"File not found: {log_file}", file=sys.stderr)
        sys.exit(1)

    entries = []
    with open(path, encoding="utf-8") as f:
        for line in f:
            entry = parse_log_line(line.strip())
            if entry:
                entries.append(entry)

    # Count by level
    level_counts = Counter(e["level"] for e in entries)
    print("=== Log Count by Level ===")
    for level, count in level_counts.most_common():
        print(f"  {level}: {count}")

    # Aggregate errors by hour
    error_entries = [e for e in entries if e["level"] == "ERROR"]
    hour_errors = defaultdict(int)
    for entry in error_entries:
        hour_errors[entry["timestamp"].hour] += 1

    print("\n=== Errors by Hour ===")
    for hour in sorted(hour_errors):
        print(f"  {hour:02d}:00 — {hour_errors[hour]}")

if __name__ == "__main__":
    analyze_logs(sys.argv[1] if len(sys.argv) > 1 else "app.log")

Expert Tips

Tip 1: os.scandir() is faster than os.listdir()

import os

# Slow way
files = [f for f in os.listdir(".") if os.path.isfile(f)]

# Fast way (caches stat information)
files = [e.name for e in os.scandir(".") if e.is_file()]

Tip 2: Updating Counter

from collections import Counter

total = Counter()
for batch in data_batches:
    total.update(Counter(batch))

Tip 3: Flatten nested lists with itertools.chain.from_iterable

import itertools

nested = [[1, 2, 3], [4, 5], [6, 7, 8, 9]]
flat = list(itertools.chain.from_iterable(nested))
print(flat)  # [1, 2, 3, 4, 5, 6, 7, 8, 9]

Tip 4: datetime.timezone.utc vs pytz

from datetime import datetime, timezone

# Python 3.9+ — no additional packages needed
from zoneinfo import ZoneInfo
now_utc = datetime.now(timezone.utc)
now_kst = now_utc.astimezone(ZoneInfo("Asia/Seoul"))

os — Operating System Interface​

sys — Interpreter Information​

re — Regular Expressions Mastery​

datetime — Date and Time Processing​

collections — Advanced Containers​

Counter — Counting Items​

defaultdict — dict with Default Values​

OrderedDict​

deque — Double-Ended Queue​

namedtuple — Named Tuple​

itertools — Iterator Tools​

Practical Example: Log Analyzer​

Expert Tips​