Standard Library Essentials — os, sys, re, datetime, collections, itertools
Python's standard library (stdlib) is a powerful set of modules available without any additional installation. Master the most frequently used core modules in real-world development.
os — Operating System Interface
Used for file system, environment variables, and process management.
import os
# Environment variables
db_url = os.environ.get("DATABASE_URL", "sqlite:///dev.db")
os.environ["MY_VAR"] = "value"
# Current directory
print(os.getcwd())
os.chdir("/tmp")
# Path manipulation
path = os.path.join("data", "2024", "report.csv")
print(os.path.exists(path))
print(os.path.dirname(path)) # data/2024
print(os.path.basename(path)) # report.csv
print(os.path.splitext(path)) # ('data/2024/report', '.csv')
# Creating and deleting files/directories
os.makedirs("logs/2024/03", exist_ok=True)
os.remove("old_file.txt")
os.rmdir("empty_dir")
# Traverse directory contents
for dirpath, dirnames, filenames in os.walk("project/"):
for fname in filenames:
full_path = os.path.join(dirpath, fname)
print(full_path)
# Process information
print(os.getpid()) # Current process ID
print(os.cpu_count()) # Number of CPU cores
# Execute a command (subprocess preferred)
exit_code = os.system("ls -la")
sys — Interpreter Information
Module for accessing the Python interpreter itself.
import sys
# Python version
print(sys.version) # '3.12.0 (main, ...'
print(sys.version_info) # sys.version_info(major=3, minor=12, ...)
# Platform
print(sys.platform) # 'linux', 'darwin', 'win32'
# Command-line arguments
# python script.py arg1 arg2 --flag
print(sys.argv) # ['script.py', 'arg1', 'arg2', '--flag']
# Standard streams
sys.stdout.write("Standard output\n")
sys.stderr.write("Error output\n")
# Module search path
sys.path.insert(0, "/custom/lib")
# Maximum recursion depth
print(sys.getrecursionlimit()) # 1000 (default)
sys.setrecursionlimit(5000)
# Terminate interpreter
# sys.exit(0) # Normal exit
# sys.exit(1) # Error exit
# Object size (bytes)
print(sys.getsizeof([1, 2, 3])) # 80
print(sys.getsizeof("hello")) # 54
# Version check pattern
if sys.version_info < (3, 12):
raise RuntimeError("Python 3.12 or higher required.")
re — Regular Expressions Mastery
import re
text = "Meeting on March 15, 2024. Contact: 555-1234-5678, Email: user@example.com"
# Basic functions
# re.search(): find first match
match = re.search(r"\d{4}", text)
if match:
print(match.group()) # 2024
print(match.start()) # position
print(match.end()) # end position
# re.findall(): all matches as a list
phones = re.findall(r"\d{3}-\d{4}-\d{4}", text)
print(phones) # ['555-1234-5678']
# re.finditer(): all matches as an iterator
for m in re.finditer(r"\d+", text):
print(f"{m.group()} at {m.start()}")
# re.sub(): substitution
cleaned = re.sub(r"\d{3}-\d{4}-\d{4}", "***-****-****", text)
# re.split(): split
parts = re.split(r"[,\s]+", "apple, banana cherry")
print(parts) # ['apple', 'banana', 'cherry']
# Group capture
date_pattern = re.compile(r"(\w+)\s+(\d{1,2}),\s+(\d{4})")
m = date_pattern.search(text)
if m:
month, day, year = m.groups()
print(f"{year}-{month}-{day}")
# Named groups
email_pattern = re.compile(
r"(?P<user>[\w.]+)@(?P<domain>[\w.]+)\.(?P<tld>[a-z]{2,})"
)
m = email_pattern.search(text)
if m:
print(m.group("user")) # user
print(m.group("domain")) # example
print(m.groupdict()) # {'user': 'user', 'domain': 'example', 'tld': 'com'}
# Compiled patterns for better performance
EMAIL_RE = re.compile(
r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
re.IGNORECASE
)
def extract_emails(text: str) -> list[str]:
return EMAIL_RE.findall(text)
# Readable pattern with re.VERBOSE
url_pattern = re.compile(r"""
https?:// # scheme
(?:www\.)? # optional www
([\w-]+) # domain
\.([a-z]{2,}) # TLD
(/[\w./%-]*)? # path (optional)
""", re.VERBOSE)
datetime — Date and Time Processing
from datetime import date, time, datetime, timedelta, timezone
import datetime as dt
# date: date only
today = date.today()
print(today) # 2024-03-15
print(today.year, today.month, today.day)
birthday = date(1990, 5, 20)
print((today - birthday).days) # age in days
# time: time only
t = time(14, 30, 0)
print(t.isoformat()) # '14:30:00'
# datetime: date + time
now = datetime.now()
utc_now = datetime.now(timezone.utc)
print(now.isoformat()) # '2024-03-15T14:30:00.123456'
print(now.strftime("%Y-%m-%d %H:%M")) # '2024-03-15 14:30'
# Parse string
dt_obj = datetime.strptime("2024-03-15 14:30", "%Y-%m-%d %H:%M")
# timedelta: duration
delta = timedelta(days=30, hours=2, minutes=15)
future = now + delta
past = now - timedelta(weeks=2)
print(f"30 days from now: {future.date()}")
# Timezone handling (Python 3.9+ recommends zoneinfo)
from zoneinfo import ZoneInfo
seoul = ZoneInfo("Asia/Seoul")
ny = ZoneInfo("America/New_York")
now_seoul = datetime.now(seoul)
now_ny = now_seoul.astimezone(ny)
print(f"Seoul: {now_seoul.strftime('%H:%M %Z')}")
print(f"New York: {now_ny.strftime('%H:%M %Z')}")
# ISO format parsing (Python 3.7+)
dt_obj = datetime.fromisoformat("2024-03-15T14:30:00+09:00")
collections — Advanced Containers
Counter — Counting Items
from collections import Counter
words = ["apple", "banana", "apple", "cherry", "banana", "apple"]
c = Counter(words)
print(c) # Counter({'apple': 3, 'banana': 2, 'cherry': 1})
print(c.most_common(2)) # [('apple', 3), ('banana', 2)]
print(c["apple"]) # 3
# Character frequency
char_count = Counter("mississippi")
print(char_count) # Counter({'i': 4, 's': 4, 'p': 2, 'm': 1})
# Counter arithmetic
c1 = Counter(a=3, b=2)
c2 = Counter(a=1, b=4, c=1)
print(c1 + c2) # Counter({'b': 6, 'a': 4, 'c': 1})
print(c1 - c2) # Counter({'a': 2})
defaultdict — dict with Default Values
from collections import defaultdict
# Regular dict raises KeyError for missing keys
# regular_dict = {}
# regular_dict["missing"] # KeyError!
# defaultdict auto-creates default values
word_positions = defaultdict(list)
text = "the quick brown fox jumps over the lazy dog"
for i, word in enumerate(text.split()):
word_positions[word].append(i)
print(word_positions["the"]) # [0, 6]
# Nested defaultdict
nested = defaultdict(lambda: defaultdict(int))
nested["group1"]["count"] += 1
OrderedDict
from collections import OrderedDict
# Regular dicts in Python 3.7+ maintain insertion order
# OrderedDict's special feature: move_to_end
od = OrderedDict([("a", 1), ("b", 2), ("c", 3)])
od.move_to_end("a") # Move to end
od.move_to_end("c", last=False) # Move to front
print(list(od.keys())) # ['c', 'b', 'a']
# Useful for LRU cache implementation
deque — Double-Ended Queue
from collections import deque
d = deque([1, 2, 3], maxlen=5)
d.appendleft(0) # Add to left
d.append(4) # Add to right
d.popleft() # Remove from left — O(1)!
d.pop() # Remove from right
# Sliding window pattern
def sliding_window_avg(data: list[float], size: int) -> list[float]:
window = deque(maxlen=size)
result = []
for val in data:
window.append(val)
if len(window) == size:
result.append(sum(window) / size)
return result
namedtuple — Named Tuple
from collections import namedtuple
from typing import NamedTuple
# Old-style
Point = namedtuple("Point", ["x", "y"])
p = Point(3, 4)
print(p.x, p.y) # 3 4
print(p._asdict()) # {'x': 3, 'y': 4}
# Modern approach (with type hints)
class Coordinate(NamedTuple):
latitude: float
longitude: float
altitude: float = 0.0
coord = Coordinate(37.5665, 126.9780)
print(coord.latitude) # 37.5665
lat, lon, alt = coord # Unpacking
itertools — Iterator Tools
import itertools
# Infinite iterators
for i, val in enumerate(itertools.count(10, 2)):
print(val) # 10, 12, 14, ...
if i >= 4:
break
colors = itertools.cycle(["red", "green", "blue"])
for _ in range(6):
print(next(colors)) # red, green, blue, red, ...
# chain: combine multiple iterables
result = list(itertools.chain([1, 2], [3, 4], [5, 6]))
print(result) # [1, 2, 3, 4, 5, 6]
# islice: slice an iterator
first_5 = list(itertools.islice(itertools.count(), 5))
print(first_5) # [0, 1, 2, 3, 4]
# product: Cartesian product
for combo in itertools.product("AB", repeat=2):
print(combo) # ('A', 'A'), ('A', 'B'), ('B', 'A'), ('B', 'B')
# combinations, permutations
print(list(itertools.combinations([1, 2, 3], 2)))
# [(1, 2), (1, 3), (2, 3)]
print(list(itertools.permutations([1, 2, 3], 2)))
# [(1, 2), (1, 3), (2, 1), (2, 3), (3, 1), (3, 2)]
# groupby: group by consecutive key
data = [("A", 1), ("A", 2), ("B", 3), ("B", 4), ("A", 5)]
for key, group in itertools.groupby(data, key=lambda x: x[0]):
print(key, list(group))
# A [('A', 1), ('A', 2)]
# B [('B', 3), ('B', 4)]
# A [('A', 5)]
Practical Example: Log Analyzer
import re
import sys
from collections import Counter, defaultdict
from datetime import datetime
from pathlib import Path
from itertools import groupby
LOG_PATTERN = re.compile(
r"(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) "
r"(?P<level>INFO|WARNING|ERROR|DEBUG) "
r"(?P<message>.+)"
)
def parse_log_line(line: str) -> dict | None:
m = LOG_PATTERN.match(line)
if not m:
return None
return {
"timestamp": datetime.strptime(m.group("timestamp"), "%Y-%m-%d %H:%M:%S"),
"level": m.group("level"),
"message": m.group("message"),
}
def analyze_logs(log_file: str) -> None:
path = Path(log_file)
if not path.exists():
print(f"File not found: {log_file}", file=sys.stderr)
sys.exit(1)
entries = []
with open(path, encoding="utf-8") as f:
for line in f:
entry = parse_log_line(line.strip())
if entry:
entries.append(entry)
# Count by level
level_counts = Counter(e["level"] for e in entries)
print("=== Log Count by Level ===")
for level, count in level_counts.most_common():
print(f" {level}: {count}")
# Aggregate errors by hour
error_entries = [e for e in entries if e["level"] == "ERROR"]
hour_errors = defaultdict(int)
for entry in error_entries:
hour_errors[entry["timestamp"].hour] += 1
print("\n=== Errors by Hour ===")
for hour in sorted(hour_errors):
print(f" {hour:02d}:00 — {hour_errors[hour]}")
if __name__ == "__main__":
analyze_logs(sys.argv[1] if len(sys.argv) > 1 else "app.log")
Expert Tips
Tip 1: os.scandir() is faster than os.listdir()
import os
# Slow way
files = [f for f in os.listdir(".") if os.path.isfile(f)]
# Fast way (caches stat information)
files = [e.name for e in os.scandir(".") if e.is_file()]
Tip 2: Updating Counter
from collections import Counter
total = Counter()
for batch in data_batches:
total.update(Counter(batch))
Tip 3: Flatten nested lists with itertools.chain.from_iterable
import itertools
nested = [[1, 2, 3], [4, 5], [6, 7, 8, 9]]
flat = list(itertools.chain.from_iterable(nested))
print(flat) # [1, 2, 3, 4, 5, 6, 7, 8, 9]
Tip 4: datetime.timezone.utc vs pytz
from datetime import datetime, timezone
# Python 3.9+ — no additional packages needed
from zoneinfo import ZoneInfo
now_utc = datetime.now(timezone.utc)
now_kst = now_utc.astimezone(ZoneInfo("Asia/Seoul"))