Ch 2.3 Advanced Collection Types
Python provides powerful built-in collection types. Choosing the right data structure determines the performance and readability of your code.
1. list — Mutable Ordered Collection
A list is an ordered, mutable collection. It is the most frequently used data structure.
# Creating lists
fruits = ["apple", "banana", "cherry"]
numbers = [1, 2, 3, 4, 5]
mixed = [1, "hello", True, 3.14, None] # multiple types allowed
empty = []
from_range = list(range(5)) # [0, 1, 2, 3, 4]
# Indexing
print(fruits[0]) # apple (first element)
print(fruits[-1]) # cherry (last element)
# Slicing [start:stop:step]
print(numbers[1:4]) # [2, 3, 4]
print(numbers[::2]) # [1, 3, 5] (every other element)
print(numbers[::-1]) # [5, 4, 3, 2, 1] (reversed)
Key Methods
fruits = ["apple", "banana"]
# Adding
fruits.append("cherry") # add to end
fruits.insert(1, "blueberry") # insert at index
fruits.extend(["date", "elderberry"]) # add multiple elements
# Removing
fruits.remove("banana") # remove by value (first match)
popped = fruits.pop() # remove and return last element
popped_at = fruits.pop(0) # remove and return element at index
# Searching
print("apple" in fruits) # True
print(fruits.index("cherry")) # index of cherry
print(fruits.count("apple")) # count of apple
# Sorting
nums = [3, 1, 4, 1, 5, 9, 2, 6]
nums.sort() # modify in place (ascending)
nums.sort(reverse=True) # descending
sorted_nums = sorted(nums) # return new list (original preserved)
# Custom sort
words = ["banana", "apple", "cherry", "date"]
words.sort(key=len) # sort by length
print(words) # ['date', 'apple', 'banana', 'cherry']
# Miscellaneous
fruits.reverse() # reverse in place
print(len(fruits)) # length
fruits.clear() # remove all elements
copy = fruits.copy() # shallow copy
List Comprehensions
# Basic comprehension
squares = [x**2 for x in range(10)]
print(squares) # [0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
# Conditional comprehension
evens = [x for x in range(20) if x % 2 == 0]
print(evens) # [0, 2, 4, 6, 8, 10, 12, 14, 16, 18]
# Nested comprehension
matrix = [[i * j for j in range(1, 4)] for i in range(1, 4)]
print(matrix) # [[1, 2, 3], [2, 4, 6], [3, 6, 9]]
# Real-world example: filter words by length
words = ["python", "is", "a", "great", "language"]
long_words = [w.upper() for w in words if len(w) > 3]
print(long_words) # ['PYTHON', 'GREAT', 'LANGUAGE']
2. tuple — Immutable Ordered Collection
A tuple is an ordered, immutable collection. Once created, it cannot be modified.
# Creating tuples
point = (3, 5)
rgb = (255, 128, 0)
single = (42,) # single-element tuple — trailing comma required!
empty = ()
# Parentheses optional
coordinates = 10, 20 # automatically packed as (10, 20)
# Confirming immutability
# point[0] = 10 # TypeError: 'tuple' object does not support item assignment
# Unpacking
x, y = point
print(f"x={x}, y={y}") # x=3, y=5
# Commonly used when returning multiple values from a function
def get_min_max(numbers):
return min(numbers), max(numbers)
minimum, maximum = get_min_max([3, 1, 4, 1, 5, 9])
print(f"min: {minimum}, max: {maximum}")
Named Tuples (namedtuple)
from collections import namedtuple
# Define a named tuple
Point = namedtuple("Point", ["x", "y"])
Color = namedtuple("Color", ["red", "green", "blue"])
# Create instances
p = Point(3, 5)
orange = Color(255, 165, 0)
# Access by name (better readability)
print(p.x, p.y) # 3 5
print(orange.red, orange.green) # 255 165
# Access by index also works
print(p[0], p[1]) # 3 5
# Convert to dict
print(p._asdict()) # {'x': 3, 'y': 5}
# Python 3.6+ typing.NamedTuple (more modern)
from typing import NamedTuple
class Employee(NamedTuple):
name: str
department: str
salary: float = 50000.0
emp = Employee("Alice", "Engineering", 80000)
print(emp.name) # Alice
print(emp.salary) # 80000
3. set — Collection Without Duplicates
A set is an unordered collection with no duplicate elements.
# Creating sets
fruits = {"apple", "banana", "cherry"}
numbers = {1, 2, 3, 3, 3} # duplicates are removed automatically
print(numbers) # {1, 2, 3}
# Commonly used to remove duplicates from a list
data = [1, 2, 2, 3, 3, 3, 4]
unique = list(set(data))
print(sorted(unique)) # [1, 2, 3, 4]
# Empty set — {} creates a dict! Use set() instead
empty_set = set()
print(type(empty_set)) # <class 'set'>
print(type({})) # <class 'dict'>
Set Operations
a = {1, 2, 3, 4, 5}
b = {4, 5, 6, 7, 8}
# Union
print(a | b) # {1, 2, 3, 4, 5, 6, 7, 8}
print(a.union(b)) # same result
# Intersection
print(a & b) # {4, 5}
print(a.intersection(b))
# Difference
print(a - b) # {1, 2, 3}
print(a.difference(b))
# Symmetric difference (union minus intersection)
print(a ^ b) # {1, 2, 3, 6, 7, 8}
print(a.symmetric_difference(b))
# Subset check
print({1, 2}.issubset(a)) # True
print(a.issuperset({1, 2})) # True
frozenset — Immutable Set
# frozenset: an immutable set that can be used as a dictionary key
fs = frozenset([1, 2, 3])
# fs.add(4) # AttributeError: cannot modify
# Use as a dictionary key
cache = {frozenset([1, 2]): "pair_12"}
print(cache[frozenset([1, 2])]) # pair_12
4. dict — Key-Value Pairs
A dictionary stores data as key-value pairs. Since Python 3.7+, insertion order is guaranteed.
# Creating dictionaries
person = {"name": "Alice", "age": 30, "city": "Seoul"}
empty = {}
from_keys = dict.fromkeys(["a", "b", "c"], 0) # {'a': 0, 'b': 0, 'c': 0}
# Accessing values
print(person["name"]) # Alice
print(person.get("email")) # None (no KeyError)
print(person.get("email", "N/A")) # N/A (with default value)
# Modifying / Adding
person["age"] = 31 # modify existing key
person["email"] = "alice@example.com" # add new key
# Removing
del person["city"]
removed = person.pop("email") # remove and return value
Key Dictionary Methods
config = {"host": "localhost", "port": 5432, "db": "mydb"}
# View keys, values, and items
print(list(config.keys())) # ['host', 'port', 'db']
print(list(config.values())) # ['localhost', 5432, 'mydb']
print(list(config.items())) # [('host', 'localhost'), ...]
# Iteration
for key, value in config.items():
print(f"{key}: {value}")
# update — update from another dictionary
config.update({"port": 3306, "charset": "utf8"})
# setdefault — set default value only when key is missing
config.setdefault("timeout", 30)
print(config["timeout"]) # 30
config.setdefault("timeout", 60) # ignored since key already exists
print(config["timeout"]) # 30 (unchanged)
# Python 3.9+ dictionary merge
defaults = {"timeout": 30, "retry": 3}
custom = {"timeout": 60, "host": "example.com"}
merged = defaults | custom # custom takes precedence
print(merged)
# {'timeout': 60, 'retry': 3, 'host': 'example.com'}
Dictionary Comprehensions
# Basic dictionary comprehension
squares = {x: x**2 for x in range(1, 6)}
print(squares) # {1: 1, 2: 4, 3: 9, 4: 16, 5: 25}
# Conditional
even_squares = {x: x**2 for x in range(10) if x % 2 == 0}
# Inverting a dictionary (swap keys and values)
original = {"a": 1, "b": 2, "c": 3}
inverted = {v: k for k, v in original.items()}
print(inverted) # {1: 'a', 2: 'b', 3: 'c'}
# Real-world example: word frequency count
words = ["apple", "banana", "apple", "cherry", "banana", "apple"]
frequency = {word: words.count(word) for word in set(words)}
print(frequency) # {'apple': 3, 'banana': 2, 'cherry': 1}
5. Collection Selection Guide
| Situation | Recommended Data Structure |
|---|---|
| Ordered, mutable data | list |
| Fixed data, dictionary key | tuple |
| Remove duplicates, membership testing | set |
| Fast lookup by key | dict |
| Fast insert/delete at both ends | collections.deque |
| Frequency counting | collections.Counter |
| Default value when key is missing | collections.defaultdict |
6. The collections Module
from collections import Counter, defaultdict, OrderedDict, deque
# Counter — count element frequencies
scores = [85, 92, 85, 78, 92, 85, 90]
counter = Counter(scores)
print(counter) # Counter({85: 3, 92: 2, 78: 1, 90: 1})
print(counter.most_common(2)) # [(85, 3), (92, 2)]
text = "hello world"
char_count = Counter(text)
print(char_count['l']) # 3
# defaultdict — auto-creates a default value when a missing key is accessed
word_list = ["apple", "banana", "apple", "cherry", "banana", "apple"]
# A regular dict raises KeyError on missing keys
# but defaultdict creates a default value automatically
word_freq = defaultdict(int)
for word in word_list:
word_freq[word] += 1
print(dict(word_freq)) # {'apple': 3, 'banana': 2, 'cherry': 1}
graph = defaultdict(list)
graph["A"].append("B")
graph["A"].append("C")
print(dict(graph)) # {'A': ['B', 'C']}
# deque — double-ended queue (O(1) insert/delete at both ends)
dq = deque([1, 2, 3])
dq.appendleft(0) # add to front
dq.append(4) # add to back
print(dq) # deque([0, 1, 2, 3, 4])
dq.popleft() # remove from front
dq.rotate(1) # rotate one position to the right
print(dq) # deque([4, 1, 2, 3])
# Use with maxlen as a sliding window
recent = deque(maxlen=3)
for x in range(7):
recent.append(x)
print(list(recent))
# [0]
# [0, 1]
# [0, 1, 2]
# [1, 2, 3] ← oldest element removed when maxlen is exceeded
# ...
Pro Tips: Data Structure Performance Comparison
Performance difference of the in operator:
import time
large_list = list(range(1_000_000))
large_set = set(range(1_000_000))
large_dict = {x: x for x in range(1_000_000)}
# List search O(n) — slow
start = time.perf_counter()
999_999 in large_list
print(f"list: {time.perf_counter() - start:.6f}s") # ~0.01s
# Set search O(1) — fast (hash-based)
start = time.perf_counter()
999_999 in large_set
print(f"set: {time.perf_counter() - start:.6f}s") # ~0.000001s
# Dict key search O(1) — fast
start = time.perf_counter()
999_999 in large_dict
print(f"dict: {time.perf_counter() - start:.6f}s") # ~0.000001s
List vs Tuple memory:
import sys
lst = [1, 2, 3, 4, 5]
tpl = (1, 2, 3, 4, 5)
print(sys.getsizeof(lst)) # 104 bytes
print(sys.getsizeof(tpl)) # 80 bytes — tuples are smaller
# Tuple creation is also faster
import timeit
print(timeit.timeit("[1, 2, 3]", number=10_000_000)) # ~0.8s
print(timeit.timeit("(1, 2, 3)", number=10_000_000)) # ~0.2s
Conclusion: Use tuples for data that doesn't need to change, and sets when you need frequent membership testing.
You have mastered the collection types. The next chapter covers type casting between types.