Pandas in Practice
Pandas is the core library for data analysis in Python. It efficiently processes tabular data with DataFrame.
Installation and Basic Structure
pip install pandas
import pandas as pd
import numpy as np
# Series — 1D labeled array
s = pd.Series([10, 20, 30, 40], index=["a", "b", "c", "d"])
print(s["b"]) # 20
print(s[s > 15]) # a:20, b:30, c:40
# DataFrame — 2D table structure
df = pd.DataFrame({
"name": ["Alice", "Bob", "Charlie", "Diana"],
"age": [28, 34, 22, 45],
"salary": [75000, 82000, 55000, 95000],
"department": ["Engineering", "Marketing", "Engineering", "HR"],
})
Loading Data
# CSV
df = pd.read_csv("data.csv", encoding="utf-8")
df = pd.read_csv("data.csv", sep="\t", parse_dates=["date"])
# Excel
df = pd.read_excel("data.xlsx", sheet_name="Sheet1")
# JSON
df = pd.read_json("data.json")
# DB (SQLAlchemy)
from sqlalchemy import create_engine
engine = create_engine("postgresql://user:pass@localhost/db")
df = pd.read_sql("SELECT * FROM products WHERE is_active = true", engine)
# Save
df.to_csv("output.csv", index=False)
df.to_excel("output.xlsx", index=False)
Basic Exploration
df.shape # (4, 4)
df.dtypes # data types per column
df.info() # non-null counts, memory usage
df.describe() # statistical summary of numeric columns
df.head(3) # top 3 rows
df.tail(3) # bottom 3 rows
# Column selection
df["name"] # Series
df[["name", "salary"]] # DataFrame
# Row selection
df.loc[0] # select by index label
df.iloc[0] # select by integer position
df.loc[df["age"] > 30] # conditional filtering
df.iloc[1:3, 0:2] # slicing
Data Cleaning
# Handling missing values
df.isnull().sum() # missing values per column
df.dropna() # drop rows with missing values
df.dropna(subset=["salary"]) # specific column only
df.fillna(0) # fill with 0
df["salary"].fillna(df["salary"].mean(), inplace=True) # fill with mean
# Handling duplicates
df.duplicated().sum() # count duplicate rows
df.drop_duplicates(inplace=True)
df.drop_duplicates(subset=["name"])
# Type conversion
df["age"] = df["age"].astype(int)
df["date"] = pd.to_datetime(df["date"])
df["salary"] = pd.to_numeric(df["salary"], errors="coerce") # invalid → NaN
# String cleaning
df["name"] = df["name"].str.strip() # remove leading/trailing spaces
df["name"] = df["name"].str.lower() # lowercase
df["department"] = df["department"].str.replace(" ", "_")
# Rename columns
df.rename(columns={"name": "employee_name", "age": "employee_age"}, inplace=True)
# Add/remove columns
df["annual_bonus"] = df["salary"] * 0.1
df.drop(columns=["annual_bonus"], inplace=True)
groupby — Group Aggregation
# Basic groupby
dept_stats = df.groupby("department")["salary"].mean()
# Multiple aggregations
result = df.groupby("department").agg(
avg_salary=("salary", "mean"),
max_salary=("salary", "max"),
count=("name", "count"),
).reset_index()
# Group by multiple columns
df.groupby(["department", "gender"])["salary"].sum()
# transform — add group statistics to original rows
df["dept_avg_salary"] = df.groupby("department")["salary"].transform("mean")
df["salary_ratio"] = df["salary"] / df["dept_avg_salary"]
# filter — keep only groups that meet a condition
large_depts = df.groupby("department").filter(lambda x: len(x) >= 2)
merge / join
# Merge two DataFrames
employees = pd.DataFrame({
"emp_id": [1, 2, 3, 4],
"name": ["Alice", "Bob", "Charlie", "Diana"],
"dept_id": [10, 20, 10, 30],
})
departments = pd.DataFrame({
"dept_id": [10, 20, 30],
"dept_name": ["Engineering", "Marketing", "HR"],
})
# INNER JOIN (default)
merged = pd.merge(employees, departments, on="dept_id")
# LEFT JOIN
merged = pd.merge(employees, departments, on="dept_id", how="left")
# Join on different column names
pd.merge(df1, df2, left_on="emp_id", right_on="id")
# Vertical concatenation
all_data = pd.concat([df1, df2, df3], ignore_index=True)
pd.concat([df1, df2], axis=1) # horizontal concatenation
pivot_table
# Pivot table
pivot = df.pivot_table(
values="salary",
index="department",
columns="gender",
aggfunc="mean",
fill_value=0,
)
# melt — reverse pivot (wide → long format)
melted = pd.melt(
df,
id_vars=["name", "department"],
value_vars=["salary", "bonus"],
var_name="type",
value_name="amount",
)
Date Handling
df["date"] = pd.to_datetime(df["date"])
# Extract date components
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["day_of_week"] = df["date"].dt.dayofweek # 0=Monday
# Date filtering
mask = (df["date"] >= "2024-01-01") & (df["date"] <= "2024-12-31")
df_2024 = df[mask]
# Time series resampling (monthly aggregation)
df.set_index("date", inplace=True)
monthly = df["sales"].resample("ME").sum() # Month End
Summary
| Feature | Method |
|---|---|
| Load file | read_csv(), read_excel(), read_sql() |
| Missing values | dropna(), fillna() |
| Conditional filter | df[df["col"] > value] |
| Group aggregation | groupby().agg() |
| Add group value | groupby().transform() |
| Table merge | pd.merge() (SQL JOIN) |
| Shape transform | pivot_table(), melt() |
| Date handling | .dt.year, resample() |
The core Pandas pattern is the groupby → agg → merge → pivot pipeline.