Skip to main content
Advertisement

Pandas in Practice

Pandas is the core library for data analysis in Python. It efficiently processes tabular data with DataFrame.


Installation and Basic Structure

pip install pandas
import pandas as pd
import numpy as np

# Series — 1D labeled array
s = pd.Series([10, 20, 30, 40], index=["a", "b", "c", "d"])
print(s["b"]) # 20
print(s[s > 15]) # a:20, b:30, c:40

# DataFrame — 2D table structure
df = pd.DataFrame({
"name": ["Alice", "Bob", "Charlie", "Diana"],
"age": [28, 34, 22, 45],
"salary": [75000, 82000, 55000, 95000],
"department": ["Engineering", "Marketing", "Engineering", "HR"],
})

Loading Data

# CSV
df = pd.read_csv("data.csv", encoding="utf-8")
df = pd.read_csv("data.csv", sep="\t", parse_dates=["date"])

# Excel
df = pd.read_excel("data.xlsx", sheet_name="Sheet1")

# JSON
df = pd.read_json("data.json")

# DB (SQLAlchemy)
from sqlalchemy import create_engine
engine = create_engine("postgresql://user:pass@localhost/db")
df = pd.read_sql("SELECT * FROM products WHERE is_active = true", engine)

# Save
df.to_csv("output.csv", index=False)
df.to_excel("output.xlsx", index=False)

Basic Exploration

df.shape          # (4, 4)
df.dtypes # data types per column
df.info() # non-null counts, memory usage
df.describe() # statistical summary of numeric columns

df.head(3) # top 3 rows
df.tail(3) # bottom 3 rows

# Column selection
df["name"] # Series
df[["name", "salary"]] # DataFrame

# Row selection
df.loc[0] # select by index label
df.iloc[0] # select by integer position
df.loc[df["age"] > 30] # conditional filtering
df.iloc[1:3, 0:2] # slicing

Data Cleaning

# Handling missing values
df.isnull().sum() # missing values per column
df.dropna() # drop rows with missing values
df.dropna(subset=["salary"]) # specific column only
df.fillna(0) # fill with 0
df["salary"].fillna(df["salary"].mean(), inplace=True) # fill with mean

# Handling duplicates
df.duplicated().sum() # count duplicate rows
df.drop_duplicates(inplace=True)
df.drop_duplicates(subset=["name"])

# Type conversion
df["age"] = df["age"].astype(int)
df["date"] = pd.to_datetime(df["date"])
df["salary"] = pd.to_numeric(df["salary"], errors="coerce") # invalid → NaN

# String cleaning
df["name"] = df["name"].str.strip() # remove leading/trailing spaces
df["name"] = df["name"].str.lower() # lowercase
df["department"] = df["department"].str.replace(" ", "_")

# Rename columns
df.rename(columns={"name": "employee_name", "age": "employee_age"}, inplace=True)

# Add/remove columns
df["annual_bonus"] = df["salary"] * 0.1
df.drop(columns=["annual_bonus"], inplace=True)

groupby — Group Aggregation

# Basic groupby
dept_stats = df.groupby("department")["salary"].mean()

# Multiple aggregations
result = df.groupby("department").agg(
avg_salary=("salary", "mean"),
max_salary=("salary", "max"),
count=("name", "count"),
).reset_index()

# Group by multiple columns
df.groupby(["department", "gender"])["salary"].sum()

# transform — add group statistics to original rows
df["dept_avg_salary"] = df.groupby("department")["salary"].transform("mean")
df["salary_ratio"] = df["salary"] / df["dept_avg_salary"]

# filter — keep only groups that meet a condition
large_depts = df.groupby("department").filter(lambda x: len(x) >= 2)

merge / join

# Merge two DataFrames
employees = pd.DataFrame({
"emp_id": [1, 2, 3, 4],
"name": ["Alice", "Bob", "Charlie", "Diana"],
"dept_id": [10, 20, 10, 30],
})

departments = pd.DataFrame({
"dept_id": [10, 20, 30],
"dept_name": ["Engineering", "Marketing", "HR"],
})

# INNER JOIN (default)
merged = pd.merge(employees, departments, on="dept_id")

# LEFT JOIN
merged = pd.merge(employees, departments, on="dept_id", how="left")

# Join on different column names
pd.merge(df1, df2, left_on="emp_id", right_on="id")

# Vertical concatenation
all_data = pd.concat([df1, df2, df3], ignore_index=True)
pd.concat([df1, df2], axis=1) # horizontal concatenation

pivot_table

# Pivot table
pivot = df.pivot_table(
values="salary",
index="department",
columns="gender",
aggfunc="mean",
fill_value=0,
)

# melt — reverse pivot (wide → long format)
melted = pd.melt(
df,
id_vars=["name", "department"],
value_vars=["salary", "bonus"],
var_name="type",
value_name="amount",
)

Date Handling

df["date"] = pd.to_datetime(df["date"])

# Extract date components
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["day_of_week"] = df["date"].dt.dayofweek # 0=Monday

# Date filtering
mask = (df["date"] >= "2024-01-01") & (df["date"] <= "2024-12-31")
df_2024 = df[mask]

# Time series resampling (monthly aggregation)
df.set_index("date", inplace=True)
monthly = df["sales"].resample("ME").sum() # Month End

Summary

FeatureMethod
Load fileread_csv(), read_excel(), read_sql()
Missing valuesdropna(), fillna()
Conditional filterdf[df["col"] > value]
Group aggregationgroupby().agg()
Add group valuegroupby().transform()
Table mergepd.merge() (SQL JOIN)
Shape transformpivot_table(), melt()
Date handling.dt.year, resample()

The core Pandas pattern is the groupby → agg → merge → pivot pipeline.

Advertisement