Pandas provides high-level data structures and tools for cleaning, transforming, and analyzing structured data.

Installation

  pip install pandas
  
  import pandas as pd
import numpy as np
  

Series — 1D Labeled Array

  s = pd.Series([10, 20, 30, 40], index=["a", "b", "c", "d"])
s["b"]           # 20
s[s > 15]        # filter
s.mean()         # 25.0
  

DataFrame — 2D Labeled Table

  data = {
    "name": ["Alice", "Bob", "Charlie", "Diana"],
    "age": [25, 30, 35, 28],
    "city": ["NYC", "LA", "Chicago", "NYC"],
    "salary": [70000, 85000, 90000, 72000],
}
df = pd.DataFrame(data)
print(df)
  

Reading and Writing Data

  df = pd.read_csv("employees.csv")
df = pd.read_excel("report.xlsx")
df = pd.read_json("data.json")

df.to_csv("output.csv", index=False)
df.to_excel("output.xlsx", index=False)
  

Exploring Data

  df.head()          # first 5 rows
df.tail(3)         # last 3 rows
df.info()          # column types and non-null counts
df.describe()      # statistics for numeric columns
df.shape           # (rows, columns)
df.columns.tolist()
df["city"].value_counts()
  

Selecting Data

  df["name"]                    # single column (Series)
df[["name", "salary"]]        # multiple columns
df.loc[0]                     # row by label
df.loc[0:2, "name":"city"]    # slice rows and columns
df.iloc[0, 1]                 # row 0, column 1 by position
df[df["age"] > 28]            # filter rows
df.query("age > 28 and city == 'NYC'")
  

Handling Missing Data

  df.isnull().sum()             # count nulls per column
df.dropna()                   # drop rows with any null
df.fillna(0)                  # fill nulls with 0
df["salary"].fillna(df["salary"].mean())
  

Adding and Modifying Columns

  df["bonus"] = df["salary"] * 0.1
df["senior"] = df["age"] > 30
df.rename(columns={"name": "full_name"}, inplace=True)
df.drop(columns=["senior"], inplace=True)
  

Grouping and Aggregation

  df.groupby("city")["salary"].mean()

df.groupby("city").agg(
    avg_salary=("salary", "mean"),
    count=("name", "count"),
    max_age=("age", "max"),
)
  

Merging DataFrames

  departments = pd.DataFrame({
    "name": ["Alice", "Bob", "Charlie"],
    "dept": ["Engineering", "Sales", "Engineering"],
})

merged = df.merge(departments, on="name", how="left")

# Concatenate vertically or horizontally
combined = pd.concat([df, df2], ignore_index=True)
  

Sorting

  df.sort_values("salary", ascending=False)
df.sort_values(["city", "age"])
  

Apply Custom Functions

  df["salary_k"] = df["salary"].apply(lambda x: f"${x/1000:.0f}K")

def categorize(age):
    if age < 30:
        return "Junior"
    return "Senior"

df["level"] = df["age"].apply(categorize)
  

Pivot Tables

  pivot = df.pivot_table(
    values="salary",
    index="city",
    columns="level",
    aggfunc="mean",
)
  

Real-World Example

  # Load, clean, analyze
df = pd.read_csv("sales.csv")
df["date"] = pd.to_datetime(df["date"])
df = df.dropna(subset=["amount"])
df = df[df["amount"] > 0]

monthly = (
    df.set_index("date")
    .resample("M")["amount"]
    .sum()
    .reset_index()
)
print(monthly)
  

Pandas is the most important tool for data wrangling in Python. Combined with NumPy and Matplotlib, it covers the full data analysis workflow.