Introduction to Pandas | Java to Python Journey

1. Introduction to Pandas

Pandas is a powerful data manipulation and analysis library built on top of NumPy. It provides data structures like DataFrames and Series for working with structured data, making it ideal for data cleaning, transformation, and analysis.

Why Pandas?

DataFrames: Tabular data structure similar to Excel spreadsheets or SQL tables.
Series: 1D labeled array for single columns.
Convenience: Easy-to-use methods for common data operations.
Integration: Works seamlessly with NumPy, Matplotlib, and Scikit-learn.
Performance: Efficient for large datasets (millions of rows).

Installation

pip install pandas

2. Series: 1D Data Structure

Creating Series

import pandas as pd
import numpy as np

# From list
s1 = pd.Series([10, 20, 30, 40, 50])
print(s1)
# 0    10
# 1    20
# 2    30
# 3    40
# 4    50

# From dictionary (keys become indices)
s2 = pd.Series({"apple": 5, "banana": 3, "cherry": 8})
print(s2)
# apple     5
# banana    3
# cherry    8

# With custom index
s3 = pd.Series([10, 20, 30], index=["a", "b", "c"])
print(s3)
# a    10
# b    20
# c    30

# From NumPy array
s4 = pd.Series(np.arange(5))

Series Operations

s = pd.Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"])

# Accessing elements
print(s["a"])          # 1
print(s[0])            # 1
print(s[1:3])          # Elements at indices 1, 2
# b    2
# c    3

# Properties
print(s.values)        # [1 2 3 4 5] (NumPy array)
print(s.index)         # Index(['a', 'b', 'c', 'd', 'e'])
print(s.dtype)         # int64
print(s.size)          # 5

# Statistical operations
print(s.sum())         # 15
print(s.mean())        # 3.0
print(s.min())         # 1
print(s.max())         # 5
print(s.describe())    # Summary statistics

3. DataFrames: 2D Data Structure

Creating DataFrames

# From dictionary of lists
df1 = pd.DataFrame({
    "name": ["Alice", "Bob", "Charlie", "David"],
    "age": [25, 30, 35, 28],
    "salary": [50000, 60000, 75000, 55000],
    "department": ["IT", "HR", "Finance", "IT"]
})

print(df1)
#      name  age  salary department
# 0   Alice   25   50000         IT
# 1     Bob   30   60000         HR
# 2 Charlie   35   75000     Finance
# 3   David   28   55000         IT

# From list of dictionaries
df2 = pd.DataFrame([
    {"name": "Alice", "age": 25, "city": "NYC"},
    {"name": "Bob", "age": 30, "city": "LA"},
    {"name": "Charlie", "age": 35, "city": "Chicago"}
])

# From NumPy array
df3 = pd.DataFrame(
    np.random.randn(3, 4),
    columns=["A", "B", "C", "D"]
)

# From CSV file
df = pd.read_csv("data.csv")

# From Excel file
df = pd.read_excel("data.xlsx")

DataFrame Properties

df = pd.DataFrame({
    "name": ["Alice", "Bob", "Charlie"],
    "age": [25, 30, 35],
    "salary": [50000, 60000, 75000]
})

# Basic info
print(df.shape)        # (3, 3) - rows, columns
print(df.columns)      # Index(['name', 'age', 'salary'])
print(df.index)        # RangeIndex(start=0, stop=3, step=1)
print(df.dtypes)       # Data type of each column
print(df.info())       # Summary of DataFrame
print(df.describe())   # Statistical summary

# First and last rows
print(df.head(2))      # First 2 rows
print(df.tail(1))      # Last 1 row

4. Accessing Data

Column Access

df = pd.DataFrame({
    "name": ["Alice", "Bob", "Charlie"],
    "age": [25, 30, 35],
    "salary": [50000, 60000, 75000]
})

# Access single column (returns Series)
names = df["name"]
# or
names = df.name

# Access multiple columns (returns DataFrame)
subset = df[["name", "salary"]]

# Add new column
df["bonus"] = df["salary"] * 0.1

# Modify column
df["age"] = df["age"] + 1

Row Access

# iloc: integer-location based indexing
print(df.iloc[0])      # First row
print(df.iloc[1:3])    # Rows 1-2
print(df.iloc[0, 1])   # Row 0, Column 1

# loc: label-based indexing (if index has custom labels)
df_indexed = df.set_index("name")
print(df_indexed.loc["Alice"])

# at/iat: access single values
print(df.at[0, "name"])   # loc version
print(df.iat[0, 0])       # iloc version

5. Data Filtering and Selection

df = pd.DataFrame({
    "name": ["Alice", "Bob", "Charlie", "David"],
    "age": [25, 30, 35, 28],
    "salary": [50000, 60000, 75000, 55000],
    "department": ["IT", "HR", "Finance", "IT"]
})

# Filter rows by condition
high_earners = df[df["salary"] > 60000]
# Returns: Charlie (75000) and David (55000)? No, just Charlie

# Multiple conditions
it_staff = df[(df["department"] == "IT") & (df["age"] > 26)]

# Filter with isin
depts = df[df["department"].isin(["IT", "Finance"])]

# Filter with string methods
names_with_a = df[df["name"].str.contains("a", case=False)]

# Filter out null values
df_no_null = df.dropna()

# Filter with query method
high_earners = df.query("salary > 60000")

6. Data Cleaning and Transformation

Handling Missing Values

df = pd.DataFrame({
    "A": [1, 2, np.nan, 4],
    "B": [5, np.nan, np.nan, 8],
    "C": [9, 10, 11, 12]
})

# Check for null values
print(df.isnull())
print(df.notnull())
print(df.isnull().sum())  # Count nulls per column

# Drop rows with any null
df_dropped = df.dropna()

# Drop columns with nulls
df_dropped = df.dropna(axis=1)

# Fill null values
df_filled = df.fillna(0)
df_filled = df.fillna(method="ffill")  # Forward fill
df_filled = df.fillna(method="bfill")  # Backward fill

# Interpolate
df_interpolated = df.interpolate()

Removing Duplicates

df = pd.DataFrame({
    "name": ["Alice", "Bob", "Alice", "Charlie"],
    "age": [25, 30, 25, 35]
})

# Check for duplicates
print(df.duplicated())

# Drop duplicates
df_unique = df.drop_duplicates()

# Drop duplicates based on specific columns
df_unique = df.drop_duplicates(subset=["name"])

Data Type Conversion

df = pd.DataFrame({
    "age": ["25", "30", "35"],
    "salary": ["50000", "60000", "75000"]
})

# Convert to numeric
df["age"] = pd.to_numeric(df["age"])
df["salary"] = pd.to_numeric(df["salary"])

# Convert to string
df["age_str"] = df["age"].astype(str)

# Convert to datetime
df["date"] = pd.to_datetime(["2023-01-01", "2023-01-02", "2023-01-03"])

# Convert to category
df["category"] = df["category"].astype("category")

7. Sorting and Ranking

df = pd.DataFrame({
    "name": ["Alice", "Bob", "Charlie"],
    "age": [25, 30, 35],
    "salary": [50000, 60000, 75000]
})

# Sort by single column
df_sorted = df.sort_values("age")

# Sort by multiple columns
df_sorted = df.sort_values(["age", "salary"], ascending=[True, False])

# Sort by index
df_sorted = df.sort_index()

# Rank
df["age_rank"] = df["age"].rank()
df["salary_rank"] = df["salary"].rank(method="dense")

8. Aggregation and GroupBy

GroupBy Operations

df = pd.DataFrame({
    "department": ["IT", "HR", "IT", "Finance", "HR", "Finance"],
    "name": ["Alice", "Bob", "Charlie", "David", "Eve", "Frank"],
    "salary": [50000, 45000, 60000, 55000, 48000, 70000],
    "age": [25, 30, 35, 28, 26, 40]
})

# Group by department
grouped = df.groupby("department")

# Sum
print(grouped["salary"].sum())
# department
# Finance    125000
# HR          93000
# IT         110000

# Mean
print(grouped["salary"].mean())
# department
# Finance     62500.0
# HR          46500.0
# IT          55000.0

# Count
print(grouped.size())
# department
# Finance    2
# HR         2
# IT         2

# Multiple aggregations
agg_result = grouped.agg({
    "salary": ["sum", "mean", "count"],
    "age": "mean"
})

# Custom aggregation
def salary_range(x):
    return x.max() - x.min()

agg_result = grouped["salary"].agg(salary_range)

Pivot Tables

df = pd.DataFrame({
    "date": ["2023-01", "2023-01", "2023-02", "2023-02"],
    "region": ["North", "South", "North", "South"],
    "sales": [100, 150, 120, 180]
})

# Create pivot table
pivot = df.pivot_table(
    values="sales",
    index="date",
    columns="region",
    aggfunc="sum"
)

print(pivot)
# region     North  South
# date
# 2023-01    100    150
# 2023-02    120    180

9. Merging and Joining

Merge (SQL-like join)

df1 = pd.DataFrame({
    "employee_id": [1, 2, 3],
    "name": ["Alice", "Bob", "Charlie"],
    "dept_id": [10, 20, 10]
})

df2 = pd.DataFrame({
    "dept_id": [10, 20],
    "department": ["IT", "HR"]
})

# Inner join (default)
merged = pd.merge(df1, df2, on="dept_id")

# Left join
merged = pd.merge(df1, df2, on="dept_id", how="left")

# Right join
merged = pd.merge(df1, df2, on="dept_id", how="right")

# Outer join
merged = pd.merge(df1, df2, on="dept_id", how="outer")

Concatenate

df1 = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
df2 = pd.DataFrame({"A": [5, 6], "B": [7, 8]})

# Concatenate rows (vertical)
concat_rows = pd.concat([df1, df2], ignore_index=True)

# Concatenate columns (horizontal)
concat_cols = pd.concat([df1, df2], axis=1)

10. String Operations

df = pd.DataFrame({
    "name": ["alice", "bob", "charlie"],
    "email": ["alice@example.com", "bob@example.com", "charlie@example.com"]
})

# String methods (accessed via .str)
df["name_upper"] = df["name"].str.upper()
df["name_length"] = df["name"].str.len()

# Contains
has_a = df["name"].str.contains("a")

# Replace
df["email_masked"] = df["email"].str.replace("example.com", "***")

# Split
df[["first", "domain"]] = df["email"].str.split("@", expand=True)

# Extract
df["domain"] = df["email"].str.extract(r"@(\w+\.\w+)")

11. Working with Dates

df = pd.DataFrame({
    "date": ["2023-01-01", "2023-01-02", "2023-01-03"],
    "sales": [100, 150, 120]
})

# Convert to datetime
df["date"] = pd.to_datetime(df["date"])

# Extract components
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["day"] = df["date"].dt.day
df["day_of_week"] = df["date"].dt.day_name()

# Set date as index
df_indexed = df.set_index("date")

# Resample (time-based aggregation)
daily_sales = df_indexed["sales"].resample("D").sum()
monthly_sales = df_indexed["sales"].resample("M").sum()

# Filter by date range
start = "2023-01-01"
end = "2023-01-02"
filtered = df[(df["date"] >= start) & (df["date"] <= end)]

12. Input/Output Operations

# Read CSV
df = pd.read_csv("data.csv")

# Read Excel
df = pd.read_excel("data.xlsx", sheet_name=0)

# Read SQL database
from sqlalchemy import create_engine
engine = create_engine("sqlite:///data.db")
df = pd.read_sql("SELECT * FROM users", con=engine)

# Write CSV
df.to_csv("output.csv", index=False)

# Write Excel
df.to_excel("output.xlsx", index=False)

# Write to SQL
df.to_sql("users", con=engine, if_exists="append")

# Write to JSON
df.to_json("output.json")

# Read JSON
df = pd.read_json("output.json")

13. Practical Examples

Example 1: Sales Analysis

# Sample sales data
df = pd.DataFrame({
    "date": pd.date_range("2023-01-01", periods=10),
    "product": ["A", "B", "A", "C", "B", "A", "C", "B", "A", "C"],
    "quantity": [5, 10, 3, 8, 12, 7, 4, 9, 6, 5],
    "price": [100, 50, 100, 75, 50, 100, 75, 50, 100, 75]
})

# Calculate revenue
df["revenue"] = df["quantity"] * df["price"]

# Total revenue by product
revenue_by_product = df.groupby("product")["revenue"].sum()

# Average price by product
avg_price = df.groupby("product")["price"].mean()

# Top selling product by quantity
top_product = df.groupby("product")["quantity"].sum().idxmax()

# Daily revenue
daily_revenue = df.groupby("date")["revenue"].sum()

print(f"Total Revenue: ${df['revenue'].sum()}")
print(f"Top Product: {top_product}")

Example 2: Data Cleaning

# Messy data
df = pd.DataFrame({
    "name": ["Alice", "  Bob  ", "CHARLIE", "david", ""],
    "age": ["25", "30", "35", "N/A", "28"],
    "salary": ["50000", "60000", None, "55000", "48000"],
    "dept": ["IT", "HR", "IT", "IT", "HR"]
})

# Clean name column
df["name"] = df["name"].str.strip().str.title()

# Convert age to numeric
df["age"] = pd.to_numeric(df["age"], errors="coerce")

# Fill missing salary
df["salary"] = pd.to_numeric(df["salary"], errors="coerce")
df["salary"] = df["salary"].fillna(df["salary"].mean())

# Remove empty rows
df = df[df["name"] != ""]

print(df)

Example 3: Customer Segmentation

df = pd.DataFrame({
    "customer_id": [1, 2, 3, 4, 5],
    "total_purchases": [100, 250, 50, 500, 150],
    "avg_order_value": [20, 50, 10, 100, 30]
})

# Segment customers by purchase value
df["segment"] = pd.cut(
    df["total_purchases"],
    bins=[0, 100, 300, 1000],
    labels=["Bronze", "Silver", "Gold"]
)

print(df)

Summary: Pandas vs Java

Data Structure
- Pandas: DataFrame/Series
- Java (Apache Spark): DataFrame
Ease of Use
- Pandas: Very easy
- Java (Apache Spark): More verbose
Performance
- Pandas: Good for <1GB
- Java (Apache Spark): Better for large data
Groupby/Aggregation
- Pandas: Simple and intuitive
- Java (Apache Spark): Requires SQL/DSL
CSV I/O
- Pandas: Built-in
- Java (Apache Spark): Requires configuration
Null Handling
- Pandas: Easy (fillna, dropna)
- Java (Apache Spark): More manual
String Operations
- Pandas: Rich API
- Java (Apache Spark): Limited
Date/Time
- Pandas: Excellent
- Java (Apache Spark): Good