Introduction to NumPy | Java to Python Journey

1. Introduction to NumPy

NumPy (Numerical Python) is the foundational library for numerical computing in Python. It provides support for large, multi-dimensional arrays and matrices, along with a collection of mathematical functions to operate on these arrays efficiently.

Why NumPy?

Performance: 10-100x faster than native Python lists for numerical operations.
Memory Efficient: Uses contiguous memory blocks, reducing overhead.
Convenient: Provides high-level mathematical functions and operations.
Foundational: Base for Pandas, Scikit-learn, TensorFlow, and other data science libraries.

Installation

pip install numpy

2. Creating Arrays

Basic Array Creation

import numpy as np

# From Python list
arr1 = np.array([1, 2, 3, 4, 5])
print(arr1)  # [1 2 3 4 5]

# Specify data type
arr2 = np.array([1, 2, 3, 4, 5], dtype=float)
print(arr2)  # [1. 2. 3. 4. 5.]

# 2D array (matrix)
arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print(arr2d)
# [[1 2 3]
#  [4 5 6]
#  [7 8 9]]

# 3D array
arr3d = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])

Special Array Creation

# Zeros
zeros = np.zeros((3, 4))  # 3x4 matrix of zeros
# [[0. 0. 0. 0.]
#  [0. 0. 0. 0.]
#  [0. 0. 0. 0.]]

# Ones
ones = np.ones((2, 3))
# [[1. 1. 1.]
#  [1. 1. 1.]]

# Identity matrix
identity = np.eye(3)
# [[1. 0. 0.]
#  [0. 1. 0.]
#  [0. 0. 1.]]

# Range (similar to Python range)
arr_range = np.arange(0, 10, 2)  # [0 2 4 6 8]

# Linspace (evenly spaced numbers)
linspace = np.linspace(0, 1, 5)  # [0.   0.25 0.5  0.75 1.  ]

# Random numbers
random_arr = np.random.rand(3, 3)  # 3x3 random values between 0-1
random_int = np.random.randint(0, 10, size=(2, 3))  # Random integers

3. Array Properties

arr = np.array([[1, 2, 3], [4, 5, 6]])

# Shape: dimensions of array
print(arr.shape)  # (2, 3) - 2 rows, 3 columns

# Size: total number of elements
print(arr.size)  # 6

# Dtype: data type of elements
print(arr.dtype)  # int64

# Ndim: number of dimensions
print(arr.ndim)  # 2

# Itemsize: size of each element in bytes
print(arr.itemsize)  # 8

# Data: memory buffer
print(arr.data)  # <memory at 0x...>

4. Indexing and Slicing

1D Array Indexing

arr = np.array([10, 20, 30, 40, 50])

# Positive indexing
print(arr[0])   # 10
print(arr[2])   # 30

# Negative indexing
print(arr[-1])  # 50 (last element)
print(arr[-2])  # 40 (second to last)

# Slicing
print(arr[1:4])    # [20 30 40] (indices 1, 2, 3)
print(arr[:3])     # [10 20 30] (from start to index 2)
print(arr[2:])     # [30 40 50] (from index 2 to end)
print(arr[::2])    # [10 30 50] (every 2nd element)
print(arr[::-1])   # [50 40 30 20 10] (reversed)

2D Array Indexing

arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])

# Access element at row 1, column 2
print(arr2d[1, 2])  # 6

# Access entire row
print(arr2d[0])     # [1 2 3]

# Access entire column
print(arr2d[:, 1])  # [2 5 8]

# Slicing
print(arr2d[0:2, 1:3])  # First 2 rows, columns 1-2
# [[2 3]
#  [5 6]]

# Boolean indexing
print(arr2d[arr2d > 5])  # [6 7 8 9] (elements > 5)

5. Array Operations

Element-wise Operations

a = np.array([1, 2, 3, 4, 5])
b = np.array([2, 3, 4, 5, 6])

# Arithmetic operations
print(a + b)      # [3 5 7 9 11]
print(a - b)      # [-1 -1 -1 -1 -1]
print(a * b)      # [2 6 12 20 30] (element-wise multiplication)
print(a / b)      # [0.5 0.67 0.75 0.8 0.83]
print(a ** 2)     # [1 4 9 16 25] (element-wise power)
print(np.sqrt(a)) # [1. 1.41 1.73 2. 2.24]

# Broadcasting (operating with scalar)
print(a + 10)     # [11 12 13 14 15]
print(a * 2)      # [2 4 6 8 10]

Matrix Operations

A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])

# Matrix multiplication (dot product)
print(np.dot(A, B))
# [[19 22]
#  [43 50]]

# Element-wise multiplication
print(A * B)
# [[ 5 12]
#  [21 32]]

# Transpose
print(A.T)
# [[1 3]
#  [2 4]]

# Determinant
print(np.linalg.det(A))  # -2.0

# Inverse
print(np.linalg.inv(A))
# [[-2.   1. ]
#  [ 1.5 -0.5]]

6. Aggregation and Statistical Functions

arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

# Sum
print(np.sum(arr))        # 55
print(arr.sum())          # 55 (method on array)

# Mean (average)
print(np.mean(arr))       # 5.5

# Median
print(np.median(arr))     # 5.5

# Standard deviation
print(np.std(arr))        # 2.872

# Variance
print(np.var(arr))        # 8.25

# Min and Max
print(np.min(arr))        # 1
print(np.max(arr))        # 10

# Argmin and Argmax (indices)
print(np.argmin(arr))     # 0
print(np.argmax(arr))     # 9

# Percentile
print(np.percentile(arr, 25))  # 3.25 (25th percentile)
print(np.percentile(arr, 75))  # 7.75 (75th percentile)

# Unique values
arr_dup = np.array([1, 2, 2, 3, 3, 3, 4, 4, 4, 4])
print(np.unique(arr_dup))  # [1 2 3 4]

2D Aggregation

arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])

# Sum all elements
print(np.sum(arr2d))       # 45

# Sum along rows (axis=1)
print(np.sum(arr2d, axis=1))  # [6 15 24]

# Sum along columns (axis=0)
print(np.sum(arr2d, axis=0))  # [12 15 18]

# Mean along rows
print(np.mean(arr2d, axis=1))  # [2. 5. 8.]

7. Array Reshaping and Manipulation

arr = np.arange(12)  # [0 1 2 3 4 5 6 7 8 9 10 11]

# Reshape
reshaped = arr.reshape(3, 4)
# [[ 0  1  2  3]
#  [ 4  5  6  7]
#  [ 8  9 10 11]]

# Flatten (convert multi-dimensional to 1D)
flattened = reshaped.flatten()  # [0 1 2 3 4 5 6 7 8 9 10 11]

# Ravel (similar to flatten, but returns view)
raveled = reshaped.ravel()

# Transpose
transposed = reshaped.T
# [[ 0  4  8]
#  [ 1  5  9]
#  [ 2  6 10]
#  [ 3  7 11]]

# Stack arrays
a = np.array([1, 2, 3])
b = np.array([4, 5, 6])
stacked = np.vstack([a, b])  # Vertical stack
# [[1 2 3]
#  [4 5 6]]

hstacked = np.hstack([a, b])  # Horizontal stack
# [1 2 3 4 5 6]

# Concatenate
concat = np.concatenate([a, b])  # [1 2 3 4 5 6]

8. Array Comparison and Filtering

arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

# Comparison
print(arr > 5)           # [False False False False False  True  True  True  True  True]
print(arr == 5)          # [False False False False  True False False False False False]
print(arr <= 3)          # [ True  True  True False False False False False False False]

# Filtering (boolean indexing)
filtered = arr[arr > 5]  # [6 7 8 9 10]
filtered2 = arr[(arr > 3) & (arr < 8)]  # [4 5 6 7]

# Count elements meeting condition
count = np.sum(arr > 5)  # 5

# Find indices where condition is true
indices = np.where(arr > 5)  # (array([5, 6, 7, 8, 9]),)

9. Linear Algebra

# Dot product
a = np.array([1, 2, 3])
b = np.array([4, 5, 6])
dot_product = np.dot(a, b)  # 1*4 + 2*5 + 3*6 = 32

# Cross product
cross = np.cross(a, b)  # [-3 6 -3]

# Norm (magnitude)
norm = np.linalg.norm(a)  # sqrt(1^2 + 2^2 + 3^2) = 3.74

# Eigenvalues and eigenvectors
A = np.array([[4, 2], [1, 3]])
eigenvalues, eigenvectors = np.linalg.eig(A)

# Solve linear system (Ax = b)
A = np.array([[3, 1], [1, 2]])
b = np.array([9, 8])
x = np.linalg.solve(A, b)  # [2. 3.]

10. Working with CSV and Text Files

# Save to CSV
arr = np.array([[1, 2, 3], [4, 5, 6]])
np.savetxt("array.csv", arr, delimiter=",")

# Load from CSV
loaded = np.loadtxt("array.csv", delimiter=",")

# Save binary format (faster)
np.save("array.npy", arr)
loaded = np.load("array.npy")

11. Sorting and Searching

arr = np.array([3, 1, 4, 1, 5, 9, 2, 6])

# Sort
sorted_arr = np.sort(arr)  # [1 1 2 3 4 5 6 9]

# Argsort (indices that would sort array)
indices = np.argsort(arr)  # [1 3 6 0 2 4 7 5]

# Search sorted
index = np.searchsorted(sorted_arr, 5)  # 4

# Find (similar to where)
found_indices = np.where(arr == 1)  # (array([1, 3]),)

12. Practical Examples

Example 1: Calculate Statistics

# Test scores for multiple students
scores = np.array([
    [85, 90, 78],  # Student 1: Math, English, Science
    [92, 88, 91],  # Student 2
    [78, 85, 82]   # Student 3
])

# Average score per student
avg_per_student = np.mean(scores, axis=1)
print(avg_per_student)  # [84.33 90.33 81.67]

# Average score per subject
avg_per_subject = np.mean(scores, axis=0)
print(avg_per_subject)  # [85. 87.67 83.67]

# Highest score
print(np.max(scores))  # 92

# Lowest score
print(np.min(scores))  # 78

# Students with average > 85
high_performers = np.where(avg_per_student > 85)[0]
print(high_performers)  # [1] (Student 2)

Example 2: Generate Random Data

# Generate 1000 random data points from normal distribution
data = np.random.normal(loc=100, scale=15, size=1000)

# Statistics
print(f"Mean: {np.mean(data):.2f}")
print(f"Std Dev: {np.std(data):.2f}")
print(f"Min: {np.min(data):.2f}")
print(f"Max: {np.max(data):.2f}")

# Histogram
bins = np.histogram(data, bins=10)
print(bins)  # Returns (frequencies, bin_edges)

Example 3: Data Normalization

# Normalize data to 0-1 range
data = np.array([10, 20, 30, 40, 50])

min_val = np.min(data)
max_val = np.max(data)

normalized = (data - min_val) / (max_val - min_val)
print(normalized)  # [0.   0.25 0.5  0.75 1.  ]

# Standardize (zero mean, unit variance)
mean = np.mean(data)
std = np.std(data)
standardized = (data - mean) / std
print(standardized)  # [-1.41 -0.71  0.   0.71  1.41]

Example 4: Matrix Operations

# Solve system of equations: 3x + 2y = 8, 2x + 5y = 11
A = np.array([[3, 2], [2, 5]])
b = np.array([8, 11])

solution = np.linalg.solve(A, b)
print(solution)  # [2. 1.] meaning x=2, y=1

# Verify
print(np.dot(A, solution))  # [8. 11.] ✓

Summary: NumPy vs Java Arrays

Type
- NumPy: Homogeneous (all same type)
- Java Array: Homogeneous
Dimension
- NumPy: N-dimensional
- Java Array: Fixed dimension
Performance
- NumPy: Very fast (C backend)
- Java Array: Moderate
Math Functions
- NumPy: Extensive built-in
- Java Array: Requires libraries
Broadcasting
- NumPy: Yes
- Java Array: Manual
Slicing
- NumPy: Simple and powerful
- Java Array: Manual indexing
Memory
- NumPy: Efficient
- Java Array: Standard allocation
Use Case
- NumPy: Numerical computing
- Java Array: General purpose