1. Introduction to NumPy
NumPy (Numerical Python) is the foundational library for numerical computing in Python. It provides support for large, multi-dimensional arrays and matrices, along with a collection of mathematical functions to operate on these arrays efficiently.
Why NumPy?
- Performance: 10-100x faster than native Python lists for numerical operations.
- Memory Efficient: Uses contiguous memory blocks, reducing overhead.
- Convenient: Provides high-level mathematical functions and operations.
- Foundational: Base for Pandas, Scikit-learn, TensorFlow, and other data science libraries.
Installation
pip install numpy
2. Creating Arrays
Basic Array Creation
import numpy as np
# From Python list
arr1 = np.array([1, 2, 3, 4, 5])
print(arr1) # [1 2 3 4 5]
# Specify data type
arr2 = np.array([1, 2, 3, 4, 5], dtype=float)
print(arr2) # [1. 2. 3. 4. 5.]
# 2D array (matrix)
arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print(arr2d)
# [[1 2 3]
# [4 5 6]
# [7 8 9]]
# 3D array
arr3d = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
Special Array Creation
# Zeros
zeros = np.zeros((3, 4)) # 3x4 matrix of zeros
# [[0. 0. 0. 0.]
# [0. 0. 0. 0.]
# [0. 0. 0. 0.]]
# Ones
ones = np.ones((2, 3))
# [[1. 1. 1.]
# [1. 1. 1.]]
# Identity matrix
identity = np.eye(3)
# [[1. 0. 0.]
# [0. 1. 0.]
# [0. 0. 1.]]
# Range (similar to Python range)
arr_range = np.arange(0, 10, 2) # [0 2 4 6 8]
# Linspace (evenly spaced numbers)
linspace = np.linspace(0, 1, 5) # [0. 0.25 0.5 0.75 1. ]
# Random numbers
random_arr = np.random.rand(3, 3) # 3x3 random values between 0-1
random_int = np.random.randint(0, 10, size=(2, 3)) # Random integers
3. Array Properties
arr = np.array([[1, 2, 3], [4, 5, 6]])
# Shape: dimensions of array
print(arr.shape) # (2, 3) - 2 rows, 3 columns
# Size: total number of elements
print(arr.size) # 6
# Dtype: data type of elements
print(arr.dtype) # int64
# Ndim: number of dimensions
print(arr.ndim) # 2
# Itemsize: size of each element in bytes
print(arr.itemsize) # 8
# Data: memory buffer
print(arr.data) # <memory at 0x...>
4. Indexing and Slicing
1D Array Indexing
arr = np.array([10, 20, 30, 40, 50])
# Positive indexing
print(arr[0]) # 10
print(arr[2]) # 30
# Negative indexing
print(arr[-1]) # 50 (last element)
print(arr[-2]) # 40 (second to last)
# Slicing
print(arr[1:4]) # [20 30 40] (indices 1, 2, 3)
print(arr[:3]) # [10 20 30] (from start to index 2)
print(arr[2:]) # [30 40 50] (from index 2 to end)
print(arr[::2]) # [10 30 50] (every 2nd element)
print(arr[::-1]) # [50 40 30 20 10] (reversed)
2D Array Indexing
arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
# Access element at row 1, column 2
print(arr2d[1, 2]) # 6
# Access entire row
print(arr2d[0]) # [1 2 3]
# Access entire column
print(arr2d[:, 1]) # [2 5 8]
# Slicing
print(arr2d[0:2, 1:3]) # First 2 rows, columns 1-2
# [[2 3]
# [5 6]]
# Boolean indexing
print(arr2d[arr2d > 5]) # [6 7 8 9] (elements > 5)
5. Array Operations
Element-wise Operations
a = np.array([1, 2, 3, 4, 5])
b = np.array([2, 3, 4, 5, 6])
# Arithmetic operations
print(a + b) # [3 5 7 9 11]
print(a - b) # [-1 -1 -1 -1 -1]
print(a * b) # [2 6 12 20 30] (element-wise multiplication)
print(a / b) # [0.5 0.67 0.75 0.8 0.83]
print(a ** 2) # [1 4 9 16 25] (element-wise power)
print(np.sqrt(a)) # [1. 1.41 1.73 2. 2.24]
# Broadcasting (operating with scalar)
print(a + 10) # [11 12 13 14 15]
print(a * 2) # [2 4 6 8 10]
Matrix Operations
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])
# Matrix multiplication (dot product)
print(np.dot(A, B))
# [[19 22]
# [43 50]]
# Element-wise multiplication
print(A * B)
# [[ 5 12]
# [21 32]]
# Transpose
print(A.T)
# [[1 3]
# [2 4]]
# Determinant
print(np.linalg.det(A)) # -2.0
# Inverse
print(np.linalg.inv(A))
# [[-2. 1. ]
# [ 1.5 -0.5]]
6. Aggregation and Statistical Functions
arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
# Sum
print(np.sum(arr)) # 55
print(arr.sum()) # 55 (method on array)
# Mean (average)
print(np.mean(arr)) # 5.5
# Median
print(np.median(arr)) # 5.5
# Standard deviation
print(np.std(arr)) # 2.872
# Variance
print(np.var(arr)) # 8.25
# Min and Max
print(np.min(arr)) # 1
print(np.max(arr)) # 10
# Argmin and Argmax (indices)
print(np.argmin(arr)) # 0
print(np.argmax(arr)) # 9
# Percentile
print(np.percentile(arr, 25)) # 3.25 (25th percentile)
print(np.percentile(arr, 75)) # 7.75 (75th percentile)
# Unique values
arr_dup = np.array([1, 2, 2, 3, 3, 3, 4, 4, 4, 4])
print(np.unique(arr_dup)) # [1 2 3 4]
2D Aggregation
arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
# Sum all elements
print(np.sum(arr2d)) # 45
# Sum along rows (axis=1)
print(np.sum(arr2d, axis=1)) # [6 15 24]
# Sum along columns (axis=0)
print(np.sum(arr2d, axis=0)) # [12 15 18]
# Mean along rows
print(np.mean(arr2d, axis=1)) # [2. 5. 8.]
7. Array Reshaping and Manipulation
arr = np.arange(12) # [0 1 2 3 4 5 6 7 8 9 10 11]
# Reshape
reshaped = arr.reshape(3, 4)
# [[ 0 1 2 3]
# [ 4 5 6 7]
# [ 8 9 10 11]]
# Flatten (convert multi-dimensional to 1D)
flattened = reshaped.flatten() # [0 1 2 3 4 5 6 7 8 9 10 11]
# Ravel (similar to flatten, but returns view)
raveled = reshaped.ravel()
# Transpose
transposed = reshaped.T
# [[ 0 4 8]
# [ 1 5 9]
# [ 2 6 10]
# [ 3 7 11]]
# Stack arrays
a = np.array([1, 2, 3])
b = np.array([4, 5, 6])
stacked = np.vstack([a, b]) # Vertical stack
# [[1 2 3]
# [4 5 6]]
hstacked = np.hstack([a, b]) # Horizontal stack
# [1 2 3 4 5 6]
# Concatenate
concat = np.concatenate([a, b]) # [1 2 3 4 5 6]
8. Array Comparison and Filtering
arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
# Comparison
print(arr > 5) # [False False False False False True True True True True]
print(arr == 5) # [False False False False True False False False False False]
print(arr <= 3) # [ True True True False False False False False False False]
# Filtering (boolean indexing)
filtered = arr[arr > 5] # [6 7 8 9 10]
filtered2 = arr[(arr > 3) & (arr < 8)] # [4 5 6 7]
# Count elements meeting condition
count = np.sum(arr > 5) # 5
# Find indices where condition is true
indices = np.where(arr > 5) # (array([5, 6, 7, 8, 9]),)
9. Linear Algebra
# Dot product
a = np.array([1, 2, 3])
b = np.array([4, 5, 6])
dot_product = np.dot(a, b) # 1*4 + 2*5 + 3*6 = 32
# Cross product
cross = np.cross(a, b) # [-3 6 -3]
# Norm (magnitude)
norm = np.linalg.norm(a) # sqrt(1^2 + 2^2 + 3^2) = 3.74
# Eigenvalues and eigenvectors
A = np.array([[4, 2], [1, 3]])
eigenvalues, eigenvectors = np.linalg.eig(A)
# Solve linear system (Ax = b)
A = np.array([[3, 1], [1, 2]])
b = np.array([9, 8])
x = np.linalg.solve(A, b) # [2. 3.]
10. Working with CSV and Text Files
# Save to CSV
arr = np.array([[1, 2, 3], [4, 5, 6]])
np.savetxt("array.csv", arr, delimiter=",")
# Load from CSV
loaded = np.loadtxt("array.csv", delimiter=",")
# Save binary format (faster)
np.save("array.npy", arr)
loaded = np.load("array.npy")
11. Sorting and Searching
arr = np.array([3, 1, 4, 1, 5, 9, 2, 6])
# Sort
sorted_arr = np.sort(arr) # [1 1 2 3 4 5 6 9]
# Argsort (indices that would sort array)
indices = np.argsort(arr) # [1 3 6 0 2 4 7 5]
# Search sorted
index = np.searchsorted(sorted_arr, 5) # 4
# Find (similar to where)
found_indices = np.where(arr == 1) # (array([1, 3]),)
12. Practical Examples
Example 1: Calculate Statistics
# Test scores for multiple students
scores = np.array([
[85, 90, 78], # Student 1: Math, English, Science
[92, 88, 91], # Student 2
[78, 85, 82] # Student 3
])
# Average score per student
avg_per_student = np.mean(scores, axis=1)
print(avg_per_student) # [84.33 90.33 81.67]
# Average score per subject
avg_per_subject = np.mean(scores, axis=0)
print(avg_per_subject) # [85. 87.67 83.67]
# Highest score
print(np.max(scores)) # 92
# Lowest score
print(np.min(scores)) # 78
# Students with average > 85
high_performers = np.where(avg_per_student > 85)[0]
print(high_performers) # [1] (Student 2)
Example 2: Generate Random Data
# Generate 1000 random data points from normal distribution
data = np.random.normal(loc=100, scale=15, size=1000)
# Statistics
print(f"Mean: {np.mean(data):.2f}")
print(f"Std Dev: {np.std(data):.2f}")
print(f"Min: {np.min(data):.2f}")
print(f"Max: {np.max(data):.2f}")
# Histogram
bins = np.histogram(data, bins=10)
print(bins) # Returns (frequencies, bin_edges)
Example 3: Data Normalization
# Normalize data to 0-1 range
data = np.array([10, 20, 30, 40, 50])
min_val = np.min(data)
max_val = np.max(data)
normalized = (data - min_val) / (max_val - min_val)
print(normalized) # [0. 0.25 0.5 0.75 1. ]
# Standardize (zero mean, unit variance)
mean = np.mean(data)
std = np.std(data)
standardized = (data - mean) / std
print(standardized) # [-1.41 -0.71 0. 0.71 1.41]
Example 4: Matrix Operations
# Solve system of equations: 3x + 2y = 8, 2x + 5y = 11
A = np.array([[3, 2], [2, 5]])
b = np.array([8, 11])
solution = np.linalg.solve(A, b)
print(solution) # [2. 1.] meaning x=2, y=1
# Verify
print(np.dot(A, solution)) # [8. 11.] ✓
Summary: NumPy vs Java Arrays
-
Type
- NumPy: Homogeneous (all same type)
- Java Array: Homogeneous
-
Dimension
- NumPy: N-dimensional
- Java Array: Fixed dimension
-
Performance
- NumPy: Very fast (C backend)
- Java Array: Moderate
-
Math Functions
- NumPy: Extensive built-in
- Java Array: Requires libraries
-
Broadcasting
- NumPy: Yes
- Java Array: Manual
-
Slicing
- NumPy: Simple and powerful
- Java Array: Manual indexing
-
Memory
- NumPy: Efficient
- Java Array: Standard allocation
-
Use Case
- NumPy: Numerical computing
- Java Array: General purpose