Master the essential Python libraries — NumPy, Pandas, Matplotlib, and Scikit-learn — that power every ML workflow.
Python is the #1 language for ML and Data Science. Here's why:
Numerical computing
Arrays &
linear algebra
Data manipulation
DataFrames &
Series
Data visualization
Charts &
plots
ML algorithms
Training &
evaluation
NumPy is the foundation of scientific computing in Python. It provides high-performance multidimensional arrays and mathematical operations.
import numpy as np
# Creating arrays
arr = np.array([1, 2, 3, 4, 5])
matrix = np.array([[1, 2, 3], [4, 5, 6]])
zeros = np.zeros((3, 3)) # 3×3 matrix of zeros
ones = np.ones((2, 4)) # 2×4 matrix of ones
rand = np.random.rand(3, 3) # 3×3 random values [0,1)
# Array properties
print(arr.shape) # (5,)
print(matrix.ndim) # 2
print(matrix.dtype) # int64
# Arithmetic (vectorized — no loops needed!)
a = np.array([10, 20, 30])
b = np.array([1, 2, 3])
print(a + b) # [11, 22, 33]
print(a * b) # [10, 40, 90]
print(a ** 2) # [100, 400, 900]
# Statistical operations
data = np.array([85, 90, 78, 92, 88])
print(np.mean(data)) # 86.6
print(np.median(data)) # 88.0
print(np.std(data)) # 4.72
print(np.min(data)) # 78
print(np.max(data)) # 92
# Matrix multiplication
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])
result = np.dot(A, B) # or A @ B
# [[19, 22], [43, 50]]
# Transpose
print(A.T)
# [[1, 3], [2, 4]]
# Reshaping
arr = np.arange(12) # [0, 1, 2, ..., 11]
reshaped = arr.reshape(3, 4) # 3 rows, 4 columns
# Indexing & Slicing
matrix = np.array([[1,2,3],[4,5,6],[7,8,9]])
print(matrix[0, :]) # First row: [1, 2, 3]
print(matrix[:, 1]) # Second column: [2, 5, 8]
print(matrix[1:, :2]) # Rows 1-2, Cols 0-1: [[4,5],[7,8]]
# Boolean indexing
scores = np.array([65, 85, 92, 45, 78])
print(scores[scores > 70]) # [85, 92, 78]
Pandas provides two primary data structures: Series (1D) and DataFrame (2D table), making data manipulation intuitive and powerful.
import pandas as pd
# Series — 1D labeled array
grades = pd.Series([85, 90, 78, 92],
index=['Alice', 'Bob', 'Charlie', 'Diana'])
print(grades['Bob']) # 90
# DataFrame — 2D labeled table
data = {
'Name': ['Alice', 'Bob', 'Charlie', 'Diana'],
'Age': [25, 30, 28, 22],
'Score': [85, 92, 78, 95],
'City': ['Mumbai', 'Delhi', 'Mumbai', 'Chennai']
}
df = pd.DataFrame(data)
print(df)
# Name Age Score City
# 0 Alice 25 85 Mumbai
# 1 Bob 30 92 Delhi
# 2 Charlie 28 78 Mumbai
# 3 Diana 22 95 Chennai
# Reading data from CSV
df = pd.read_csv('data.csv')
# Quick overview
print(df.head()) # First 5 rows
print(df.info()) # Data types, non-null counts
print(df.describe()) # Statistical summary
print(df.shape) # (rows, columns)
# Selecting columns
print(df['Age']) # Single column → Series
print(df[['Name', 'Score']]) # Multiple columns → DataFrame
# Filtering rows
high_scorers = df[df['Score'] > 80]
mumbai = df[df['City'] == 'Mumbai']
# Handling missing values
print(df.isnull().sum()) # Count NaN per column
df_clean = df.dropna() # Remove rows with any NaN
df_filled = df.fillna(0) # Replace NaN with 0
df['Age'].fillna(df['Age'].mean(), inplace=True) # Mean imputation
# Sorting
df_sorted = df.sort_values('Score', ascending=False)
# Grouping and aggregation
city_avg = df.groupby('City')['Score'].mean()
# City
# Chennai 95.0
# Delhi 92.0
# Mumbai 81.5
# Adding a new column
df['Pass'] = df['Score'] >= 80
# Applying functions
df['Grade'] = df['Score'].apply(
lambda x: 'A' if x>=90 else 'B' if x>=80 else 'C'
)
# Encoding categorical data
df['City_Encoded'] = df['City'].map({
'Mumbai': 0, 'Delhi': 1, 'Chennai': 2
})
# One-Hot Encoding
df_encoded = pd.get_dummies(df, columns=['City'])
# Exporting
df.to_csv('output.csv', index=False)
df.to_excel('output.xlsx', index=False)
Matplotlib is the foundational plotting library in Python. It provides both quick plots and fine-grained control over every visual element.
import matplotlib.pyplot as plt
import numpy as np
# ── Line Plot ──
x = np.linspace(0, 10, 100)
plt.plot(x, np.sin(x), label='sin(x)', color='blue')
plt.plot(x, np.cos(x), label='cos(x)', color='red')
plt.xlabel('x'); plt.ylabel('y')
plt.title('Trigonometric Functions')
plt.legend(); plt.grid(True)
plt.show()
# ── Scatter Plot ──
hours = [2, 3, 4, 5, 6, 7, 8]
scores = [50, 55, 65, 70, 80, 85, 90]
plt.scatter(hours, scores, c='purple', marker='o', s=80)
plt.xlabel('Study Hours'); plt.ylabel('Score')
plt.title('Study Hours vs Exam Score')
plt.show()
# ── Bar Chart ──
categories = ['KNN', 'SVM', 'Decision Tree', 'Logistic Reg']
accuracies = [85, 92, 88, 90]
plt.bar(categories, accuracies, color=['#6366f1','#10b981','#f97316','#ec4899'])
plt.ylabel('Accuracy (%)'); plt.title('Model Comparison')
plt.ylim(70, 100)
plt.show()
# ── Histogram ──
data = np.random.normal(70, 15, 1000)
plt.hist(data, bins=30, edgecolor='black', alpha=0.7)
plt.xlabel('Score'); plt.ylabel('Frequency')
plt.title('Score Distribution')
plt.show()
# Multiple subplots
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
axes[0].plot(x, np.sin(x)); axes[0].set_title('Line Plot')
axes[1].bar(['A','B','C'], [30,50,20]); axes[1].set_title('Bar Chart')
axes[2].pie([40,30,30], labels=['A','B','C'], autopct='%1.1f%%')
axes[2].set_title('Pie Chart')
plt.tight_layout()
plt.savefig('plots.png', dpi=150)
plt.show()
# Heatmap (Confusion Matrix)
import seaborn as sns
cm = np.array([[45, 5], [8, 42]])
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['Pos','Neg'], yticklabels=['Pos','Neg'])
plt.xlabel('Predicted'); plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
Scikit-learn provides a consistent API for implementing ML algorithms. Every
model follows the same pattern: fit() → predict() →
score().
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
# Step 1: Load and split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Step 2: Scale features (important for KNN, SVM)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Step 3: Create model, train, predict
model = SomeAlgorithm(hyperparams)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Step 4: Evaluate
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred))
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
# Load dataset
iris = load_iris()
X, y = iris.data, iris.target
# Split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42
)
# Train KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
# Predict and evaluate
y_pred = knn.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
# Accuracy: 0.98
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Training data
X_train = np.array([[1],[2],[3],[4],[5],[6],[7],[8]])
y_train = np.array([15,22,30,35,42,48,55,62])
# Train model
model = LinearRegression()
model.fit(X_train, y_train)
# Model parameters
print(f"Slope: {model.coef_[0]:.2f}") # ~6.79
print(f"Intercept: {model.intercept_:.2f}") # ~9.50
# Predict
y_pred = model.predict(X_train)
print(f"R² Score: {r2_score(y_train, y_pred):.4f}") # ~0.998
print(f"RMSE: {np.sqrt(mean_squared_error(y_train, y_pred)):.2f}")
from sklearn.cluster import KMeans
# Generate sample data
from sklearn.datasets import make_blobs
X, _ = make_blobs(n_samples=200, centers=3, random_state=42)
# Train K-Means
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
kmeans.fit(X)
# Results
print(f"Cluster centers:\n{kmeans.cluster_centers_}")
print(f"Labels: {kmeans.labels_[:10]}")
print(f"Inertia (WCSS): {kmeans.inertia_:.2f}")
# Visualize
plt.scatter(X[:,0], X[:,1], c=kmeans.labels_, cmap='viridis', s=30)
plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1],
c='red', marker='X', s=200, edgecolors='black')
plt.title('K-Means Clustering'); plt.show()
from sklearn.model_selection import cross_val_score
# 5-fold cross-validation
scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy')
print(f"Fold scores: {scores}")
print(f"Mean accuracy: {scores.mean():.2f} ± {scores.std():.2f}")
Here's a complete end-to-end ML pipeline combining all four libraries:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# ─── Step 1: Load Data ───
df = pd.read_csv('students.csv')
print(df.head())
print(df.info())
# ─── Step 2: Explore Data ───
print(df.describe())
print(df.isnull().sum())
# ─── Step 3: Preprocess ───
df.dropna(inplace=True) # Handle missing values
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender']) # Encode categoricals
# Features and target
X = df[['Age', 'Study_Hours', 'Attendance', 'Gender']]
y = df['Pass'] # 1 = Pass, 0 = Fail
# ─── Step 4: Split and Scale ───
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# ─── Step 5: Train Multiple Models ───
models = {
'KNN': KNeighborsClassifier(n_neighbors=5),
'SVM': SVC(kernel='rbf'),
'Logistic': LogisticRegression()
}
results = {}
for name, model in models.items():
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
results[name] = acc
print(f"{name}: {acc:.2%}")
# ─── Step 6: Visualize Results ───
plt.bar(results.keys(), results.values(), color=['#6366f1','#10b981','#f97316'])
plt.ylabel('Accuracy'); plt.title('Model Comparison')
plt.ylim(0.5, 1.0); plt.show()
# ─── Step 7: Cross-Validate Best Model ───
best_model = SVC(kernel='rbf')
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5)
print(f"CV Mean Accuracy: {cv_scores.mean():.2%} ± {cv_scores.std():.2%}")
model = Algorithm(params)model.fit(X_train, y_train) —
learn from datamodel.predict(X_test) — make
predictionsmodel.score(X_test, y_test) — evaluate