asrman: November 2025

Wednesday, November 5, 2025

ML 101-Part1

def scaled_dot_attn(Q, K, V, mask=None):

# Q, K, V: [B, T, d_k]

scores = torch.matmul(Q, K.transpose(-2, -1) ) / math.sqrt(Q.size(-1)) # [B, T, T]

if mask is not None: # mask: [B, 1, T] or [B, T, T]; True = keep, False = block

scores = scores.masked_fill(~mask, float('-inf'))

A = torch.softmax(scores, dim=-1) # [B, T, T]

return torch.matmul(A, V) # [B, T, d_k]

def kmeans(X, k,iterations=200, ratio=1e4):

assert X.ndim == 2

N, D = X.shape

device = X.device

rng = torch.Generator(device=device).manual_seed(0)

centroids_idx = torch.randint(0, N, (k,0), generator=rng, device=device)

center = X[centroids_idx].clone()

prev_inertia = 0

for i in ranger(iterations):

# E-step

d2 = torch.cdist(X, center, p = 2)**2

label = d2.argmin(dim=1)

inertia = d2.gather(1, label.unsqueeze(1) ).sum().item()

inertia = d2.gather(1, d2.argmin(dim=1，keepdim=True )).sum().item()

# M-step

centers_sum = torch.zeros_like(centers) # [k, D]

counts = torch.zeros(k, device=device, dtype=X.dtype) # [k]

centers_sum.index_add_(0, labels, X) # sum per cluster

counts.index_add_(0, labels, torch.ones_like(labels, dtype=X.dtype)) #Note3

# Handle empty clusters: re-seed them to random points (rare but important)

empty = counts == 0

if empty.any():

idxs=torch.randint(0,X.size(0),(int(empty.sum()),),generator=rng, device=device)

counts[empty] = 1.0

centers_sum[empty] = X[idxs]

center = center_sums/ center_cnt.clamp_min(1.0).unsqueeze(1)

if prev_inertia - inertial < prev_inertia * ratio: break

prev_inertia = inertia

return centers, labels

class KNNClassifier:

def __init__(self, k=3):

self.k = k

self.x_train = None

self.y_train = None

def fit(self, x_train, y_train):

self.x_train = x_train

self.y_train = y_train

def predict(self, x_test):

dists = (

x_test.pow(2).sum(dim=1, keepdim=True) #[N_test, 1]

+ self.x_train.pow(2).sum(dim=1).unsqueeze(0) #[1, N_train]

- 2 * x_test @ self.x_train.T #[N_test, N_train]

) # [N_test, N_train]

#dists = torch.cdist(x_test, x_train) # also ok

knn_idx = dists.topk(self.k, largest=False).indices # [N_test, k]

knn_labels = self.y_train[knn_idx] # [N_test, k]

# Majority vote along k nearest neighbors for each test point

preds = torch.mode(knn_labels, dim=1).values #[N_test]

return preds

if __name__ == "__main__":

# Synthetic data: 6 training points, 2 test points, 2D features

x_train = torch.tensor([

[1., 1.], [1., 2.], [2., 1.], # class 0

[5., 5.], [5., 6.], [6., 5.] # class 1

])

y_train = torch.tensor([0, 0, 0, 1, 1, 1])

x_test = torch.tensor([[1.5, 1.5], [5.5, 5.5]])

model = KNNClassifier(k=3)

model.fit(x_train, y_train)

preds = model.predict(x_test)

print("Predictions:", preds.tolist())

Monday, November 3, 2025

What is k-Means? what is its pros and cons?

Q: What is k-Means? what is its pros and cons? Please provide a minimal PyTorch code implementation from scratch. Please provide necessary comment for challenging parts.

A: Here’s a crisp overview + a from-scratch PyTorch k-Means (with k-means++ init). I added comments exactly where people usually get stuck (distance calc, assignments, centroid updates, empty clusters).

What is k-Means (in one breath)

Unsupervised clustering that partitions data into k clusters by iteratively:

1. assigning each point to its nearest centroid (Euc. distance),

2. recomputing each centroid as the mean of its assigned points,
until assignments stabilize.

Pros / Cons (fast soundbites)

Pros

· Simple, fast, scalable (vectorizes well, mini-batchable).

· Works nicely when clusters are roughly spherical/separated.

· Few parameters (k, max_iter).

Cons

· Must choose k.

· Sensitive to initialization; can get stuck in local minima.

· Assumes Euclidean geometry; struggles with non-spherical/imbalanced clusters.

· Sensitive to scale (standardize features!).

Minimal PyTorch Implementation (with comments)

import torch

def kmeans_plus_plus_init(X, k, rng):

"""

k-means++ initialization:

1) pick one center uniformly at random

2) pick remaining centers with prob ∝ distance^2 to nearest chosen center

"""

N, D = X.shape

centers = torch.empty(k, D, device=X.device, dtype=X.dtype)

# pick first center, Start with one random center.

i0 = torch.randint(low=0, high=N, size=(1,), generator=rng, device=X.device)

centers[0] = X[i0]

# Pre-allocate distances to nearest chosen center

# Start large; will be updated to min squared distance to any chosen center

d2 = torch.full((N,), float("inf"), device=X.device, dtype=X.dtype) #[N]

for c in range(1, k):

# Update d2 to the nearest of chosen centers so far

# ||X - centers[c-1]||^2 using broadcasting (N,D) - (1,D) -> (N,D)

diff = X - centers[c-1] # [N, D]

d2 = torch.minimum(d2, (diff * diff).sum(dim=1)) #Note2; #[N]

# Sample next center proportional to distance^2

probs = d2 / (d2.sum() + 1e-12)

next_idx = torch.multinomial(probs, num_samples=1, generator=rng)

centers[c] = X[next_idx]

return centers

def kmeans(X, k, max_iter=100, tol=1e-4, use_pp=True, seed=0):

"""

X: [N, D] data tensor (float32/float64). Recommend standardized features.

k: number of clusters

Returns:

centers: [k, D]

labels: [N] cluster assignments

"""

assert X.ndim == 2

device = X.device

rng = torch.Generator(device=device).manual_seed(seed)

# ----- init centers -----

if use_pp:

centers = kmeans_plus_plus_init(X, k, rng)

else:

# random init from data

idx = torch.randint(0, X.size(0), (k,), generator=rng, device=device) #[k]

centers = X[idx].clone() # [k, D]

prev_inertia = float("inf")

for it in range(max_iter):

# ===== E-step: assign points to nearest center =====

#d2 = torch.cdist(X, centers, p=2) ** 2 # Ok but bit slower than following

# Pairwise squared distances via (x^2 + c^2 - 2 x·c)

# X2: [N,1], C2: [k], X@C^T: [N,k]

X2 = (X * X).sum(dim=1, keepdim=True) # [N,1]

C2 = (centers * centers).sum(dim=1) # [k]

d2 = X2 + C2.unsqueeze(0) - 2.0 * (X @ centers.T) # [N,k]

# finds the index of the smallest value along each row.

labels = d2.argmin(dim=1) # [N]

# Inertia = sum of min squared distances (objective)

inertia = d2.gather(1, labels.unsqueeze(1)).sum().item() #Better

#inertia = torch.gather(d2, 1, labels.unsqueeze(1)).sum().item() # OK

# ===== M-step: recompute centers as mean of assigned points =====

# We’ll use scatter-add to accumulate sums and then divide by counts.

kD = centers.shape # torch.Size([k, D])

centers_sum = torch.zeros_like(centers) # [k, D]

counts = torch.zeros(k, device=device, dtype=X.dtype) # [k]

# One-hot style accumulation without making a dense one-hot matrix:

# scatter_add accumulates row-wise sums into centers_sum[labels]

centers_sum.index_add_(0, labels, X) # sum per cluster

counts.index_add_(0, labels, torch.ones_like(labels, dtype=X.dtype)) #Note3

# Handle empty clusters: re-seed them to random points (rare but important)

empty = counts == 0

if empty.any():

# Reinitialize empty centers from random data points (k-means++ alternative)

repl_idx = torch.randint(0, X.size(0), (int(empty.sum()),), generator=rng, device=device)

#centers[empty] = X[repl_idx]

counts[empty] = 1.0

centers_sum[empty] = X[repl_idx]

centers = centers_sum / counts.clamp_min(1.0).unsqueeze(1)

# ===== stopping criterion =====

# Relative improvement on inertia; simple and effective

rel_impr = (prev_inertia - inertia) / (abs(prev_inertia) + 1e-12)

if rel_impr >= 0 and rel_impr < tol:

break

prev_inertia = inertia

return centers, labels

# -----------------------------

# Tiny demo

if __name__ == "__main__":

torch.manual_seed(0)

device = "cuda" if torch.cuda.is_available() else "cpu"

# Make three blobs in 2D

N = 600

X = torch.cat([

torch.randn(N//3, 2) * 0.5 + torch.tensor([0.0, 0.0]),

torch.randn(N//3, 2) * 0.6 + torch.tensor([4.0, 4.0]),

torch.randn(N//3, 2) * 0.4 + torch.tensor([-4.0, 3.5]),

], dim=0).to(device)

centers, labels = kmeans(X, k=3, max_iter=100, tol=1e-4, use_pp=True, seed=42)

print("Centers:\n", centers.cpu())

print("Counts per cluster:", torch.bincount(labels.cpu(), minlength=3).tolist())

Notes on the “tricky” parts

· Pairwise distances: we used the identity
(|x-c|^2 = |x|^2 + |c|^2 - 2x\cdot c)
to compute all distances in one shot (vectorized, GPU-friendly).

· k-means++ picks diverse initial centers using probabilities ∝ distance² — often dramatically improves convergence and final quality.

· Empty clusters can happen; we re-seed those centers to random points (simple, robust).

· Stopping: we check relative improvement of inertia; tol=1e-4 is a good default.

· Scaling: always standardize features (mean 0, var 1) before k-Means in practice.

Note3: counts.index_add_(0, labels, torch.ones(X.size(0), device=X.device, dtype=X.dtype))

Blog Archive

Wednesday, November 5, 2025

ML 101-Part1

Monday, November 3, 2025

What is k-Means? what is its pros and cons?