Blog Archive

Wednesday, November 5, 2025

ML 101-Part1


def scaled_dot_attn(Q, K, V, mask=None):

    # Q, K, V: [B, T, d_k]

    scores = torch.matmul(Q, K.transpose(-2, -1) ) / math.sqrt(Q.size(-1))  # [B, T, T]

    if mask is not None:  # mask: [B, 1, T] or [B, T, T]; True = keep, False = block

        scores = scores.masked_fill(~mask, float('-inf'))

    A = torch.softmax(scores, dim=-1)                                # [B, T, T]

    return torch.matmul(A, V)                                             # [B, T, d_k]

 

def kmeans(X, k,iterations=200, ratio=1e4):

    assert X.ndim == 2

    N, D = X.shape

    device = X.device

    rng = torch.Generator(device=device).manual_seed(0)

    centroids_idx = torch.randint(0, N, (k,0), generator=rng, device=device)

    center = X[centroids_idx].clone()

   

    prev_inertia = 0

    for i in ranger(iterations):

        # E-step

        d2 = torch.cdist(X, center, p = 2)**2

        label   = d2.argmin(dim=1)

        inertia = d2.gather(1, label.unsqueeze(1) ).sum().item()       

        inertia = d2.gather(1, d2.argmin(dim=1keepdim=True )).sum().item()       

 

        # M-step

        centers_sum = torch.zeros_like(centers) # [k, D]

        counts = torch.zeros(k, device=device, dtype=X.dtype) # [k]

        centers_sum.index_add_(0, labels, X)                     # sum per cluster

        counts.index_add_(0, labels, torch.ones_like(labels, dtype=X.dtype)) #Note3

        # Handle empty clusters: re-seed them to random points (rare but important)

        empty = counts == 0

        if empty.any():

                idxs=torch.randint(0,X.size(0),(int(empty.sum()),),generator=rng, device=device)

                counts[empty] = 1.0

                centers_sum[empty] = X[idxs]

       

        center = center_sums/ center_cnt.clamp_min(1.0).unsqueeze(1)       

        if prev_inertia  - inertial < prev_inertia * ratio: break

        prev_inertia = inertia

 

    return centers, labels

 

 

 

 

 

 

class KNNClassifier:

    def __init__(self, k=3):

        self.k = k

        self.x_train = None

        self.y_train = None

 

    def fit(self, x_train, y_train):

        self.x_train = x_train

        self.y_train = y_train

 

    def predict(self, x_test):       

        dists = (

            x_test.pow(2).sum(dim=1, keepdim=True) #[N_test, 1]

            + self.x_train.pow(2).sum(dim=1).unsqueeze(0)   #[1, N_train]

            - 2 * x_test @ self.x_train.T                        #[N_test, N_train]

        )  # [N_test, N_train]

        #dists = torch.cdist(x_test, x_train)  # also ok

        knn_idx = dists.topk(self.k, largest=False).indices  # [N_test, k]

        knn_labels = self.y_train[knn_idx]                           # [N_test, k]

 

        # Majority vote along k nearest neighbors for each test point

        preds = torch.mode(knn_labels, dim=1).values #[N_test]

        return preds

 

if __name__ == "__main__":

    # Synthetic data: 6 training points, 2 test points, 2D features

    x_train = torch.tensor([

        [1., 1.], [1., 2.], [2., 1.],   # class 0

        [5., 5.], [5., 6.], [6., 5.]    # class 1

    ])

    y_train = torch.tensor([0, 0, 0, 1, 1, 1])

    x_test = torch.tensor([[1.5, 1.5], [5.5, 5.5]])

    model = KNNClassifier(k=3)

    model.fit(x_train, y_train)

    preds = model.predict(x_test)

    print("Predictions:", preds.tolist())


Monday, November 3, 2025

What is k-Means? what is its pros and cons?

 

Q: What is k-Means? what is its pros and cons? Please provide a minimal PyTorch code implementation from scratch. Please provide necessary comment for challenging parts.


A: Here’s a crisp overview + a from-scratch PyTorch k-Means (with k-means++ init). I added comments exactly where people usually get stuck (distance calc, assignments, centroid updates, empty clusters).


What is k-Means (in one breath)

Unsupervised clustering that partitions data into k clusters by iteratively:

1.       assigning each point to its nearest centroid (Euc. distance),

2.       recomputing each centroid as the mean of its assigned points,
until assignments stabilize.


Pros / Cons (fast soundbites)

Pros

·       Simple, fast, scalable (vectorizes well, mini-batchable).

·       Works nicely when clusters are roughly spherical/separated.

·       Few parameters (k, max_iter).

Cons

·       Must choose k.

·       Sensitive to initialization; can get stuck in local minima.

·       Assumes Euclidean geometry; struggles with non-spherical/imbalanced clusters.

·       Sensitive to scale (standardize features!).


Minimal PyTorch Implementation (with comments)



import torch

 

def kmeans_plus_plus_init(X, k, rng):

    """

    k-means++ initialization:

    1) pick one center uniformly at random

    2) pick remaining centers with prob distance^2 to nearest chosen center

    """

    N, D = X.shape

    centers = torch.empty(k, D, device=X.device, dtype=X.dtype)

 

    # pick first center, Start with one random center.

    i0 = torch.randint(low=0, high=N, size=(1,), generator=rng, device=X.device)

    centers[0] = X[i0]

 

    # Pre-allocate distances to nearest chosen center

    # Start large; will be updated to min squared distance to any chosen center

    d2 = torch.full((N,), float("inf"), device=X.device, dtype=X.dtype) #[N]

 

    for c in range(1, k):

        # Update d2 to the nearest of chosen centers so far

        # ||X - centers[c-1]||^2 using broadcasting (N,D) - (1,D) -> (N,D)

        diff = X - centers[c-1] # [N, D]

        d2 = torch.minimum(d2, (diff * diff).sum(dim=1)) #Note2; #[N]

 

        # Sample next center proportional to distance^2

        probs = d2 / (d2.sum() + 1e-12)

        next_idx = torch.multinomial(probs, num_samples=1, generator=rng)

        centers[c] = X[next_idx]

 

    return centers

 

 

def kmeans(X, k, max_iter=100, tol=1e-4, use_pp=True, seed=0):

    """

    X: [N, D] data tensor (float32/float64). Recommend standardized features.

    k: number of clusters

    Returns:

      centers: [k, D]

      labels:  [N] cluster assignments

    """

    assert X.ndim == 2

    device = X.device

    rng = torch.Generator(device=device).manual_seed(seed)

 

    # ----- init centers -----

    if use_pp:

        centers = kmeans_plus_plus_init(X, k, rng)

    else:

        # random init from data

        idx = torch.randint(0, X.size(0), (k,), generator=rng, device=device) #[k]

        centers = X[idx].clone() # [k, D]

 

    prev_inertia = float("inf")

 

    for it in range(max_iter):

        # ===== E-step: assign points to nearest center =====

        #d2 = torch.cdist(X, centers, p=2) ** 2   # Ok but bit slower  than following

        # Pairwise squared distances via (x^2 + c^2 - 2 x·c)

        # X2: [N,1], C2: [k], X@C^T: [N,k]

        X2 = (X * X).sum(dim=1, keepdim=True)               # [N,1]

        C2 = (centers * centers).sum(dim=1)                        # [k]

        d2 = X2 + C2.unsqueeze(0) - 2.0 * (X @ centers.T) # [N,k]

        # finds the index of the smallest value along each row.

        labels = d2.argmin(dim=1)  # [N]

        # Inertia = sum of min squared distances (objective)

        inertia = d2.gather(1, labels.unsqueeze(1)).sum().item() #Better

        #inertia = torch.gather(d2, 1, labels.unsqueeze(1)).sum().item() # OK

 

        # ===== M-step: recompute centers as mean of assigned points =====

        # We’ll use scatter-add to accumulate sums and then divide by counts.

        kD = centers.shape # torch.Size([k, D])

        centers_sum = torch.zeros_like(centers) # [k, D]

        counts = torch.zeros(k, device=device, dtype=X.dtype) # [k]

 

        # One-hot style accumulation without making a dense one-hot matrix:

        # scatter_add accumulates row-wise sums into centers_sum[labels]

        centers_sum.index_add_(0, labels, X)                     # sum per cluster

        counts.index_add_(0, labels, torch.ones_like(labels, dtype=X.dtype)) #Note3

        # Handle empty clusters: re-seed them to random points (rare but important)

        empty = counts == 0

        if empty.any():

            # Reinitialize empty centers from random data points (k-means++ alternative)

            repl_idx = torch.randint(0, X.size(0), (int(empty.sum()),), generator=rng, device=device)

            #centers[empty] = X[repl_idx]

            counts[empty] = 1.0

            centers_sum[empty] = X[repl_idx]

 

        centers = centers_sum / counts.clamp_min(1.0).unsqueeze(1)

 

        # ===== stopping criterion =====

        # Relative improvement on inertia; simple and effective

        rel_impr = (prev_inertia - inertia) / (abs(prev_inertia) + 1e-12)

        if rel_impr >= 0 and rel_impr < tol:

            break

        prev_inertia = inertia

 

    return centers, labels

 

 

# -----------------------------

# Tiny demo

if __name__ == "__main__":

    torch.manual_seed(0)

    device = "cuda" if torch.cuda.is_available() else "cpu"

 

    # Make three blobs in 2D

    N = 600

    X = torch.cat([

        torch.randn(N//3, 2) * 0.5 + torch.tensor([0.0, 0.0]),

        torch.randn(N//3, 2) * 0.6 + torch.tensor([4.0, 4.0]),

        torch.randn(N//3, 2) * 0.4 + torch.tensor([-4.0, 3.5]),

    ], dim=0).to(device)

 

    centers, labels = kmeans(X, k=3, max_iter=100, tol=1e-4, use_pp=True, seed=42)

    print("Centers:\n", centers.cpu())

    print("Counts per cluster:", torch.bincount(labels.cpu(), minlength=3).tolist())

Notes on the “tricky” parts

·       Pairwise distances: we used the identity
(|x-c|^2 = |x|^2 + |c|^2 - 2x\cdot c)
to compute all distances in one shot (vectorized, GPU-friendly).

·       k-means++ picks diverse initial centers using probabilities distance² — often dramatically improves convergence and final quality.

·       Empty clusters can happen; we re-seed those centers to random points (simple, robust).

·       Stopping: we check relative improvement of inertia; tol=1e-4 is a good default.

·       Scaling: always standardize features (mean 0, var 1) before k-Means in practice.

Note3: counts.index_add_(0, labels, torch.ones(X.size(0), device=X.device, dtype=X.dtype))

Sunday, September 14, 2025

My favorite separation line marker: # ▼▼▼ and # ▲▲▲

TL;DR:

# ▼▼▼ START: Some_documentation ▼▼▼

# ▲▲▲ END: Some_documentation ▲▲▲ 


# Example: 

# solution to https://leetcode.com/problems/rotting-oranges


from collections import deque
from typing import List

class Solution:
    def orangesRotting(self, grid: List[List[int]]) -> int:
        q = deque()
        freshCnt = 0
        m, n = len(grid), len(grid[0])
       
        # Initial pass to find fresh/rotten oranges and check for isolated cells
        for i in range(m):
            for j in range(n):
                if grid[i][j] == 1:
                    freshCnt += 1
                    # ▼▼▼ START: NEW LOGIC FOR EARLY EXIT ▼▼▼
                    has_non_empty_neighbor = False
                    # Check all 4 potential neighbors
                    for di, dj in [(0, 1), (0, -1), (1, 0), (-1, 0)]:
                        ni, nj = i + di, j + dj
                        # First, ensure the neighbor is within the grid bounds
                        if 0 <= ni < m and 0 <= nj < n:
                            # If we find any neighbor that isn't an empty cell,
                            # this orange is NOT isolated.
                            if grid[ni][nj] != 0:
                                has_non_empty_neighbor = True
                                break # No need to check other neighbors
                   
                    # If after checking all valid neighbors, none were found,
                    # this orange is completely surrounded by empty cells.
                    if not has_non_empty_neighbor:
                        return -1 # Early exit
                    # ▲▲▲ END: NEW LOGIC FOR EARLY EXIT ▲▲▲

                elif grid[i][j] == 2:
                    q.append((i, j))
       
        time = 0
        # This BFS part of your logic is correct
        while q and freshCnt > 0:
       
            for _ in range(len(q)):
                i, j = q.popleft()
                for ni, nj in (i - 1, j), (i + 1, j), (i, j - 1), (i, j + 1):
                    if 0 <= ni < m and 0 <= nj < n and grid[ni][nj] == 1:
                        q.append((ni, nj))
                        grid[ni][nj] = 2
                        freshCnt -= 1
            time += 1
       
        return time if freshCnt == 0 else -1

Monday, February 17, 2025

How to set up remote ssh without password from VSCode

 You can set up passwordless SSH access from VSCode by using SSH keys. Here’s how:

  1. Generate an SSH key pair (if you don’t have one):
    Open your terminal and run:

    ssh-keygen -t rsa -b 4096

    Follow the prompts. If you want a completely passwordless experience, leave the passphrase empty.

  2. Copy your public key to the remote server:
    Use the ssh-copy-id command: (you need replace with your own info)

    ssh-copy-id username@remote_server_address

    This adds your public key (typically ~/.ssh/id_rsa.pub) to the ~/.ssh/authorized_keys file on the remote machine.

  3. Verify remote SSH settings:
    Ensure the remote server’s SSH configuration (usually in /etc/ssh/sshd_config) has PubkeyAuthentication yes enabled, and that the ~/.ssh directory and authorized_keys file have the correct permissions.

  4. Configure your SSH client (optional but useful):
    Edit (or create) the ~/.ssh/config  (it is on C:\Users\<YourUsername>\.ssh in windows OS) file on your local machine with an entry like:

    Host my-remote HostName remote_server_address User username IdentityFile ~/.ssh/id_rsa

    This simplifies the connection process.

  5. Connect using VSCode’s Remote - SSH extension:

    • Install the Remote - SSH extension in VSCode if you haven’t already.
    • Open the Command Palette (Ctrl+Shift+P or Cmd+Shift+P) and type “Remote-SSH: Connect to Host…”.
    • Select the host (either from your SSH config or enter it manually).

VSCode will use your SSH key for authentication, allowing you to log in without entering a password.


FAQ:

Q1) Where to find ~/.ssh in windows OS?

A1) On Windows, the equivalent of the ~/.ssh folder is located in your user profile directory, typically at:    

C:\Users\<YourUsername>\.ssh

Q2) How to solve?

ssh-copy-id username@remote_server_address ssh-copy-id : The term 'ssh-copy-id' is not recognized as the name of a cmdlet, function, script file, or operable program. Check the spelling of the name, or if a path was included, verify that the path is correct and try again. At line:1 char:1 + ssh-copy-id username@remote_server_address + ~~~~~~~~~~~ + CategoryInfo : ObjectNotFound: (ssh-copy-id:String) [], CommandNotFoundException + FullyQualifiedErrorId : CommandNotFoundException

A2: (replace username and remote_server_address with your own case)
type $env:USERPROFILE\.ssh\id_rsa.pub | ssh username@remote_server_address "mkdir -p ~/.ssh && cat >> ~/.ssh/authorized_keys"

Saturday, February 1, 2025

Tutorial: load(s) and dump(s) for Json

 

>>> help(json.load)

load(fp, *, cls=None, object_hook=None, parse_float=None, parse_int=None, parse_constant=None, object_pairs_hook=None, **kw)

    Deserialize ``fp`` (a ``.read()``-supporting file-like object containing a JSON document) to a Python object.                

 

help(json.loads)

loads(s, *, cls=None, object_hook=None, parse_float=None, parse_int=None, parse_constant=None, object_pairs_hook=None, **kw)

    Deserialize ``s`` (a ``str``, ``bytes`` or ``bytearray`` instance containing a JSON document) to a Python object.

 


Example using json.load

Assume you have a file named data.json with the following content:

{

  "name": "Alice",

  "age": 25,

  "city": "NY"

}

You can load this JSON data into a Python dictionary as follows:

import json

# Open the JSON file and parse its content

with open('data.json', 'r') as file:

    data = json.load(file)

print(data)

# Output:{'name': 'Alice', 'age': 25, 'city': 'NY'}

 

Example using json.loads

Suppose you have a JSON string:

import json

json_string='{"name":"Bob", "age":30,"city": "NY"}'

# Parse the JSON string into a Python dictionary

data = json.loads(json_string)

print(data)

# Output: {'name': 'Bob', 'age': 30, 'city': 'NY'}

 

Example:

1. Saving a Dictionary to a JSON File

You can use the json.dump function to write a dictionary to a file in JSON format. This method writes the data directly to the file in a structured JSON format, which is easy to read back later using json.load.

 

import json

 

data = {

    "name": "Alice",

    "age": 25,

    "city": "Wonderland"

}

 

# Open a file in write mode and dump the dictionary as JSON

with open('data.json', 'w') as json_file:

    json.dump(data, json_file, indent=4)  # 'indent=4' makes the output nicely formatted

 

 

2. Saving a Dictionary to a Text File as a String

There are two common approaches depending on what string format you need:

 

Method A: Using str()

This method writes the Python dictionary's string representation to the file. Note that this representation is not valid JSON but rather the standard Python dictionary format.

 

data = {

    "name": "Alice",

    "age": 25,

    "city": "Wonderland"

}

 

# Open a text file in write mode and write the dictionary as a string

with open('data.txt', 'w') as text_file:

    text_file.write(str(data))

 

The content of data.txt will look like:

{'name': 'Alice', 'age': 25, 'city': 'Wonderland'}

 

 

Method B: Using json.dumps()

If you prefer to store the dictionary as a JSON-formatted string in a text file (which is both human-readable and can be easily parsed later), you can convert it to a string first with json.dumps and then write it.

import json

data = {

    "name": "Alice",

    "age": 25,

    "city": "Wonderland"

}

 

# Convert the dictionary to a JSON formatted string

json_string = json.dumps(data, indent=4)

# Open a text file in write mode and write the JSON string

with open('data.txt', 'w') as text_file:

    text_file.write(json_string)

The resulting data.txt will contain:

 

{

    "name": "Alice",

    "age": 25,

    "city": "Wonderland"

}

Summary

json.dump: Directly writes a dictionary to a file as JSON.

str(): Converts the dictionary to its Python string representation.

json.dumps: Converts the dictionary to a JSON formatted string, which can then be written to a text file.

Choose the method that best fits your needs for later reading or processing the file.

 


 

 

 

 


help me fix: >>> sx

"{'name': 'Bob', 'age': 30, 'city': 'Builderland'}"

>>> sd = json.loads(sx)

Traceback (most recent call last):

  File "<stdin>", line 1, in <module>

  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.2288.0_x64__qbz5n2kfra8p0\Lib\json\__init__.py", line 346, in loads

    return _default_decoder.decode(s)

           ^^^^^^^^^^^^^^^^^^^^^^^^^^

  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.2288.0_x64__qbz5n2kfra8p0\Lib\json\decoder.py", line 338, in decode

    obj, end = self.raw_decode(s, idx=_w(s, 0).end())

               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.2288.0_x64__qbz5n2kfra8p0\Lib\json\decoder.py", line 354, in raw_decode

    obj, end = self.scan_once(s, idx)

               ^^^^^^^^^^^^^^^^^^^^^^

json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)

 

GPT:

The error occurs because JSON requires double quotes (") for both keys and string values, but your string uses single quotes ('). Here are two ways to fix this:

 

1. Convert the String to Valid JSON

If you can change the string to proper JSON format, use double quotes. For example:

import json

# Valid JSON string with double quotes

sx = '{"name": "Bob", "age": 30, "city": "Builderland"}'

sd = json.loads(sx)

print(sd)

 

2. Use ast.literal_eval for a Python Literal

If you have a string in Python's dictionary format (using single quotes) and you want to convert it into a dictionary, you can use the ast.literal_eval function:

import ast

sx = "{'name': 'Bob', 'age': 30, 'city': 'Builderland'}"

sd = ast.literal_eval(sx) # sd is a dict

print(sd)

This will also output:

#{'name': 'Bob', 'age': 30, 'city': 'Builderland'}

 

str(sd) == sx # return True

 

 

Please comment the following two strings from the perspetive of JSON read and write. which one is preferred?

string1='{"name":"Bob", "age":30,"city": "NY"}'  

string2="{'name': 'Bob', 'age': 30, 'city': 'NY'}"

 

Answer: string1 is preferred and valid JSON format.

 

Best Practice:

Always use json.dumps() to generate JSON strings from Python dictionaries, as it will ensure the proper formatting (i.e., using double quotes).