Source code for gower_metric.distances.categorical_ordinal

from typing import Any

import numpy as np

from gower_metric.utils.missing import apply_missing_strategy, is_missing


[docs] def categorical_ordinal_component( X: np.ndarray, Y: np.ndarray, ordinal_indices: list[int], metadata: dict[int | str, dict[str, Any]], missing_strategy: str = "ignore", calculation_type: str = "kaufman", weights: np.ndarray | None = None, ) -> tuple[np.ndarray, np.ndarray]: """ Compute the ordinal categorical component of Gower metric between rows of X and Y. Args: X (np.ndarray): First dataset, shape (n_x, n_features). Y (np.ndarray): Second dataset, shape (n_y, n_features). ordinal_indices (list[int]): Indices of ordinal features. metadata (dict[int, dict[str, Any]]): Metadata for ordinal features. Computed by Gower.fit() on whole data range and passed to the distance function. missing_strategy (str): Strategy for handling missing values, default is "ignore". calculation_type (str): Type of calculation for ordinal distance, available options are "kaufman" and "podani". Default is "kaufman". weights (Optional[np.ndarray]): Optional weight per ordinal feature. Returns: tuple[np.ndarray, np.ndarray]: - sum_diff: matrix (n_x, n_y) of weighted, normalized ordinal distances - count_present: matrix (n_x, n_y) of counts of present (non-missing) features """ n_x, n_y = X.shape[0], Y.shape[0] sum_diff = np.zeros((n_x, n_y), dtype=float) count_present = np.zeros((n_x, n_y), dtype=float) if not ordinal_indices: return sum_diff, count_present for pos, j in enumerate(ordinal_indices): col_x = X[:, j] col_y = Y[:, j] mask_x = np.array([not is_missing(v) for v in col_x], dtype=bool) mask_y = np.array([not is_missing(v) for v in col_y], dtype=bool) present = mask_x[:, None] & mask_y[None, :] if metadata and j in metadata: info = metadata[j] ranks_map = info["ranks"] min_rank = info["min"] max_rank = info["max"] counts_arr = info["counts"] else: raise ValueError(f"Missing metadata for ordinal feature at index {j}.") if min_rank is None: continue r_x = np.array([ranks_map.get(v, np.nan) for v in col_x], dtype=float) r_y = np.array([ranks_map.get(v, np.nan) for v in col_y], dtype=float) if calculation_type == "kaufman": denom = max_rank - min_rank if denom == 0: dist = np.zeros((n_x, n_y), dtype=float) else: dist = np.abs(r_x[:, None] - r_y[None, :]) / denom else: diff = np.abs(r_x[:, None] - r_y[None, :]) mid = (counts_arr - 1) / 2.0 mid_x = mid[r_x.astype(int)][:, None] mid_y = mid[r_y.astype(int)][None, :] podani_denom = max_rank - min_rank - mid[0] - mid[-1] if podani_denom <= 0: # fallback to kaufman if podani denominator is not valid base_denom = max_rank - min_rank if base_denom == 0: dist = np.zeros((n_x, n_y), dtype=float) else: dist = diff / base_denom else: dist = (diff - mid_x - mid_y) / podani_denom dist = np.clip(dist, 0.0, 1.0) dist, mask = apply_missing_strategy(dist, present, missing_strategy) w = weights[pos] if weights is not None else 1.0 sum_diff += dist * w count_present += mask.astype(float) * w return sum_diff, count_present