Source code for gower_metric.utils.missing

import math
from collections.abc import Sequence
from typing import Any

import numpy as np
import pandas as pd


[docs] def is_missing(value: Any) -> bool: """ Return True if the value is considered missing. Args: value (Any): The value to check. Returns: bool: True if the value is missing, False otherwise. """ return bool( value is None or (isinstance(value, float) and math.isnan(value)) or (hasattr(pd, "isna") and pd.isna(value)) or (isinstance(value, float) and np.isnan(value)) )
[docs] def first_not_missing(sequence: Sequence) -> Any | None: """ Return the first non-missing value from a sequence. Args: sequence (Sequence): A sequence of values. Returns: Optional[Any]: The first non-missing value, or None if all values are missing. """ for value in sequence: if not is_missing(value): return value return None
[docs] def apply_missing_strategy( diff: np.ndarray, present: np.ndarray, nan_method: str ) -> tuple[np.ndarray, np.ndarray]: """ Apply the chosen missing-values strategy to the raw diff matrix. Args: diff (np.ndarray): raw distance matrix for one feature, shape (n_x, n_y). present (np.ndarray): boolean mask where True means both values were non-missing. nan_method (str): one of "ignore", "max_dist", "raise_error". Returns: tuple[np.ndarray, np.ndarray]: Diff is adjusted distance matrix. Count_mask is int matrix of same shape, how much to add to count_present. Raises: ValueError: if nan_method is not recognized. """ if nan_method == "ignore": diff[~present] = 0.0 count_mask = present.astype(int) elif nan_method == "max_dist": diff[~present] = 1.0 count_mask = present.astype(int) elif nan_method == "raise_error": if not present.all(): raise ValueError count_mask = present.astype(int) else: raise ValueError(f"Unknown nan_method '{nan_method}'") return diff, count_mask