Source code for experiment_design.scorers

from typing import Callable, Iterable, Protocol

import numpy as np
from scipy.spatial.distance import pdist

from experiment_design.variable import ParameterSpace


class Scorer(Protocol):
    """
    :meta private:
    """

    def __call__(self, doe: np.ndarray) -> float:
        """
        A scoring function to evaluate an experiment design quality. Larger values are better,
        i.e. this will be maximized.

        :param doe: |DoE| consisting of candidate samples with shape (n_sample, n_dim).
        :return: score of the DoE.
        """


class ScorerFactory(Protocol):
    """
    :meta private:
    """

    def __call__(
        self,
        space: ParameterSpace,
        sample_size: int,
        old_sample: np.ndarray | None = None,
    ) -> Scorer:
        """
        Given variables and sample size, create a scoring function

        :param space: ParameterSpace for which the Scorer will be used
        :param sample_size: Number of samples in the DoE that will be evaluated by the scorer
        :param old_sample: If passed, it is an array with shape (n_old_sample, n_dim) which represents the already
            observed points.
        :return: A Scorer function, that accepts a DoE of shape (n_sample, n_dim) and returns the score
        """



[docs]
class MaxCorrelationScorerFactory:
    """
    A scorer factory for the maximum absolute correlation error between sampling points.

    :param local: If True, any points in the old_sample will be ignored, that fall outside the finite bounds of the
        provided variables. Has no effect if old_sample is None.
    :param eps: A small positive value to improve the stability of the log operation.
    """

    def __init__(
        self,
        local: bool = True,
        eps: float = 1e-2,
    ) -> None:
        self.local = local

        if eps < 0:
            raise ValueError("eps must be positive")
        self.eps = eps

    def __call__(
        self,
        space: ParameterSpace,
        sample_size: int,
        old_sample: np.ndarray | None = None,
    ) -> Scorer:
        """
        Creates a scorer, that computes the maximum absolute correlation error between the candidate samples
        and the target_correlation.

        :param space: Dimensions of the design space.
        :param sample_size: The number of candidate points to be scored.
        :param old_sample: If passed, represents the matrix of points in an older design of experiments with shape
            (old_sample_size, space.dimensions). Depending on self.local, some or all of these will be appended to the
            candidate points before computing the correlation error.
        :return: a scorer that returns the negative exp(maximum absolute correlation error + 1). If the error is
            smaller than self.eps, it returns negative exp(maximum absolute correlation error - 1) instead.
        """

        handler = create_old_doe_handler(space, old_sample, local=self.local)

        def _scorer(doe: np.ndarray) -> float:
            error = np.max(
                np.abs(np.corrcoef(handler(doe), rowvar=False) - space.correlation)
            )
            if error > self.eps:
                return -np.exp(error + 1)
            return -np.exp(error - 1)

        return _scorer




[docs]
class PairwiseDistanceScorerFactory:
    """
    A scorer factory for the minimum pairwise distance between sampling points.

    .. warning:: Currently, all pair-wise distances are computed greedily. Although this works faster for small sample
        sizes thanks to the C++ implementation used in scipy.spatial.distance.pdist, it may be memory-inefficient for
        large sample sizes. Using algorithms like KDTrees could solve this issue. However, we prefer omitting
        such implementation for the sake of reducing the number of dependencies. You can implement a custom
        ScorerFactory to circumvent this issue.

    :param local: If True, any points in the old_sample will be ignored, that fall outside the finite bounds of the
        provided variables. Has no effect if old_sample is None.
    :return: A scorer that returns the log minimum pairwise distance divided by the log max distance.
    """

    def __init__(self, local: bool = False) -> None:
        self.local = local

    def __call__(
        self,
        space: ParameterSpace,
        sample_size: int,
        old_sample: np.ndarray | None = None,
    ) -> Scorer:
        """
        Create a scorer, that computes the minimum pairwise distance between sampling points.s

        :param space: Dimensions of the design space.
        :param sample_size: The number of candidate points to be scored.
        :param old_sample: If passed, represents the matrix of points in an older design of experiments with shape
            (old_sample_size, len(variables)). Depending on self.local, some or all of these will be appended to the
            candidate points before computing the correlation error.
        :return: a scorer that returns the log minimum pairwise distance divided by the log max distance,
        """

        handler = create_old_doe_handler(space, old_sample, local=self.local)
        bin_diagonal_length = calculate_equidistant_bin_diagonal_length(
            space, sample_size
        )

        def _scorer(doe: np.ndarray) -> float:
            min_pairwise_distance = np.min(pdist(handler(doe)))
            return np.exp(min_pairwise_distance / bin_diagonal_length)

        return _scorer




[docs]
class WeightedSumScorerFactory:
    """
    A factory that creates a weighted sum of multiple scorers

    :param scorer_factories: These are combined by adding the scores their scorers provide.
    :param weights: Weights to use for combining the scorers. If not passed, each scores will not be weighed.
    """

    def __init__(
        self, scorer_factories: list[ScorerFactory], weights: Iterable[float]
    ) -> None:
        self.scorer_factories = scorer_factories
        if weights is None:
            weights = np.ones(len(scorer_factories))
        self.weights = np.array(weights).ravel()
        if self.weights.size != len(self.scorer_factories):
            raise ValueError(
                f"Wrong number of scorer_factories ({len(scorer_factories)}) and weights ({self.weights.size})"
            )

    def __call__(
        self,
        space: ParameterSpace,
        sample_size: int,
        old_sample: np.ndarray | None = None,
    ) -> Scorer:
        scorers = [
            factory(space, sample_size, old_sample=old_sample)
            for factory in self.scorer_factories
        ]

        def _scorer(doe: np.ndarray) -> float:
            return sum(
                [scorer(doe) * weight for scorer, weight in zip(scorers, self.weights)]
            )

        return _scorer




[docs]
def create_default_scorer_factory(
    distance_score_weight: float = 0.9,
    correlation_score_weight: float = 0.1,
    local_correlation: bool = True,
    local_pairwise_distance: bool = False,
) -> ScorerFactory:
    """
    Create a scorer factory, which is a weighted sum of maximum correlation error and
    minimum pairwise distance scorers

    :param distance_score_weight: Weight of the minimum pairwise distance score.
    :param correlation_score_weight: Weight of the maximum correlation error score.
    :param local_correlation: Controls the local attribute of the MaxCorrelationScorerFactory.
    :param local_pairwise_distance: Controls the local attribute of the PairwiseDistanceScorerFactory
    :return: WeightedSumScorerFactory instance.

    References
    ----------
    R.V. Joseph and Y. Hung (2008). "`Orthogonal-Maximin Latin Hypercube Designs
    <https://www3.stat.sinica.edu.tw/statistica/oldpdf/A18n17.pdf>`_"
    """
    corr_scorer_factory = MaxCorrelationScorerFactory(local=local_correlation)
    dist_scorer_factory = PairwiseDistanceScorerFactory(local=local_pairwise_distance)
    return WeightedSumScorerFactory(
        scorer_factories=[corr_scorer_factory, dist_scorer_factory],
        weights=[correlation_score_weight, distance_score_weight],
    )



def calculate_equidistant_bin_diagonal_length(
    space: ParameterSpace, sample_size: int
) -> float:
    """Calculate the length of the diagonal of equidistant bins the (Euclidean) design space

    :meta private:
    """
    lower, upper = space.lower_bound, space.upper_bound
    return float(np.linalg.norm((np.array(upper) - np.array(lower)) / sample_size))


def create_old_doe_handler(
    space: ParameterSpace,
    old_sample: np.ndarray | None = None,
    local: bool = False,
) -> Callable[[np.ndarray], np.ndarray]:
    """
    Return a function to handle any old design of experiments. Specifically,
    append some or all of the points from an old design of experiments (DoE) for
    including them in the scoring.

    :param space: Dimensions of the design space. Only relevant if local is set to True.
    :param old_sample: Matrix of points with shape=(sample_size, len(variables)) in the old DoE. If None, this returns
        a no-op function.
    :param local: If True, only include the points from the old_doe, that fall between the finite local bounds of the
        variables.
    :return: The function that receives a new |DoE|, which may append points from the old_sample depending on
        the arguments provided.

    :meta private:
    """
    if old_sample is None:
        # Nothing to handle
        return lambda x: x

    if not local:
        # Append every point in the old doe
        return lambda x: np.append(old_sample, x, axis=0)

    old_sample = select_local(old_sample, space)
    return lambda x: np.append(old_sample, x, axis=0)


def select_local(samples: np.ndarray, space: ParameterSpace) -> np.ndarray:
    """Select and return samples that fall within the finite bounds of the variables

    :meta private:
    """
    lower, upper = space.lower_bound[None, :], space.upper_bound[None, :]
    local_mask = np.logical_and((samples >= lower).all(1), (samples <= upper).all(1))
    return samples[local_mask]