Source code for nni.algorithms.hpo.tpe_tuner

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

"""
Tree-structured Parzen Estimator (TPE) tuner for hyper-parameter optimization.

Paper: https://proceedings.neurips.cc/paper/2011/file/86e8f7ab32cfd12577bc2619bc635690-Paper.pdf
Official code: https://github.com/hyperopt/hyperopt/blob/master/hyperopt/tpe.py

This is a slightly modified re-implementation of the algorithm.
"""

__all__ = ['TpeTuner', 'TpeArguments', 'suggest', 'suggest_parameter']

from collections import defaultdict
import logging
import math
from typing import NamedTuple, Optional, Union

import numpy as np
from scipy.special import erf  # pylint: disable=no-name-in-module

from nni.tuner import Tuner
from nni.common.hpo_utils import OptimizeMode, format_search_space, deformat_parameters, format_parameters
from nni.utils import extract_scalar_reward
from . import random_tuner

_logger = logging.getLogger('nni.tuner.tpe')

## Public API part ##

class TpeArguments(NamedTuple):
    """
    These are the hyper-parameters of TPE algorithm itself.
    To avoid confusing with trials' hyper-parameters, they are called "arguments" in this code.

    Parameters
    ==========
    constant_liar_type: 'best' | 'worst' | 'mean' | None (default: 'best')
        TPE algorithm itself does not support parallel tuning.
        This parameter specifies how to optimize for trial_concurrency > 1.

        None (or "null" in YAML) means do not optimize. This is the default behavior in legacy version.

        How each liar works is explained in paper's section 6.1.
        In general "best" suit for small trial number and "worst" suit for large trial number.

    n_startup_jobs: int (default: 20)
        The first N hyper-parameters are generated fully randomly for warming up.
        If the search space is large, you can increase this value.
        Or if max_trial_number is small, you may want to decrease it.

    n_ei_candidates: int (default: 24)
        For each iteration TPE samples EI for N sets of parameters and choose the best one. (loosely speaking)

    linear_forgetting: int (default: 25)
        TPE will lower the weights of old trials.
        This controls how many iterations it takes for a trial to start decay.

    prior_weight: float (default: 1.0)
        TPE treats user provided search space as prior.
        When generating new trials, it also incorporates the prior in trial history by transforming the search space to
        one trial configuration (i.e., each parameter of this configuration chooses the mean of its candidate range).
        Here, prior_weight determines the weight of this trial configuration in the history trial configurations.

        With prior weight 1.0, the search space is treated as one good trial.
        For example, "normal(0, 1)" effectly equals to a trial with x = 0 which has yielded good result.

    gamma: float (default: 0.25)
        Controls how many trials are considered "good".
        The number is calculated as "min(gamma * sqrt(N), linear_forgetting)".
    """
    constant_liar_type: Optional[str] = 'best'
    n_startup_jobs: int = 20
    n_ei_candidates: int = 24
    linear_forgetting: int = 25
    prior_weight: float = 1.0
    gamma: float = 0.25

[docs]class TpeTuner(Tuner): """ Parameters ========== optimze_mode: 'minimize' | 'maximize' (default: 'minimize') Whether optimize to minimize or maximize trial result. seed: int | None The random seed. tpe_args: dict[string, Any] | None Advanced users can use this to customize TPE tuner. See `TpeArguments` for details. """ def __init__(self, optimize_mode='minimize', seed=None, tpe_args=None): self.optimize_mode = OptimizeMode(optimize_mode) self.args = TpeArguments(**(tpe_args or {})) self.space = None # concurrent generate_parameters() calls are likely to yield similar result, because they use same history # the liar solves this problem by adding fake results to history self.liar = create_liar(self.args.constant_liar_type) if seed is None: # explicitly generate a seed to make the experiment reproducible seed = np.random.default_rng().integers(2 ** 31) self.rng = np.random.default_rng(seed) _logger.info(f'Using random seed {seed}') self._params = {} # parameter_id -> parameters (in internal format) self._running_params = {} # subset of above, that has been submitted but has not yet received loss self._history = defaultdict(list) # parameter key -> list of Record
[docs] def update_search_space(self, space): self.space = format_search_space(space)
[docs] def generate_parameters(self, parameter_id, **kwargs): if self.liar and self._running_params: # give a fake loss for each concurrently running paramater set history = {key: records.copy() for key, records in self._history.items()} # copy history lie = self.liar.lie() for param in self._running_params.values(): for key, value in param.items(): history[key].append(Record(value, lie)) else: history = self._history params = suggest(self.args, self.rng, self.space, history) self._params[parameter_id] = params self._running_params[parameter_id] = params return deformat_parameters(params, self.space)
[docs] def receive_trial_result(self, parameter_id, _parameters, value, **kwargs): if self.optimize_mode is OptimizeMode.Minimize: loss = extract_scalar_reward(value) else: loss = -extract_scalar_reward(value) if self.liar: self.liar.update(loss) params = self._running_params.pop(parameter_id) for key, value in params.items(): self._history[key].append(Record(value, loss))
[docs] def trial_end(self, parameter_id, _success, **kwargs): self._running_params.pop(parameter_id, None)
[docs] def import_data(self, data): # for resuming experiment for trial in data: param = format_parameters(trial['parameter'], self.space) loss = trial['value'] if self.optimize_mode is OptimizeMode.Maximize: loss = -trial['value'] for key, value in param.items(): self._history[key].append(Record(value, loss)) _logger.info(f'Replayed {len(data)} trials')
def suggest(args, rng, space, history): params = {} for key, spec in space.items(): if spec.is_activated_in(params): # nested search space is chosen params[key] = suggest_parameter(args, rng, spec, history[key]) return params def suggest_parameter(args, rng, spec, parameter_history): if len(parameter_history) < args.n_startup_jobs: # not enough history, still warming up return random_tuner.suggest_parameter(rng, spec) if spec.categorical: return suggest_categorical(args, rng, parameter_history, spec.size) if spec.normal_distributed: mu = spec.mu sigma = spec.sigma clip = None else: # TPE does not support uniform distribution natively # they are converted to normal((low + high) / 2, high - low) mu = (spec.low + spec.high) * 0.5 sigma = spec.high - spec.low clip = (spec.low, spec.high) return suggest_normal(args, rng, parameter_history, mu, sigma, clip) ## Public API part end ## ## Utilities part ## class Record(NamedTuple): param: Union[int, float] loss: float class BestLiar: # assume running parameters have best result, it accelerates "converging" def __init__(self): self._best = None def update(self, loss): if self._best is None or loss < self._best: self._best = loss def lie(self): # when there is no real result, all of history is the same lie, so the value does not matter # in this case, return 0 instead of infinity to prevent potential calculation error return 0.0 if self._best is None else self._best class WorstLiar: # assume running parameters have worst result, it helps to jump out of local minimum def __init__(self): self._worst = None def update(self, loss): if self._worst is None or loss > self._worst: self._worst = loss def lie(self): return 0.0 if self._worst is None else self._worst class MeanLiar: # assume running parameters have average result def __init__(self): self._sum = 0.0 self._n = 0 def update(self, loss): self._sum += loss self._n += 1 def lie(self): return 0.0 if self._n == 0 else (self._sum / self._n) def create_liar(liar_type): if liar_type is None or liar_type.lower == 'none': return None liar_classes = { 'best': BestLiar, 'worst': WorstLiar, 'mean': MeanLiar, } return liar_classes[liar_type.lower()]() ## Utilities part end ## ## Algorithm part ## # the algorithm is implemented in process-oriented style because I find it's easier to be understood in this way, # you know exactly what data each step is processing. def suggest_categorical(args, rng, param_history, size): """ Suggest a categorical ("choice" or "randint") parameter. """ below, above = split_history(args, param_history) # split history into good ones and bad ones weights = linear_forgetting_weights(args, len(below)) counts = np.bincount(below, weights, size) p = (counts + args.prior_weight) / sum(counts + args.prior_weight) # calculate weight of good choices samples = rng.choice(size, args.n_ei_candidates, p=p) # sample N EIs using the weights below_llik = np.log(p[samples]) # the probablity of these samples to be good (llik means log-likelyhood) weights = linear_forgetting_weights(args, len(above)) counts = np.bincount(above, weights, size) p = (counts + args.prior_weight) / sum(counts + args.prior_weight) # calculate weight of bad choices above_llik = np.log(p[samples]) # the probablity of above samples to be bad return samples[np.argmax(below_llik - above_llik)] # which one has best probability to be good def suggest_normal(args, rng, param_history, prior_mu, prior_sigma, clip): """ Suggest a normal distributed parameter. Uniform has been converted to normal in the caller function; log and q will be handled by "deformat_parameters". """ below, above = split_history(args, param_history) # split history into good ones and bad ones weights, mus, sigmas = adaptive_parzen_normal(args, below, prior_mu, prior_sigma) # calculate weight of good segments samples = gmm1(args, rng, weights, mus, sigmas, clip) # sample N EIs using the weights below_llik = gmm1_lpdf(args, samples, weights, mus, sigmas, clip) # the probability of these samples to be good weights, mus, sigmas = adaptive_parzen_normal(args, above, prior_mu, prior_sigma) # calculate weight of bad segments above_llik = gmm1_lpdf(args, samples, weights, mus, sigmas, clip) # the probability of above samples to be bad return samples[np.argmax(below_llik - above_llik)] # which one has best probability to be good def split_history(args, param_history): """ Divide trials into good ones (below) and bad ones (above). """ n_below = math.ceil(args.gamma * math.sqrt(len(param_history))) n_below = min(n_below, args.linear_forgetting) order = sorted(range(len(param_history)), key=(lambda i: param_history[i].loss)) # argsort by loss below = [param_history[i].param for i in order[:n_below]] above = [param_history[i].param for i in order[n_below:]] return np.asarray(below), np.asarray(above) def linear_forgetting_weights(args, n): """ Calculate decayed weights of N trials. """ lf = args.linear_forgetting if n < lf: return np.ones(n) else: ramp = np.linspace(1.0 / n, 1.0, n - lf) flat = np.ones(lf) return np.concatenate([ramp, flat]) def adaptive_parzen_normal(args, history_mus, prior_mu, prior_sigma): """ The "Adaptive Parzen Estimator" described in paper section 4.2, for normal distribution. Because TPE internally only supports categorical and normal distributed space (domain), this function is used for everything other than "choice" and "randint". Parameters ========== args: TpeArguments Algorithm arguments. history_mus: 1-d array of float Parameter values evaluated in history. These are the "observations" in paper section 4.2. ("placing density in the vicinity of K observations") prior_mu: float µ value of normal search space. piror_sigma: float σ value of normal search space. Returns ======= Tuple of three 1-d float arrays: (weight, µ, σ). The tuple represents N+1 "vicinity of observations" and each one's weight, calculated from "N" history and "1" user provided prior. The result is sorted by µ. """ mus = np.append(history_mus, prior_mu) order = np.argsort(mus) mus = mus[order] prior_index = np.searchsorted(mus, prior_mu) if len(mus) == 1: sigmas = np.asarray([prior_sigma]) elif len(mus) == 2: sigmas = np.asarray([prior_sigma * 0.5, prior_sigma * 0.5]) sigmas[prior_index] = prior_sigma else: l_delta = mus[1:-1] - mus[:-2] r_delta = mus[2:] - mus[1:-1] sigmas_mid = np.maximum(l_delta, r_delta) sigmas = np.concatenate([[mus[1] - mus[0]], sigmas_mid, [mus[-1] - mus[-2]]]) sigmas[prior_index] = prior_sigma # "magic formula" in official implementation n = min(100, len(mus) + 1) sigmas = np.clip(sigmas, prior_sigma / n, prior_sigma) weights = np.append(linear_forgetting_weights(args, len(mus)), args.prior_weight) weights = weights[order] return weights / np.sum(weights), mus, sigmas def gmm1(args, rng, weights, mus, sigmas, clip=None): """ Gaussian Mixture Model 1D. """ ret = np.asarray([]) while len(ret) < args.n_ei_candidates: n = args.n_ei_candidates - len(ret) active = np.argmax(rng.multinomial(1, weights, n), axis=1) samples = rng.normal(mus[active], sigmas[active]) if clip: samples = samples[(clip[0] <= samples) & (samples <= clip[1])] ret = np.concatenate([ret, samples]) return ret def gmm1_lpdf(_args, samples, weights, mus, sigmas, clip=None): """ Gaussian Mixture Model 1D's log probability distribution function. """ eps = 1e-12 if clip: normal_cdf_low = erf((clip[0] - mus) / np.maximum(np.sqrt(2) * sigmas, eps)) * 0.5 + 0.5 normal_cdf_high = erf((clip[1] - mus) / np.maximum(np.sqrt(2) * sigmas, eps)) * 0.5 + 0.5 p_accept = np.sum(weights * (normal_cdf_high - normal_cdf_low)) else: p_accept = 1 # normal lpdf dist = samples.reshape(-1, 1) - mus mahal = (dist / np.maximum(sigmas, eps)) ** 2 z = np.sqrt(2 * np.pi) * sigmas coef = weights / z / p_accept normal_lpdf = -0.5 * mahal + np.log(coef) # log sum rows m = normal_lpdf.max(axis=1) e = np.exp(normal_lpdf - m.reshape(-1, 1)) return np.log(e.sum(axis=1)) + m ## Algorithm part end ##