Source code for cde.density_estimator.BaseNNEstimator

import numpy as np
import tensorflow as tf
import sklearn
import os
import itertools
from multiprocessing import Manager

from cde.utils.tf_utils.layers_powered import LayersPowered
import cde.utils.tf_utils.layers as L
from cde.utils.serializable import Serializable
from cde.utils.async_executor import AsyncExecutor
from cde.density_estimator.BaseDensityEstimator import BaseDensityEstimator


class BaseNNEstimator(LayersPowered, Serializable, BaseDensityEstimator):
    """
    Base class for a density estimator using a neural network to parametrize the distribution p(y|x)
    To use this class, implement pdf_, cdf_ and log_pdf_ or overwrite the parent methods
    To use the hyperparameter search, also implement reset_fit and (optionally) _param_grid()
    """

    # input data can be normalized before training
    data_normalization = False

    # used for noise regularization of the data
    x_noise_std = False
    y_noise_std = False

    # was the model fitted to the data or not
    fitted = False

    # set to >0. to use dropout during training. Determines the probability of dropping the output of a node
    dropout = 0.0

    def reset_fit(self):
        """
        Reset all tensorflow objects to enable the model to be trained again
        :return:
        """
        raise NotImplementedError()

    def fit_by_cv(self, X, Y, n_folds=3, param_grid=None, random_state=None, verbose=True, n_jobs=-1):
        """ Fits the conditional density model with hyperparameter search and cross-validation.

        - Determines the best hyperparameter configuration from a pre-defined set using cross-validation. Thereby,
          the conditional log-likelihood is used for simulation_eval.
        - Fits the model with the previously selected hyperparameter configuration

        Args:
          X: numpy array to be conditioned on - shape: (n_samples, n_dim_x)
          Y: numpy array of y targets - shape: (n_samples, n_dim_y)
          n_folds: number of cross-validation folds (positive integer)
          param_grid: (optional) a dictionary with the hyperparameters of the model as key and and a list of respective
                      parametrizations as value. The hyperparameter search is performed over the cartesian product of
                      the provided lists.
                      Example::
                              {"n_centers": [20, 50, 100, 200],
                               "center_sampling_method": ["agglomerative", "k_means", "random"],
                               "keep_edges": [True, False]
                              }
          random_state: (int) seed used by the random number generator for shuffeling the data
        """
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
        original_params = self.get_params()

        if param_grid is None:
            param_grid = self._param_grid()

        param_list = list(sklearn.model_selection.ParameterGrid(param_grid))
        train_splits, test_splits = list(zip(*list(sklearn.model_selection.KFold(n_splits=n_folds, shuffle=False,
                                                                                 random_state=random_state).split(X))))

        param_ids, fold_ids = list(zip(*itertools.product(range(len(param_list)), range(n_folds))))

        # multiprocessing setup
        manager = Manager()
        score_dict = manager.dict()

        def _fit_eval(param_idx, fold_idx, verbose=False, i_rand=-1):
            train_indices, test_indices = train_splits[fold_idx], test_splits[fold_idx]
            X_train, Y_train = X[train_indices], Y[train_indices]
            X_test, Y_test = X[test_indices], Y[test_indices]

            with tf.Session():
                kwargs_dict = {**original_params, **param_list[param_idx]}
                kwargs_dict['name'] = 'cv_%i_%i_%i_' % (param_idx, fold_idx, i_rand) + self.name
                model = self.__class__(**kwargs_dict)

                model.fit(X_train, Y_train, verbose=verbose)
                test_score = model.score(X_test, Y_test)
                assert not np.isnan(test_score)
                score_dict[(param_idx, fold_idx)] = test_score

        # run the prepared tasks in multiple processes
        executor = AsyncExecutor(n_jobs=n_jobs)
        executor.run(_fit_eval, param_ids, fold_ids, verbose=verbose)


        # check if all results are available and rerun failed fit_evals. Try three times
        for i in range(3):
            failed_runs = [x for x in zip(param_ids, fold_ids) if x not in score_dict]
            if not failed_runs:
                break
            if verbose:
                print("{} runs succeeded, {} runs failed. Rerunning failed runs".format(len(score_dict.keys()),
                                                                                        len(failed_runs)))
            for (p, f) in failed_runs:
                try:
                    _fit_eval(p, f, verbose=verbose, i_rand=i)
                except Exception as e:
                    print(e)

        # make sure we ultimately have an output for every parameter - fold - combination
        assert len(score_dict.keys()) == len(param_list) * len(train_splits)

        # Select the best parameter setting
        scores_array = np.zeros((len(param_list), len(train_splits)))
        for (i, j), score in score_dict.items():
            scores_array[i, j] = score
        avg_scores = np.mean(scores_array, axis=-1)
        best_idx = np.argmax(avg_scores)
        selected_params = param_list[best_idx]

        if verbose:
            print("Completed grid search - Selected params: {}".format(selected_params))
            print("Refitting model with selected params")

        # Refit with best parameter set
        self.set_params(**selected_params)
        self.reset_fit()
        self.fit(X, Y, verbose=False)
        return selected_params

    def pdf(self, X, Y):
        """ Predicts the conditional probability p(y|x). Requires the model to be fitted.

           Args:
             X: numpy array to be conditioned on - shape: (n_samples, n_dim_x)
             Y: numpy array of y targets - shape: (n_samples, n_dim_y)

           Returns:
              conditional probability p(y|x) - numpy array of shape (n_query_samples, )

         """
        assert self.fitted, "model must be fitted to compute likelihood score"
        X, Y = self._handle_input_dimensionality(X, Y, fitting=False)
        p = self.sess.run(self.pdf_, feed_dict={self.X_ph: X, self.Y_ph: Y})
        assert p.ndim == 1 and p.shape[0] == X.shape[0]
        return p

    def cdf(self, X, Y):
        """ Predicts the conditional cumulative probability p(Y<=y|X=x). Requires the model to be fitted.

           Args:
             X: numpy array to be conditioned on - shape: (n_samples, n_dim_x)
             Y: numpy array of y targets - shape: (n_samples, n_dim_y)

           Returns:
             conditional cumulative probability p(Y<=y|X=x) - numpy array of shape (n_query_samples, )

        """
        assert self.fitted, "model must be fitted to compute likelihood score"
        X, Y = self._handle_input_dimensionality(X, Y, fitting=False)
        p = self.sess.run(self.cdf_, feed_dict={self.X_ph: X, self.Y_ph: Y})
        assert p.ndim == 1 and p.shape[0] == X.shape[0]
        return p

    def log_pdf(self, X, Y):
        """ Predicts the conditional log-probability log p(y|x). Requires the model to be fitted.

           Args:
             X: numpy array to be conditioned on - shape: (n_samples, n_dim_x)
             Y: numpy array of y targets - shape: (n_samples, n_dim_y)

           Returns:
              onditional log-probability log p(y|x) - numpy array of shape (n_query_samples, )

         """
        assert self.fitted, "model must be fitted to compute likelihood score"
        X, Y = self._handle_input_dimensionality(X, Y, fitting=False)
        p = self.sess.run(self.log_pdf_, feed_dict={self.X_ph: X, self.Y_ph: Y})
        assert p.ndim == 1 and p.shape[0] == X.shape[0]
        return p

    def _compute_data_normalization(self, X, Y):
        # compute data statistics (mean & std)
        self.x_mean = np.mean(X, axis=0)
        self.x_std = np.std(X, axis=0)
        self.y_mean = np.mean(Y, axis=0)
        self.y_std = np.std(Y, axis=0)

        self.data_statistics = {
            'X_mean': self.x_mean,
            'X_std': self.x_std,
            'Y_mean': self.y_mean,
            'Y_std': self.y_std,
        }

        # assign them to tf variables
        sess = tf.get_default_session()
        sess.run([
            tf.assign(self.mean_x_sym, self.x_mean),
            tf.assign(self.std_x_sym, self.x_std),
            tf.assign(self.mean_y_sym, self.y_mean),
            tf.assign(self.std_y_sym, self.y_std)
        ])

    def _build_input_layers(self):
        # Input_Layers & placeholders
        self.X_ph = tf.placeholder(tf.float32, shape=(None, self.ndim_x))
        self.Y_ph = tf.placeholder(tf.float32, shape=(None, self.ndim_y))
        self.train_phase = tf.placeholder_with_default(False, None)

        layer_in_x = L.InputLayer(shape=(None, self.ndim_x), input_var=self.X_ph, name="input_x")
        layer_in_y = L.InputLayer(shape=(None, self.ndim_y), input_var=self.Y_ph, name="input_y")

        # add data normalization layer if desired
        if self.data_normalization:
            layer_in_x = L.NormalizationLayer(layer_in_x, self.ndim_x, name="data_norm_x")
            self.mean_x_sym, self.std_x_sym = layer_in_x.get_params()
            layer_in_y = L.NormalizationLayer(layer_in_y, self.ndim_y, name="data_norm_y")
            self.mean_y_sym, self.std_y_sym = layer_in_y.get_params()

        # add noise layer if desired
        if self.x_noise_std is not None:
            layer_in_x = L.GaussianNoiseLayer(layer_in_x, self.x_noise_std, noise_on_ph=self.train_phase)
        if self.y_noise_std is not None:
            layer_in_y = L.GaussianNoiseLayer(layer_in_y, self.y_noise_std, noise_on_ph=self.train_phase)

        # setup dropout. This placeholder will remain unused if dropout is not implemented by the MLP
        self.dropout_ph = tf.placeholder_with_default(0., shape=())

        return layer_in_x, layer_in_y

    def __getstate__(self):
        state = LayersPowered.__getstate__(self)
        state['fitted'] = self.fitted
        return state

    def __setstate__(self, state):
        LayersPowered.__setstate__(self, state)
        self.fitted = state['fitted']
        self.sess = tf.get_default_session()

    def _handle_input_dimensionality(self, X, Y=None, fitting=False):
        assert (self.ndim_x == 1 and X.ndim == 1) or (X.ndim == 2 and X.shape[1] == self.ndim_x), "expected X to have shape (?, %i) but received %s"%(self.ndim_x, str(X.shape))
        assert (Y is None) or (self.ndim_y == 1 and Y.ndim == 1) or (Y.ndim == 2 and Y.shape[1] == self.ndim_y), "expected Y to have shape (?, %i) but received %s"%(self.ndim_y, str(Y.shape))
        return BaseDensityEstimator._handle_input_dimensionality(self, X, Y, fitting=fitting)

    @staticmethod
    def _check_uniqueness_of_scope(name):
        current_scope = tf.get_variable_scope().name
        scopes = set([variable.name.split('/')[0] for variable in tf.global_variables(scope=current_scope)])
        assert name not in scopes, "%s is already in use for a tensorflow scope - please choose another estimator name"%name