Source code for cde.density_estimator.NF

import numpy as np
import tensorflow as tf

import cde.utils.tf_utils.layers as L
from cde.utils.tf_utils.layers_powered import LayersPowered
from cde.utils.tf_utils.network import MLP
from cde.utils.tf_utils.adamW import AdamWOptimizer
from .BaseNNEstimator import BaseNNEstimator
from .normalizing_flows import FLOWS
from cde.utils.serializable import Serializable


[docs]class NormalizingFlowEstimator(BaseNNEstimator):
    """ Normalizing Flow Estimator

        Args:
            name: (str) name space of the network (should be unique in code, otherwise tensorflow namespace collisions may arise)
            ndim_x: (int) dimensionality of x variable
            ndim_y: (int) dimensionality of y variable
            flows_type: (tuple of strings) The chain of individual flows that together make up the full flow. The
                        individual flows can be any of: *affine*, *planar*, *radial*, *identity*. They will be applied in order
                        going from the base distribution to the transformed distribution.
            hidden_sizes: (tuple of int) sizes of the hidden layers of the neural network
            hidden_nonlinearity: (tf function) nonlinearity of the hidden layers
            n_training_epochs: (int) Number of epochs for training
            x_noise_std: (optional) standard deviation of Gaussian noise over the the training data X -> regularization through noise
            y_noise_std: (optional) standard deviation of Gaussian noise over the the training data Y -> regularization through noise
            weight_decay: (float) the amount of decoupled (http://arxiv.org/abs/1711.05101) weight decay to apply
            weight_normalization: (boolean) whether weight normalization shall be used for the neural network
            data_normalization: (boolean) whether to normalize the data (X and Y) to exhibit zero-mean and uniform-std
            dropout: (float) the probability of switching off nodes during training
            random_seed: (optional) seed (int) of the random number generators used
    """

    def __init__(self, name, ndim_x, ndim_y, flows_type=('affine', 'radial', 'radial', 'radial'), hidden_sizes=(16, 16),
                 hidden_nonlinearity=tf.tanh, n_training_epochs=1000, x_noise_std=None, y_noise_std=None,
                 weight_decay=0.0, weight_normalization=True, data_normalization=True, dropout=0.0,
                 random_seed=None):
        Serializable.quick_init(self, locals())
        self._check_uniqueness_of_scope(name)
        assert all([f in FLOWS.keys() for f in flows_type])

        self.name = name
        self.ndim_x = ndim_x
        self.ndim_y = ndim_y

        self.random_seed = random_seed
        self.random_state = np.random.RandomState(seed=random_seed)
        tf.set_random_seed(random_seed)

        # charateristics of the flows to be used
        self.flows_type = flows_type

        # specification of the network
        self.hidden_sizes = hidden_sizes
        self.hidden_nonlinearity = hidden_nonlinearity

        self.n_training_epochs = n_training_epochs

        # regularization parameters
        self.x_noise_std = x_noise_std
        self.y_noise_std = y_noise_std

        # decoupled weight decay
        self.weight_decay = weight_decay

        # normalizing the network weights
        self.weight_normalization = weight_normalization

        # whether to normalize the data to zero mean, and uniform variance
        self.data_normalization = data_normalization

        # the prob of dropping a node
        self.dropout = dropout

        # gradients for planar flows tend to explode -> clip them by global norm
        self.gradient_clipping = True if 'planar' in flows_type else False

        # as we'll be using reversed flows, sampling is too slow to be useful
        self.can_sample = False
        self.has_pdf = True
        # tf has a cdf implementation only for 1-D Normal Distribution
        self.has_cdf = True if self.ndim_y == 1 else False

        self.fitted = False

        # build tensorflow model
        self._build_model()

[docs]    def fit(self, X, Y, random_seed=None, verbose=True, eval_set=None, **kwargs):
        """
        Fit the model with to the provided data

        :param X: numpy array to be conditioned on - shape: (n_samples, n_dim_x)
        :param Y: numpy array of y targets - shape: (n_samples, n_dim_y)
        :param eval_set: (tuple) eval/test dataset - tuple (X_test, Y_test)
        :param verbose: (boolean) controls the verbosity of console output
        """

        X, Y = self._handle_input_dimensionality(X, Y, fitting=True)

        if eval_set:
            eval_set = tuple(self._handle_input_dimensionality(x) for x in eval_set)

        # If no session has yet been created, create one and make it the default
        self.sess = tf.get_default_session() if tf.get_default_session() else tf.InteractiveSession()

        var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name)
        tf.initializers.variables(var_list, name='init').run()

        if self.data_normalization:
            self._compute_data_normalization(X, Y)

        for i in range(0, self.n_training_epochs + 1):
            self.sess.run(self.train_step,
                          feed_dict={self.X_ph: X, self.Y_ph: Y, self.train_phase: True, self.dropout_ph: self.dropout})
            if verbose and not i % 100:
                log_loss = self.sess.run(self.log_loss, feed_dict={self.X_ph: X, self.Y_ph: Y})
                if not eval_set:
                    print('Step {:4}: train log-loss {: .4f}'.format(i, log_loss))
                else:
                    eval_ll = self.sess.run(self.log_loss, feed_dict={self.X_ph: eval_set[0], self.Y_ph: eval_set[1]})
                    print('Step {:4}: train log-loss {: .4f} eval log-loss {: .4f}'.format(i, log_loss, eval_ll))

        self.fitted = True

[docs]    def reset_fit(self):
        """
        Resets all tensorflow objects and enables this model to be fitted anew
        """
        tf.reset_default_graph()
        self._build_model()
        self.fitted = False

    def _param_grid(self):
        return {
            'n_training_epochs': [500, 1000, 1500],
            'hidden_sizes': [(16, 16), (32, 32)],
            'flows_type': [
                # radial
                ('affine', 'radial', 'radial', 'radial'),
                ('affine', 'radial', 'radial', 'radial', 'radial'),
                ('affine', 'radial', 'radial', 'radial', 'radial', 'radial'),
                # planar
                ('planar', 'planar', 'planar'),
                ('affine', 'planar', 'planar', 'planar'),
                ('affine', 'planar', 'planar', 'planar', 'planar'),
                # mix
                ('affine', 'radial', 'planar', 'radial', 'planar',),
                ('affine', 'radial', 'planar', 'radial', 'planar', 'radial'),
            ],
            'x_noise_std': [0.1, 0.2, 0.4, None],
            'y_noise_std': [0.01, 0.02, 0.05, 0.1, 0.2, None],
            'weight_decay': [1e-5, 0.0]
        }

    def _build_model(self):
        """
        implementation of the flow model
        """
        with tf.variable_scope(self.name):
            # adds placeholders, data normalization and data noise to graph as desired. Also sets up a placeholder
            # for dropout
            self.layer_in_x, self.layer_in_y = self._build_input_layers()
            self.y_input = L.get_output(self.layer_in_y)

            flow_classes = [FLOWS[flow_name] for flow_name in self.flows_type]
            # get the individual parameter sizes for each flow
            param_split_sizes = [flow.get_param_size(self.ndim_y) for flow in flow_classes]
            mlp_output_dim = sum(param_split_sizes)
            core_network = MLP(
                name="core_network",
                input_layer=self.layer_in_x,
                output_dim=mlp_output_dim,
                hidden_sizes=self.hidden_sizes,
                hidden_nonlinearity=self.hidden_nonlinearity,
                output_nonlinearity=None,
                weight_normalization=self.weight_normalization,
                dropout_ph=self.dropout_ph if self.dropout else None
            )
            outputs = L.get_output(core_network.output_layer)
            flow_params = tf.split(value=outputs, num_or_size_splits=param_split_sizes, axis=1)

            # instanciate the flows with their parameters
            flows = [flow(params, self.ndim_y) for flow, params in zip(flow_classes, flow_params)]

            # build up the base distribution that will be transformed by the flows
            if self.ndim_y == 1:
                # this is faster for 1-D than the multivariate version
                # it also supports a cdf, which isn't implemented for Multivariate
                base_dist = tf.distributions.Normal(loc=0., scale=1.)
            else:
                base_dist = tf.contrib.distributions.MultivariateNormalDiag(loc=[0.] * self.ndim_y,
                                                                            scale_diag=[1.] * self.ndim_y)

            # chain the flows together and build the transformed distribution using the base_dist + flows
            # Chaining applies the flows in reverse, Chain([a,b]).forward(x) being a.forward(b.forward(x))
            # We reverse them so the flows are stacked ontop of the base distribution in the original order
            flows.reverse()
            chain = tf.contrib.distributions.bijectors.Chain(flows)
            target_dist = tf.contrib.distributions.TransformedDistribution(distribution=base_dist, bijector=chain)

            # since we operate with matrices not vectors, the output would have dimension (?,1)
            # and therefor has to be reduce first to have shape (?,)
            if self.ndim_y == 1:
                # for x shape (batch_size, 1) normal_distribution.pdf(x) outputs shape (batch_size, 1) -> squeeze
                self.pdf_ = tf.squeeze(target_dist.prob(self.y_input), axis=1)
                self.log_pdf_ = tf.squeeze(target_dist.log_prob(self.y_input), axis=1)
                self.cdf_ = tf.squeeze(target_dist.cdf(self.y_input), axis=1)
            else:
                # no squeezing necessary for multivariate_normal, but we don't have a cdf
                self.pdf_ = target_dist.prob(self.y_input)
                self.log_pdf_ = target_dist.log_prob(self.y_input)

            if self.data_normalization:
                self.pdf_ = self.pdf_ / tf.reduce_prod(self.std_y_sym)
                self.log_pdf_ = self.log_pdf_ - tf.reduce_sum(tf.log(self.std_y_sym))
                # cdf is only implemented for 1-D
                if self.ndim_y == 1:
                    self.cdf_ = self.cdf_ / tf.reduce_prod(self.std_y_sym)

            self.loss = -tf.reduce_prod(self.pdf_)
            self.log_loss = -tf.reduce_sum(self.log_pdf_)

            optimizer = AdamWOptimizer(self.weight_decay) if self.weight_decay else tf.train.AdamOptimizer()

            if self.gradient_clipping:
                gradients, variables = zip(*optimizer.compute_gradients(self.log_loss))
                gradients, _ = tf.clip_by_global_norm(gradients, 3e5)
                self.train_step = optimizer.apply_gradients(zip(gradients, variables))
            else:
                self.train_step = optimizer.minimize(self.log_loss)

        # initialize LayersPowered -> provides functions for serializing tf models
        LayersPowered.__init__(self, [self.layer_in_y, core_network.output_layer])

    def __str__(self):
        return "\nEstimator type: {}" \
               "\n flows_type: {}" \
               "\n data_normalization: {}" \
               "\n weight_normalization: {}" \
               "\n n_training_epochs: {}" \
               "\n x_noise_std: {}" \
               "\n y_noise_std: {}" \
               "\n ".format(self.__class__.__name__, self.flows_type, self.data_normalization,
                            self.weight_normalization, self.n_training_epochs, self.x_noise_std, self.y_noise_std)