Source code for cde.density_estimator.LSCDE

import itertools
import numpy as np
import scipy.stats as stats
from scipy.special import logsumexp

from cde.utils.center_point_select import sample_center_points
from cde.utils.misc import norm_along_axis_1
from .BaseDensityEstimator import BaseDensityEstimator
from cde.utils.async_executor import execute_batch_async_pdf

MULTIPROC_THRESHOLD = 10**4

[docs]class LSConditionalDensityEstimation(BaseDensityEstimator):
  """ Least-Squares Density Ratio Estimator

  http://proceedings.mlr.press/v9/sugiyama10a.html

  Args:
      name: (str) name / identifier of estimator
      ndim_x: (int) dimensionality of x variable
      ndim_y: (int) dimensionality of y variable
      center_sampling_method: String that describes the method to use for finding kernel centers. Allowed values \
                            [all, random, distance, k_means, agglomerative]
      bandwidth: scale / bandwith of the gaussian kernels
      n_centers: Number of kernels to use in the output
      regularization: regularization / damping parameter for solving the least-squares problem
      keep_edges: if set to True, the extreme y values as centers are kept (for expressiveness)
      n_jobs: (int) number of jobs to launch for calls with large batch sizes
      random_seed: (optional) seed (int) of the random number generators used
    """

  def __init__(self, name='LSCDE', ndim_x=None, ndim_y=None, center_sampling_method='k_means',
               bandwidth=0.5, n_centers=500, regularization=1.0,
               keep_edges=True, n_jobs=-1, random_seed=None):

    self.name = name
    self.ndim_x = ndim_x
    self.ndim_y = ndim_y
    self.random_state = np.random.RandomState(seed=random_seed)
    self.random_seed = random_seed

    self.center_sampling_method = center_sampling_method
    self.n_centers = n_centers
    self.keep_edges = keep_edges
    self.bandwidth = bandwidth
    self.regularization = regularization
    self.n_jobs = n_jobs

    self.fitted = False
    self.can_sample = True
    self.has_pdf = True
    self.has_cdf = False

  def _build_model(self, X, Y):
    # save mean and variance of data for normalization
    self.x_mean, self.y_mean = np.mean(X, axis=0), np.mean(Y, axis=0)
    self.x_std, self.y_std = np.std(X, axis=0),  np.std(Y, axis=0)

    # get locations of the gaussian kernel centers
    if self.center_sampling_method == 'all':
      self.n_centers = X.shape[0]
    else:
      self.n_centers = min(self.n_centers, X.shape[0])

    n_locs = self.n_centers
    X_Y_normalized = np.concatenate(list(self._normalize(X, Y)), axis=1)
    centroids = sample_center_points(X_Y_normalized, method=self.center_sampling_method, k=n_locs,
                                     keep_edges=self.keep_edges, random_state=self.random_state)
    self.centr_x = centroids[:, 0:self.ndim_x]
    self.centr_y = centroids[:, self.ndim_x:]

    #prepare gaussians for sampling
    self.gaussians_y = [stats.multivariate_normal(mean=center, cov=self.bandwidth) for center in self.centr_y]

    assert self.centr_x.shape == (n_locs, self.ndim_x) and self.centr_y.shape == (n_locs, self.ndim_y)

[docs]  def fit(self, X, Y, **kwargs):
    """ Fits the conditional density model with provided data

      Args:
        X: numpy array to be conditioned on - shape: (n_samples, n_dim_x)
        Y: numpy array of y targets - shape: (n_samples, n_dim_y)
    """
    # assert that both X an Y are 2D arrays with shape (n_samples, n_dim)

    X, Y = self._handle_input_dimensionality(X, Y, fitting=True)
    self.ndim_y, self.ndim_x = Y.shape[1], X.shape[1]

    self._build_model(X, Y)

    X_normalized, Y_normalized = self._normalize(X, Y)

    # determine the kernel weights alpha
    self.h = np.mean(self._gaussian_kernel(X_normalized, Y_normalized), axis=0)

    a = np.mean(norm_along_axis_1(X_normalized,self.centr_x),axis=0)
    b = norm_along_axis_1(self.centr_y, self.centr_y)
    eta = 2 * np.add.outer(a,a) + b

    self.H = (np.sqrt(np.pi) * self.bandwidth) ** self.ndim_y * np.exp(- eta / (5 * self.bandwidth ** 2))

    self.alpha = np.linalg.solve(self.H + self.regularization * np.identity(self.n_centers), self.h)
    self.alpha[self.alpha <= 0] = 1e-10 # set to small value instead of 0 for numerical stability

    self.fitted = True

[docs]  def pdf(self, X, Y):
    """ Predicts the conditional density p(y|x). Requires the model to be fitted.

       Args:
         X: numpy array to be conditioned on - shape: (n_samples, n_dim_x)
         Y: numpy array of y targets - shape: (n_samples, n_dim_y)

       Returns:
          conditional probability density p(y|x) - numpy array of shape (n_query_samples, )

     """
    assert self.fitted, "model must be fitted for predictions"

    X, Y = self._handle_input_dimensionality(X, Y)

    n_samples = X.shape[0]
    if n_samples >= MULTIPROC_THRESHOLD:
      return execute_batch_async_pdf(self._pdf, X, Y, n_jobs=self.n_jobs)
    else:
      return self._pdf(X, Y)

[docs]  def log_pdf(self, X, Y):
    """ Predicts the conditional log-probability log p(y|x). Requires the model to be fitted.

          Args:
            X: numpy array to be conditioned on - shape: (n_samples, n_dim_x)
            Y: numpy array of y targets - shape: (n_samples, n_dim_y)

          Returns:
             conditional log-probability density log p(y|x) - numpy array of shape (n_query_samples, )

        """
    assert self.fitted, "model must be fitted for predictions"

    X, Y = self._handle_input_dimensionality(X, Y)

    n_samples = X.shape[0]
    if n_samples >= MULTIPROC_THRESHOLD:
      return execute_batch_async_pdf(self._log_pdf, X, Y, n_jobs=self.n_jobs)
    else:
      return self._log_pdf(X, Y)

[docs]  def sample(self, X):
    """ sample from the conditional mixture distributions - requires the model to be fitted

    Args:
      X: values to be conditioned on when sampling - numpy array of shape (n_instances, n_dim_x)

    Returns: tuple (X, Y)
      - X - the values to conditioned on that were provided as argument - numpy array of shape (n_samples, ndim_x)
      - Y - conditional samples from the model p(y|x) - numpy array of shape (n_samples, ndim_y)
    """
    assert self.fitted
    X = self._handle_input_dimensionality(X)

    weights = np.multiply(self.alpha, self._gaussian_kernel(X))
    weights = weights / np.sum(weights, axis=1)[:,None]

    Y = np.zeros(shape=(X.shape[0], self.ndim_y))
    for i in range(X.shape[0]):
      discrete_dist = stats.rv_discrete(values=(range(weights.shape[1]), weights[i, :]))
      idx = discrete_dist.rvs()
      Y[i, :] = self.gaussians_y[idx].rvs()

    return X, Y

  def _pdf(self, X, Y):
   return np.exp(self._log_pdf(X, Y))

  def _log_pdf(self, X, Y):
    X_normalized, Y_normalized = self._normalize(X, Y)
    log_p = logsumexp(np.log(self.alpha.T) + self._log_gaussian_kernel(X_normalized, Y_normalized), axis=1)
    log_normalization = (0.5 * np.log(2 *np.pi) + np.log(self.bandwidth)) * self.ndim_y + \
                        logsumexp(np.log(self.alpha.T) + self._log_gaussian_kernel(X_normalized), axis=1)

    return np.squeeze(log_p - log_normalization - np.sum(np.log(self.y_std)))

  def _normalize(self, X, Y):
    X_normalized = (X - self.x_mean) / self.x_std
    Y_normalized = (Y - self.y_mean) / self.y_std
    return X_normalized, Y_normalized

  def _gaussian_kernel(self, X, Y=None):
    """
    if Y is set returns the product of the gaussian kernels for X and Y, else only the gaussian kernel for X
    :param X: numpy array of size (n_samples, ndim_x)
    :param Y: numpy array of size (n_samples, ndim_y)
    :return: phi -  numpy array of size (n_samples, n_centers)
    """
    return np.exp(self._log_gaussian_kernel(X, Y))

  def _log_gaussian_kernel(self, X, Y=None):
    """
    if Y is set returns the sum of the gaussian log-kernels for X and Y, else only the gaussian log-kernel for X
    :param X: numpy array of size (n_samples, ndim_x)
    :param Y: numpy array of size (n_samples, ndim_y)
    :return: phi -  numpy array of size (n_samples, n_centers)
    """
    phi = np.zeros(shape=(X.shape[0], self.n_centers))

    if Y is not None:
      for i in range(phi.shape[1]):

        #suqared distances from center point i
        sq_d_x = np.sum(np.square(X - self.centr_x[i, :]), axis=1)
        sq_d_y = np.sum(np.square(Y - self.centr_y[i, :]), axis=1)

        phi[:, i] = - sq_d_x / (2 * self.bandwidth ** 2) - sq_d_y / (2 * self.bandwidth ** 2)
    else:
      for i in range(phi.shape[1]):
        # suqared distances from center point i
        sq_d_x = np.sum(np.square(X - self.centr_x[i, :]), axis=1)
        phi[:, i] = - sq_d_x / (2 * self.bandwidth ** 2)

    assert phi.shape == (X.shape[0], self.n_centers)
    return phi

  def _param_grid(self):
    param_grid = {
      "n_centers": np.asarray([100, 500, 1000]),
      "bandwidth": np.asarray([0.1, 0.2, 0.5, 0.7, 1.0]),
      "regularization": np.asarray([0.1, 0.5, 1.0, 4.0, 8.0])
    }
    return param_grid

  def __str__(self):
    return "\nEstimator type: {}\n center sampling method: {}\n n_centers: {}\n keep_edges: {}\n bandwidth: {}\n regularization: {}\n ".format(
      self.__class__.__name__, self.center_sampling_method, self.n_centers, self.keep_edges, self.bandwidth, self.regularization)

  def __unicode__(self):
    return self.__str__()