import numpy as np
import statsmodels.api as sm
from cde.utils.async_executor import execute_batch_async_pdf
from .BaseDensityEstimator import BaseDensityEstimator
MULTIPROC_THRESHOLD = 10**4
[docs]class ConditionalKernelDensityEstimation(BaseDensityEstimator):
""" ConditionalKernelDensityEstimation (CKDE): Nonparametric conditional density estimator that
models the joint probability p(x,y) and marginal probability p(x) via kernel density estimation
and computes the conditional density as p(y|x) = p(x, y) / p(x). This implementation wraps
functionality of the statsmodels.nonparametric module.
Args:
name: (str) name / identifier of estimator
ndim_x: (int) dimensionality of x variable
ndim_y: (int) dimensionality of y variable
bandwidth: (array_like or str)
If an array, it is a fixed user-specified bandwidth. If a string,
should be one of:
- normal_reference: normal reference rule of thumb (default)
- cv_ml: cross validation maximum likelihood
- cv_ls: cross validation least squares
n_jobs: (int) number of jobs to launch for calls with large batch sizes
random_seed: (optional) seed (int) of the random number generators used
References:
Racine, J., Li, Q. Nonparametric econometrics: theory and practice.
Princeton University Press. (2007)
"""
def __init__(self, name='CKDE', ndim_x=None, ndim_y=None, bandwidth='cv_ml', n_jobs=-1, random_seed=None):
self.random_state = np.random.RandomState(seed=random_seed)
self.name = name
self.ndim_x = ndim_x
self.ndim_y = ndim_y
self.n_jobs = n_jobs
self.random_seed = random_seed
assert bandwidth in ['normal_reference', 'cv_ml', 'cv_ls']
self.bandwidth = bandwidth
self.fitted = False
self.can_sample = False
self.has_pdf = True
self.has_cdf = True
[docs] def fit(self, X, Y, **kwargs):
""" Since CKDE is a lazy learner, fit just stores the provided training data (X,Y)
Args:
X: numpy array to be conditioned on - shape: (n_samples, n_dim_x)
Y: numpy array of y targets - shape: (n_samples, n_dim_y)
"""
X, Y = self._handle_input_dimensionality(X, Y, fitting=True)
self.y_mean, self.y_std = np.mean(Y, axis=0), np.std(Y, axis=0)
dep_type = 'c' * self.ndim_y
indep_type = 'c' * self.ndim_x
self.sm_kde = sm.nonparametric.KDEMultivariateConditional(endog=[Y], exog=[X], dep_type=dep_type, indep_type=indep_type, bw=self.bandwidth)
self.fitted = True
self.can_sample = False
self.has_cdf = True
[docs] def pdf(self, X, Y):
""" Predicts the conditional likelihood p(y|x). Requires the model to be fitted.
Args:
X: numpy array to be conditioned on - shape: (n_samples, n_dim_x)
Y: numpy array of y targets - shape: (n_samples, n_dim_y)
Returns:
conditional likelihood p(y|x) - numpy array of shape (n_query_samples, )
"""
X,Y = self._handle_input_dimensionality(X, Y)
n_samples = X.shape[0]
if n_samples >= MULTIPROC_THRESHOLD:
return execute_batch_async_pdf(self.sm_kde.pdf, Y, X, n_jobs=self.n_jobs)
else:
return self.sm_kde.pdf(endog_predict=Y, exog_predict=X)
[docs] def cdf(self, X, Y):
""" Predicts the conditional cumulative probability p(Y<=y|X=x). Requires the model to be fitted.
Args:
X: numpy array to be conditioned on - shape: (n_samples, n_dim_x)
Y: numpy array of y targets - shape: (n_samples, n_dim_y)
Returns:
conditional cumulative probability p(Y<=y|X=x) - numpy array of shape (n_query_samples, )
"""
assert self.fitted, "model must be fitted to compute likelihood score"
X, Y = self._handle_input_dimensionality(X, Y)
n_samples = X.shape[0]
if n_samples > MULTIPROC_THRESHOLD:
execute_batch_async_pdf(self.sm_kde.cdf, Y, X, n_jobs=self.n_jobs)
else:
return self.sm_kde.cdf(endog_predict=Y, exog_predict=X)
def sample(self, X):
raise NotImplementedError("Conditional Kernel Density Estimation is a lazy learner and does not support sampling")
def _param_grid(self):
mean_std_y = np.mean(self.y_std)
bandwidths = np.asarray([0.01, 0.1, 0.5, 1, 2, 5]) * mean_std_y
param_grid = {
"bandwidth": bandwidths
}
return param_grid
def __str__(self):
return "\n Estimator type: {}\n ndim_x: {}\n ndim_y: {}\n bandwidth: {}\n".format(self.__class__.__name__, self.ndim_x, self.ndim_y,
self.bandwidth)
def __unicode__(self):
return self.__str__()