Source code for cde.density_estimator.BaseDensityEstimator

from sklearn.model_selection import GridSearchCV
import warnings
from sklearn.model_selection import cross_validate

from cde import ConditionalDensity

from cde.utils.center_point_select import *

[docs]class BaseDensityEstimator(ConditionalDensity): """ Interface for conditional density estimation models """
[docs] def fit(self, X, Y, verbose=False): """ Fits the conditional density model with provided data Args: X: numpy array to be conditioned on - shape: (n_samples, n_dim_x) Y: numpy array of y targets - shape: (n_samples, n_dim_y) """ raise NotImplementedError
[docs] def eval_by_cv(self, X, Y, n_splits=5, verbose=True): """ Fits the conditional density model with cross-validation by using the score function of the BaseDensityEstimator for scoring the various splits. Args: X: numpy array to be conditioned on - shape: (n_samples, n_dim_x) Y: numpy array of y targets - shape: (n_samples, n_dim_y) n_splits: number of cross-validation folds (positive integer) verbose: the verbosity level """ X, Y = self._handle_input_dimensionality(X, Y, fitting=True) cv_results = cross_validate(self, X=X, y=Y, cv=n_splits, return_estimator=True, verbose=verbose) test_scores = cv_results['test_score'] test_scores_max_idx = np.nanargmax(test_scores) estimator = cv_results['estimator'][test_scores_max_idx] self.set_params(**estimator.get_params()) self.fit(X, Y)
[docs] def fit_by_cv(self, X, Y, n_folds=3, param_grid=None, verbose=True, n_jobs=-1): """ Fits the conditional density model with hyperparameter search and cross-validation. - Determines the best hyperparameter configuration from a pre-defined set using cross-validation. Thereby, the conditional log-likelihood is used for simulation_eval. - Fits the model with the previously selected hyperparameter configuration Args: X: numpy array to be conditioned on - shape: (n_samples, n_dim_x) Y: numpy array of y targets - shape: (n_samples, n_dim_y) n_folds: number of cross-validation folds (positive integer) param_grid: (optional) a dictionary with the hyperparameters of the model as key and and a list of respective \ parametrizations as value. The hyperparameter search is performed over the cartesian product of \ the provided lists. Example: {"n_centers": [20, 50, 100, 200], "center_sampling_method": ["agglomerative", "k_means", "random"], "keep_edges": [True, False] } """ # save properties of data self.n_samples = X.shape[0] self.x_std = np.std(X, axis=0) self.y_std = np.std(Y, axis=0) if param_grid is None: param_grid = self._param_grid() cv_model = GridSearchCV(self, param_grid, fit_params=None, n_jobs=n_jobs, refit=True, cv=n_folds, verbose=verbose, ) with warnings.catch_warnings(): warnings.simplefilter("ignore") # don't print division by zero warning cv_model.fit(X, Y) best_params = cv_model.best_params_ if verbose: print("Cross-Validation terminated") if verbose: print("Best likelihood score: %.4f" % cv_model.best_score_) if verbose: print("Best params:", best_params) self.set_params(**best_params) self.fit(X, Y) return best_params
[docs] def pdf(self, X, Y): """ Predicts the conditional likelihood p(y|x). Requires the model to be fitted. Args: X: numpy array to be conditioned on - shape: (n_samples, n_dim_x) Y: numpy array of y targets - shape: (n_samples, n_dim_y) Returns: conditional likelihood p(y|x) - numpy array of shape (n_query_samples, ) """ raise NotImplementedError
[docs] def log_pdf(self, X, Y): """ Predicts the conditional log-probability log p(y|x). Requires the model to be fitted. Args: X: numpy array to be conditioned on - shape: (n_samples, n_dim_x) Y: numpy array of y targets - shape: (n_samples, n_dim_y) Returns: conditional log-probability log p(y|x) - numpy array of shape (n_query_samples, ) """ # This method is numerically unfavorable and should be overwritten with a numerically stable method with warnings.catch_warnings(): warnings.simplefilter("ignore") log_prob = np.log(self.pdf(X, Y)) return log_prob
[docs] def predict_density(self, X, Y=None, resolution=50): """ Computes conditional density p(y|x) over a predefined grid of y target values Args: X: values/vectors to be conditioned on - shape: (n_instances, n_dim_x) Y: (optional) y values to be evaluated from p(y|x) - if not set, Y will be a grid with with specified resolution resulution: integer specifying the resolution of simulation_eval grid Returns: tuple (P, Y) - P - density p(y|x) - shape (n_instances, resolution**n_dim_y) - Y - grid with with specified resolution - shape (resolution**n_dim_y, n_dim_y) or a copy of Y \ in case it was provided as argument """ raise NotImplementedError
def _param_grid(self): raise NotImplementedError
[docs] def score(self, X, Y): """Computes the mean conditional log-likelihood of the provided data (X, Y) Args: X: numpy array to be conditioned on - shape: (n_query_samples, n_dim_x) Y: numpy array of y targets - shape: (n_query_samples, n_dim_y) Returns: average log likelihood of data """ return np.mean(self.log_pdf(X, Y))
[docs] def mean_(self, x_cond, n_samples=10**6): """ Mean of the fitted distribution conditioned on x_cond Args: x_cond: different x values to condition on - numpy array of shape (n_values, ndim_x) Returns: Means E[y|x] corresponding to x_cond - numpy array of shape (n_values, ndim_y) """ assert self.fitted, "model must be fitted" x_cond = self._handle_input_dimensionality(x_cond) assert x_cond.ndim == 2 if self.has_pdf: return self._mean_pdf(x_cond, n_samples=n_samples) else: return self._mean_mc(x_cond, n_samples=n_samples)
[docs] def std_(self, x_cond, n_samples=10 ** 6): """ Standard deviation of the fitted distribution conditioned on x_cond Args: x_cond: different x values to condition on - numpy array of shape (n_values, ndim_x) Returns: Standard deviations sqrt(Var[y|x]) corresponding to x_cond - numpy array of shape (n_values, ndim_y) """ assert self.fitted, "model must be fitted" x_cond = self._handle_input_dimensionality(x_cond) assert x_cond.ndim == 2 return self._std_pdf(x_cond, n_samples=n_samples)
[docs] def covariance(self, x_cond, n_samples=10**6): """ Covariance of the fitted distribution conditioned on x_cond Args: x_cond: different x values to condition on - numpy array of shape (n_values, ndim_x) Returns: Covariances Cov[y|x] corresponding to x_cond - numpy array of shape (n_values, ndim_y, ndim_y) """ assert self.fitted, "model must be fitted" x_cond = self._handle_input_dimensionality(x_cond) assert x_cond.ndim == 2 return self._covariance_pdf(x_cond, n_samples=n_samples)
[docs] def skewness(self, x_cond, n_samples=10**6): """ Skewness of the fitted distribution conditioned on x_cond Args: x_cond: different x values to condition on - numpy array of shape (n_values, ndim_x) Returns: Skewness Skew[y|x] corresponding to x_cond - numpy array of shape (n_values, ndim_y, ndim_y) """ assert self.fitted, "model must be fitted" x_cond = self._handle_input_dimensionality(x_cond) assert x_cond.ndim == 2 return self._skewness_pdf(x_cond, n_samples=n_samples)
[docs] def kurtosis(self, x_cond, n_samples=10**6): """ Kurtosis of the fitted distribution conditioned on x_cond Args: x_cond: different x values to condition on - numpy array of shape (n_values, ndim_x) Returns: Kurtosis Kurt[y|x] corresponding to x_cond - numpy array of shape (n_values, ndim_y, ndim_y) """ assert self.fitted, "model must be fitted" x_cond = self._handle_input_dimensionality(x_cond) assert x_cond.ndim == 2 return self._kurtosis_pdf(x_cond, n_samples=n_samples)
[docs] def mean_std(self, x_cond, n_samples=10 ** 6): """ Computes Mean and Covariance of the fitted distribution conditioned on x_cond. Computationally more efficient than calling mean and covariance computatio separately Args: x_cond: different x values to condition on - numpy array of shape (n_values, ndim_x) Returns: Means E[y|x] and Covariances Cov[y|x] """ mean = self.mean_(x_cond, n_samples=n_samples) std = self._std_pdf(x_cond, n_samples=n_samples, mean=mean) return mean, std
[docs] def value_at_risk(self, x_cond, alpha=0.01, n_samples=10**6): """ Computes the Value-at-Risk (VaR) of the fitted distribution. Only if ndim_y = 1 Args: x_cond: different x values to condition on - numpy array of shape (n_values, ndim_x) alpha: quantile percentage of the distribution Returns: VaR values for each x to condition on - numpy array of shape (n_values) """ assert self.fitted, "model must be fitted" assert self.ndim_y == 1, "Value at Risk can only be computed when ndim_y = 1" assert x_cond.ndim == 2 if self.has_cdf: VaR = self._quantile_cdf(x_cond, alpha=alpha) if np.isnan(VaR).any() and self.can_sample: # try with sampling if failed VaR = self._quantile_mc(x_cond, alpha=alpha, n_samples=n_samples) elif self.can_sample: VaR = self._quantile_mc(x_cond, alpha=alpha, n_samples=n_samples) else: raise NotImplementedError() return VaR
[docs] def conditional_value_at_risk(self, x_cond, alpha=0.01, n_samples=10**6): """ Computes the Conditional Value-at-Risk (CVaR) / Expected Shortfall of the fitted distribution. Only if ndim_y = 1 Args: x_cond: different x values to condition on - numpy array of shape (n_values, ndim_x) alpha: quantile percentage of the distribution Returns: CVaR values for each x to condition on - numpy array of shape (n_values) """ assert self.fitted, "model must be fitted" assert self.ndim_y == 1, "Value at Risk can only be computed when ndim_y = 1" x_cond = self._handle_input_dimensionality(x_cond) assert x_cond.ndim == 2 VaRs = self.value_at_risk(x_cond, alpha=alpha, n_samples=n_samples) if self.has_pdf: return self._conditional_value_at_risk_mc_pdf(VaRs, x_cond, alpha=alpha, n_samples=n_samples) elif self.can_sample: return self._conditional_value_at_risk_sampling(VaRs, x_cond, n_samples=n_samples) else: raise NotImplementedError("Distribution object must either support pdf or sampling in order to compute CVaR")
[docs] def get_configuration(self, deep=True): """ Get parameter configuration for this estimator. Args: deep: boolean, optional If True, will return the parameters for this estimator and \ contained subobjects that are estimators. Returns: params - mapping of string to any Parameter names mapped to their values. """ param_dict = super(BaseDensityEstimator, self).get_params(deep=deep) param_dict['estimator'] = self.__class__.__name__ for x in ["n_centers", "center_sampling_method", "x_noise_std", "y_noise_std", "random_seed", "ndim_x", "ndim_y"]: if hasattr(self, x): param_dict[x] = getattr(self, x) else: param_dict[x] = None return param_dict
[docs] def tail_risk_measures(self, x_cond, alpha=0.01, n_samples=10 ** 6): """ Computes the Value-at-Risk (VaR) and Conditional Value-at-Risk (CVaR) Args: x_cond: different x values to condition on - numpy array of shape (n_values, ndim_x) alpha: quantile percentage of the distribution n_samples: number of samples for monte carlo model_fitting Returns: - VaR values for each x to condition on - numpy array of shape (n_values) - CVaR values for each x to condition on - numpy array of shape (n_values) """ assert self.fitted, "model must be fitted" assert self.ndim_y == 1, "Value at Risk can only be computed when ndim_y = 1" assert x_cond.ndim == 2 VaRs = self.value_at_risk(x_cond, alpha=alpha, n_samples=n_samples) if self.has_pdf: CVaRs = self._conditional_value_at_risk_mc_pdf(VaRs, x_cond, alpha=alpha, n_samples=n_samples) elif self.can_sample: CVaRs = self._conditional_value_at_risk_sampling(VaRs, x_cond, n_samples=n_samples) else: raise NotImplementedError("Distribution object must either support pdf or sampling in order to compute CVaR") assert VaRs.shape == CVaRs.shape == (len(x_cond),) return VaRs, CVaRs