Module bayesvalidrox.surrogate_models.reg_fast_laplace
Expand source code
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import numpy as np
from sklearn.utils import as_float_array
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings("ignore")
import sys
import matplotlib.pyplot as plt
class RegressionFastLaplace():
'''
Sparse regression with Bayesian Compressive Sensing as described in Alg. 1
(Fast Laplace) of Ref.[1], which updated formulas from [2].
sigma2: noise precision (sigma^2)
nu fixed to 0
uqlab/lib/uq_regression/BCS/uq_bsc.m
Parameters
----------
n_iter: int, optional (DEFAULT = 1000)
Maximum number of iterations
tol: float, optional (DEFAULT = 1e-7)
If absolute change in precision parameter for weights is below threshold
algorithm terminates.
fit_intercept : boolean, optional (DEFAULT = True)
whether to calculate the intercept for this model. If set
to false, no intercept will be used in calculations
(e.g. data is expected to be already centered).
copy_X : boolean, optional (DEFAULT = True)
If True, X will be copied; else, it may be overwritten.
verbose : boolean, optional (DEFAULT = FALSE)
Verbose mode when fitting the model
Attributes
----------
coef_ : array, shape = (n_features)
Coefficients of the regression model (mean of posterior distribution)
alpha_ : float
estimated precision of the noise
active_ : array, dtype = np.bool, shape = (n_features)
True for non-zero coefficients, False otherwise
lambda_ : array, shape = (n_features)
estimated precisions of the coefficients
sigma_ : array, shape = (n_features, n_features)
estimated covariance matrix of the weights, computed only
for non-zero coefficients
References
----------
[1] Babacan, S. D., Molina, R., & Katsaggelos, A. K. (2009). Bayesian
compressive sensing using Laplace priors. IEEE Transactions on image
processing, 19(1), 53-63.
[2] Fast marginal likelihood maximisation for sparse Bayesian models
(Tipping & Faul 2003).
(http://www.miketipping.com/papers/met-fastsbl.pdf)
'''
def __init__(self, n_iter=1000, n_Kfold=10, tol=1e-7, fit_intercept=False,
copy_X=True, verbose=True):
self.n_iter = n_iter
self.n_Kfold = n_Kfold
self.tol = tol
self.fit_intercept = fit_intercept
self.copy_X = copy_X
self.verbose = verbose
def _center_data(self, X,y):
''' Centers data'''
X = as_float_array(X,self.copy_X)
# normalisation should be done in preprocessing!
X_std = np.ones(X.shape[1], dtype = X.dtype)
if self.fit_intercept:
X_mean = np.average(X,axis = 0)
y_mean = np.average(y,axis = 0)
X -= X_mean
y = y - y_mean
else:
X_mean = np.zeros(X.shape[1],dtype = X.dtype)
y_mean = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype=X.dtype)
return X,y, X_mean, y_mean, X_std
def fit(self, X,Y):
k_fold = KFold(n_splits=self.n_Kfold)
varY = np.var(Y,ddof=1) if np.var(Y,ddof=1)!=0 else 1.0
sigma2s = len(Y)*varY*10**np.linspace(-16,-1,self.n_Kfold)
errors = np.zeros((len(sigma2s),self.n_Kfold))
for s, sigma2 in enumerate(sigma2s):
for k, (train, test) in enumerate(k_fold.split(X, Y)):
self.fit_(X[train], Y[train], sigma2)
errors[s,k] = np.linalg.norm(Y[test] - self.predict(X[test]))**2/len(test)
KfCVerror = np.sum(errors,axis=1)/self.n_Kfold/varY
i_minCV = np.argmin(KfCVerror)
self.kfoldCVerror = np.min(KfCVerror)
return self.fit_(X, Y, sigma2s[i_minCV])
def fit_(self, X, Y, sigma2):
N,P = X.shape
# n_samples, n_features = X.shape
X, y, X_mean, y_mean, X_std = self._center_data(X, Y)
self._x_mean_ = X_mean
self._y_mean = y_mean
self._x_std = X_std
beta = 1./sigma2
# precompute X'*Y , X'*X for faster iterations & allocate memory for
# sparsity & quality vectors X=Psi, y=Y
PsiTY = np.dot(X.T,y) #XY
PsiTPsi = np.dot(X.T,X) #XX
XXd = np.diag(PsiTPsi)
# initialize with constant regressor, or if that one does not exist,
# with the one that has the largest correlation with Y
ind_global_to_local = np.zeros(P , dtype = np.int32)#np.bool) #active
# identify constant regressors
constidx = np.where(~np.diff(X,axis=0).all(axis=0))[0]
if constidx.size != 0:
ind_start = constidx[0]
ind_global_to_local[ind_start] = True
else:
# start from a single basis vector with largest projection on targets
proj = np.divide(np.square(PsiTY) , XXd)
ind_start = np.argmax(proj)
ind_global_to_local[ind_start] = True
num_active = 1
active_indices = [ind_start]
deleted_indices = []
bcs_path = [ind_start]
gamma = np.zeros(P)
# for the initial value of gamma(ind_start), use the RVM formula
# gamma = (q^2 - s) / (s^2)
# and the fact that initially s = S = beta*Psi_i'*Psi_i and q = Q =
# beta*Psi_i'*Y
gamma[ind_start] = (np.square(PsiTY[ind_start]) - sigma2 * PsiTPsi[ind_start,ind_start]) / \
(np.square(PsiTPsi[ind_start, ind_start]))
Sigma = 1. / (beta * PsiTPsi[ind_start,ind_start] + 1./gamma[ind_start])
mu = Sigma * PsiTY[ind_start] * beta
tmp1 = beta * PsiTPsi[ind_start,:]
S = beta * np.diag(PsiTPsi).T - Sigma * np.square(tmp1)
Q = beta * PsiTY.T - mu*(tmp1)
tmp2 = np.ones(P) # alternative computation for the initial s,q
q0tilde = PsiTY[ind_start];
s0tilde = PsiTPsi[ind_start, ind_start];
tmp2[ind_start] = s0tilde / (q0tilde**2) / beta
s = np.divide(S,tmp2)
q = np.divide(Q,tmp2)
Lambda = 2*(num_active - 1) / np.sum(gamma) # initialize lambda as well
Delta_L_max = []
for i in range(self.n_iter):
if self.verbose:
print(' lambda = {0:.3e}\n'.format(Lambda))
# Calculate the potential updated value of each gamma[i]
if Lambda == 0.0: # RVM
gamma_potential = np.multiply(((np.square(q) - s) > Lambda), \
np.divide(np.square(q) - s, np.square(s)))
else:
a = Lambda * np.square(s)
b = np.square(s) + 2*Lambda*s
c = Lambda + s - np.square(q) # <-- important decision boundary
gamma_potential = np.multiply((c < 0) ,
np.divide(-b + np.sqrt(np.square(b) - 4*np.multiply(a,c)),2*a))
l_gamma = - np.log(np.absolute(1 + np.multiply(gamma,s))) \
+ np.divide(np.multiply(np.square(q), gamma),(1 + np.multiply(gamma,s))) \
- Lambda*gamma # omitted the factor 1/2
# Contribution of each updated gamma(i) to L(gamma)
l_gamma_potential = - np.log(np.absolute(1 + np.multiply(gamma_potential,s))) \
+ np.divide(np.multiply(np.square(q), gamma_potential),(1 + np.multiply(gamma_potential,s))) \
- Lambda*gamma_potential # omitted the factor 1/2
# Check how L(gamma) would change if we replaced gamma(i) by the
# updated gamma_potential(i), for each i separately
Delta_L_potential = l_gamma_potential - l_gamma
# deleted indices should not be chosen again
if len(deleted_indices)!=0:
Delta_L_potential[deleted_indices] = -np.inf * np.ones(len(deleted_indices))
Delta_L_max.append(np.nanmax(Delta_L_potential))
ind_L_max = np.nanargmax(Delta_L_potential)
# in case there is only 1 regressor in the model and it would now be
# deleted
if len(active_indices) == 1 and ind_L_max == active_indices[0] \
and gamma_potential[ind_L_max] == 0.0:
Delta_L_potential[ind_L_max] = -np.inf
Delta_L_max[i] = np.max(Delta_L_potential)
ind_L_max = np.argmax(Delta_L_potential)
# If L did not change significantly anymore, break
if Delta_L_max[i] <= 0.0 or\
( i > 0 and \
all(np.absolute(Delta_L_max[i-1:]) \
< sum(Delta_L_max)*self.tol)) or \
( i > 0 and all(np.diff(bcs_path)[i-1:]==0.0)):
if self.verbose:
print('Increase in L: {0:.3e} (eta = {1:.3e})\
-- break\n'.format(Delta_L_max[i],self.tol))
break
# Print information
if self.verbose:
print(' Delta L = {0:.3e} \n'.format(Delta_L_max[i]))
what_changed = int(gamma[ind_L_max]==0.0) - int(gamma_potential[ind_L_max]==0.0)
# Print information
if self.verbose:
if what_changed < 0:
print('{} - Remove regressor #{}..\n'.format(i+1,ind_L_max+1))
elif what_changed == 0:
print('{} - Recompute regressor #{}..\n'.format(i+1,ind_L_max+1))
else:
print('{} - Add regressor #{}..\n'.format(i+1,ind_L_max+1))
# --- Update all quantities ----
if what_changed == 1:
# adding a regressor
# update gamma
gamma[ind_L_max] = gamma_potential[ind_L_max]
Sigma_ii = 1.0 / (1.0/gamma[ind_L_max] + S[ind_L_max])
try:
x_i = np.matmul(Sigma, PsiTPsi[active_indices,ind_L_max].reshape(-1,1))
except:
x_i = Sigma * PsiTPsi[active_indices,ind_L_max]
tmp_1 = - (beta * Sigma_ii) * x_i
Sigma = np.vstack((np.hstack(((beta**2 * Sigma_ii) * np.dot(x_i,x_i.T) + Sigma , tmp_1))\
, np.append(tmp_1.T,Sigma_ii)))
mu_i = Sigma_ii * Q[ind_L_max]
mu = np.vstack((mu - (beta * mu_i) * x_i, mu_i))
tmp2 = beta * (PsiTPsi[:,ind_L_max] - beta * \
np.squeeze(np.matmul(PsiTPsi[:,active_indices],x_i))).T # row vector
S = S - Sigma_ii * np.square(tmp2)
Q = Q - mu_i * tmp2
num_active += 1
ind_global_to_local[ind_L_max] = num_active # local index
active_indices.append(ind_L_max)
bcs_path.append(ind_L_max)
elif what_changed == 0:
# recomputation
if not ind_global_to_local[ind_L_max]: # zero if regressor has not been chosen yet
raise Exception('cannot recompute index{0} -- not yet\
part of the model!'.format(ind_L_max))
gamma_i_new = gamma_potential[ind_L_max]
gamma_i_old = gamma[ind_L_max]
# update gamma
gamma[ind_L_max] = gamma_potential[ind_L_max]
# index of regressor in Sigma
local_ind = ind_global_to_local[ind_L_max]-1
kappa_i = 1.0 / (Sigma[local_ind, local_ind] + 1.0 / (1.0/gamma_i_new - 1.0/gamma_i_old))
Sigma_i_col = Sigma[:,local_ind] # column of interest in Sigma
Sigma = Sigma - kappa_i * (Sigma_i_col * Sigma_i_col.T)
mu_i = mu[local_ind]
mu = mu - (kappa_i * mu_i) * Sigma_i_col[:,None]
tmp1 = beta * np.dot(Sigma_i_col.reshape(1,-1),PsiTPsi[active_indices])[0] # row vector
S = S + kappa_i * np.square(tmp1)
Q = Q + (kappa_i * mu_i) * tmp1
# no change in active_indices or ind_global_to_local
bcs_path.append(ind_L_max+ 0.1)
elif what_changed == -1:
gamma[ind_L_max] = 0
# index of regressor in Sigma
local_ind = ind_global_to_local[ind_L_max]-1
Sigma_ii_inv = 1. / Sigma[local_ind,local_ind]
Sigma_i_col = Sigma[:,local_ind] # column to be deleted in Sigma
Sigma = Sigma - Sigma_ii_inv * (Sigma_i_col * Sigma_i_col.T);
Sigma = np.delete(np.delete(Sigma, local_ind, axis=0), local_ind, axis=1)
mu = mu - (mu[local_ind] * Sigma_ii_inv) * Sigma_i_col[:,None]
mu = np.delete(mu, local_ind, axis=0)
tmp1 = beta * np.dot(Sigma_i_col,PsiTPsi[active_indices]) # row vector
S = S + Sigma_ii_inv * np.square(tmp1)
Q = Q + (mu_i * Sigma_ii_inv) * tmp1
num_active -= 1
ind_global_to_local[ind_L_max] = 0.0
ind_global_to_local[ind_global_to_local > local_ind] = ind_global_to_local[ind_global_to_local > local_ind] - 1
del active_indices[local_ind]
deleted_indices.append(ind_L_max) # mark this index as deleted
# and therefore ineligible
bcs_path.append(-ind_L_max)
# same for all three cases
tmp3 = 1 - np.multiply(gamma,S)
s = np.divide(S,tmp3)
q = np.divide(Q,tmp3)
# Update lambda
Lambda = 2*(num_active - 1) / np.sum(gamma)
# Prepare the result object
self.coef_ = np.zeros(P)
self.coef_[active_indices] = np.squeeze(mu)
self.sigma_ = Sigma
self.active_ = active_indices
self.gamma = gamma
self.Lambda = Lambda
self.beta = beta
self.bcs_path = bcs_path
return self
def predict(self,X, return_std=False):
'''
Computes predictive distribution for test set.
Predictive distribution for each data point is one dimensional
Gaussian and therefore is characterised by mean and variance based on
Ref.[1] Section 3.3.2.
Parameters
-----------
X: {array-like, sparse} (n_samples_test, n_features)
Test data, matrix of explanatory variables
Returns
-------
: list of length two [y_hat, var_hat]
y_hat: numpy array of size (n_samples_test,)
Estimated values of targets on test set (i.e. mean of predictive
distribution)
var_hat: numpy array of size (n_samples_test,)
Variance of predictive distribution
References
----------
[1] Bishop, C. M. (2006). Pattern recognition and machine learning. springer.
'''
x = (X - self._x_mean_) / self._x_std
y_hat = np.dot(x,self.coef_) + self._y_mean
if return_std:
var_hat = 1./self.beta
var_hat += np.sum( np.dot(x[:,self.active_],self.sigma_) * x[:,self.active_], axis = 1)
std_hat = np.sqrt(var_hat)
return y_hat, std_hat
else:
return y_hat
# l2norm = 0.0
# for idx in range(10):
# sigma2 = np.genfromtxt('./test/sigma2_{0}.csv'.format(idx+1), delimiter=',')
# Psi_train = np.genfromtxt('./test/Psi_train_{0}.csv'.format(idx+1), delimiter=',')
# Y_train = np.genfromtxt('./test/Y_train_{0}.csv'.format(idx+1))
# coeffs_fold = np.genfromtxt('./test/coeffs_fold_{0}.csv'.format(idx+1))
# Psi_test = np.genfromtxt('./test/Psi_test_{0}.csv'.format(idx+1), delimiter=',')
# Y_test = np.genfromtxt('./test/Y_test_{0}.csv'.format(idx+1))
# clf = RegressionFastLaplace(verbose=True)
# clf.fit_(Psi_train, Y_train, sigma2)
# print("coeffs error: {0:.4g}".format(np.linalg.norm(clf.coef_ - coeffs_fold)))
# l2norm += np.linalg.norm(Y_test - clf.predict(Psi_test))**2/len(Y_test)
# print("l2norm error: {0:.4g}".format(l2norm))
# import sys
# sys.exit()
# Psi = np.genfromtxt('../tests/AnalyticalFunction/Psi.csv', delimiter=',')
# Y = np.genfromtxt('../tests/AnalyticalFunction/Y.csv')
# clf = RegressionFastLaplace(verbose=True)
# best = clf.fit_(Psi, Y,7.608010186983984e-11)
# y_hat, std = best.predict(Psi[:2], return_std=True)
Classes
class RegressionFastLaplace (n_iter=1000, n_Kfold=10, tol=1e-07, fit_intercept=False, copy_X=True, verbose=True)
-
Sparse regression with Bayesian Compressive Sensing as described in Alg. 1 (Fast Laplace) of Ref.[1], which updated formulas from [2].
sigma2: noise precision (sigma^2) nu fixed to 0
uqlab/lib/uq_regression/BCS/uq_bsc.m
Parameters
n_iter
:int
, optional(DEFAULT = 1000)
- Maximum number of iterations
tol
:float
, optional(DEFAULT = 1e-7)
- If absolute change in precision parameter for weights is below threshold algorithm terminates.
fit_intercept
:boolean
, optional(DEFAULT = True)
- whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (e.g. data is expected to be already centered).
copy_X
:boolean
, optional(DEFAULT = True)
- If True, X will be copied; else, it may be overwritten.
verbose
:boolean
, optional(DEFAULT = FALSE)
- Verbose mode when fitting the model
Attributes
coef_
:array, shape = (n_features)
- Coefficients of the regression model (mean of posterior distribution)
alpha_
:float
estimated precision of the noise
active_
:array, dtype = np.bool, shape = (n_features)
True for non-zero coefficients, False otherwise
lambda_
:array, shape = (n_features)
estimated precisions of the coefficients
sigma_
:array, shape = (n_features, n_features)
- estimated covariance matrix of the weights, computed only for non-zero coefficients
References
[1] Babacan, S. D., Molina, R., & Katsaggelos, A. K. (2009). Bayesian compressive sensing using Laplace priors. IEEE Transactions on image processing, 19(1), 53-63. [2] Fast marginal likelihood maximisation for sparse Bayesian models (Tipping & Faul 2003). (http://www.miketipping.com/papers/met-fastsbl.pdf)
Expand source code
class RegressionFastLaplace(): ''' Sparse regression with Bayesian Compressive Sensing as described in Alg. 1 (Fast Laplace) of Ref.[1], which updated formulas from [2]. sigma2: noise precision (sigma^2) nu fixed to 0 uqlab/lib/uq_regression/BCS/uq_bsc.m Parameters ---------- n_iter: int, optional (DEFAULT = 1000) Maximum number of iterations tol: float, optional (DEFAULT = 1e-7) If absolute change in precision parameter for weights is below threshold algorithm terminates. fit_intercept : boolean, optional (DEFAULT = True) whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (e.g. data is expected to be already centered). copy_X : boolean, optional (DEFAULT = True) If True, X will be copied; else, it may be overwritten. verbose : boolean, optional (DEFAULT = FALSE) Verbose mode when fitting the model Attributes ---------- coef_ : array, shape = (n_features) Coefficients of the regression model (mean of posterior distribution) alpha_ : float estimated precision of the noise active_ : array, dtype = np.bool, shape = (n_features) True for non-zero coefficients, False otherwise lambda_ : array, shape = (n_features) estimated precisions of the coefficients sigma_ : array, shape = (n_features, n_features) estimated covariance matrix of the weights, computed only for non-zero coefficients References ---------- [1] Babacan, S. D., Molina, R., & Katsaggelos, A. K. (2009). Bayesian compressive sensing using Laplace priors. IEEE Transactions on image processing, 19(1), 53-63. [2] Fast marginal likelihood maximisation for sparse Bayesian models (Tipping & Faul 2003). (http://www.miketipping.com/papers/met-fastsbl.pdf) ''' def __init__(self, n_iter=1000, n_Kfold=10, tol=1e-7, fit_intercept=False, copy_X=True, verbose=True): self.n_iter = n_iter self.n_Kfold = n_Kfold self.tol = tol self.fit_intercept = fit_intercept self.copy_X = copy_X self.verbose = verbose def _center_data(self, X,y): ''' Centers data''' X = as_float_array(X,self.copy_X) # normalisation should be done in preprocessing! X_std = np.ones(X.shape[1], dtype = X.dtype) if self.fit_intercept: X_mean = np.average(X,axis = 0) y_mean = np.average(y,axis = 0) X -= X_mean y = y - y_mean else: X_mean = np.zeros(X.shape[1],dtype = X.dtype) y_mean = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype=X.dtype) return X,y, X_mean, y_mean, X_std def fit(self, X,Y): k_fold = KFold(n_splits=self.n_Kfold) varY = np.var(Y,ddof=1) if np.var(Y,ddof=1)!=0 else 1.0 sigma2s = len(Y)*varY*10**np.linspace(-16,-1,self.n_Kfold) errors = np.zeros((len(sigma2s),self.n_Kfold)) for s, sigma2 in enumerate(sigma2s): for k, (train, test) in enumerate(k_fold.split(X, Y)): self.fit_(X[train], Y[train], sigma2) errors[s,k] = np.linalg.norm(Y[test] - self.predict(X[test]))**2/len(test) KfCVerror = np.sum(errors,axis=1)/self.n_Kfold/varY i_minCV = np.argmin(KfCVerror) self.kfoldCVerror = np.min(KfCVerror) return self.fit_(X, Y, sigma2s[i_minCV]) def fit_(self, X, Y, sigma2): N,P = X.shape # n_samples, n_features = X.shape X, y, X_mean, y_mean, X_std = self._center_data(X, Y) self._x_mean_ = X_mean self._y_mean = y_mean self._x_std = X_std beta = 1./sigma2 # precompute X'*Y , X'*X for faster iterations & allocate memory for # sparsity & quality vectors X=Psi, y=Y PsiTY = np.dot(X.T,y) #XY PsiTPsi = np.dot(X.T,X) #XX XXd = np.diag(PsiTPsi) # initialize with constant regressor, or if that one does not exist, # with the one that has the largest correlation with Y ind_global_to_local = np.zeros(P , dtype = np.int32)#np.bool) #active # identify constant regressors constidx = np.where(~np.diff(X,axis=0).all(axis=0))[0] if constidx.size != 0: ind_start = constidx[0] ind_global_to_local[ind_start] = True else: # start from a single basis vector with largest projection on targets proj = np.divide(np.square(PsiTY) , XXd) ind_start = np.argmax(proj) ind_global_to_local[ind_start] = True num_active = 1 active_indices = [ind_start] deleted_indices = [] bcs_path = [ind_start] gamma = np.zeros(P) # for the initial value of gamma(ind_start), use the RVM formula # gamma = (q^2 - s) / (s^2) # and the fact that initially s = S = beta*Psi_i'*Psi_i and q = Q = # beta*Psi_i'*Y gamma[ind_start] = (np.square(PsiTY[ind_start]) - sigma2 * PsiTPsi[ind_start,ind_start]) / \ (np.square(PsiTPsi[ind_start, ind_start])) Sigma = 1. / (beta * PsiTPsi[ind_start,ind_start] + 1./gamma[ind_start]) mu = Sigma * PsiTY[ind_start] * beta tmp1 = beta * PsiTPsi[ind_start,:] S = beta * np.diag(PsiTPsi).T - Sigma * np.square(tmp1) Q = beta * PsiTY.T - mu*(tmp1) tmp2 = np.ones(P) # alternative computation for the initial s,q q0tilde = PsiTY[ind_start]; s0tilde = PsiTPsi[ind_start, ind_start]; tmp2[ind_start] = s0tilde / (q0tilde**2) / beta s = np.divide(S,tmp2) q = np.divide(Q,tmp2) Lambda = 2*(num_active - 1) / np.sum(gamma) # initialize lambda as well Delta_L_max = [] for i in range(self.n_iter): if self.verbose: print(' lambda = {0:.3e}\n'.format(Lambda)) # Calculate the potential updated value of each gamma[i] if Lambda == 0.0: # RVM gamma_potential = np.multiply(((np.square(q) - s) > Lambda), \ np.divide(np.square(q) - s, np.square(s))) else: a = Lambda * np.square(s) b = np.square(s) + 2*Lambda*s c = Lambda + s - np.square(q) # <-- important decision boundary gamma_potential = np.multiply((c < 0) , np.divide(-b + np.sqrt(np.square(b) - 4*np.multiply(a,c)),2*a)) l_gamma = - np.log(np.absolute(1 + np.multiply(gamma,s))) \ + np.divide(np.multiply(np.square(q), gamma),(1 + np.multiply(gamma,s))) \ - Lambda*gamma # omitted the factor 1/2 # Contribution of each updated gamma(i) to L(gamma) l_gamma_potential = - np.log(np.absolute(1 + np.multiply(gamma_potential,s))) \ + np.divide(np.multiply(np.square(q), gamma_potential),(1 + np.multiply(gamma_potential,s))) \ - Lambda*gamma_potential # omitted the factor 1/2 # Check how L(gamma) would change if we replaced gamma(i) by the # updated gamma_potential(i), for each i separately Delta_L_potential = l_gamma_potential - l_gamma # deleted indices should not be chosen again if len(deleted_indices)!=0: Delta_L_potential[deleted_indices] = -np.inf * np.ones(len(deleted_indices)) Delta_L_max.append(np.nanmax(Delta_L_potential)) ind_L_max = np.nanargmax(Delta_L_potential) # in case there is only 1 regressor in the model and it would now be # deleted if len(active_indices) == 1 and ind_L_max == active_indices[0] \ and gamma_potential[ind_L_max] == 0.0: Delta_L_potential[ind_L_max] = -np.inf Delta_L_max[i] = np.max(Delta_L_potential) ind_L_max = np.argmax(Delta_L_potential) # If L did not change significantly anymore, break if Delta_L_max[i] <= 0.0 or\ ( i > 0 and \ all(np.absolute(Delta_L_max[i-1:]) \ < sum(Delta_L_max)*self.tol)) or \ ( i > 0 and all(np.diff(bcs_path)[i-1:]==0.0)): if self.verbose: print('Increase in L: {0:.3e} (eta = {1:.3e})\ -- break\n'.format(Delta_L_max[i],self.tol)) break # Print information if self.verbose: print(' Delta L = {0:.3e} \n'.format(Delta_L_max[i])) what_changed = int(gamma[ind_L_max]==0.0) - int(gamma_potential[ind_L_max]==0.0) # Print information if self.verbose: if what_changed < 0: print('{} - Remove regressor #{}..\n'.format(i+1,ind_L_max+1)) elif what_changed == 0: print('{} - Recompute regressor #{}..\n'.format(i+1,ind_L_max+1)) else: print('{} - Add regressor #{}..\n'.format(i+1,ind_L_max+1)) # --- Update all quantities ---- if what_changed == 1: # adding a regressor # update gamma gamma[ind_L_max] = gamma_potential[ind_L_max] Sigma_ii = 1.0 / (1.0/gamma[ind_L_max] + S[ind_L_max]) try: x_i = np.matmul(Sigma, PsiTPsi[active_indices,ind_L_max].reshape(-1,1)) except: x_i = Sigma * PsiTPsi[active_indices,ind_L_max] tmp_1 = - (beta * Sigma_ii) * x_i Sigma = np.vstack((np.hstack(((beta**2 * Sigma_ii) * np.dot(x_i,x_i.T) + Sigma , tmp_1))\ , np.append(tmp_1.T,Sigma_ii))) mu_i = Sigma_ii * Q[ind_L_max] mu = np.vstack((mu - (beta * mu_i) * x_i, mu_i)) tmp2 = beta * (PsiTPsi[:,ind_L_max] - beta * \ np.squeeze(np.matmul(PsiTPsi[:,active_indices],x_i))).T # row vector S = S - Sigma_ii * np.square(tmp2) Q = Q - mu_i * tmp2 num_active += 1 ind_global_to_local[ind_L_max] = num_active # local index active_indices.append(ind_L_max) bcs_path.append(ind_L_max) elif what_changed == 0: # recomputation if not ind_global_to_local[ind_L_max]: # zero if regressor has not been chosen yet raise Exception('cannot recompute index{0} -- not yet\ part of the model!'.format(ind_L_max)) gamma_i_new = gamma_potential[ind_L_max] gamma_i_old = gamma[ind_L_max] # update gamma gamma[ind_L_max] = gamma_potential[ind_L_max] # index of regressor in Sigma local_ind = ind_global_to_local[ind_L_max]-1 kappa_i = 1.0 / (Sigma[local_ind, local_ind] + 1.0 / (1.0/gamma_i_new - 1.0/gamma_i_old)) Sigma_i_col = Sigma[:,local_ind] # column of interest in Sigma Sigma = Sigma - kappa_i * (Sigma_i_col * Sigma_i_col.T) mu_i = mu[local_ind] mu = mu - (kappa_i * mu_i) * Sigma_i_col[:,None] tmp1 = beta * np.dot(Sigma_i_col.reshape(1,-1),PsiTPsi[active_indices])[0] # row vector S = S + kappa_i * np.square(tmp1) Q = Q + (kappa_i * mu_i) * tmp1 # no change in active_indices or ind_global_to_local bcs_path.append(ind_L_max+ 0.1) elif what_changed == -1: gamma[ind_L_max] = 0 # index of regressor in Sigma local_ind = ind_global_to_local[ind_L_max]-1 Sigma_ii_inv = 1. / Sigma[local_ind,local_ind] Sigma_i_col = Sigma[:,local_ind] # column to be deleted in Sigma Sigma = Sigma - Sigma_ii_inv * (Sigma_i_col * Sigma_i_col.T); Sigma = np.delete(np.delete(Sigma, local_ind, axis=0), local_ind, axis=1) mu = mu - (mu[local_ind] * Sigma_ii_inv) * Sigma_i_col[:,None] mu = np.delete(mu, local_ind, axis=0) tmp1 = beta * np.dot(Sigma_i_col,PsiTPsi[active_indices]) # row vector S = S + Sigma_ii_inv * np.square(tmp1) Q = Q + (mu_i * Sigma_ii_inv) * tmp1 num_active -= 1 ind_global_to_local[ind_L_max] = 0.0 ind_global_to_local[ind_global_to_local > local_ind] = ind_global_to_local[ind_global_to_local > local_ind] - 1 del active_indices[local_ind] deleted_indices.append(ind_L_max) # mark this index as deleted # and therefore ineligible bcs_path.append(-ind_L_max) # same for all three cases tmp3 = 1 - np.multiply(gamma,S) s = np.divide(S,tmp3) q = np.divide(Q,tmp3) # Update lambda Lambda = 2*(num_active - 1) / np.sum(gamma) # Prepare the result object self.coef_ = np.zeros(P) self.coef_[active_indices] = np.squeeze(mu) self.sigma_ = Sigma self.active_ = active_indices self.gamma = gamma self.Lambda = Lambda self.beta = beta self.bcs_path = bcs_path return self def predict(self,X, return_std=False): ''' Computes predictive distribution for test set. Predictive distribution for each data point is one dimensional Gaussian and therefore is characterised by mean and variance based on Ref.[1] Section 3.3.2. Parameters ----------- X: {array-like, sparse} (n_samples_test, n_features) Test data, matrix of explanatory variables Returns ------- : list of length two [y_hat, var_hat] y_hat: numpy array of size (n_samples_test,) Estimated values of targets on test set (i.e. mean of predictive distribution) var_hat: numpy array of size (n_samples_test,) Variance of predictive distribution References ---------- [1] Bishop, C. M. (2006). Pattern recognition and machine learning. springer. ''' x = (X - self._x_mean_) / self._x_std y_hat = np.dot(x,self.coef_) + self._y_mean if return_std: var_hat = 1./self.beta var_hat += np.sum( np.dot(x[:,self.active_],self.sigma_) * x[:,self.active_], axis = 1) std_hat = np.sqrt(var_hat) return y_hat, std_hat else: return y_hat
Methods
def fit(self, X, Y)
-
Expand source code
def fit(self, X,Y): k_fold = KFold(n_splits=self.n_Kfold) varY = np.var(Y,ddof=1) if np.var(Y,ddof=1)!=0 else 1.0 sigma2s = len(Y)*varY*10**np.linspace(-16,-1,self.n_Kfold) errors = np.zeros((len(sigma2s),self.n_Kfold)) for s, sigma2 in enumerate(sigma2s): for k, (train, test) in enumerate(k_fold.split(X, Y)): self.fit_(X[train], Y[train], sigma2) errors[s,k] = np.linalg.norm(Y[test] - self.predict(X[test]))**2/len(test) KfCVerror = np.sum(errors,axis=1)/self.n_Kfold/varY i_minCV = np.argmin(KfCVerror) self.kfoldCVerror = np.min(KfCVerror) return self.fit_(X, Y, sigma2s[i_minCV])
def fit_(self, X, Y, sigma2)
-
Expand source code
def fit_(self, X, Y, sigma2): N,P = X.shape # n_samples, n_features = X.shape X, y, X_mean, y_mean, X_std = self._center_data(X, Y) self._x_mean_ = X_mean self._y_mean = y_mean self._x_std = X_std beta = 1./sigma2 # precompute X'*Y , X'*X for faster iterations & allocate memory for # sparsity & quality vectors X=Psi, y=Y PsiTY = np.dot(X.T,y) #XY PsiTPsi = np.dot(X.T,X) #XX XXd = np.diag(PsiTPsi) # initialize with constant regressor, or if that one does not exist, # with the one that has the largest correlation with Y ind_global_to_local = np.zeros(P , dtype = np.int32)#np.bool) #active # identify constant regressors constidx = np.where(~np.diff(X,axis=0).all(axis=0))[0] if constidx.size != 0: ind_start = constidx[0] ind_global_to_local[ind_start] = True else: # start from a single basis vector with largest projection on targets proj = np.divide(np.square(PsiTY) , XXd) ind_start = np.argmax(proj) ind_global_to_local[ind_start] = True num_active = 1 active_indices = [ind_start] deleted_indices = [] bcs_path = [ind_start] gamma = np.zeros(P) # for the initial value of gamma(ind_start), use the RVM formula # gamma = (q^2 - s) / (s^2) # and the fact that initially s = S = beta*Psi_i'*Psi_i and q = Q = # beta*Psi_i'*Y gamma[ind_start] = (np.square(PsiTY[ind_start]) - sigma2 * PsiTPsi[ind_start,ind_start]) / \ (np.square(PsiTPsi[ind_start, ind_start])) Sigma = 1. / (beta * PsiTPsi[ind_start,ind_start] + 1./gamma[ind_start]) mu = Sigma * PsiTY[ind_start] * beta tmp1 = beta * PsiTPsi[ind_start,:] S = beta * np.diag(PsiTPsi).T - Sigma * np.square(tmp1) Q = beta * PsiTY.T - mu*(tmp1) tmp2 = np.ones(P) # alternative computation for the initial s,q q0tilde = PsiTY[ind_start]; s0tilde = PsiTPsi[ind_start, ind_start]; tmp2[ind_start] = s0tilde / (q0tilde**2) / beta s = np.divide(S,tmp2) q = np.divide(Q,tmp2) Lambda = 2*(num_active - 1) / np.sum(gamma) # initialize lambda as well Delta_L_max = [] for i in range(self.n_iter): if self.verbose: print(' lambda = {0:.3e}\n'.format(Lambda)) # Calculate the potential updated value of each gamma[i] if Lambda == 0.0: # RVM gamma_potential = np.multiply(((np.square(q) - s) > Lambda), \ np.divide(np.square(q) - s, np.square(s))) else: a = Lambda * np.square(s) b = np.square(s) + 2*Lambda*s c = Lambda + s - np.square(q) # <-- important decision boundary gamma_potential = np.multiply((c < 0) , np.divide(-b + np.sqrt(np.square(b) - 4*np.multiply(a,c)),2*a)) l_gamma = - np.log(np.absolute(1 + np.multiply(gamma,s))) \ + np.divide(np.multiply(np.square(q), gamma),(1 + np.multiply(gamma,s))) \ - Lambda*gamma # omitted the factor 1/2 # Contribution of each updated gamma(i) to L(gamma) l_gamma_potential = - np.log(np.absolute(1 + np.multiply(gamma_potential,s))) \ + np.divide(np.multiply(np.square(q), gamma_potential),(1 + np.multiply(gamma_potential,s))) \ - Lambda*gamma_potential # omitted the factor 1/2 # Check how L(gamma) would change if we replaced gamma(i) by the # updated gamma_potential(i), for each i separately Delta_L_potential = l_gamma_potential - l_gamma # deleted indices should not be chosen again if len(deleted_indices)!=0: Delta_L_potential[deleted_indices] = -np.inf * np.ones(len(deleted_indices)) Delta_L_max.append(np.nanmax(Delta_L_potential)) ind_L_max = np.nanargmax(Delta_L_potential) # in case there is only 1 regressor in the model and it would now be # deleted if len(active_indices) == 1 and ind_L_max == active_indices[0] \ and gamma_potential[ind_L_max] == 0.0: Delta_L_potential[ind_L_max] = -np.inf Delta_L_max[i] = np.max(Delta_L_potential) ind_L_max = np.argmax(Delta_L_potential) # If L did not change significantly anymore, break if Delta_L_max[i] <= 0.0 or\ ( i > 0 and \ all(np.absolute(Delta_L_max[i-1:]) \ < sum(Delta_L_max)*self.tol)) or \ ( i > 0 and all(np.diff(bcs_path)[i-1:]==0.0)): if self.verbose: print('Increase in L: {0:.3e} (eta = {1:.3e})\ -- break\n'.format(Delta_L_max[i],self.tol)) break # Print information if self.verbose: print(' Delta L = {0:.3e} \n'.format(Delta_L_max[i])) what_changed = int(gamma[ind_L_max]==0.0) - int(gamma_potential[ind_L_max]==0.0) # Print information if self.verbose: if what_changed < 0: print('{} - Remove regressor #{}..\n'.format(i+1,ind_L_max+1)) elif what_changed == 0: print('{} - Recompute regressor #{}..\n'.format(i+1,ind_L_max+1)) else: print('{} - Add regressor #{}..\n'.format(i+1,ind_L_max+1)) # --- Update all quantities ---- if what_changed == 1: # adding a regressor # update gamma gamma[ind_L_max] = gamma_potential[ind_L_max] Sigma_ii = 1.0 / (1.0/gamma[ind_L_max] + S[ind_L_max]) try: x_i = np.matmul(Sigma, PsiTPsi[active_indices,ind_L_max].reshape(-1,1)) except: x_i = Sigma * PsiTPsi[active_indices,ind_L_max] tmp_1 = - (beta * Sigma_ii) * x_i Sigma = np.vstack((np.hstack(((beta**2 * Sigma_ii) * np.dot(x_i,x_i.T) + Sigma , tmp_1))\ , np.append(tmp_1.T,Sigma_ii))) mu_i = Sigma_ii * Q[ind_L_max] mu = np.vstack((mu - (beta * mu_i) * x_i, mu_i)) tmp2 = beta * (PsiTPsi[:,ind_L_max] - beta * \ np.squeeze(np.matmul(PsiTPsi[:,active_indices],x_i))).T # row vector S = S - Sigma_ii * np.square(tmp2) Q = Q - mu_i * tmp2 num_active += 1 ind_global_to_local[ind_L_max] = num_active # local index active_indices.append(ind_L_max) bcs_path.append(ind_L_max) elif what_changed == 0: # recomputation if not ind_global_to_local[ind_L_max]: # zero if regressor has not been chosen yet raise Exception('cannot recompute index{0} -- not yet\ part of the model!'.format(ind_L_max)) gamma_i_new = gamma_potential[ind_L_max] gamma_i_old = gamma[ind_L_max] # update gamma gamma[ind_L_max] = gamma_potential[ind_L_max] # index of regressor in Sigma local_ind = ind_global_to_local[ind_L_max]-1 kappa_i = 1.0 / (Sigma[local_ind, local_ind] + 1.0 / (1.0/gamma_i_new - 1.0/gamma_i_old)) Sigma_i_col = Sigma[:,local_ind] # column of interest in Sigma Sigma = Sigma - kappa_i * (Sigma_i_col * Sigma_i_col.T) mu_i = mu[local_ind] mu = mu - (kappa_i * mu_i) * Sigma_i_col[:,None] tmp1 = beta * np.dot(Sigma_i_col.reshape(1,-1),PsiTPsi[active_indices])[0] # row vector S = S + kappa_i * np.square(tmp1) Q = Q + (kappa_i * mu_i) * tmp1 # no change in active_indices or ind_global_to_local bcs_path.append(ind_L_max+ 0.1) elif what_changed == -1: gamma[ind_L_max] = 0 # index of regressor in Sigma local_ind = ind_global_to_local[ind_L_max]-1 Sigma_ii_inv = 1. / Sigma[local_ind,local_ind] Sigma_i_col = Sigma[:,local_ind] # column to be deleted in Sigma Sigma = Sigma - Sigma_ii_inv * (Sigma_i_col * Sigma_i_col.T); Sigma = np.delete(np.delete(Sigma, local_ind, axis=0), local_ind, axis=1) mu = mu - (mu[local_ind] * Sigma_ii_inv) * Sigma_i_col[:,None] mu = np.delete(mu, local_ind, axis=0) tmp1 = beta * np.dot(Sigma_i_col,PsiTPsi[active_indices]) # row vector S = S + Sigma_ii_inv * np.square(tmp1) Q = Q + (mu_i * Sigma_ii_inv) * tmp1 num_active -= 1 ind_global_to_local[ind_L_max] = 0.0 ind_global_to_local[ind_global_to_local > local_ind] = ind_global_to_local[ind_global_to_local > local_ind] - 1 del active_indices[local_ind] deleted_indices.append(ind_L_max) # mark this index as deleted # and therefore ineligible bcs_path.append(-ind_L_max) # same for all three cases tmp3 = 1 - np.multiply(gamma,S) s = np.divide(S,tmp3) q = np.divide(Q,tmp3) # Update lambda Lambda = 2*(num_active - 1) / np.sum(gamma) # Prepare the result object self.coef_ = np.zeros(P) self.coef_[active_indices] = np.squeeze(mu) self.sigma_ = Sigma self.active_ = active_indices self.gamma = gamma self.Lambda = Lambda self.beta = beta self.bcs_path = bcs_path return self
def predict(self, X, return_std=False)
-
Computes predictive distribution for test set. Predictive distribution for each data point is one dimensional Gaussian and therefore is characterised by mean and variance based on Ref.[1] Section 3.3.2.
Parameters
X
:{array-like, sparse} (n_samples_test, n_features)
Test data, matrix of explanatory variables
Returns
-
list of length two [y_hat, var_hat]
y_hat: numpy array of size (n_samples_test,) Estimated values of targets on test set (i.e. mean of predictive distribution)
var_hat: numpy array of size (n_samples_test,) Variance of predictive distribution
References
[1] Bishop, C. M. (2006). Pattern recognition and machine learning. springer.
Expand source code
def predict(self,X, return_std=False): ''' Computes predictive distribution for test set. Predictive distribution for each data point is one dimensional Gaussian and therefore is characterised by mean and variance based on Ref.[1] Section 3.3.2. Parameters ----------- X: {array-like, sparse} (n_samples_test, n_features) Test data, matrix of explanatory variables Returns ------- : list of length two [y_hat, var_hat] y_hat: numpy array of size (n_samples_test,) Estimated values of targets on test set (i.e. mean of predictive distribution) var_hat: numpy array of size (n_samples_test,) Variance of predictive distribution References ---------- [1] Bishop, C. M. (2006). Pattern recognition and machine learning. springer. ''' x = (X - self._x_mean_) / self._x_std y_hat = np.dot(x,self.coef_) + self._y_mean if return_std: var_hat = 1./self.beta var_hat += np.sum( np.dot(x[:,self.active_],self.sigma_) * x[:,self.active_], axis = 1) std_hat = np.sqrt(var_hat) return y_hat, std_hat else: return y_hat