From 2e3f6aca8bded537b76e742c46a382de9855c40b Mon Sep 17 00:00:00 2001
From: faridm69 <faridmohammadi69@gmail.com>
Date: Wed, 17 Jun 2020 13:33:59 +0200
Subject: [PATCH] [BayesInference] added MCMC with PCE.

---
 BayesValidRox/BayesInference/MCMC.py | 398 +++++++++++++++++++++++++++
 1 file changed, 398 insertions(+)
 create mode 100755 BayesValidRox/BayesInference/MCMC.py

diff --git a/BayesValidRox/BayesInference/MCMC.py b/BayesValidRox/BayesInference/MCMC.py
new file mode 100755
index 000000000..029be7754
--- /dev/null
+++ b/BayesValidRox/BayesInference/MCMC.py
@@ -0,0 +1,398 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jun  3 16:42:12 2020
+
+@author: farid
+"""
+
+import numpy as np
+import emcee
+import pandas as pd
+from multiprocessing import Pool
+import scipy.stats as st
+from scipy.linalg import cholesky as chol
+import warnings
+
+class MCMC():
+    """
+    nwalkers = 20  # number of MCMC walkers
+    nburn = 100 # "burn-in" period to let chains stabilize
+    nsteps = 1000  # number of MCMC steps to take
+    ntemps = 20
+    """
+    def __init__(self, BayesOpts, initsamples = None, nwalkers = 20, nburn = 100, 
+                 nsteps = 1000, verbose = False):
+        
+        self.BayesOpts      = BayesOpts
+        self.initsamples    = initsamples
+        self.nwalkers       = nwalkers
+        self.nburn          = nburn
+        self.nsteps         = nsteps
+        self.verbose        = verbose
+        
+    def run_sampler(self, Observation, TotalSigma2):
+        
+        PCEModel = self.BayesOpts.PCEModel
+        Model = PCEModel.ModelObj
+        priorDist = PCEModel.ExpDesign.JDist
+        
+        ndim = PCEModel.NofPa
+        
+        # Define emcee sampler
+        # Here we'll set up the computation. emcee combines multiple "walkers",
+        # each of which is its own MCMC chain. The number of trace results will
+        # be nwalkers * nsteps
+        
+        # set theta near the maximum likelihood, with 
+        np.random.seed(0)
+        if self.initsamples is None:
+            initsamples = priorDist.sample(self.nwalkers).T
+        else:
+            # Pick samples based on a uniform dist between main and max of each dim
+            initsamples = np.zeros((self.nwalkers, ndim))
+            for idxDim in range(ndim):
+                lower, upper = np.min(self.initsamples[:,idxDim]),np.max(self.initsamples[:,idxDim])
+                initsamples[:,idxDim] = st.uniform(loc=lower, scale=upper-lower).rvs(size=self.nwalkers)
+            
+        with Pool() as pool:
+            sampler = emcee.EnsembleSampler(self.nwalkers, ndim, self.log_posterior, args=[Observation, TotalSigma2], pool=pool)
+            sampler.run_mcmc(initsamples, self.nsteps, progress=True)
+        # sampler = emcee.EnsembleSampler(nwalkers, ndim, log_posterior, args=[Observation, TotalSigma2])
+        # sampler.run_mcmc(starting_guesses, nsteps, progress=True)
+        
+        # sampler.chain is of shape (nwalkers, nsteps, ndim)
+        # we'll throw-out the burn-in points and reshape:
+        emcee_trace = sampler.chain[:, self.nburn:, :].reshape(-1, ndim).T
+        
+        # logml_dict = self.Marginal_llk_emcee(sampler, self.nburn, logp=None, maxiter=5000)
+        # print('\nThe Bridge Sampling Estimation is %.5f.'%(logml_dict['logml']))
+        
+        # # Posterior-based expectation of posterior probablity
+        # postExpPostLikelihoods_emcee = np.mean(sampler.get_log_prob(flat=True)[self.nburn*self.nwalkers:])
+        
+        # # Posterior-based expectation of prior densities
+        # postExpPrior_emcee = np.mean(self.log_prior(emcee_trace.T))
+        
+        # # Posterior-based expectation of likelihoods
+        # postExpLikelihoods_emcee = postExpPostLikelihoods_emcee - postExpPrior_emcee
+        
+        # # Calculate Kullback-Leibler Divergence
+        # KLD_emcee = postExpLikelihoods_emcee - logml_dict['logml']
+        # print("Kullback-Leibler divergence: %.5f"%KLD_emcee)
+        
+        # # Information Entropy based on Entropy paper Eq. 38
+        # infEntropy_emcee = logml_dict['logml'] - postExpPrior_emcee - postExpLikelihoods_emcee
+        # print("Information Entropy: %.5f" %infEntropy_emcee)
+        
+        InputNames = [PCEModel.Inputs.Marginals[i].Name for i in range(len(PCEModel.Inputs.Marginals))]
+            
+        Posterior_df = pd.DataFrame(emcee_trace.T, columns=InputNames)
+        
+        return Posterior_df
+    
+    def log_prior(self, theta):
+        
+        PCEModel = self.BayesOpts.PCEModel
+        priorDist = PCEModel.ExpDesign.JDist
+        params_range = PCEModel.ExpDesign.BoundTuples
+        ndimTheta = theta.ndim
+        theta = theta if ndimTheta != 1 else theta.reshape((1,-1))
+        nsamples = theta.shape[0]
+        
+        logprior = -np.inf*np.ones(nsamples)
+        
+        for i in range(nsamples):
+            if not self.check_ranges(theta[i], params_range):
+                logprior[i] = priorDist.pdf(theta[i])
+                
+        if nsamples == 1:
+            return logprior[0]
+        else:
+            return logprior
+        
+    # LogLikelihood
+    def log_likelihood(self, theta, Observation, TotalSigma2):
+        
+        BayesOpts = self.BayesOpts
+        ndimTheta = theta.ndim
+        theta = theta if ndimTheta != 1 else theta.reshape((1,-1))
+        
+        if BayesOpts.emulator:
+            # Evaluate the PCEModel
+            meanPCEPriorPred, BayesOpts.stdPCEPriorPred = BayesOpts.eval_PCEmodel(theta)
+        else:
+            # Evaluate the origModel
+            meanPCEPriorPred = BayesOpts.eval_Model(theta)
+
+        return BayesOpts.normpdf(meanPCEPriorPred, Observation, TotalSigma2)
+    
+    
+    def log_posterior(self, theta, Observation, TotalSigma2):
+        """
+        Computes the posterior likelihood for the given parameterset.
+
+        Parameters
+        ----------
+        theta : TYPE
+            DESCRIPTION.
+        Observation : TYPE
+            DESCRIPTION.
+        TotalSigma2 : TYPE
+            DESCRIPTION.
+
+        Returns
+        -------
+        TYPE
+            DESCRIPTION.
+
+        """
+        ndimTheta = theta.ndim
+        nsamples = 1 if ndimTheta == 1 else theta.shape[0]
+        
+        if nsamples == 1:
+            if self.log_prior(theta) == -np.inf: 
+                return -np.inf
+            else:
+                return self.log_prior(theta) + self.log_likelihood(theta, Observation, TotalSigma2)
+        else:
+            # Compute log prior
+            log_prior = self.log_prior(theta)
+            
+            # Initialize log_likelihood
+            log_likelihood = -np.inf*np.ones(nsamples)
+            
+            # find the indices for -inf sets
+            nonInfIdx = np.where(log_prior != -np.inf)[0]
+            
+            # Compute loLikelihoods
+            if nonInfIdx.size != 0:
+                log_likelihood[nonInfIdx] = self.log_likelihood(theta[nonInfIdx], Observation, TotalSigma2)
+            
+            return log_prior + log_likelihood
+    
+    def Marginal_llk_emcee(self, sampler, nburn=None, logp=None, maxiter=1000):
+        """
+        The Bridge Sampling Estimator of the Marginal Likelihood.
+    
+        Parameters
+        ----------
+        mtrace : MultiTrace, result of MCMC run
+        model : PyMC Model
+            Optional model. Default None, taken from context.
+        logp : Model Log-probability function, read from the model by default
+        maxiter : Maximum number of iterations
+    
+        Returns
+        -------
+        marg_llk : Estimated Marginal log-Likelihood.
+        """
+        r0, tol1, tol2 = 0.5, 1e-10, 1e-4
+        
+        if logp is None:
+            logp = sampler.log_prob_fn
+    
+        
+        # Split the samples into two parts  
+        # Use the first 50% for fiting the proposal distribution and the second 50% 
+        # in the iterative scheme.
+        if nburn is None:
+            mtrace = sampler.chain
+        else:
+            mtrace = sampler.chain[:, nburn:, :]
+            
+        nchain, len_trace, nrofVars = mtrace.shape
+        
+        N1_ = len_trace // 2
+        N1 = N1_*nchain
+        N2 = len_trace*nchain - N1
+        
+        samples_4_fit = np.zeros((nrofVars, N1))
+        samples_4_iter = np.zeros((nrofVars, N2))
+        effective_n = np.zeros((nrofVars))
+    
+        # matrix with already transformed samples
+        for var in range(nrofVars):
+            
+            # for fitting the proposal
+            x = mtrace[:,:N1_,var]
+            
+            samples_4_fit[var, :] = x.flatten()
+            # for the iterative scheme
+            x2 = mtrace[:,N1_:,var]
+            samples_4_iter[var, :] = x2.flatten()
+    
+            # effective sample size of samples_4_iter, scalar 
+            #(https://github.com/jwalton3141/jwalton3141.github.io/blob/master/assets/posts/ESS/rwmh.py)
+            effective_n[var] = self.my_ESS(x2)
+    
+        # median effective sample size (scalar)
+        neff = np.median(effective_n)
+        
+        # get mean & covariance matrix and generate samples from proposal
+        m = np.mean(samples_4_fit, axis=1)
+        V = np.cov(samples_4_fit)
+        L = chol(V, lower=True)
+    
+        # Draw N2 samples from the proposal distribution
+        gen_samples = m[:, None] + np.dot(L, st.norm.rvs(0, 1, 
+                                             size=samples_4_iter.shape))
+    
+        # Evaluate proposal distribution for posterior & generated samples
+        q12 = st.multivariate_normal.logpdf(samples_4_iter.T, m, V)
+        q22 = st.multivariate_normal.logpdf(gen_samples.T, m, V)
+    
+        # Evaluate unnormalized posterior for posterior & generated samples
+        q11 = logp(samples_4_iter.T)
+        q21 = logp(gen_samples.T)
+        
+        # Run iterative scheme:
+        tmp = self.iterative_scheme(N1, N2, q11, q12, q21, q22, r0, neff, tol1, maxiter, 'r')
+        if ~np.isfinite(tmp['logml']):
+            warnings.warn("""logml could not be estimated within maxiter, rerunning with 
+                          adjusted starting value. Estimate might be more variable than usual.""")
+            # use geometric mean as starting value
+            r0_2 = np.sqrt(tmp['r_vals'][-2]*tmp['r_vals'][-1])
+            tmp = self.iterative_scheme(q11, q12, q21, q22, r0_2, neff, tol2, maxiter, 'logml')
+    
+        return dict(logml = tmp['logml'], niter = tmp['niter'], method = "normal", 
+                    q11 = q11, q12 = q12, q21 = q21, q22 = q22)
+    
+    
+    def iterative_scheme(self,N1, N2, q11, q12, q21, q22, r0, neff, tol, maxiter, criterion):
+        """
+        Iterative scheme as proposed in Meng and Wong (1996) to estimate the marginal likelihood    
+    
+        Parameters
+        ----------
+        N1 : TYPE
+            DESCRIPTION.
+        N2 : TYPE
+            DESCRIPTION.
+        q11 : TYPE
+            DESCRIPTION.
+        q12 : TYPE
+            DESCRIPTION.
+        q21 : TYPE
+            DESCRIPTION.
+        q22 : TYPE
+            DESCRIPTION.
+        r0 : TYPE
+            DESCRIPTION.
+        neff : TYPE
+            DESCRIPTION.
+        tol : TYPE
+            DESCRIPTION.
+        maxiter : TYPE
+            DESCRIPTION.
+        criterion : TYPE
+            DESCRIPTION.
+    
+        Returns
+        -------
+        TYPE
+            DESCRIPTION.
+    
+        """
+        l1 = q11 - q12
+        l2 = q21 - q22
+        lstar = np.median(l1) # To increase numerical stability, 
+                              # subtracting the median of l1 from l1 & l2 later
+        s1 = neff/(neff + N2)
+        s2 = N2/(neff + N2)
+        r = r0
+        r_vals = [r]
+        logml = np.log(r) + lstar
+        criterion_val = 1 + tol
+    
+        i = 0
+        while (i <= maxiter) & (criterion_val > tol):
+            rold = r
+            logmlold = logml
+            numi = np.exp(l2 - lstar)/(s1 * np.exp(l2 - lstar) + s2 * r)
+            deni = 1/(s1 * np.exp(l1 - lstar) + s2 * r)
+            if np.sum(~np.isfinite(numi))+np.sum(~np.isfinite(deni)) > 0:
+                warnings.warn("""Infinite value in iterative scheme, returning NaN. 
+                Try rerunning with more samples.""")
+            r = (N1/N2) * np.sum(numi)/np.sum(deni)
+            r_vals.append(r)
+            logml = np.log(r) + lstar
+            i += 1
+            if criterion=='r':
+                criterion_val = np.abs((r - rold)/r)
+            elif criterion=='logml':
+                criterion_val = np.abs((logml - logmlold)/logml)
+    
+        if i >= maxiter:
+            return dict(logml = np.NaN, niter = i, r_vals = np.asarray(r_vals))
+        else:
+            return dict(logml = logml, niter = i)
+        
+    
+    def my_gelman_rubin(self, x):
+        """ Estimate the marginal posterior variance. Vectorised implementation. """
+        m_chains, n_iters = x.shape
+    
+        # Calculate between-chain variance
+        B_over_n = ((np.mean(x, axis=1) - np.mean(x))**2).sum() / (m_chains - 1)
+    
+        # Calculate within-chain variances
+        W = ((x - x.mean(axis=1, keepdims=True))**2).sum() / (m_chains*(n_iters - 1))
+    
+        # (over) estimate of variance
+        s2 = W * (n_iters - 1) / n_iters + B_over_n
+        
+        return s2
+    
+    def my_ESS(self, x):
+        """ 
+        Compute the effective sample size of estimand of interest. 
+        Vectorised implementation. 
+        """
+        m_chains, n_iters = x.shape
+    
+        variogram = lambda t: ((x[:, t:] - x[:, :(n_iters - t)])**2).sum() / (m_chains * (n_iters - t))
+    
+        post_var = self.my_gelman_rubin(x)
+    
+        t = 1
+        rho = np.ones(n_iters)
+        negative_autocorr = False
+    
+        # Iterate until the sum of consecutive estimates of autocorrelation is negative
+        while not negative_autocorr and (t < n_iters):
+            rho[t] = 1 - variogram(t) / (2 * post_var)
+    
+            if not t % 2:
+                negative_autocorr = sum(rho[t-1:t+1]) < 0
+    
+            t += 1
+        
+        return int(m_chains*n_iters / (1 + 2*rho[1:t].sum()))
+    
+    def check_ranges(self, theta, ranges):
+        """
+        This function checks if theta lies in the given ranges
+
+        Parameters
+        ----------
+        theta : numpy array
+            Proposed parameter set.
+        ranges : TYPE
+            DESCRIPTION.
+
+        Returns
+        -------
+        c : bool
+            If it lies in the given range, it return True else False.
+
+        """
+        c = False 
+        #sigma = theta[-1]
+        # traverse in the list1 
+        for i, bounds in enumerate(ranges): 
+            x = theta[i]
+            # condition check 
+            if x< bounds[0] or x> bounds[1]: #or sigma < 0: 
+                c = True
+        return c
\ No newline at end of file
-- 
GitLab