From b9882ba56396052b53b6a2d527273673a03b4b9f Mon Sep 17 00:00:00 2001
From: kp992 <kpl992@outlook.com>
Date: Thu, 14 Aug 2025 17:20:00 -0700
Subject: [PATCH 01/22] remove pyro

---
 lectures/bayes_nonconj.md | 415 ++++++++++----------------------------
 1 file changed, 112 insertions(+), 303 deletions(-)

diff --git a/lectures/bayes_nonconj.md b/lectures/bayes_nonconj.md
index 6d586e4e3..2664a41d2 100644
--- a/lectures/bayes_nonconj.md
+++ b/lectures/bayes_nonconj.md
@@ -4,7 +4,7 @@ jupytext:
     extension: .md
     format_name: myst
     format_version: 0.13
-    jupytext_version: 1.16.4
+    jupytext_version: 1.16.7
 kernelspec:
   display_name: Python 3 (ipykernel)
   language: python
@@ -19,17 +19,17 @@ kernelspec:
 ```{code-cell} ipython3
 :tags: [hide-output]
 
-!pip install numpyro pyro-ppl torch jax
+!pip install numpyro jax
 ```
 
-This lecture is a sequel to the {doc}`quantecon lecture <prob_meaning>`.
+This lecture is a sequel to the {doc}`Two Meanings of Probability lecture <prob_meaning>`.
 
 That lecture offers a Bayesian interpretation of probability in a setting in which the likelihood function and the prior distribution
 over parameters just happened to form a **conjugate** pair in which
 
 -  application of Bayes' Law produces a posterior distribution that has the same functional form as the prior
 
-Having a likelihood and prior that  are conjugate can simplify calculation of a posterior, faciltating  analytical or nearly analytical calculations.
+Having a likelihood and prior that  are conjugate can simplify calculation of a posterior, facilitating analytical or nearly analytical calculations.
 
 But in many situations  the likelihood and prior need not form a conjugate pair.
 
@@ -42,13 +42,7 @@ In this lecture, we illustrate how modern Bayesians confront non-conjugate prior
 - first  cleverly forming a Markov chain whose invariant distribution is the posterior distribution we want
 - simulating the Markov chain until it has converged and then sampling from the invariant distribution to approximate the posterior
 
-We shall illustrate the approach by deploying two powerful Python modules that implement this approach as well as another closely related one to
-be described below.
-
-The two Python modules are
-
-- `numpyro`
-- `pymc4`
+We shall illustrate the approach by deploying a powerful Python library, [NumPyro](https://num.pyro.ai/en/stable/getting_started.html) that implement this approach.
 
 As usual, we begin by importing some Python code.
 
@@ -58,34 +52,23 @@ import seaborn as sns
 import matplotlib.pyplot as plt
 from scipy.stats import binom
 import scipy.stats as st
-import torch
 
-# jax
 import jax.numpy as jnp
-from jax import lax, random
+from jax import random as jax_random
 
-# pyro
-import pyro
-from pyro import distributions as dist
-import pyro.distributions.constraints as constraints
-from pyro.infer import MCMC, NUTS, SVI, ELBO, Trace_ELBO
-from pyro.optim import Adam
-
-# numpyro
 import numpyro
 from numpyro import distributions as ndist
 import numpyro.distributions.constraints as nconstraints
 from numpyro.infer import MCMC as nMCMC
 from numpyro.infer import NUTS as nNUTS
 from numpyro.infer import SVI as nSVI
-from numpyro.infer import ELBO as nELBO
 from numpyro.infer import Trace_ELBO as nTrace_ELBO
 from numpyro.optim import Adam as nAdam
 ```
 
 ## Unleashing MCMC on a  Binomial Likelihood
 
-This lecture begins with the binomial example in the {doc}`quantecon lecture <prob_meaning>`.
+This lecture begins with the binomial example in the {doc}`Probability Meanings lecture <prob_meaning>`.
 
 That lecture computed a posterior
 
@@ -96,11 +79,11 @@ This lecture instead computes posteriors
 - numerically by sampling from the posterior distribution through MCMC methods, and
 - using a variational inference (VI) approximation.
 
-We use both the packages `pyro` and `numpyro` with assistance from  `jax` to approximate a  posterior distribution
+We use `numpyro` with assistance from `jax` to approximate a  posterior distribution
 
 We use several alternative prior distributions
 
-We  compare computed posteriors  with ones associated with a conjugate prior as described in  {doc}`the quantecon lecture <prob_meaning>`
+We compare computed posteriors  with ones associated with a conjugate prior as described in  {doc}`Probability Meanings lecture <prob_meaning>`
 
 
 ### Analytical Posterior
@@ -127,7 +110,7 @@ $$
 We choose this as our prior for now because  we know that a conjugate prior for the binomial likelihood function is a beta distribution.
 
 
-After observing  $k$ successes among $N$ sample observations, the posterior  probability distributionof  $ \theta $ is
+After observing  $k$ successes among $N$ sample observations, the posterior  probability distribution of  $ \theta $ is
 
 $$
 \textrm{Prob}(\theta|k) = \frac{\textrm{Prob}(\theta,k)}{\textrm{Prob}(k)}=\frac{\textrm{Prob}(k|\theta)\textrm{Prob}(\theta)}{\textrm{Prob}(k)}=\frac{\textrm{Prob}(k|\theta) \textrm{Prob}(\theta)}{\int_0^1 \textrm{Prob}(k|\theta)\textrm{Prob}(\theta) d\theta}
@@ -149,7 +132,7 @@ $$
 \textrm{Prob}(\theta|k) \sim {Beta}(\alpha + k, \beta+N-k)
 $$
 
-The analytical posterior for a given conjugate beta prior is coded in the following Python code.
+The analytical posterior for a given conjugate beta prior is coded in the following
 
 ```{code-cell} ipython3
 def simulate_draw(theta, n):
@@ -189,7 +172,7 @@ Suppose that we don't have a conjugate prior.
 
 Then  we  can't compute posteriors analytically.
 
-Instead,  we use computational tools to approximate the posterior distribution for a set of alternative prior distributions using both `Pyro` and `Numpyro` packages in Python.
+Instead,  we use computational tools to approximate the posterior distribution for a set of alternative prior distributions using `numpyro`.
 
 We first use the **Markov Chain Monte Carlo** (MCMC) algorithm .
 
@@ -199,7 +182,7 @@ In that way we construct a sampling distribution that approximates the  posterio
 
 After doing that we deply another procedure called  **Variational Inference** (VI).
 
-In particular, we implement Stochastic Variational Inference (SVI) machinery in both `Pyro` and `Numpyro`.
+In particular, we implement Stochastic Variational Inference (SVI) machinery in `numpyro`.
 
 The MCMC algorithm  supposedly generates a more accurate approximation since in principle it directly samples from the posterior distribution.
 
@@ -217,7 +200,7 @@ a Kullback-Leibler (KL) divergence between true posterior and the putatitive pos
 
 ## Prior Distributions
 
-In order to be able to apply MCMC sampling or VI, `Pyro` and `Numpyro` require  that a prior distribution satisfy special properties:
+In order to be able to apply MCMC sampling or VI, `numpyro` require  that a prior distribution satisfy special properties:
 
 - we must be able sample from it;
 - we must be able to  compute the log pdf  pointwise;
@@ -231,28 +214,21 @@ We  will use the following priors:
 
 - a truncated log-normal distribution with support on $[0,1]$ with parameters $(\mu,\sigma)$.
 
-    - To implement this, let $Z\sim Normal(\mu,\sigma)$ and $\tilde{Z}$ be truncated normal with support $[\log(0),\log(1)]$, then $\exp(Z)$ has a log normal distribution with bounded support $[0,1]$. This can be easily coded since `Numpyro` has a built-in truncated normal distribution, and `Torch` provides a `TransformedDistribution` class that includes an exponential transformation.
-
-    - Alternatively, we can use a rejection sampling strategy by assigning the probability rate to $0$ outside the bounds and rescaling accepted samples, i.e., realizations that are within the bounds, by the total probability computed via CDF of the original distribution. This can be implemented by defining a truncated distribution class with `pyro`'s `dist.Rejector` class.
-
-    - We implement both methods in the below section and verify that they  produce the same result.
+    - To implement this, let $Z\sim Normal(\mu,\sigma)$ and $\tilde{Z}$ be truncated normal with support $[\log(0),\log(1)]$, then $\exp(Z)$ has a log normal distribution with bounded support $[0,1]$. This can be easily coded since `numpyro` has a built-in truncated normal distribution, and `numpyro`'s `TransformedDistribution` class that includes an exponential transformation.
 
 - a shifted von Mises distribution that has support confined to $[0,1]$ with parameter $(\mu,\kappa)$.
 
     - Let $X\sim vonMises(0,\kappa)$. We know that $X$ has bounded support $[-\pi, \pi]$. We can define a shifted von Mises random variable $\tilde{X}=a+bX$ where $a=0.5, b=1/(2 \pi)$ so that $\tilde{X}$ is supported on $[0,1]$.
 
-    - This can be implemented using `Torch`'s `TransformedDistribution` class  with its `AffineTransform` method.
-
-    - If instead, we want the prior to be von-Mises distributed with center $\mu=0.5$, we can choose a high concentration level $\kappa$ so that most mass is located between $0$ and $1$. Then we can truncate the distribution using the above strategy. This can be implemented using  `pyro`'s `dist.Rejector` class. We choose $\kappa > 40$ in this case.
+    - This can be implemented using `numpyro`'s `TransformedDistribution` class  with its `AffineTransform` method.
 
 - a truncated Laplace distribution.
 
     - We also considered a truncated Laplace distribution because its density comes in a piece-wise non-smooth form and has a distinctive spiked shape.
 
-    - The truncated Laplace can be created using `Numpyro`'s `TruncatedDistribution` class.
+    - The truncated Laplace can be created using `numpyro`'s `TruncatedDistribution` class.
 
 ```{code-cell} ipython3
-# used by Numpyro
 def TruncatedLogNormal_trans(loc, scale):
     """
     Obtains the truncated log normal distribution using numpyro's TruncatedNormal and ExpTransform
@@ -279,48 +255,6 @@ def TruncatedLaplace(loc, scale):
     return ndist.TruncatedDistribution(
         base_dist, low=0.0, high=1.0
     )
-
-# used by Pyro
-class TruncatedLogNormal(dist.Rejector):
-    """
-    Define a TruncatedLogNormal distribution through rejection sampling in Pyro
-    """
-    def __init__(self, loc, scale_0, upp=1):
-        self.upp = upp
-        propose = dist.LogNormal(loc, scale_0)
-
-        def log_prob_accept(x):
-            return (x < upp).type_as(x).log()
-
-        log_scale = dist.LogNormal(loc, scale_0).cdf(torch.as_tensor(upp)).log()
-        super(TruncatedLogNormal, self).__init__(propose, log_prob_accept, log_scale)
-
-    @constraints.dependent_property
-    def support(self):
-        return constraints.interval(0, self.upp)
-
-
-class TruncatedvonMises(dist.Rejector):
-    """
-    Define a TruncatedvonMises distribution through rejection sampling in Pyro
-    """
-    def __init__(self, kappa, mu=0.5, low=0.0, upp=1.0):
-        self.low, self.upp = low, upp
-        propose = dist.VonMises(mu, kappa)
-
-        def log_prob_accept(x):
-            return ((x > low) & (x < upp)).type_as(x).log()
-
-        log_scale = torch.log(
-            torch.tensor(
-                st.vonmises(kappa=kappa, loc=mu).cdf(upp)
-                - st.vonmises(kappa=kappa, loc=mu).cdf(low))
-        )
-        super(TruncatedvonMises, self).__init__(propose, log_prob_accept, log_scale)
-
-    @constraints.dependent_property
-    def support(self):
-        return constraints.interval(self.low, self.upp)
 ```
 
 ### Variational Inference
@@ -390,7 +324,7 @@ A standard optimization routine can used to search for the optimal $\phi$ in our
 
 The parameterized  distribution $q_{\phi}(\theta)$ is called the **variational distribution**.
 
-We can implement Stochastic Variational Inference (SVI) in Pyro and Numpyro using the `Adam` gradient descent algorithm to approximate posterior.
+We can implement Stochastic Variational Inference (SVI) numpyro using the `Adam` gradient descent algorithm to approximate posterior.
 
 We use  two sets of variational distributions: Beta and TruncatedNormal with support $[0,1]$
 
@@ -416,9 +350,7 @@ The (`param`, `name_dist`) pair includes:
 - ('lognormal', loc, scale)
    - Note: This is the truncated log normal.
 
-- ('vonMises', kappa), where kappa denotes concentration parameter, and center location is set to $0.5$.
-   - Note: When using `Pyro`, this is the truncated version of the original vonMises distribution;
-   - Note: When using `Numpyro`, this is the **shifted** distribution.
+- ('vonMises', kappa), where kappa denotes concentration parameter, and center location is set to $0.5$. Using `numpyro`, this is the **shifted** distribution.
 
 - ('laplace', loc, scale)
    - Note: This is the truncated Laplace
@@ -442,7 +374,7 @@ The class `BaysianInference` has several key methods :
 
 ```{code-cell} ipython3
 class BayesianInference:
-    def __init__(self, param, name_dist, solver):
+    def __init__(self, param, name_dist):
         """
         Parameters
         ---------
@@ -450,61 +382,46 @@ class BayesianInference:
             a tuple object that contains all relevant parameters for the distribution
         dist : str.
             name of the distribution - 'beta', 'uniform', 'lognormal', 'vonMises', 'tent'
-        solver : str.
-            either pyro or numpyro
         """
         self.param = param
         self.name_dist = name_dist
-        self.solver = solver
-
         # jax requires explicit PRNG state to be passed
-        self.rng_key = random.PRNGKey(0)
+        self.rng_key = jax_random.PRNGKey(0)
 
 
     def sample_prior(self):
         """
-        Define the prior distribution to sample from in Pyro/Numpyro models.
+        Define the prior distribution to sample from in numpyro models.
         """
         if self.name_dist=='beta':
             # unpack parameters
             alpha0, beta0 = self.param
-            if self.solver=='pyro':
-                sample = pyro.sample('theta', dist.Beta(alpha0, beta0))
-            else:
-                sample = numpyro.sample('theta', ndist.Beta(alpha0, beta0), rng_key=self.rng_key)
+            sample = numpyro.sample('theta', ndist.Beta(alpha0, beta0),
+                                    rng_key=self.rng_key)
 
         elif self.name_dist=='uniform':
             # unpack parameters
             lb, ub = self.param
-            if self.solver=='pyro':
-                sample = pyro.sample('theta', dist.Uniform(lb, ub))
-            else:
-                sample = numpyro.sample('theta', ndist.Uniform(lb, ub), rng_key=self.rng_key)
+            sample = numpyro.sample('theta', ndist.Uniform(lb, ub),
+                                    rng_key=self.rng_key)
 
         elif self.name_dist=='lognormal':
             # unpack parameters
             loc, scale = self.param
-            if self.solver=='pyro':
-                sample = pyro.sample('theta', TruncatedLogNormal(loc, scale))
-            else:
-                sample = numpyro.sample('theta', TruncatedLogNormal_trans(loc, scale), rng_key=self.rng_key)
+            sample = numpyro.sample('theta', TruncatedLogNormal_trans(loc, scale),
+                                    rng_key=self.rng_key)
 
         elif self.name_dist=='vonMises':
             # unpack parameters
             kappa = self.param
-            if self.solver=='pyro':
-                sample = pyro.sample('theta', TruncatedvonMises(kappa))
-            else:
-                sample = numpyro.sample('theta', ShiftedVonMises(kappa), rng_key=self.rng_key)
+            sample = numpyro.sample('theta', ShiftedVonMises(kappa),
+                                    rng_key=self.rng_key)
 
         elif self.name_dist=='laplace':
             # unpack parameters
             loc, scale = self.param
-            if self.solver=='pyro':
-                print("WARNING: Please use Numpyro for truncated Laplace.")
-                sample = None
-            else:
-                sample = numpyro.sample('theta', TruncatedLaplace(loc, scale), rng_key=self.rng_key)
+            sample = numpyro.sample('theta', TruncatedLaplace(loc, scale),
+                                    rng_key=self.rng_key)
 
         return sample
 
@@ -515,20 +432,13 @@ class BayesianInference:
         """
         self.bins = bins
 
-        if self.solver=='pyro':
-            with pyro.plate('show_prior', size=size):
-                sample = self.sample_prior()
-            # to numpy
-            sample_array = sample.numpy()
-
-        elif self.solver=='numpyro':
-            with numpyro.plate('show_prior', size=size):
-                sample = self.sample_prior()
-            # to numpy
-            sample_array=jnp.asarray(sample)
+        with numpyro.plate('show_prior', size=size):
+            sample = self.sample_prior()
+        # to JAX array
+        sample_array = jnp.asarray(sample)
 
         # plot histogram and kernel density
-        if disp_plot==1:
+        if disp_plot == 1:
             sns.displot(sample_array, kde=True, stat='density', bins=bins, height=5, aspect=1.5)
             plt.xlim(0, 1)
             plt.show()
@@ -540,17 +450,8 @@ class BayesianInference:
         """
         Define the probabilistic model by specifying prior, conditional likelihood, and data conditioning
         """
-        if not torch.is_tensor(data):
-            data = torch.tensor(data)
-        # set prior
         theta = self.sample_prior()
-
-        # sample from conditional likelihood
-        if self.solver=='pyro':
-            output = pyro.sample('obs', dist.Binomial(len(data), theta), obs=torch.sum(data))
-        else:
-            # Note: numpyro.sample() requires obs=np.ndarray
-            output = numpyro.sample('obs', ndist.Binomial(len(data), theta), obs=torch.sum(data).numpy())
+        output = numpyro.sample('obs', ndist.Binomial(len(data), theta), obs=jnp.sum(data))
         return output
 
 
@@ -559,22 +460,11 @@ class BayesianInference:
         Computes numerically the posterior distribution with beta prior parametrized by (alpha0, beta0)
         given data using MCMC
         """
-        # use pyro
-        if self.solver=='pyro':
-            # tensorize
-            data = torch.tensor(data)
-            nuts_kernel = NUTS(self.model)
-            mcmc = MCMC(nuts_kernel, num_samples=num_samples, warmup_steps=num_warmup, disable_progbar=True)
-            mcmc.run(data)
-
-        # use numpyro
-        elif self.solver=='numpyro':
-            data = np.array(data, dtype=float)
-            nuts_kernel = nNUTS(self.model)
-            mcmc = nMCMC(nuts_kernel, num_samples=num_samples, num_warmup=num_warmup, progress_bar=False)
-            mcmc.run(self.rng_key, data=data)
-
-        # collect samples
+        data = jnp.array(data, dtype=float)
+        nuts_kernel = nNUTS(self.model)
+        mcmc = nMCMC(nuts_kernel, num_samples=num_samples, num_warmup=num_warmup, progress_bar=False)
+        mcmc.run(self.rng_key, data=data)
+
         samples = mcmc.get_samples()['theta']
         return samples
 
@@ -584,20 +474,12 @@ class BayesianInference:
         Defines the candidate parametrized variational distribution that we train to approximate posterior with Pyro/Numpyro
         Here we use parameterized beta
         """
-        if self.solver=='pyro':
-            alpha_q = pyro.param('alpha_q', torch.tensor(0.5),
-                            constraint=constraints.positive)
-            beta_q = pyro.param('beta_q', torch.tensor(0.5),
-                            constraint=constraints.positive)
-            pyro.sample('theta', dist.Beta(alpha_q, beta_q))
-
-        else:
-            alpha_q = numpyro.param('alpha_q', 10,
-                            constraint=nconstraints.positive)
-            beta_q = numpyro.param('beta_q', 10,
-                            constraint=nconstraints.positive)
+        alpha_q = numpyro.param('alpha_q', 10,
+                        constraint=nconstraints.positive)
+        beta_q = numpyro.param('beta_q', 10,
+                        constraint=nconstraints.positive)
 
-            numpyro.sample('theta', ndist.Beta(alpha_q, beta_q))
+        numpyro.sample('theta', ndist.Beta(alpha_q, beta_q))
 
 
     def truncnormal_guide(self, data):
@@ -620,23 +502,11 @@ class BayesianInference:
         adam_params = {"lr": lr}
 
         if guide_dist=='beta':
-            if self.solver=='pyro':
-                optimizer = Adam(adam_params)
-                svi = SVI(self.model, self.beta_guide, optimizer, loss=Trace_ELBO())
-
-            elif self.solver=='numpyro':
-                optimizer = nAdam(step_size=lr)
-                svi = nSVI(self.model, self.beta_guide, optimizer, loss=nTrace_ELBO())
-
+            optimizer = nAdam(step_size=lr)
+            svi = nSVI(self.model, self.beta_guide, optimizer, loss=nTrace_ELBO())
         elif guide_dist=='normal':
-            # only allow numpyro
-            if self.solver=='pyro':
-                print("WARNING: Please use Numpyro with TruncatedNormal guide")
-                svi = None
-
-            elif self.solver=='numpyro':
-                optimizer = nAdam(step_size=lr)
-                svi = nSVI(self.model, self.truncnormal_guide, optimizer, loss=nTrace_ELBO())
+            optimizer = nAdam(step_size=lr)
+            svi = nSVI(self.model, self.truncnormal_guide, optimizer, loss=nTrace_ELBO())
         else:
             print("WARNING: Please input either 'beta' or 'normal'")
             svi = None
@@ -656,29 +526,12 @@ class BayesianInference:
         # initiate SVI
         svi = self.SVI_init(guide_dist=guide_dist)
 
-        # do gradient steps
-        if self.solver=='pyro':
-             # tensorize data
-            if not torch.is_tensor(data):
-                data = torch.tensor(data)
-            # store loss vector
-            losses = np.zeros(n_steps)
-            for step in range(n_steps):
-                losses[step] = svi.step(data)
-
-            # pyro only supports beta VI distribution
-            params = {
-                'alpha_q': pyro.param('alpha_q').item(),
-                'beta_q': pyro.param('beta_q').item()
-                }
-
-        elif self.solver=='numpyro':
-            data = np.array(data, dtype=float)
-            result = svi.run(self.rng_key, n_steps, data, progress_bar=False)
-            params = dict(
-                (key, np.asarray(value)) for key, value in result.params.items()
-                )
-            losses = np.asarray(result.losses)
+        data = jnp.array(data, dtype=float)
+        result = svi.run(self.rng_key, n_steps, data, progress_bar=False)
+        params = dict(
+            (key, jnp.asarray(value)) for key, value in result.params.items()
+            )
+        losses = jnp.asarray(result.losses)
 
         return params, losses
 ```
@@ -692,40 +545,34 @@ Let's see how well our sampling algorithm does in approximating
 
 To examine our alternative  prior distributions, we'll plot  approximate prior distributions below by calling the `show_prior` method.
 
-We verify that the rejection sampling strategy under `Pyro` produces the same log normal distribution as the truncated normal transformation under `Numpyro`.
-
 ```{code-cell} ipython3
 # truncated log normal
-exampleLN = BayesianInference(param=(0,2), name_dist='lognormal', solver='numpyro')
+exampleLN = BayesianInference(param=(0,2), name_dist='lognormal')
 exampleLN.show_prior(size=100000,bins=20)
 
 # truncated uniform
-exampleUN = BayesianInference(param=(0.1,0.8), name_dist='uniform', solver='numpyro')
+exampleUN = BayesianInference(param=(0.1,0.8), name_dist='uniform')
 exampleUN.show_prior(size=100000,bins=20)
 ```
 
 The above graphs show that sampling seems to work well with both distributions.
 
 
-Now let's see how well things work with a couple of von Mises distributions.
+Now let's see how well things work with von Mises distributions.
 
 ```{code-cell} ipython3
 # shifted von Mises
-exampleVM = BayesianInference(param=10, name_dist='vonMises', solver='numpyro')
+exampleVM = BayesianInference(param=10, name_dist='vonMises')
 exampleVM.show_prior(size=100000,bins=20)
-
-# truncated von Mises
-exampleVM_trunc = BayesianInference(param=20, name_dist='vonMises', solver='pyro')
-exampleVM_trunc.show_prior(size=100000,bins=20)
 ```
 
-These graphs look good too.
+The graphs look good too.
 
 Now let's try with a Laplace distribution.
 
 ```{code-cell} ipython3
 # truncated Laplace
-exampleLP = BayesianInference(param=(0.5,0.05), name_dist='laplace', solver='numpyro')
+exampleLP = BayesianInference(param=(0.5,0.05), name_dist='laplace')
 exampleLP.show_prior(size=100000,bins=40)
 ```
 
@@ -735,7 +582,7 @@ Having assured ourselves that our sampler seems to do a good job, let's put it t
 
 We construct a class  `BayesianInferencePlot` to implement MCMC or VI algorithms and plot multiple posteriors for different updating data sizes and different  possible prior.
 
-This class takes as inputs the true data generating parameter 'theta', a list of updating data sizes for multiple posterior plotting, and a defined and parametrized `BayesianInference` class.
+This class takes as inputs the true data generating parameter `theta`, a list of updating data sizes for multiple posterior plotting, and a defined and parametrized `BayesianInference` class.
 
 It has two key methods:
 
@@ -779,9 +626,6 @@ class BayesianInferencePlot:
 
 
     def MCMC_plot(self, num_samples, num_warmup=1000):
-        """
-        Parameters as in MCMC_sampling except that data is already defined
-        """
         fig, ax = plt.subplots(figsize=(10, 6))
 
         # plot prior
@@ -818,7 +662,6 @@ class BayesianInferencePlot:
     def SVI_fitting(self, guide_dist, params):
         """
         Fit the beta/truncnormal curve using parameters trained by SVI.
-        I create plot using PDF given by scipy.stats distributions since torch.dist do not have embedded PDF methods.
         """
         # create x axis
         xaxis = np.linspace(0,1,1000)
@@ -837,9 +680,6 @@ class BayesianInferencePlot:
 
 
     def SVI_plot(self, guide_dist, n_steps=2000):
-        """
-        Parameters as in SVI_run except that data is already defined
-        """
         fig, ax = plt.subplots(figsize=(10, 6))
 
         # plot prior
@@ -891,18 +731,16 @@ Let's compare outcomes when we use a Beta prior.
 For the same Beta prior, we shall
 
 * compute posteriors analytically
-* compute posteriors using MCMC  via  `Pyro` and `Numpyro`.
-* compute posteriors using  VI via  `Pyro` and `Numpyro`.
+* compute posteriors using MCMC  using `numpyro`.
+* compute posteriors using  VI using `numpyro`.
 
-Let's start with the analytical method that we described in this quantecon lecture <https://python.quantecon.org/prob_meaning.html>
+Let's start with the analytical method that we described in this {doc}`Probability Meanings lecture <prob_meaning>`
 
 ```{code-cell} ipython3
-# First examine Beta priors
-BETA_pyro = BayesianInference(param=(5,5), name_dist='beta', solver='pyro')
-BETA_numpyro = BayesianInference(param=(5,5), name_dist='beta', solver='numpyro')
+# First examine Beta prior
+BETA = BayesianInference(param=(5,5), name_dist='beta')
 
-BETA_pyro_plot = BayesianInferencePlot(true_theta, num_list, BETA_pyro)
-BETA_numpyro_plot = BayesianInferencePlot(true_theta, num_list, BETA_numpyro)
+BETA_plot = BayesianInferencePlot(true_theta, num_list, BETA)
 
 
 # plot analytical Beta prior and posteriors
@@ -913,8 +751,9 @@ fig, ax = plt.subplots(figsize=(10, 6))
 # plot analytical beta prior
 ax.plot(xaxis, y_prior, label='Analytical Beta Prior', color='#4C4E52')
 
-data, colorlist, N_list = BETA_pyro_plot.data, BETA_pyro_plot.colorlist, BETA_pyro_plot.N_list
-# plot analytical beta posteriors
+data, colorlist, N_list = BETA_plot.data, BETA_plot.colorlist, BETA_plot.N_list
+
+# Plot analytical beta posteriors
 for id, n in enumerate(N_list):
     func = analytical_beta_posterior(data[:n], alpha0=5, beta0=5)
     y_posterior = func.pdf(xaxis)
@@ -931,8 +770,8 @@ Now let's use MCMC while still using a beta prior.
 We'll do this for both MCMC and VI.
 
 ```{code-cell} ipython3
-BayesianInferencePlot(true_theta, num_list, BETA_pyro).MCMC_plot(num_samples=MCMC_num_samples)
-BayesianInferencePlot(true_theta, num_list, BETA_numpyro).SVI_plot(guide_dist='beta', n_steps=SVI_num_steps)
+BayesianInferencePlot(true_theta, num_list, BETA).MCMC_plot(num_samples=MCMC_num_samples)
+BayesianInferencePlot(true_theta, num_list, BETA).SVI_plot(guide_dist='beta', n_steps=SVI_num_steps)
 ```
 
 Here the MCMC approximation looks good.
@@ -950,7 +789,7 @@ will be  more accurate, as we shall see next.
 (Increasing the step size increases computational time though).
 
 ```{code-cell} ipython3
-BayesianInferencePlot(true_theta, num_list, BETA_numpyro).SVI_plot(guide_dist='beta', n_steps=100000)
+BayesianInferencePlot(true_theta, num_list, BETA).SVI_plot(guide_dist='beta', n_steps=100000)
 ```
 
 ## Non-conjugate Prior Distributions
@@ -968,32 +807,28 @@ We first initialize the `BayesianInference` classes and then can directly call `
 
 ```{code-cell} ipython3
 # Initialize BayesianInference classes
-# try uniform
-STD_UNIFORM_pyro = BayesianInference(param=(0,1), name_dist='uniform', solver='pyro')
-UNIFORM_numpyro = BayesianInference(param=(0.2,0.7), name_dist='uniform', solver='numpyro')
+# Try uniform
+STD_UNIFORM = BayesianInference(param=(0,1), name_dist='uniform')
+UNIFORM = BayesianInference(param=(0.2,0.7), name_dist='uniform')
 
-# try truncated lognormal
-LOGNORMAL_numpyro = BayesianInference(param=(0,2), name_dist='lognormal', solver='numpyro')
-LOGNORMAL_pyro = BayesianInference(param=(0,2), name_dist='lognormal', solver='pyro')
+# Try truncated lognormal
+LOGNORMAL = BayesianInference(param=(0,2), name_dist='lognormal')
 
-# try von Mises
-# shifted von Mises
-VONMISES_numpyro = BayesianInference(param=10, name_dist='vonMises', solver='numpyro')
-# truncated von Mises
-VONMISES_pyro = BayesianInference(param=40, name_dist='vonMises', solver='pyro')
+# Try Von Mises
+VONMISES = BayesianInference(param=10, name_dist='vonMises')
 
-# try laplace
-LAPLACE_numpyro = BayesianInference(param=(0.5, 0.07), name_dist='laplace', solver='numpyro')
+# Try Laplace
+LAPLACE = BayesianInference(param=(0.5, 0.07), name_dist='laplace')
 ```
 
 ```{code-cell} ipython3
 # Uniform
-example_CLASS = STD_UNIFORM_pyro
-print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}\nSolver: {example_CLASS.solver}')
+example_CLASS = STD_UNIFORM
+print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}')
 BayesianInferencePlot(true_theta, num_list, example_CLASS).MCMC_plot(num_samples=MCMC_num_samples)
 
-example_CLASS = UNIFORM_numpyro
-print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}\nSolver: {example_CLASS.solver}')
+example_CLASS = UNIFORM
+print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}')
 BayesianInferencePlot(true_theta, num_list, example_CLASS).MCMC_plot(num_samples=MCMC_num_samples)
 ```
 
@@ -1005,32 +840,23 @@ Note how when  the true data-generating $\theta$ is located at $0.8$ as it is he
 
 ```{code-cell} ipython3
 # Log Normal
-example_CLASS = LOGNORMAL_numpyro
-print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}\nSolver: {example_CLASS.solver}')
-BayesianInferencePlot(true_theta, num_list, example_CLASS).MCMC_plot(num_samples=MCMC_num_samples)
-
-example_CLASS = LOGNORMAL_pyro
-print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}\nSolver: {example_CLASS.solver}')
+example_CLASS = LOGNORMAL
+print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}')
 BayesianInferencePlot(true_theta, num_list, example_CLASS).MCMC_plot(num_samples=MCMC_num_samples)
 ```
 
 ```{code-cell} ipython3
 # Von Mises
-example_CLASS = VONMISES_numpyro
-print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}\nSolver: {example_CLASS.solver}')
+example_CLASS = VONMISES
+print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}')
 print('\nNOTE: Shifted von Mises')
 BayesianInferencePlot(true_theta, num_list, example_CLASS).MCMC_plot(num_samples=MCMC_num_samples)
-
-example_CLASS = VONMISES_pyro
-print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}\nSolver: {example_CLASS.solver}')
-print('\nNOTE: Truncated von Mises')
-BayesianInferencePlot(true_theta, num_list, example_CLASS).MCMC_plot(num_samples=MCMC_num_samples)
 ```
 
 ```{code-cell} ipython3
 # Laplace
-example_CLASS = LAPLACE_numpyro
-print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}\nSolver: {example_CLASS.solver}')
+example_CLASS = LAPLACE
+print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}')
 BayesianInferencePlot(true_theta, num_list, example_CLASS).MCMC_plot(num_samples=MCMC_num_samples)
 ```
 
@@ -1044,30 +870,22 @@ SVI_num_steps = 50000
 
 ```{code-cell} ipython3
 # Uniform
-example_CLASS = BayesianInference(param=(0,1), name_dist='uniform', solver='numpyro')
-print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}\nSolver: {example_CLASS.solver}')
+example_CLASS = BayesianInference(param=(0,1), name_dist='uniform')
+print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}')
 BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(guide_dist='normal', n_steps=SVI_num_steps)
 ```
 
 ```{code-cell} ipython3
-# Log Normal
-example_CLASS = LOGNORMAL_numpyro
-print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}\nSolver: {example_CLASS.solver}')
-BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(guide_dist='normal', n_steps=SVI_num_steps)
-```
-
-```{code-cell} ipython3
-# Von Mises
-example_CLASS = VONMISES_numpyro
-print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}\nSolver: {example_CLASS.solver}')
-print('\nNB: Shifted von Mises')
+# Lognormal
+example_CLASS = LOGNORMAL
+print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}')
 BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(guide_dist='normal', n_steps=SVI_num_steps)
 ```
 
 ```{code-cell} ipython3
 # Laplace
-example_CLASS = LAPLACE_numpyro
-print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}\nSolver: {example_CLASS.solver}')
+example_CLASS = LAPLACE
+print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}')
 BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(guide_dist='normal', n_steps=SVI_num_steps)
 ```
 
@@ -1075,38 +893,29 @@ BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(guide_dist='
 
 ```{code-cell} ipython3
 # Uniform
-example_CLASS = STD_UNIFORM_pyro
-print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}\nSolver: {example_CLASS.solver}')
+example_CLASS = STD_UNIFORM
+print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}')
 BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(guide_dist='beta', n_steps=SVI_num_steps)
 ```
 
 ```{code-cell} ipython3
 # Log Normal
-example_CLASS = LOGNORMAL_numpyro
-print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}\nSolver: {example_CLASS.solver}')
-BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(guide_dist='beta', n_steps=SVI_num_steps)
-
-example_CLASS = LOGNORMAL_pyro
-print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}\nSolver: {example_CLASS.solver}')
+example_CLASS = LOGNORMAL
+print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}')
 BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(guide_dist='beta', n_steps=SVI_num_steps)
 ```
 
 ```{code-cell} ipython3
 # Von Mises
-example_CLASS = VONMISES_numpyro
-print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}\nSolver: {example_CLASS.solver}')
-print('\nNB: Shifted von Mises')
-BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(guide_dist='beta', n_steps=SVI_num_steps)
-
-example_CLASS = VONMISES_pyro
-print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}\nSolver: {example_CLASS.solver}')
-print('\nNB: Truncated von Mises')
+example_CLASS = VONMISES
+print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}')
+print('Shifted von Mises')
 BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(guide_dist='beta', n_steps=SVI_num_steps)
 ```
 
 ```{code-cell} ipython3
 # Laplace
-example_CLASS = LAPLACE_numpyro
-print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}\nSolver: {example_CLASS.solver}')
+example_CLASS = LAPLACE
+print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}')
 BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(guide_dist='beta', n_steps=SVI_num_steps)
 ```

From fa1aa2e247c690491c41de0351ae620534546c5e Mon Sep 17 00:00:00 2001
From: kp992 <kpl992@outlook.com>
Date: Thu, 14 Aug 2025 17:43:00 -0700
Subject: [PATCH 02/22] fix typos

---
 lectures/bayes_nonconj.md | 148 ++++++++++++++++++--------------------
 1 file changed, 71 insertions(+), 77 deletions(-)

diff --git a/lectures/bayes_nonconj.md b/lectures/bayes_nonconj.md
index 2664a41d2..ba9fcb44f 100644
--- a/lectures/bayes_nonconj.md
+++ b/lectures/bayes_nonconj.md
@@ -29,20 +29,20 @@ over parameters just happened to form a **conjugate** pair in which
 
 -  application of Bayes' Law produces a posterior distribution that has the same functional form as the prior
 
-Having a likelihood and prior that  are conjugate can simplify calculation of a posterior, facilitating analytical or nearly analytical calculations.
+Having a likelihood and prior that are conjugate can simplify calculation of a posterior, facilitating analytical or nearly analytical calculations.
 
-But in many situations  the likelihood and prior need not form a conjugate pair.
+But in many situations the likelihood and prior need not form a conjugate pair.
 
- - after all, a person's prior is his or her own business and would take a form conjugate to a likelihood only by remote coincidence
+- after all, a person's prior is his or her own business and would take a form conjugate to a likelihood only by remote coincidence
 
 In these situations, computing a posterior can become very challenging.
 
-In this lecture, we illustrate how modern Bayesians confront non-conjugate priors  by using  Monte Carlo techniques that involve
+In this lecture, we illustrate how modern Bayesians confront non-conjugate priors by using Monte Carlo techniques that involve
 
-- first  cleverly forming a Markov chain whose invariant distribution is the posterior distribution we want
+- first cleverly forming a Markov chain whose invariant distribution is the posterior distribution we want
 - simulating the Markov chain until it has converged and then sampling from the invariant distribution to approximate the posterior
 
-We shall illustrate the approach by deploying a powerful Python library, [NumPyro](https://num.pyro.ai/en/stable/getting_started.html) that implement this approach.
+We shall illustrate the approach by deploying a powerful Python library, [NumPyro](https://num.pyro.ai/en/stable/getting_started.html) that implements this approach.
 
 As usual, we begin by importing some Python code.
 
@@ -79,12 +79,11 @@ This lecture instead computes posteriors
 - numerically by sampling from the posterior distribution through MCMC methods, and
 - using a variational inference (VI) approximation.
 
-We use `numpyro` with assistance from `jax` to approximate a  posterior distribution
+We use `numpyro` with assistance from `jax` to approximate a posterior distribution.
 
-We use several alternative prior distributions
-
-We compare computed posteriors  with ones associated with a conjugate prior as described in  {doc}`Probability Meanings lecture <prob_meaning>`
+We use several alternative prior distributions.
 
+We compare computed posteriors with ones associated with a conjugate prior as described in {doc}`Probability Meanings lecture <prob_meaning>`.
 
 ### Analytical Posterior
 
@@ -107,10 +106,9 @@ $$
 f(\theta) = \textrm{Prob}(\theta) = \frac{\theta^{\alpha - 1} (1 - \theta)^{\beta - 1}}{B(\alpha, \beta)}
 $$
 
-We choose this as our prior for now because  we know that a conjugate prior for the binomial likelihood function is a beta distribution.
-
+We choose this as our prior for now because we know that a conjugate prior for the binomial likelihood function is a beta distribution.
 
-After observing  $k$ successes among $N$ sample observations, the posterior  probability distribution of  $ \theta $ is
+After observing $k$ successes among $N$ sample observations, the posterior probability distribution of $ \theta $ is
 
 $$
 \textrm{Prob}(\theta|k) = \frac{\textrm{Prob}(\theta,k)}{\textrm{Prob}(k)}=\frac{\textrm{Prob}(k|\theta)\textrm{Prob}(\theta)}{\textrm{Prob}(k)}=\frac{\textrm{Prob}(k|\theta) \textrm{Prob}(\theta)}{\int_0^1 \textrm{Prob}(k|\theta)\textrm{Prob}(\theta) d\theta}
@@ -124,8 +122,6 @@ $$
 =\frac{(1 -\theta)^{\beta+N-k-1} \theta^{\alpha+k-1}}{\int_0^1 (1 - \theta)^{\beta+N-k-1} \theta^{\alpha+k-1} d\theta} .
 $$
 
-
-
 Thus,
 
 $$
@@ -170,47 +166,47 @@ def analytical_beta_posterior(data, alpha0, beta0):
 
 Suppose that we don't have a conjugate prior.
 
-Then  we  can't compute posteriors analytically.
+Then we can't compute posteriors analytically.
 
-Instead,  we use computational tools to approximate the posterior distribution for a set of alternative prior distributions using `numpyro`.
+Instead, we use computational tools to approximate the posterior distribution for a set of alternative prior distributions using `numpyro`.
 
-We first use the **Markov Chain Monte Carlo** (MCMC) algorithm .
+We first use the **Markov Chain Monte Carlo** (MCMC) algorithm.
 
 We implement the NUTS sampler to sample from the posterior.
 
-In that way we construct a sampling distribution that approximates the  posterior.
+In that way we construct a sampling distribution that approximates the posterior.
 
-After doing that we deply another procedure called  **Variational Inference** (VI).
+After doing that we deploy another procedure called **Variational Inference** (VI).
 
 In particular, we implement Stochastic Variational Inference (SVI) machinery in `numpyro`.
 
-The MCMC algorithm  supposedly generates a more accurate approximation since in principle it directly samples from the posterior distribution.
+The MCMC algorithm supposedly generates a more accurate approximation since in principle it directly samples from the posterior distribution.
 
-But it  can be computationally expensive, especially when dimension is large.
+But it can be computationally expensive, especially when dimension is large.
 
-A VI approach can be  cheaper, but it is likely to produce an inferior approximation to the posterior, for the simple reason that it requires guessing a parametric **guide functional form** that we use to approximate a posterior.
+A VI approach can be cheaper, but it is likely to produce an inferior approximation to the posterior, for the simple reason that it requires guessing a parametric **guide functional form** that we use to approximate a posterior.
 
-This guide function is likely  at best to be an imperfect approximation.
+This guide function is likely at best to be an imperfect approximation.
 
 By paying the cost of restricting the putative posterior to have a restricted functional form,
-the problem of approximating a posteriors is transformed to a well-posed optimization problem that seeks parameters of the putative posterior  that minimize
-a Kullback-Leibler (KL) divergence between true posterior and the putatitive posterior  distribution.
+the problem of approximating a posterior is transformed to a well-posed optimization problem that seeks parameters of the putative posterior that minimize
+a Kullback-Leibler (KL) divergence between true posterior and the putative posterior distribution.
 
-  - minimizing the KL divergence is  equivalent with  maximizing a criterion called  the **Evidence Lower Bound** (ELBO), as we shall verify soon.
+- minimizing the KL divergence is equivalent to maximizing a criterion called the **Evidence Lower Bound** (ELBO), as we shall verify soon.
 
 ## Prior Distributions
 
-In order to be able to apply MCMC sampling or VI, `numpyro` require  that a prior distribution satisfy special properties:
+In order to be able to apply MCMC sampling or VI, `numpyro` requires that a prior distribution satisfy special properties:
 
-- we must be able sample from it;
-- we must be able to  compute the log pdf  pointwise;
-- the pdf must be  differentiable with respect to the parameters.
+- we must be able to sample from it;
+- we must be able to compute the log pdf pointwise;
+- the pdf must be differentiable with respect to the parameters.
 
 We'll want to define a distribution `class`.
 
-We  will use the following priors:
+We will use the following priors:
 
--  a uniform distribution on $[\underline \theta, \overline \theta]$, where $0 \leq \underline \theta < \overline \theta \leq 1$.
+- a uniform distribution on $[\underline \theta, \overline \theta]$, where $0 \leq \underline \theta < \overline \theta \leq 1$.
 
 - a truncated log-normal distribution with support on $[0,1]$ with parameters $(\mu,\sigma)$.
 
@@ -220,7 +216,7 @@ We  will use the following priors:
 
     - Let $X\sim vonMises(0,\kappa)$. We know that $X$ has bounded support $[-\pi, \pi]$. We can define a shifted von Mises random variable $\tilde{X}=a+bX$ where $a=0.5, b=1/(2 \pi)$ so that $\tilde{X}$ is supported on $[0,1]$.
 
-    - This can be implemented using `numpyro`'s `TransformedDistribution` class  with its `AffineTransform` method.
+    - This can be implemented using `numpyro`'s `TransformedDistribution` class with its `AffineTransform` method.
 
 - a truncated Laplace distribution.
 
@@ -259,17 +255,17 @@ def TruncatedLaplace(loc, scale):
 
 ### Variational Inference
 
-Instead of directly sampling from the posterior,  the **variational inference**  methodw approximates an unknown posterior distribution with  a family of tractable distributions/densities.
+Instead of directly sampling from the posterior, the **variational inference** method approximates an unknown posterior distribution with a family of tractable distributions/densities.
 
-It then seeks to minimizes a measure of statistical discrepancy between the approximating and  true posteriors.
+It then seeks to minimize a measure of statistical discrepancy between the approximating and true posteriors.
 
-Thus variational inference (VI)  approximates a posterior by solving  a  minimization problem.
+Thus variational inference (VI) approximates a posterior by solving a minimization problem.
 
-Let the latent parameter/variable that we want to infer  be $\theta$.
+Let the latent parameter/variable that we want to infer be $\theta$.
 
-Let the  prior be  $p(\theta)$ and the likelihood be $p\left(Y\vert\theta\right)$.
+Let the prior be $p(\theta)$ and the likelihood be $p\left(Y\vert\theta\right)$.
 
-We want  $p\left(\theta\vert Y\right)$.
+We want $p\left(\theta\vert Y\right)$.
 
 Bayes' rule implies
 
@@ -283,11 +279,11 @@ $$
 p\left(Y\right)=\int d\theta p\left(Y\mid\theta\right)p\left(Y\right).
 $$ (eq:intchallenge)
 
-The integral on the right side of {eq}`eq:intchallenge`  is typically difficult to compute.
+The integral on the right side of {eq}`eq:intchallenge` is typically difficult to compute.
 
-Consider a  **guide distribution** $q_{\phi}(\theta)$ parameterized by $\phi$ that we'll use to approximate the posterior.
+Consider a **guide distribution** $q_{\phi}(\theta)$ parameterized by $\phi$ that we'll use to approximate the posterior.
 
-We choose  parameters $\phi$ of the guide distribution to minimize a Kullback-Leibler (KL)  divergence between the approximate posterior $q_{\phi}(\theta)$ and  the posterior:
+We choose parameters $\phi$ of the guide distribution to minimize a Kullback-Leibler (KL) divergence between the approximate posterior $q_{\phi}(\theta)$ and the posterior:
 
 $$
  D_{KL}(q(\theta;\phi)\;\|\;p(\theta\mid Y)) \equiv -\int d\theta q(\theta;\phi)\log\frac{p(\theta\mid Y)}{q(\theta;\phi)}
@@ -312,24 +308,24 @@ $$
 \end{aligned}
 $$
 
-For  observed data $Y$, $p(\theta,Y)$ is a constant, so minimizing KL divergence is equivalent to maximizing
+For observed data $Y$, $p(\theta,Y)$ is a constant, so minimizing KL divergence is equivalent to maximizing
 
 $$
 ELBO\equiv\int d\theta q_{\phi}(\theta)\log\frac{p(\theta,Y)}{q_{\phi}(\theta)}=\mathbb{E}_{q_{\phi}(\theta)}\left[\log p(\theta,Y)-\log q_{\phi}(\theta)\right]
 $$ (eq:ELBO)
 
-Formula {eq}`eq:ELBO` is called  the evidence lower bound (ELBO).
+Formula {eq}`eq:ELBO` is called the evidence lower bound (ELBO).
 
-A standard optimization routine can used to search for the optimal $\phi$ in our parametrized distribution $q_{\phi}(\theta)$.
+A standard optimization routine can be used to search for the optimal $\phi$ in our parametrized distribution $q_{\phi}(\theta)$.
 
-The parameterized  distribution $q_{\phi}(\theta)$ is called the **variational distribution**.
+The parameterized distribution $q_{\phi}(\theta)$ is called the **variational distribution**.
 
-We can implement Stochastic Variational Inference (SVI) numpyro using the `Adam` gradient descent algorithm to approximate posterior.
+We can implement Stochastic Variational Inference (SVI) in numpyro using the `Adam` gradient descent algorithm to approximate the posterior.
 
-We use  two sets of variational distributions: Beta and TruncatedNormal with support $[0,1]$
+We use two sets of variational distributions: Beta and TruncatedNormal with support $[0,1]$
 
-  - Learnable parameters for the Beta distribution are (alpha, beta), both of which are positive.
-  - Learnable parameters for the Truncated Normal distribution are (loc, scale).
+- Learnable parameters for the Beta distribution are (alpha, beta), both of which are positive.
+- Learnable parameters for the Truncated Normal distribution are (loc, scale).
 
 ```{note}
 We restrict the truncated Normal parameter 'loc' to be in the interval $[0,1]$
@@ -337,7 +333,7 @@ We restrict the truncated Normal parameter 'loc' to be in the interval $[0,1]$
 
 ## Implementation
 
-We have constructed a Python class `BaysianInference` that requires the following arguments to be initialized:
+We have constructed a Python class `BayesianInference` that requires the following arguments to be initialized:
 
 - `param`: a tuple/scalar of parameters dependent on distribution types
 - `name_dist`: a string that specifies distribution names
@@ -345,7 +341,7 @@ We have constructed a Python class `BaysianInference` that requires the followin
 The (`param`, `name_dist`) pair includes:
 - ('beta', alpha, beta)
 
-- ('uniform', upper_bound, lower_bound)
+- ('uniform', lower_bound, upper_bound)
 
 - ('lognormal', loc, scale)
    - Note: This is the truncated log normal.
@@ -355,16 +351,16 @@ The (`param`, `name_dist`) pair includes:
 - ('laplace', loc, scale)
    - Note: This is the truncated Laplace
 
-The class `BaysianInference` has several key methods :
+The class `BayesianInference` has several key methods :
 - `sample_prior`:
    - This can be used to draw a single sample from the given prior distribution.
 
 - `show_prior`:
-   - Plots the approximate prior distribution by repeatedly drawing samples and fitting a kernal density curve.
+   - Plots the approximate prior distribution by repeatedly drawing samples and fitting a kernel density curve.
 
 - `MCMC_sampling`:
    - INPUT: (data, num_samples, num_warmup=1000)
-   - Take a `np.array` data and generate MCMC sampling of posterior of size `num_samples`.
+   - Takes a `np.array` data and generates MCMC sampling of posterior of size `num_samples`.
 
 - `SVI_run`:
   - INPUT: (data, guide_dist, n_steps=10000)
@@ -543,7 +539,7 @@ Let's see how well our sampling algorithm does in approximating
 - a log normal distribution
 - a uniform distribution
 
-To examine our alternative  prior distributions, we'll plot  approximate prior distributions below by calling the `show_prior` method.
+To examine our alternative prior distributions, we'll plot approximate prior distributions below by calling the `show_prior` method.
 
 ```{code-cell} ipython3
 # truncated log normal
@@ -557,7 +553,6 @@ exampleUN.show_prior(size=100000,bins=20)
 
 The above graphs show that sampling seems to work well with both distributions.
 
-
 Now let's see how well things work with von Mises distributions.
 
 ```{code-cell} ipython3
@@ -580,15 +575,15 @@ Having assured ourselves that our sampler seems to do a good job, let's put it t
 
 ## Posteriors Via MCMC and VI
 
-We construct a class  `BayesianInferencePlot` to implement MCMC or VI algorithms and plot multiple posteriors for different updating data sizes and different  possible prior.
+We construct a class `BayesianInferencePlot` to implement MCMC or VI algorithms and plot multiple posteriors for different updating data sizes and different possible priors.
 
 This class takes as inputs the true data generating parameter `theta`, a list of updating data sizes for multiple posterior plotting, and a defined and parametrized `BayesianInference` class.
 
 It has two key methods:
 
-- `BayesianInferencePlot.MCMC_plot()` takes wanted MCMC sample size as input and plot the output posteriors  together with the prior defined in `BayesianInference` class.
+- `BayesianInferencePlot.MCMC_plot()` takes desired MCMC sample size as input and plots the output posteriors together with the prior defined in `BayesianInference` class.
 
-- `BayesianInferencePlot.SVI_plot()` takes wanted VI distribution class ('beta' or 'normal') as input and plot the posteriors together with the prior.
+- `BayesianInferencePlot.SVI_plot()` takes desired VI distribution class ('beta' or 'normal') as input and plots the posteriors together with the prior.
 
 ```{code-cell} ipython3
 class BayesianInferencePlot:
@@ -709,9 +704,9 @@ class BayesianInferencePlot:
         plt.show()
 ```
 
-Let's set some parameters that we'll use in all of the examples  below.
+Let's set some parameters that we'll use in all of the examples below.
 
-To save computer time at first, notice that  we'll set `MCMC_num_samples = 2000` and `SVI_num_steps = 5000`.
+To save computer time at first, notice that we'll set `MCMC_num_samples = 2000` and `SVI_num_steps = 5000`.
 
 (Later, to increase accuracy of approximations, we'll want to increase these.)
 
@@ -731,8 +726,8 @@ Let's compare outcomes when we use a Beta prior.
 For the same Beta prior, we shall
 
 * compute posteriors analytically
-* compute posteriors using MCMC  using `numpyro`.
-* compute posteriors using  VI using `numpyro`.
+* compute posteriors using MCMC using `numpyro`.
+* compute posteriors using VI using `numpyro`.
 
 Let's start with the analytical method that we described in this {doc}`Probability Meanings lecture <prob_meaning>`
 
@@ -778,13 +773,12 @@ Here the MCMC approximation looks good.
 
 But the VI approximation doesn't look so good.
 
-* even though we use the  beta distribution as our guide, the VI approximated posterior distributions do not closely resemble the posteriors that we had just computed analytically.
+* even though we use the beta distribution as our guide, the VI approximated posterior distributions do not closely resemble the posteriors that we had just computed analytically.
 
 (Here, our initial parameter for Beta guide is (0.5, 0.5).)
 
-
-But if we increase the number of  steps from 5000 to 10000 in VI as we now shall do, we'll get VI-approximated   posteriors
-will be  more accurate, as we shall see next.
+But if we increase the number of steps from 5000 to 100000 in VI as we now shall do, we'll get VI-approximated posteriors
+that will be more accurate, as we shall see next.
 
 (Increasing the step size increases computational time though).
 
@@ -794,14 +788,14 @@ BayesianInferencePlot(true_theta, num_list, BETA).SVI_plot(guide_dist='beta', n_
 
 ## Non-conjugate Prior Distributions
 
-Having assured ourselves that our MCMC and VI methods can work well when we have  conjugate prior and so can also compute analytically, we
-next proceed to situations in which our  prior  is not a beta distribution, so we don't have a conjugate prior.
+Having assured ourselves that our MCMC and VI methods can work well when we have a conjugate prior and so can also compute analytically, we
+next proceed to situations in which our prior is not a beta distribution, so we don't have a conjugate prior.
 
 So we will have non-conjugate priors and are cast into situations in which we can't calculate posteriors analytically.
 
 ### MCMC
 
-First, we implement and display  MCMC.
+First, we implement and display MCMC.
 
 We first initialize the `BayesianInference` classes and then can directly call `BayesianInferencePlot` to plot both MCMC and SVI approximating posteriors.
 
@@ -832,11 +826,11 @@ print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {exam
 BayesianInferencePlot(true_theta, num_list, example_CLASS).MCMC_plot(num_samples=MCMC_num_samples)
 ```
 
-In the situation depicted above, we have assumed a  $Uniform(\underline{\theta}, \overline{\theta})$ prior that puts zero probability   outside a bounded support that excludes the true value.
+In the situation depicted above, we have assumed a $Uniform(\underline{\theta}, \overline{\theta})$ prior that puts zero probability outside a bounded support that excludes the true value.
 
-Consequently,  the posterior cannot put positive probability above $\overline{\theta}$ or below $\underline{\theta}$.
+Consequently, the posterior cannot put positive probability above $\overline{\theta}$ or below $\underline{\theta}$.
 
-Note how when  the true data-generating $\theta$ is located at $0.8$ as it is here,  when $n$ gets large, the posterior  concentrate on the upper bound of the support of the prior,  $0.7$ here.
+Note how when the true data-generating $\theta$ is located at $0.8$ as it is here, when $n$ gets large, the posterior concentrates on the upper bound of the support of the prior, $0.7$ here.
 
 ```{code-cell} ipython3
 # Log Normal
@@ -866,7 +860,7 @@ To get more accuracy we will now increase the number of steps for Variational In
 SVI_num_steps = 50000
 ```
 
-#### VI with a  Truncated Normal Guide
+#### VI with a Truncated Normal Guide
 
 ```{code-cell} ipython3
 # Uniform
@@ -889,7 +883,7 @@ print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {exam
 BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(guide_dist='normal', n_steps=SVI_num_steps)
 ```
 
-#### Variational Inference with a  Beta Guide Distribution
+#### Variational Inference with a Beta Guide Distribution
 
 ```{code-cell} ipython3
 # Uniform

From e3ad3d2b59a7e1817f4bb2ae5e7695d8bbf4e4ba Mon Sep 17 00:00:00 2001
From: kp992 <kpl992@outlook.com>
Date: Thu, 14 Aug 2025 17:44:30 -0700
Subject: [PATCH 03/22] remove unused import

---
 lectures/bayes_nonconj.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lectures/bayes_nonconj.md b/lectures/bayes_nonconj.md
index ba9fcb44f..c69ce2097 100644
--- a/lectures/bayes_nonconj.md
+++ b/lectures/bayes_nonconj.md
@@ -50,7 +50,6 @@ As usual, we begin by importing some Python code.
 import numpy as np
 import seaborn as sns
 import matplotlib.pyplot as plt
-from scipy.stats import binom
 import scipy.stats as st
 
 import jax.numpy as jnp

From 8e53787f484d1bb0d1293e00447b5471b395841b Mon Sep 17 00:00:00 2001
From: kp992 <kpl992@outlook.com>
Date: Thu, 14 Aug 2025 17:55:56 -0700
Subject: [PATCH 04/22] fix code formatting

---
 lectures/bayes_nonconj.md | 333 +++++++++++++++++++++++---------------
 1 file changed, 203 insertions(+), 130 deletions(-)

diff --git a/lectures/bayes_nonconj.md b/lectures/bayes_nonconj.md
index c69ce2097..7b9d9fd86 100644
--- a/lectures/bayes_nonconj.md
+++ b/lectures/bayes_nonconj.md
@@ -228,10 +228,11 @@ def TruncatedLogNormal_trans(loc, scale):
     """
     Obtains the truncated log normal distribution using numpyro's TruncatedNormal and ExpTransform
     """
-    base_dist = ndist.TruncatedNormal(low=jnp.log(0), high=jnp.log(1), loc=loc, scale=scale)
-    return ndist.TransformedDistribution(
-        base_dist,ndist.transforms.ExpTransform()
-        )
+    base_dist = ndist.TruncatedNormal(
+        low=jnp.log(0), high=jnp.log(1), loc=loc, scale=scale
+    )
+    return ndist.TransformedDistribution(base_dist, ndist.transforms.ExpTransform())
+
 
 def ShiftedVonMises(kappa):
     """
@@ -239,17 +240,16 @@ def ShiftedVonMises(kappa):
     """
     base_dist = ndist.VonMises(0, kappa)
     return ndist.TransformedDistribution(
-        base_dist, ndist.transforms.AffineTransform(loc=0.5, scale=1/(2*jnp.pi))
-        )
+        base_dist, ndist.transforms.AffineTransform(loc=0.5, scale=1 / (2 * jnp.pi))
+    )
+
 
 def TruncatedLaplace(loc, scale):
     """
     Obtains the truncated Laplace distribution on [0,1]
     """
     base_dist = ndist.Laplace(loc, scale)
-    return ndist.TruncatedDistribution(
-        base_dist, low=0.0, high=1.0
-    )
+    return ndist.TruncatedDistribution(base_dist, low=0.0, high=1.0)
 ```
 
 ### Variational Inference
@@ -383,73 +383,78 @@ class BayesianInference:
         # jax requires explicit PRNG state to be passed
         self.rng_key = jax_random.PRNGKey(0)
 
-
     def sample_prior(self):
         """
         Define the prior distribution to sample from in numpyro models.
         """
-        if self.name_dist=='beta':
+        if self.name_dist == "beta":
             # unpack parameters
             alpha0, beta0 = self.param
-            sample = numpyro.sample('theta', ndist.Beta(alpha0, beta0),
-                                    rng_key=self.rng_key)
+            sample = numpyro.sample(
+                "theta", ndist.Beta(alpha0, beta0), rng_key=self.rng_key
+            )
 
-        elif self.name_dist=='uniform':
+        elif self.name_dist == "uniform":
             # unpack parameters
             lb, ub = self.param
-            sample = numpyro.sample('theta', ndist.Uniform(lb, ub),
-                                    rng_key=self.rng_key)
+            sample = numpyro.sample(
+                "theta", ndist.Uniform(lb, ub), rng_key=self.rng_key
+            )
 
-        elif self.name_dist=='lognormal':
+        elif self.name_dist == "lognormal":
             # unpack parameters
             loc, scale = self.param
-            sample = numpyro.sample('theta', TruncatedLogNormal_trans(loc, scale),
-                                    rng_key=self.rng_key)
+            sample = numpyro.sample(
+                "theta", TruncatedLogNormal_trans(loc, scale), rng_key=self.rng_key
+            )
 
-        elif self.name_dist=='vonMises':
+        elif self.name_dist == "vonMises":
             # unpack parameters
             kappa = self.param
-            sample = numpyro.sample('theta', ShiftedVonMises(kappa),
-                                    rng_key=self.rng_key)
+            sample = numpyro.sample(
+                "theta", ShiftedVonMises(kappa), rng_key=self.rng_key
+            )
 
-        elif self.name_dist=='laplace':
+        elif self.name_dist == "laplace":
             # unpack parameters
             loc, scale = self.param
-            sample = numpyro.sample('theta', TruncatedLaplace(loc, scale),
-                                    rng_key=self.rng_key)
+            sample = numpyro.sample(
+                "theta", TruncatedLaplace(loc, scale), rng_key=self.rng_key
+            )
 
         return sample
 
-
     def show_prior(self, size=1e5, bins=20, disp_plot=1):
         """
         Visualizes prior distribution by sampling from prior and plots the approximated sampling distribution
         """
         self.bins = bins
 
-        with numpyro.plate('show_prior', size=size):
+        with numpyro.plate("show_prior", size=size):
             sample = self.sample_prior()
         # to JAX array
         sample_array = jnp.asarray(sample)
 
         # plot histogram and kernel density
         if disp_plot == 1:
-            sns.displot(sample_array, kde=True, stat='density', bins=bins, height=5, aspect=1.5)
+            sns.displot(
+                sample_array, kde=True, stat="density", bins=bins, height=5, aspect=1.5
+            )
             plt.xlim(0, 1)
             plt.show()
         else:
             return sample_array
 
-
     def model(self, data):
         """
         Define the probabilistic model by specifying prior, conditional likelihood, and data conditioning
         """
         theta = self.sample_prior()
-        output = numpyro.sample('obs', ndist.Binomial(len(data), theta), obs=jnp.sum(data))
+        output = numpyro.sample(
+            "obs", ndist.Binomial(len(data), theta), obs=jnp.sum(data)
+        )
         return output
 
-
     def MCMC_sampling(self, data, num_samples, num_warmup=1000):
         """
         Computes numerically the posterior distribution with beta prior parametrized by (alpha0, beta0)
@@ -457,37 +462,35 @@ class BayesianInference:
         """
         data = jnp.array(data, dtype=float)
         nuts_kernel = nNUTS(self.model)
-        mcmc = nMCMC(nuts_kernel, num_samples=num_samples, num_warmup=num_warmup, progress_bar=False)
+        mcmc = nMCMC(
+            nuts_kernel,
+            num_samples=num_samples,
+            num_warmup=num_warmup,
+            progress_bar=False,
+        )
         mcmc.run(self.rng_key, data=data)
 
-        samples = mcmc.get_samples()['theta']
+        samples = mcmc.get_samples()["theta"]
         return samples
 
-
     def beta_guide(self, data):
         """
         Defines the candidate parametrized variational distribution that we train to approximate posterior with Pyro/Numpyro
         Here we use parameterized beta
         """
-        alpha_q = numpyro.param('alpha_q', 10,
-                        constraint=nconstraints.positive)
-        beta_q = numpyro.param('beta_q', 10,
-                        constraint=nconstraints.positive)
-
-        numpyro.sample('theta', ndist.Beta(alpha_q, beta_q))
+        alpha_q = numpyro.param("alpha_q", 10, constraint=nconstraints.positive)
+        beta_q = numpyro.param("beta_q", 10, constraint=nconstraints.positive)
 
+        numpyro.sample("theta", ndist.Beta(alpha_q, beta_q))
 
     def truncnormal_guide(self, data):
         """
         Defines the candidate parametrized variational distribution that we train to approximate posterior with Pyro/Numpyro
         Here we use truncated normal on [0,1]
         """
-        loc = numpyro.param('loc', 0.5,
-                        constraint=nconstraints.interval(0.0, 1.0))
-        scale = numpyro.param('scale', 1,
-                        constraint=nconstraints.positive)
-        numpyro.sample('theta', ndist.TruncatedNormal(loc, scale, low=0.0, high=1.0))
-
+        loc = numpyro.param("loc", 0.5, constraint=nconstraints.interval(0.0, 1.0))
+        scale = numpyro.param("scale", 1, constraint=nconstraints.positive)
+        numpyro.sample("theta", ndist.TruncatedNormal(loc, scale, low=0.0, high=1.0))
 
     def SVI_init(self, guide_dist, lr=0.0005):
         """
@@ -496,12 +499,14 @@ class BayesianInference:
         """
         adam_params = {"lr": lr}
 
-        if guide_dist=='beta':
+        if guide_dist == "beta":
             optimizer = nAdam(step_size=lr)
             svi = nSVI(self.model, self.beta_guide, optimizer, loss=nTrace_ELBO())
-        elif guide_dist=='normal':
+        elif guide_dist == "normal":
             optimizer = nAdam(step_size=lr)
-            svi = nSVI(self.model, self.truncnormal_guide, optimizer, loss=nTrace_ELBO())
+            svi = nSVI(
+                self.model, self.truncnormal_guide, optimizer, loss=nTrace_ELBO()
+            )
         else:
             print("WARNING: Please input either 'beta' or 'normal'")
             svi = None
@@ -523,9 +528,7 @@ class BayesianInference:
 
         data = jnp.array(data, dtype=float)
         result = svi.run(self.rng_key, n_steps, data, progress_bar=False)
-        params = dict(
-            (key, jnp.asarray(value)) for key, value in result.params.items()
-            )
+        params = dict((key, jnp.asarray(value)) for key, value in result.params.items())
         losses = jnp.asarray(result.losses)
 
         return params, losses
@@ -542,12 +545,12 @@ To examine our alternative prior distributions, we'll plot approximate prior dis
 
 ```{code-cell} ipython3
 # truncated log normal
-exampleLN = BayesianInference(param=(0,2), name_dist='lognormal')
-exampleLN.show_prior(size=100000,bins=20)
+exampleLN = BayesianInference(param=(0, 2), name_dist="lognormal")
+exampleLN.show_prior(size=100000, bins=20)
 
 # truncated uniform
-exampleUN = BayesianInference(param=(0.1,0.8), name_dist='uniform')
-exampleUN.show_prior(size=100000,bins=20)
+exampleUN = BayesianInference(param=(0.1, 0.8), name_dist="uniform")
+exampleUN.show_prior(size=100000, bins=20)
 ```
 
 The above graphs show that sampling seems to work well with both distributions.
@@ -556,8 +559,8 @@ Now let's see how well things work with von Mises distributions.
 
 ```{code-cell} ipython3
 # shifted von Mises
-exampleVM = BayesianInference(param=10, name_dist='vonMises')
-exampleVM.show_prior(size=100000,bins=20)
+exampleVM = BayesianInference(param=10, name_dist="vonMises")
+exampleVM.show_prior(size=100000, bins=20)
 ```
 
 The graphs look good too.
@@ -566,8 +569,8 @@ Now let's try with a Laplace distribution.
 
 ```{code-cell} ipython3
 # truncated Laplace
-exampleLP = BayesianInference(param=(0.5,0.05), name_dist='laplace')
-exampleLP.show_prior(size=100000,bins=40)
+exampleLP = BayesianInference(param=(0.5, 0.05), name_dist="laplace")
+exampleLP.show_prior(size=100000, bins=40)
 ```
 
 Having assured ourselves that our sampler seems to do a good job, let's put it to work in using MCMC to compute posterior probabilities.
@@ -611,28 +614,29 @@ class BayesianInferencePlot:
 
         # plotting parameters
         self.binwidth = binwidth
-        self.linewidth=0.05
+        self.linewidth = 0.05
         self.colorlist = sns.color_palette(n_colors=len(N_list))
 
         # data generation
         N_max = max(N_list)
         self.data = simulate_draw(theta, N_max)
 
-
     def MCMC_plot(self, num_samples, num_warmup=1000):
         fig, ax = plt.subplots(figsize=(10, 6))
 
         # plot prior
         prior_sample = self.BayesianInferenceClass.show_prior(disp_plot=0)
         sns.histplot(
-            data=prior_sample, kde=True, stat='density',
+            data=prior_sample,
+            kde=True,
+            stat="density",
             binwidth=self.binwidth,
-            color='#4C4E52',
+            color="#4C4E52",
             linewidth=self.linewidth,
             alpha=0.1,
             ax=ax,
-            label='Prior Distribution'
-            )
+            label="Prior Distribution",
+        )
 
         # plot posteriors
         for id, n in enumerate(self.N_list):
@@ -640,65 +644,76 @@ class BayesianInferencePlot:
                 self.data[:n], num_samples, num_warmup
             )
             sns.histplot(
-                samples, kde=True, stat='density',
+                samples,
+                kde=True,
+                stat="density",
                 binwidth=self.binwidth,
                 linewidth=self.linewidth,
                 alpha=0.2,
-                color=self.colorlist[id-1],
-                label=f'Posterior with $n={n}$'
-                )
+                color=self.colorlist[id - 1],
+                label=f"Posterior with $n={n}$",
+            )
         ax.legend()
-        ax.set_title('MCMC Sampling density of Posterior Distributions', fontsize=15)
+        ax.set_title("MCMC Sampling density of Posterior Distributions", fontsize=15)
         plt.xlim(0, 1)
         plt.show()
 
-
     def SVI_fitting(self, guide_dist, params):
         """
         Fit the beta/truncnormal curve using parameters trained by SVI.
         """
         # create x axis
-        xaxis = np.linspace(0,1,1000)
-        if guide_dist=='beta':
-            y = st.beta.pdf(xaxis, a=params['alpha_q'], b=params['beta_q'])
+        xaxis = np.linspace(0, 1, 1000)
+        if guide_dist == "beta":
+            y = st.beta.pdf(xaxis, a=params["alpha_q"], b=params["beta_q"])
 
-        elif guide_dist=='normal':
+        elif guide_dist == "normal":
 
             # rescale upper/lower bound. See Scipy's truncnorm doc
             lower, upper = (0, 1)
-            loc, scale = params['loc'], params['scale']
+            loc, scale = params["loc"], params["scale"]
             a, b = (lower - loc) / scale, (upper - loc) / scale
 
-            y = st.truncnorm.pdf(xaxis, a=a, b=b, loc=params['loc'], scale=params['scale'])
+            y = st.truncnorm.pdf(
+                xaxis, a=a, b=b, loc=params["loc"], scale=params["scale"]
+            )
         return (xaxis, y)
 
-
     def SVI_plot(self, guide_dist, n_steps=2000):
         fig, ax = plt.subplots(figsize=(10, 6))
 
         # plot prior
         prior_sample = self.BayesianInferenceClass.show_prior(disp_plot=0)
         sns.histplot(
-            data=prior_sample, kde=True, stat='density',
+            data=prior_sample,
+            kde=True,
+            stat="density",
             binwidth=self.binwidth,
-            color='#4C4E52',
+            color="#4C4E52",
             linewidth=self.linewidth,
             alpha=0.1,
             ax=ax,
-            label='Prior Distribution'
-            )
+            label="Prior Distribution",
+        )
 
         # plot posteriors
         for id, n in enumerate(self.N_list):
-            (params, losses) = self.BayesianInferenceClass.SVI_run(self.data[:n], guide_dist, n_steps)
+            (params, losses) = self.BayesianInferenceClass.SVI_run(
+                self.data[:n], guide_dist, n_steps
+            )
             x, y = self.SVI_fitting(guide_dist, params)
-            ax.plot(x, y,
+            ax.plot(
+                x,
+                y,
                 alpha=1,
-                color=self.colorlist[id-1],
-                label=f'Posterior with $n={n}$'
-                )
+                color=self.colorlist[id - 1],
+                label=f"Posterior with $n={n}$",
+            )
         ax.legend()
-        ax.set_title(f'SVI density of Posterior Distributions with {guide_dist} guide', fontsize=15)
+        ax.set_title(
+            f"SVI density of Posterior Distributions with {guide_dist} guide",
+            fontsize=15,
+        )
         plt.xlim(0, 1)
         plt.show()
 ```
@@ -710,7 +725,7 @@ To save computer time at first, notice that we'll set `MCMC_num_samples = 2000`
 (Later, to increase accuracy of approximations, we'll want to increase these.)
 
 ```{code-cell} ipython3
-num_list = [5,10,50,100,1000]
+num_list = [5, 10, 50, 100, 1000]
 MCMC_num_samples = 2000
 SVI_num_steps = 5000
 
@@ -732,18 +747,18 @@ Let's start with the analytical method that we described in this {doc}`Probabili
 
 ```{code-cell} ipython3
 # First examine Beta prior
-BETA = BayesianInference(param=(5,5), name_dist='beta')
+BETA = BayesianInference(param=(5, 5), name_dist="beta")
 
 BETA_plot = BayesianInferencePlot(true_theta, num_list, BETA)
 
 
 # plot analytical Beta prior and posteriors
-xaxis = np.linspace(0,1,1000)
+xaxis = np.linspace(0, 1, 1000)
 y_prior = st.beta.pdf(xaxis, 5, 5)
 
 fig, ax = plt.subplots(figsize=(10, 6))
 # plot analytical beta prior
-ax.plot(xaxis, y_prior, label='Analytical Beta Prior', color='#4C4E52')
+ax.plot(xaxis, y_prior, label="Analytical Beta Prior", color="#4C4E52")
 
 data, colorlist, N_list = BETA_plot.data, BETA_plot.colorlist, BETA_plot.N_list
 
@@ -752,9 +767,13 @@ for id, n in enumerate(N_list):
     func = analytical_beta_posterior(data[:n], alpha0=5, beta0=5)
     y_posterior = func.pdf(xaxis)
     ax.plot(
-        xaxis, y_posterior, color=colorlist[id-1], label=f'Analytical Beta Posterior with $n={n}$')
+        xaxis,
+        y_posterior,
+        color=colorlist[id - 1],
+        label=f"Analytical Beta Posterior with $n={n}$",
+    )
 ax.legend()
-ax.set_title('Analytical Beta Prior and Posterior', fontsize=15)
+ax.set_title("Analytical Beta Prior and Posterior", fontsize=15)
 plt.xlim(0, 1)
 plt.show()
 ```
@@ -764,8 +783,12 @@ Now let's use MCMC while still using a beta prior.
 We'll do this for both MCMC and VI.
 
 ```{code-cell} ipython3
-BayesianInferencePlot(true_theta, num_list, BETA).MCMC_plot(num_samples=MCMC_num_samples)
-BayesianInferencePlot(true_theta, num_list, BETA).SVI_plot(guide_dist='beta', n_steps=SVI_num_steps)
+BayesianInferencePlot(true_theta, num_list, BETA).MCMC_plot(
+    num_samples=MCMC_num_samples
+)
+BayesianInferencePlot(true_theta, num_list, BETA).SVI_plot(
+    guide_dist="beta", n_steps=SVI_num_steps
+)
 ```
 
 Here the MCMC approximation looks good.
@@ -782,7 +805,9 @@ that will be more accurate, as we shall see next.
 (Increasing the step size increases computational time though).
 
 ```{code-cell} ipython3
-BayesianInferencePlot(true_theta, num_list, BETA).SVI_plot(guide_dist='beta', n_steps=100000)
+BayesianInferencePlot(true_theta, num_list, BETA).SVI_plot(
+    guide_dist="beta", n_steps=100000
+)
 ```
 
 ## Non-conjugate Prior Distributions
@@ -801,28 +826,36 @@ We first initialize the `BayesianInference` classes and then can directly call `
 ```{code-cell} ipython3
 # Initialize BayesianInference classes
 # Try uniform
-STD_UNIFORM = BayesianInference(param=(0,1), name_dist='uniform')
-UNIFORM = BayesianInference(param=(0.2,0.7), name_dist='uniform')
+STD_UNIFORM = BayesianInference(param=(0, 1), name_dist="uniform")
+UNIFORM = BayesianInference(param=(0.2, 0.7), name_dist="uniform")
 
 # Try truncated lognormal
-LOGNORMAL = BayesianInference(param=(0,2), name_dist='lognormal')
+LOGNORMAL = BayesianInference(param=(0, 2), name_dist="lognormal")
 
 # Try Von Mises
-VONMISES = BayesianInference(param=10, name_dist='vonMises')
+VONMISES = BayesianInference(param=10, name_dist="vonMises")
 
 # Try Laplace
-LAPLACE = BayesianInference(param=(0.5, 0.07), name_dist='laplace')
+LAPLACE = BayesianInference(param=(0.5, 0.07), name_dist="laplace")
 ```
 
 ```{code-cell} ipython3
 # Uniform
 example_CLASS = STD_UNIFORM
-print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}')
-BayesianInferencePlot(true_theta, num_list, example_CLASS).MCMC_plot(num_samples=MCMC_num_samples)
+print(
+    f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
+)
+BayesianInferencePlot(true_theta, num_list, example_CLASS).MCMC_plot(
+    num_samples=MCMC_num_samples
+)
 
 example_CLASS = UNIFORM
-print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}')
-BayesianInferencePlot(true_theta, num_list, example_CLASS).MCMC_plot(num_samples=MCMC_num_samples)
+print(
+    f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
+)
+BayesianInferencePlot(true_theta, num_list, example_CLASS).MCMC_plot(
+    num_samples=MCMC_num_samples
+)
 ```
 
 In the situation depicted above, we have assumed a $Uniform(\underline{\theta}, \overline{\theta})$ prior that puts zero probability outside a bounded support that excludes the true value.
@@ -834,23 +867,35 @@ Note how when the true data-generating $\theta$ is located at $0.8$ as it is her
 ```{code-cell} ipython3
 # Log Normal
 example_CLASS = LOGNORMAL
-print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}')
-BayesianInferencePlot(true_theta, num_list, example_CLASS).MCMC_plot(num_samples=MCMC_num_samples)
+print(
+    f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
+)
+BayesianInferencePlot(true_theta, num_list, example_CLASS).MCMC_plot(
+    num_samples=MCMC_num_samples
+)
 ```
 
 ```{code-cell} ipython3
 # Von Mises
 example_CLASS = VONMISES
-print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}')
-print('\nNOTE: Shifted von Mises')
-BayesianInferencePlot(true_theta, num_list, example_CLASS).MCMC_plot(num_samples=MCMC_num_samples)
+print(
+    f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
+)
+print("\nNOTE: Shifted von Mises")
+BayesianInferencePlot(true_theta, num_list, example_CLASS).MCMC_plot(
+    num_samples=MCMC_num_samples
+)
 ```
 
 ```{code-cell} ipython3
 # Laplace
 example_CLASS = LAPLACE
-print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}')
-BayesianInferencePlot(true_theta, num_list, example_CLASS).MCMC_plot(num_samples=MCMC_num_samples)
+print(
+    f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
+)
+BayesianInferencePlot(true_theta, num_list, example_CLASS).MCMC_plot(
+    num_samples=MCMC_num_samples
+)
 ```
 
 To get more accuracy we will now increase the number of steps for Variational Inference (VI)
@@ -863,23 +908,35 @@ SVI_num_steps = 50000
 
 ```{code-cell} ipython3
 # Uniform
-example_CLASS = BayesianInference(param=(0,1), name_dist='uniform')
-print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}')
-BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(guide_dist='normal', n_steps=SVI_num_steps)
+example_CLASS = BayesianInference(param=(0, 1), name_dist="uniform")
+print(
+    f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
+)
+BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(
+    guide_dist="normal", n_steps=SVI_num_steps
+)
 ```
 
 ```{code-cell} ipython3
 # Lognormal
 example_CLASS = LOGNORMAL
-print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}')
-BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(guide_dist='normal', n_steps=SVI_num_steps)
+print(
+    f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
+)
+BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(
+    guide_dist="normal", n_steps=SVI_num_steps
+)
 ```
 
 ```{code-cell} ipython3
 # Laplace
 example_CLASS = LAPLACE
-print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}')
-BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(guide_dist='normal', n_steps=SVI_num_steps)
+print(
+    f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
+)
+BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(
+    guide_dist="normal", n_steps=SVI_num_steps
+)
 ```
 
 #### Variational Inference with a Beta Guide Distribution
@@ -887,28 +944,44 @@ BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(guide_dist='
 ```{code-cell} ipython3
 # Uniform
 example_CLASS = STD_UNIFORM
-print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}')
-BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(guide_dist='beta', n_steps=SVI_num_steps)
+print(
+    f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
+)
+BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(
+    guide_dist="beta", n_steps=SVI_num_steps
+)
 ```
 
 ```{code-cell} ipython3
 # Log Normal
 example_CLASS = LOGNORMAL
-print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}')
-BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(guide_dist='beta', n_steps=SVI_num_steps)
+print(
+    f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
+)
+BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(
+    guide_dist="beta", n_steps=SVI_num_steps
+)
 ```
 
 ```{code-cell} ipython3
 # Von Mises
 example_CLASS = VONMISES
-print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}')
-print('Shifted von Mises')
-BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(guide_dist='beta', n_steps=SVI_num_steps)
+print(
+    f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
+)
+print("Shifted von Mises")
+BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(
+    guide_dist="beta", n_steps=SVI_num_steps
+)
 ```
 
 ```{code-cell} ipython3
 # Laplace
 example_CLASS = LAPLACE
-print(f'=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}')
-BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(guide_dist='beta', n_steps=SVI_num_steps)
+print(
+    f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
+)
+BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(
+    guide_dist="beta", n_steps=SVI_num_steps
+)
 ```

From 604bec9e629aa047badca6815daac7017e6f51e4 Mon Sep 17 00:00:00 2001
From: kp992 <kpl992@outlook.com>
Date: Thu, 14 Aug 2025 20:46:11 -0700
Subject: [PATCH 05/22] fix doc reference

---
 lectures/bayes_nonconj.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lectures/bayes_nonconj.md b/lectures/bayes_nonconj.md
index 7b9d9fd86..63bc48d0e 100644
--- a/lectures/bayes_nonconj.md
+++ b/lectures/bayes_nonconj.md
@@ -22,7 +22,7 @@ kernelspec:
 !pip install numpyro jax
 ```
 
-This lecture is a sequel to the {doc}`Two Meanings of Probability lecture <prob_meaning>`.
+This lecture is a sequel to the {doc}`prob_meaning`.
 
 That lecture offers a Bayesian interpretation of probability in a setting in which the likelihood function and the prior distribution
 over parameters just happened to form a **conjugate** pair in which
@@ -67,7 +67,7 @@ from numpyro.optim import Adam as nAdam
 
 ## Unleashing MCMC on a  Binomial Likelihood
 
-This lecture begins with the binomial example in the {doc}`Probability Meanings lecture <prob_meaning>`.
+This lecture begins with the binomial example in the {doc}`prob_meaning`.
 
 That lecture computed a posterior
 
@@ -82,7 +82,7 @@ We use `numpyro` with assistance from `jax` to approximate a posterior distribut
 
 We use several alternative prior distributions.
 
-We compare computed posteriors with ones associated with a conjugate prior as described in {doc}`Probability Meanings lecture <prob_meaning>`.
+We compare computed posteriors with ones associated with a conjugate prior as described in {doc}`prob_meaning`.
 
 ### Analytical Posterior
 
@@ -743,7 +743,7 @@ For the same Beta prior, we shall
 * compute posteriors using MCMC using `numpyro`.
 * compute posteriors using VI using `numpyro`.
 
-Let's start with the analytical method that we described in this {doc}`Probability Meanings lecture <prob_meaning>`
+Let's start with the analytical method that we described in this {doc}`prob_meaning`
 
 ```{code-cell} ipython3
 # First examine Beta prior

From e8db23f0dec9ea0c97a386155cbb9262d85dfbe7 Mon Sep 17 00:00:00 2001
From: kp992 <kpl992@outlook.com>
Date: Thu, 14 Aug 2025 21:00:57 -0700
Subject: [PATCH 06/22] fix legend warning

---
 lectures/bayes_nonconj.md | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/lectures/bayes_nonconj.md b/lectures/bayes_nonconj.md
index 63bc48d0e..910ff9dbf 100644
--- a/lectures/bayes_nonconj.md
+++ b/lectures/bayes_nonconj.md
@@ -475,7 +475,7 @@ class BayesianInference:
 
     def beta_guide(self, data):
         """
-        Defines the candidate parametrized variational distribution that we train to approximate posterior with Pyro/Numpyro
+        Defines the candidate parametrized variational distribution that we train to approximate posterior with numpyro
         Here we use parameterized beta
         """
         alpha_q = numpyro.param("alpha_q", 10, constraint=nconstraints.positive)
@@ -485,7 +485,7 @@ class BayesianInference:
 
     def truncnormal_guide(self, data):
         """
-        Defines the candidate parametrized variational distribution that we train to approximate posterior with Pyro/Numpyro
+        Defines the candidate parametrized variational distribution that we train to approximate posterior with numpyro
         Here we use truncated normal on [0,1]
         """
         loc = numpyro.param("loc", 0.5, constraint=nconstraints.interval(0.0, 1.0))
@@ -495,7 +495,6 @@ class BayesianInference:
     def SVI_init(self, guide_dist, lr=0.0005):
         """
         Initiate SVI training mode with Adam optimizer
-        NOTE: truncnormal_guide can only be used with numpyro solver
         """
         adam_params = {"lr": lr}
 
@@ -653,7 +652,7 @@ class BayesianInferencePlot:
                 color=self.colorlist[id - 1],
                 label=f"Posterior with $n={n}$",
             )
-        ax.legend()
+        ax.legend(loc="upper left")
         ax.set_title("MCMC Sampling density of Posterior Distributions", fontsize=15)
         plt.xlim(0, 1)
         plt.show()
@@ -709,7 +708,7 @@ class BayesianInferencePlot:
                 color=self.colorlist[id - 1],
                 label=f"Posterior with $n={n}$",
             )
-        ax.legend()
+        ax.legend(loc="upper left")
         ax.set_title(
             f"SVI density of Posterior Distributions with {guide_dist} guide",
             fontsize=15,
@@ -772,7 +771,7 @@ for id, n in enumerate(N_list):
         color=colorlist[id - 1],
         label=f"Analytical Beta Posterior with $n={n}$",
     )
-ax.legend()
+ax.legend(loc="upper left")
 ax.set_title("Analytical Beta Prior and Posterior", fontsize=15)
 plt.xlim(0, 1)
 plt.show()

From facf167bba6db9f626b4095cda3a0be952161415 Mon Sep 17 00:00:00 2001
From: xuanguang-li <edwardli02123@gmail.com>
Date: Mon, 18 Aug 2025 16:21:39 +0800
Subject: [PATCH 07/22] modify style

- Adopt QuantEcon naming format
- Adopt one-line docstring format in accordance with PEP 8
- Change integral notation from $\int dx\, f(x)$ to $\int f(x)\, dx$
- Reorder the illustration of (`param`, `name_dist`) pairs
---
 lectures/bayes_nonconj.md | 96 +++++++++++++++++----------------------
 1 file changed, 41 insertions(+), 55 deletions(-)

diff --git a/lectures/bayes_nonconj.md b/lectures/bayes_nonconj.md
index 910ff9dbf..eb613de26 100644
--- a/lectures/bayes_nonconj.md
+++ b/lectures/bayes_nonconj.md
@@ -4,7 +4,7 @@ jupytext:
     extension: .md
     format_name: myst
     format_version: 0.13
-    jupytext_version: 1.16.7
+    jupytext_version: 1.17.2
 kernelspec:
   display_name: Python 3 (ipykernel)
   language: python
@@ -65,7 +65,7 @@ from numpyro.infer import Trace_ELBO as nTrace_ELBO
 from numpyro.optim import Adam as nAdam
 ```
 
-## Unleashing MCMC on a  Binomial Likelihood
+## Unleashing MCMC on a binomial likelihood
 
 This lecture begins with the binomial example in the {doc}`prob_meaning`.
 
@@ -84,7 +84,7 @@ We use several alternative prior distributions.
 
 We compare computed posteriors with ones associated with a conjugate prior as described in {doc}`prob_meaning`.
 
-### Analytical Posterior
+### Analytical posterior
 
 Assume that the random variable $X\sim Binom\left(n,\theta\right)$.
 
@@ -131,9 +131,7 @@ The analytical posterior for a given conjugate beta prior is coded in the follow
 
 ```{code-cell} ipython3
 def simulate_draw(theta, n):
-    """
-    Draws a Bernoulli sample of size n with probability P(Y=1) = theta
-    """
+    """Draws a Bernoulli sample of size n with probability P(Y=1) = theta"""
     rand_draw = np.random.rand(n)
     draw = (rand_draw < theta).astype(int)
     return draw
@@ -161,7 +159,7 @@ def analytical_beta_posterior(data, alpha0, beta0):
     return st.beta(alpha0 + up_num, beta0 + down_num)
 ```
 
-### Two Ways to Approximate Posteriors
+### Two ways to approximate posteriors
 
 Suppose that we don't have a conjugate prior.
 
@@ -193,7 +191,7 @@ a Kullback-Leibler (KL) divergence between true posterior and the putative poste
 
 - minimizing the KL divergence is equivalent to maximizing a criterion called the **Evidence Lower Bound** (ELBO), as we shall verify soon.
 
-## Prior Distributions
+## Prior distributions
 
 In order to be able to apply MCMC sampling or VI, `numpyro` requires that a prior distribution satisfy special properties:
 
@@ -230,14 +228,12 @@ def TruncatedLogNormal_trans(loc, scale):
     """
     base_dist = ndist.TruncatedNormal(
         low=jnp.log(0), high=jnp.log(1), loc=loc, scale=scale
-    )
+    ) #TODO:is it fine to use log(0)?
     return ndist.TransformedDistribution(base_dist, ndist.transforms.ExpTransform())
 
 
 def ShiftedVonMises(kappa):
-    """
-    Obtains the shifted von Mises distribution using AffineTransform
-    """
+    """Obtains the shifted von Mises distribution using AffineTransform"""
     base_dist = ndist.VonMises(0, kappa)
     return ndist.TransformedDistribution(
         base_dist, ndist.transforms.AffineTransform(loc=0.5, scale=1 / (2 * jnp.pi))
@@ -245,14 +241,12 @@ def ShiftedVonMises(kappa):
 
 
 def TruncatedLaplace(loc, scale):
-    """
-    Obtains the truncated Laplace distribution on [0,1]
-    """
+    """Obtains the truncated Laplace distribution on [0,1]"""
     base_dist = ndist.Laplace(loc, scale)
     return ndist.TruncatedDistribution(base_dist, low=0.0, high=1.0)
 ```
 
-### Variational Inference
+### Variational inference
 
 Instead of directly sampling from the posterior, the **variational inference** method approximates an unknown posterior distribution with a family of tractable distributions/densities.
 
@@ -275,7 +269,7 @@ $$
 where
 
 $$
-p\left(Y\right)=\int d\theta p\left(Y\mid\theta\right)p\left(Y\right).
+p\left(Y\right)=\int p\left(Y\mid\theta\right)p\left(Y\right) d\theta.
 $$ (eq:intchallenge)
 
 The integral on the right side of {eq}`eq:intchallenge` is typically difficult to compute.
@@ -298,19 +292,19 @@ Note that
 
 $$
 \begin{aligned}D_{KL}(q(\theta;\phi)\;\|\;p(\theta\mid Y)) & =-\int d\theta q(\theta;\phi)\log\frac{P(\theta\mid Y)}{q(\theta;\phi)}\\
- & =-\int d\theta q(\theta)\log\frac{\frac{p(\theta,Y)}{p(Y)}}{q(\theta)}\\
- & =-\int d\theta q(\theta)\log\frac{p(\theta,Y)}{p(\theta)q(Y)}\\
- & =-\int d\theta q(\theta)\left[\log\frac{p(\theta,Y)}{q(\theta)}-\log p(Y)\right]\\
- & =-\int d\theta q(\theta)\log\frac{p(\theta,Y)}{q(\theta)}+\int d\theta q(\theta)\log p(Y)\\
- & =-\int d\theta q(\theta)\log\frac{p(\theta,Y)}{q(\theta)}+\log p(Y)\\
-\log p(Y)&=D_{KL}(q(\theta;\phi)\;\|\;p(\theta\mid Y))+\int d\theta q_{\phi}(\theta)\log\frac{p(\theta,Y)}{q_{\phi}(\theta)}
+ & =-\int q(\theta)\log\frac{\frac{p(\theta,Y)}{p(Y)}}{q(\theta)} d\theta\\
+ & =-\int q(\theta)\log\frac{p(\theta,Y)}{p(\theta)q(Y)} d\theta\\
+ & =-\int q(\theta)\left[\log\frac{p(\theta,Y)}{q(\theta)}-\log p(Y)\right] d\theta\\
+ & =-\int q(\theta)\log\frac{p(\theta,Y)}{q(\theta)}+\int d\theta q(\theta)\log p(Y) d\theta\\
+ & =-\int q(\theta)\log\frac{p(\theta,Y)}{q(\theta)}+\log p(Y) d\theta\\
+\log p(Y)&=D_{KL}(q(\theta;\phi)\;\|\;p(\theta\mid Y))+\int q_{\phi}(\theta)\log\frac{p(\theta,Y)}{q_{\phi}(\theta)} d\theta
 \end{aligned}
 $$
 
 For observed data $Y$, $p(\theta,Y)$ is a constant, so minimizing KL divergence is equivalent to maximizing
 
 $$
-ELBO\equiv\int d\theta q_{\phi}(\theta)\log\frac{p(\theta,Y)}{q_{\phi}(\theta)}=\mathbb{E}_{q_{\phi}(\theta)}\left[\log p(\theta,Y)-\log q_{\phi}(\theta)\right]
+ELBO\equiv\int q_{\phi}(\theta)\log\frac{p(\theta,Y)}{q_{\phi}(\theta)} d\theta=\mathbb{E}_{q_{\phi}(\theta)}\left[\log p(\theta,Y)-\log q_{\phi}(\theta)\right]
 $$ (eq:ELBO)
 
 Formula {eq}`eq:ELBO` is called the evidence lower bound (ELBO).
@@ -338,16 +332,16 @@ We have constructed a Python class `BayesianInference` that requires the followi
 - `name_dist`: a string that specifies distribution names
 
 The (`param`, `name_dist`) pair includes:
-- ('beta', alpha, beta)
+- (alpha, beta, 'beta')
 
-- ('uniform', lower_bound, upper_bound)
+- (lower_bound, upper_bound, 'uniform')
 
-- ('lognormal', loc, scale)
+- (loc, scale, 'lognormal')
    - Note: This is the truncated log normal.
 
-- ('vonMises', kappa), where kappa denotes concentration parameter, and center location is set to $0.5$. Using `numpyro`, this is the **shifted** distribution.
+- (kappa, 'vonMises'), where kappa denotes concentration parameter, and center location is set to $0.5$. Using `numpyro`, this is the **shifted** distribution.
 
-- ('laplace', loc, scale)
+- (loc, scale, 'laplace')
    - Note: This is the truncated Laplace
 
 The class `BayesianInference` has several key methods :
@@ -384,9 +378,7 @@ class BayesianInference:
         self.rng_key = jax_random.PRNGKey(0)
 
     def sample_prior(self):
-        """
-        Define the prior distribution to sample from in numpyro models.
-        """
+        """Define the prior distribution to sample from in numpyro models."""
         if self.name_dist == "beta":
             # unpack parameters
             alpha0, beta0 = self.param
@@ -493,9 +485,7 @@ class BayesianInference:
         numpyro.sample("theta", ndist.TruncatedNormal(loc, scale, low=0.0, high=1.0))
 
     def SVI_init(self, guide_dist, lr=0.0005):
-        """
-        Initiate SVI training mode with Adam optimizer
-        """
+        """Initiate SVI training mode with Adam optimizer"""
         adam_params = {"lr": lr}
 
         if guide_dist == "beta":
@@ -533,7 +523,7 @@ class BayesianInference:
         return params, losses
 ```
 
-## Alternative Prior Distributions
+## Alternative prior distributions
 
 Let's see how well our sampling algorithm does in approximating
 
@@ -574,7 +564,7 @@ exampleLP.show_prior(size=100000, bins=40)
 
 Having assured ourselves that our sampler seems to do a good job, let's put it to work in using MCMC to compute posterior probabilities.
 
-## Posteriors Via MCMC and VI
+## Posteriors via MCMC and VI
 
 We construct a class `BayesianInferencePlot` to implement MCMC or VI algorithms and plot multiple posteriors for different updating data sizes and different possible priors.
 
@@ -604,9 +594,7 @@ class BayesianInferencePlot:
     """
 
     def __init__(self, theta, N_list, BayesianInferenceClass, binwidth=0.02):
-        """
-        Enter Parameters for data generation and plotting
-        """
+        """Enter Parameters for data generation and plotting"""
         self.theta = theta
         self.N_list = N_list
         self.BayesianInferenceClass = BayesianInferenceClass
@@ -634,7 +622,7 @@ class BayesianInferencePlot:
             linewidth=self.linewidth,
             alpha=0.1,
             ax=ax,
-            label="Prior Distribution",
+            label="Prior distribution",
         )
 
         # plot posteriors
@@ -653,7 +641,7 @@ class BayesianInferencePlot:
                 label=f"Posterior with $n={n}$",
             )
         ax.legend(loc="upper left")
-        ax.set_title("MCMC Sampling density of Posterior Distributions", fontsize=15)
+        ax.set_title("MCMC sampling density of posterior distributions", fontsize=15)
         plt.xlim(0, 1)
         plt.show()
 
@@ -667,7 +655,6 @@ class BayesianInferencePlot:
             y = st.beta.pdf(xaxis, a=params["alpha_q"], b=params["beta_q"])
 
         elif guide_dist == "normal":
-
             # rescale upper/lower bound. See Scipy's truncnorm doc
             lower, upper = (0, 1)
             loc, scale = params["loc"], params["scale"]
@@ -692,7 +679,7 @@ class BayesianInferencePlot:
             linewidth=self.linewidth,
             alpha=0.1,
             ax=ax,
-            label="Prior Distribution",
+            label="Prior distribution",
         )
 
         # plot posteriors
@@ -710,7 +697,7 @@ class BayesianInferencePlot:
             )
         ax.legend(loc="upper left")
         ax.set_title(
-            f"SVI density of Posterior Distributions with {guide_dist} guide",
+            f"SVI density of posterior distributions with {guide_dist} guide",
             fontsize=15,
         )
         plt.xlim(0, 1)
@@ -732,7 +719,7 @@ SVI_num_steps = 5000
 true_theta = 0.8
 ```
 
-### Beta Prior and Posteriors:
+### Beta prior and posteriors:
 
 Let's compare outcomes when we use a Beta prior.
 
@@ -745,19 +732,18 @@ For the same Beta prior, we shall
 Let's start with the analytical method that we described in this {doc}`prob_meaning`
 
 ```{code-cell} ipython3
-# First examine Beta prior
+# first examine Beta prior
 BETA = BayesianInference(param=(5, 5), name_dist="beta")
 
 BETA_plot = BayesianInferencePlot(true_theta, num_list, BETA)
 
-
 # plot analytical Beta prior and posteriors
 xaxis = np.linspace(0, 1, 1000)
 y_prior = st.beta.pdf(xaxis, 5, 5)
 
 fig, ax = plt.subplots(figsize=(10, 6))
 # plot analytical beta prior
-ax.plot(xaxis, y_prior, label="Analytical Beta Prior", color="#4C4E52")
+ax.plot(xaxis, y_prior, label="Analytical Beta prior", color="#4C4E52")
 
 data, colorlist, N_list = BETA_plot.data, BETA_plot.colorlist, BETA_plot.N_list
 
@@ -769,10 +755,10 @@ for id, n in enumerate(N_list):
         xaxis,
         y_posterior,
         color=colorlist[id - 1],
-        label=f"Analytical Beta Posterior with $n={n}$",
+        label=f"Analytical Beta posterior with $n={n}$",
     )
 ax.legend(loc="upper left")
-ax.set_title("Analytical Beta Prior and Posterior", fontsize=15)
+ax.set_title("Analytical Beta prior and posterior", fontsize=15)
 plt.xlim(0, 1)
 plt.show()
 ```
@@ -809,7 +795,7 @@ BayesianInferencePlot(true_theta, num_list, BETA).SVI_plot(
 )
 ```
 
-## Non-conjugate Prior Distributions
+## Non-conjugate prior distributions
 
 Having assured ourselves that our MCMC and VI methods can work well when we have a conjugate prior and so can also compute analytically, we
 next proceed to situations in which our prior is not a beta distribution, so we don't have a conjugate prior.
@@ -903,7 +889,7 @@ To get more accuracy we will now increase the number of steps for Variational In
 SVI_num_steps = 50000
 ```
 
-#### VI with a Truncated Normal Guide
+#### VI with a truncated Normal guide
 
 ```{code-cell} ipython3
 # Uniform
@@ -938,7 +924,7 @@ BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(
 )
 ```
 
-#### Variational Inference with a Beta Guide Distribution
+#### Variational inference with a Beta guide distribution
 
 ```{code-cell} ipython3
 # Uniform
@@ -952,7 +938,7 @@ BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(
 ```
 
 ```{code-cell} ipython3
-# Log Normal
+# log Normal
 example_CLASS = LOGNORMAL
 print(
     f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"

From 9e92870ad3231eda1b84b1a15400774e6183eaec Mon Sep 17 00:00:00 2001
From: xuanguang-li <edwardli02123@gmail.com>
Date: Mon, 18 Aug 2025 17:36:26 +0800
Subject: [PATCH 08/22] test setting figure by cell metadata

---
 lectures/bayes_nonconj.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/lectures/bayes_nonconj.md b/lectures/bayes_nonconj.md
index eb613de26..1a5b541f8 100644
--- a/lectures/bayes_nonconj.md
+++ b/lectures/bayes_nonconj.md
@@ -768,9 +768,20 @@ Now let's use MCMC while still using a beta prior.
 We'll do this for both MCMC and VI.
 
 ```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: |
+      MCMC sampling density of posterior distributions
+    name: mcmc
+---
+
 BayesianInferencePlot(true_theta, num_list, BETA).MCMC_plot(
     num_samples=MCMC_num_samples
 )
+```
+
+```{code-cell} ipython3
 BayesianInferencePlot(true_theta, num_list, BETA).SVI_plot(
     guide_dist="beta", n_steps=SVI_num_steps
 )

From 3056dfbe309007cc0858f2b0dec4a8db4c1f9a01 Mon Sep 17 00:00:00 2001
From: xuanguang-li <edwardli02123@gmail.com>
Date: Tue, 19 Aug 2025 16:49:31 +0800
Subject: [PATCH 09/22] modify figure style

---
 lectures/bayes_nonconj.md | 167 ++++++++++++++++++++++++++++++++------
 1 file changed, 141 insertions(+), 26 deletions(-)

diff --git a/lectures/bayes_nonconj.md b/lectures/bayes_nonconj.md
index 1a5b541f8..be9ee3c80 100644
--- a/lectures/bayes_nonconj.md
+++ b/lectures/bayes_nonconj.md
@@ -228,7 +228,7 @@ def TruncatedLogNormal_trans(loc, scale):
     """
     base_dist = ndist.TruncatedNormal(
         low=jnp.log(0), high=jnp.log(1), loc=loc, scale=scale
-    ) #TODO:is it fine to use log(0)?
+    )
     return ndist.TransformedDistribution(base_dist, ndist.transforms.ExpTransform())
 
 
@@ -279,7 +279,7 @@ Consider a **guide distribution** $q_{\phi}(\theta)$ parameterized by $\phi$ tha
 We choose parameters $\phi$ of the guide distribution to minimize a Kullback-Leibler (KL) divergence between the approximate posterior $q_{\phi}(\theta)$ and the posterior:
 
 $$
- D_{KL}(q(\theta;\phi)\;\|\;p(\theta\mid Y)) \equiv -\int d\theta q(\theta;\phi)\log\frac{p(\theta\mid Y)}{q(\theta;\phi)}
+ D_{KL}(q(\theta;\phi)\;\|\;p(\theta\mid Y)) \equiv -\int q(\theta;\phi)\log\frac{p(\theta\mid Y)}{q(\theta;\phi)} d\theta
 $$
 
 Thus, we want a **variational distribution** $q$ that solves
@@ -291,7 +291,7 @@ $$
 Note that
 
 $$
-\begin{aligned}D_{KL}(q(\theta;\phi)\;\|\;p(\theta\mid Y)) & =-\int d\theta q(\theta;\phi)\log\frac{P(\theta\mid Y)}{q(\theta;\phi)}\\
+\begin{aligned}D_{KL}(q(\theta;\phi)\;\|\;p(\theta\mid Y)) & =-\int q(\theta;\phi)\log\frac{P(\theta\mid Y)}{q(\theta;\phi)} d\theta\\
  & =-\int q(\theta)\log\frac{\frac{p(\theta,Y)}{p(Y)}}{q(\theta)} d\theta\\
  & =-\int q(\theta)\log\frac{p(\theta,Y)}{p(\theta)q(Y)} d\theta\\
  & =-\int q(\theta)\left[\log\frac{p(\theta,Y)}{q(\theta)}-\log p(Y)\right] d\theta\\
@@ -533,10 +533,26 @@ Let's see how well our sampling algorithm does in approximating
 To examine our alternative prior distributions, we'll plot approximate prior distributions below by calling the `show_prior` method.
 
 ```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: |
+      Truncated log normal distribution
+    name: fig_lognormal_dist
+---
 # truncated log normal
 exampleLN = BayesianInference(param=(0, 2), name_dist="lognormal")
 exampleLN.show_prior(size=100000, bins=20)
+```
 
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: |
+      Truncated uniform distribution
+    name: fig_uniform_dist
+---
 # truncated uniform
 exampleUN = BayesianInference(param=(0.1, 0.8), name_dist="uniform")
 exampleUN.show_prior(size=100000, bins=20)
@@ -548,6 +564,13 @@ Now let's see how well things work with von Mises distributions.
 
 ```{code-cell} ipython3
 # shifted von Mises
+---
+mystnb:
+  figure:
+    caption: |
+      Shifted von Mises distribution
+    name: fig_vonmises_dist
+---
 exampleVM = BayesianInference(param=10, name_dist="vonMises")
 exampleVM.show_prior(size=100000, bins=20)
 ```
@@ -557,6 +580,13 @@ The graphs look good too.
 Now let's try with a Laplace distribution.
 
 ```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: |
+      Truncated Laplace distribution
+    name: fig_laplace_dist
+---
 # truncated Laplace
 exampleLP = BayesianInference(param=(0.5, 0.05), name_dist="laplace")
 exampleLP.show_prior(size=100000, bins=40)
@@ -609,7 +639,7 @@ class BayesianInferencePlot:
         self.data = simulate_draw(theta, N_max)
 
     def MCMC_plot(self, num_samples, num_warmup=1000):
-        fig, ax = plt.subplots(figsize=(10, 6))
+        fig, ax = plt.subplots()
 
         # plot prior
         prior_sample = self.BayesianInferenceClass.show_prior(disp_plot=0)
@@ -641,14 +671,11 @@ class BayesianInferencePlot:
                 label=f"Posterior with $n={n}$",
             )
         ax.legend(loc="upper left")
-        ax.set_title("MCMC sampling density of posterior distributions", fontsize=15)
         plt.xlim(0, 1)
         plt.show()
 
     def SVI_fitting(self, guide_dist, params):
-        """
-        Fit the beta/truncnormal curve using parameters trained by SVI.
-        """
+        """Fit the beta/truncnormal curve using parameters trained by SVI."""
         # create x axis
         xaxis = np.linspace(0, 1, 1000)
         if guide_dist == "beta":
@@ -666,7 +693,7 @@ class BayesianInferencePlot:
         return (xaxis, y)
 
     def SVI_plot(self, guide_dist, n_steps=2000):
-        fig, ax = plt.subplots(figsize=(10, 6))
+        fig, ax = plt.subplots()
 
         # plot prior
         prior_sample = self.BayesianInferenceClass.show_prior(disp_plot=0)
@@ -696,10 +723,6 @@ class BayesianInferencePlot:
                 label=f"Posterior with $n={n}$",
             )
         ax.legend(loc="upper left")
-        ax.set_title(
-            f"SVI density of posterior distributions with {guide_dist} guide",
-            fontsize=15,
-        )
         plt.xlim(0, 1)
         plt.show()
 ```
@@ -732,6 +755,13 @@ For the same Beta prior, we shall
 Let's start with the analytical method that we described in this {doc}`prob_meaning`
 
 ```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: |
+      Analytical Beta prior and posterior
+    name: fig_analytical
+---
 # first examine Beta prior
 BETA = BayesianInference(param=(5, 5), name_dist="beta")
 
@@ -741,7 +771,7 @@ BETA_plot = BayesianInferencePlot(true_theta, num_list, BETA)
 xaxis = np.linspace(0, 1, 1000)
 y_prior = st.beta.pdf(xaxis, 5, 5)
 
-fig, ax = plt.subplots(figsize=(10, 6))
+fig, ax = plt.subplots()
 # plot analytical beta prior
 ax.plot(xaxis, y_prior, label="Analytical Beta prior", color="#4C4E52")
 
@@ -758,7 +788,6 @@ for id, n in enumerate(N_list):
         label=f"Analytical Beta posterior with $n={n}$",
     )
 ax.legend(loc="upper left")
-ax.set_title("Analytical Beta prior and posterior", fontsize=15)
 plt.xlim(0, 1)
 plt.show()
 ```
@@ -772,8 +801,8 @@ We'll do this for both MCMC and VI.
 mystnb:
   figure:
     caption: |
-      MCMC sampling density of posterior distributions
-    name: mcmc
+      MCMC density with Beta prior
+    name: fig_mcmc_beta
 ---
 
 BayesianInferencePlot(true_theta, num_list, BETA).MCMC_plot(
@@ -782,6 +811,14 @@ BayesianInferencePlot(true_theta, num_list, BETA).MCMC_plot(
 ```
 
 ```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: |
+      SVI density with Beta guide
+    name: fig_svi_beta
+---
+
 BayesianInferencePlot(true_theta, num_list, BETA).SVI_plot(
     guide_dist="beta", n_steps=SVI_num_steps
 )
@@ -825,7 +862,7 @@ We first initialize the `BayesianInference` classes and then can directly call `
 STD_UNIFORM = BayesianInference(param=(0, 1), name_dist="uniform")
 UNIFORM = BayesianInference(param=(0.2, 0.7), name_dist="uniform")
 
-# Try truncated lognormal
+# Try truncated log normal
 LOGNORMAL = BayesianInference(param=(0, 2), name_dist="lognormal")
 
 # Try Von Mises
@@ -836,6 +873,13 @@ LAPLACE = BayesianInference(param=(0.5, 0.07), name_dist="laplace")
 ```
 
 ```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: |
+      MCMC density with uniform prior
+    name: fig_mcmc_uniform
+---
 # Uniform
 example_CLASS = STD_UNIFORM
 print(
@@ -861,7 +905,14 @@ Consequently, the posterior cannot put positive probability above $\overline{\th
 Note how when the true data-generating $\theta$ is located at $0.8$ as it is here, when $n$ gets large, the posterior concentrates on the upper bound of the support of the prior, $0.7$ here.
 
 ```{code-cell} ipython3
-# Log Normal
+---
+mystnb:
+  figure:
+    caption: |
+      MCMC density with log normal prior
+    name: fig_mcmc_lognormal
+---
+# log normal
 example_CLASS = LOGNORMAL
 print(
     f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
@@ -872,7 +923,14 @@ BayesianInferencePlot(true_theta, num_list, example_CLASS).MCMC_plot(
 ```
 
 ```{code-cell} ipython3
-# Von Mises
+---
+mystnb:
+  figure:
+    caption: |
+      MCMC density with von Mises prior
+    name: fig_mcmc_vonmises
+---
+# von Mises
 example_CLASS = VONMISES
 print(
     f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
@@ -884,6 +942,13 @@ BayesianInferencePlot(true_theta, num_list, example_CLASS).MCMC_plot(
 ```
 
 ```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: |
+      MCMC density with Laplace prior
+    name: fig_mcmc_laplace
+---
 # Laplace
 example_CLASS = LAPLACE
 print(
@@ -894,15 +959,23 @@ BayesianInferencePlot(true_theta, num_list, example_CLASS).MCMC_plot(
 )
 ```
 
+### VI
+
 To get more accuracy we will now increase the number of steps for Variational Inference (VI)
 
 ```{code-cell} ipython3
 SVI_num_steps = 50000
 ```
-
-#### VI with a truncated Normal guide
+#### VI with a truncated normal guide
 
 ```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: |
+      SVI density with uniform prior and normal guide
+    name: fig_svi_uniform_normal
+---
 # Uniform
 example_CLASS = BayesianInference(param=(0, 1), name_dist="uniform")
 print(
@@ -914,7 +987,14 @@ BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(
 ```
 
 ```{code-cell} ipython3
-# Lognormal
+---
+mystnb:
+  figure:
+    caption: |
+      SVI density with log normal prior and normal guide
+    name: fig_svi_lognormal_normal
+---
+# log normal
 example_CLASS = LOGNORMAL
 print(
     f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
@@ -925,6 +1005,13 @@ BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(
 ```
 
 ```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: |
+      SVI density with Laplace prior and normal guide
+    name: fig_svi_laplace_normal
+---
 # Laplace
 example_CLASS = LAPLACE
 print(
@@ -938,7 +1025,14 @@ BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(
 #### Variational inference with a Beta guide distribution
 
 ```{code-cell} ipython3
-# Uniform
+---
+mystnb:
+  figure:
+    caption: |
+      SVI density with uniform prior and Beta guide
+    name: fig_svi_uniform_beta
+---
+# uniform
 example_CLASS = STD_UNIFORM
 print(
     f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
@@ -949,7 +1043,14 @@ BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(
 ```
 
 ```{code-cell} ipython3
-# log Normal
+---
+mystnb:
+  figure:
+    caption: |
+      SVI density with log normal prior and Beta guide
+    name: fig_svi_lognormal_beta
+---
+# log normal
 example_CLASS = LOGNORMAL
 print(
     f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
@@ -960,7 +1061,14 @@ BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(
 ```
 
 ```{code-cell} ipython3
-# Von Mises
+# von Mises
+---
+mystnb:
+  figure:
+    caption: |
+      SVI density with von Mises prior and Beta guide
+    name: fig_svi_vonmises_beta
+---
 example_CLASS = VONMISES
 print(
     f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
@@ -972,6 +1080,13 @@ BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(
 ```
 
 ```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: |
+      SVI density with Laplace prior and Beta guide
+    name: fig_svi_laplace_beta
+---
 # Laplace
 example_CLASS = LAPLACE
 print(

From b9963979cef07bde419e0d4cf9ccc501647d2a9e Mon Sep 17 00:00:00 2001
From: xuanguang-li <edwardli02123@gmail.com>
Date: Tue, 19 Aug 2025 17:38:51 +0800
Subject: [PATCH 10/22] reorder of comment and metadata cell

---
 lectures/bayes_nonconj.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/lectures/bayes_nonconj.md b/lectures/bayes_nonconj.md
index be9ee3c80..94deda03d 100644
--- a/lectures/bayes_nonconj.md
+++ b/lectures/bayes_nonconj.md
@@ -295,8 +295,8 @@ $$
  & =-\int q(\theta)\log\frac{\frac{p(\theta,Y)}{p(Y)}}{q(\theta)} d\theta\\
  & =-\int q(\theta)\log\frac{p(\theta,Y)}{p(\theta)q(Y)} d\theta\\
  & =-\int q(\theta)\left[\log\frac{p(\theta,Y)}{q(\theta)}-\log p(Y)\right] d\theta\\
- & =-\int q(\theta)\log\frac{p(\theta,Y)}{q(\theta)}+\int d\theta q(\theta)\log p(Y) d\theta\\
- & =-\int q(\theta)\log\frac{p(\theta,Y)}{q(\theta)}+\log p(Y) d\theta\\
+ & =-\int q(\theta)\log\frac{p(\theta,Y)}{q(\theta)}+\int q(\theta)\log p(Y) d\theta\\
+ & =-\int q(\theta)\log\frac{p(\theta,Y)}{q(\theta)} d\theta+\log p(Y)\\
 \log p(Y)&=D_{KL}(q(\theta;\phi)\;\|\;p(\theta\mid Y))+\int q_{\phi}(\theta)\log\frac{p(\theta,Y)}{q_{\phi}(\theta)} d\theta
 \end{aligned}
 $$
@@ -563,7 +563,6 @@ The above graphs show that sampling seems to work well with both distributions.
 Now let's see how well things work with von Mises distributions.
 
 ```{code-cell} ipython3
-# shifted von Mises
 ---
 mystnb:
   figure:
@@ -571,6 +570,7 @@ mystnb:
       Shifted von Mises distribution
     name: fig_vonmises_dist
 ---
+# shifted von Mises
 exampleVM = BayesianInference(param=10, name_dist="vonMises")
 exampleVM.show_prior(size=100000, bins=20)
 ```
@@ -815,8 +815,8 @@ BayesianInferencePlot(true_theta, num_list, BETA).MCMC_plot(
 mystnb:
   figure:
     caption: |
-      SVI density with Beta guide
-    name: fig_svi_beta
+      SVI density with Beta prior and Beta guide
+    name: fig_svi_beta_beta
 ---
 
 BayesianInferencePlot(true_theta, num_list, BETA).SVI_plot(
@@ -1061,7 +1061,6 @@ BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(
 ```
 
 ```{code-cell} ipython3
-# von Mises
 ---
 mystnb:
   figure:
@@ -1069,6 +1068,7 @@ mystnb:
       SVI density with von Mises prior and Beta guide
     name: fig_svi_vonmises_beta
 ---
+# von Mises
 example_CLASS = VONMISES
 print(
     f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"

From b1976af0971fd414297b8e2d80cc4086bdfc7a79 Mon Sep 17 00:00:00 2001
From: xuanguang-li <edwardli02123@gmail.com>
Date: Wed, 20 Aug 2025 16:17:04 +0800
Subject: [PATCH 11/22] fix figure tag and caption

---
 lectures/bayes_nonconj.md | 39 ++++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/lectures/bayes_nonconj.md b/lectures/bayes_nonconj.md
index 94deda03d..fb767f9e8 100644
--- a/lectures/bayes_nonconj.md
+++ b/lectures/bayes_nonconj.md
@@ -759,7 +759,7 @@ Let's start with the analytical method that we described in this {doc}`prob_mean
 mystnb:
   figure:
     caption: |
-      Analytical Beta prior and posterior
+      Analytical density (Beta prior)
     name: fig_analytical
 ---
 # first examine Beta prior
@@ -801,7 +801,7 @@ We'll do this for both MCMC and VI.
 mystnb:
   figure:
     caption: |
-      MCMC density with Beta prior
+      MCMC density (Beta prior)
     name: fig_mcmc_beta
 ---
 
@@ -815,7 +815,7 @@ BayesianInferencePlot(true_theta, num_list, BETA).MCMC_plot(
 mystnb:
   figure:
     caption: |
-      SVI density with Beta prior and Beta guide
+      SVI density (Beta prior, Beta guide)
     name: fig_svi_beta_beta
 ---
 
@@ -877,8 +877,8 @@ LAPLACE = BayesianInference(param=(0.5, 0.07), name_dist="laplace")
 mystnb:
   figure:
     caption: |
-      MCMC density with uniform prior
-    name: fig_mcmc_uniform
+      MCMC density (uniform prior)
+    name: fig_mcmc_stduniform
 ---
 # Uniform
 example_CLASS = STD_UNIFORM
@@ -888,7 +888,16 @@ print(
 BayesianInferencePlot(true_theta, num_list, example_CLASS).MCMC_plot(
     num_samples=MCMC_num_samples
 )
+```
 
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: |
+      MCMC density (uniform prior)
+    name: fig_mcmc_uniform
+---
 example_CLASS = UNIFORM
 print(
     f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
@@ -909,7 +918,7 @@ Note how when the true data-generating $\theta$ is located at $0.8$ as it is her
 mystnb:
   figure:
     caption: |
-      MCMC density with log normal prior
+      MCMC density (log normal prior)
     name: fig_mcmc_lognormal
 ---
 # log normal
@@ -927,7 +936,7 @@ BayesianInferencePlot(true_theta, num_list, example_CLASS).MCMC_plot(
 mystnb:
   figure:
     caption: |
-      MCMC density with von Mises prior
+      MCMC density (von Mises prior)
     name: fig_mcmc_vonmises
 ---
 # von Mises
@@ -946,7 +955,7 @@ BayesianInferencePlot(true_theta, num_list, example_CLASS).MCMC_plot(
 mystnb:
   figure:
     caption: |
-      MCMC density with Laplace prior
+      MCMC density (Laplace prior)
     name: fig_mcmc_laplace
 ---
 # Laplace
@@ -973,7 +982,7 @@ SVI_num_steps = 50000
 mystnb:
   figure:
     caption: |
-      SVI density with uniform prior and normal guide
+      SVI density (uniform prior, normal guide)
     name: fig_svi_uniform_normal
 ---
 # Uniform
@@ -991,7 +1000,7 @@ BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(
 mystnb:
   figure:
     caption: |
-      SVI density with log normal prior and normal guide
+      SVI density (log normal prior, normal guide)
     name: fig_svi_lognormal_normal
 ---
 # log normal
@@ -1009,7 +1018,7 @@ BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(
 mystnb:
   figure:
     caption: |
-      SVI density with Laplace prior and normal guide
+      SVI density (Laplace prior, normal guide)
     name: fig_svi_laplace_normal
 ---
 # Laplace
@@ -1029,7 +1038,7 @@ BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(
 mystnb:
   figure:
     caption: |
-      SVI density with uniform prior and Beta guide
+      SVI density (uniform prior, Beta guide)
     name: fig_svi_uniform_beta
 ---
 # uniform
@@ -1047,7 +1056,7 @@ BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(
 mystnb:
   figure:
     caption: |
-      SVI density with log normal prior and Beta guide
+      SVI density (log normal prior, Beta guide)
     name: fig_svi_lognormal_beta
 ---
 # log normal
@@ -1065,7 +1074,7 @@ BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(
 mystnb:
   figure:
     caption: |
-      SVI density with von Mises prior and Beta guide
+      SVI density (von Mises prior, Beta guide)
     name: fig_svi_vonmises_beta
 ---
 # von Mises
@@ -1084,7 +1093,7 @@ BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(
 mystnb:
   figure:
     caption: |
-      SVI density with Laplace prior and Beta guide
+      SVI density (Laplace prior, Beta guide)
     name: fig_svi_laplace_beta
 ---
 # Laplace

From 9fc3768880964b9deb74528096275fd552917d32 Mon Sep 17 00:00:00 2001
From: xuanguang-li <edwardli02123@gmail.com>
Date: Thu, 21 Aug 2025 17:35:08 +0800
Subject: [PATCH 12/22] convert to jax-based pattern

---
 lectures/bayes_nonconj.md | 639 +++++++++++++++++++++-----------------
 1 file changed, 353 insertions(+), 286 deletions(-)

diff --git a/lectures/bayes_nonconj.md b/lectures/bayes_nonconj.md
index fb767f9e8..e12c9b9d3 100644
--- a/lectures/bayes_nonconj.md
+++ b/lectures/bayes_nonconj.md
@@ -52,6 +52,8 @@ import seaborn as sns
 import matplotlib.pyplot as plt
 import scipy.stats as st
 
+from dataclasses import dataclass, field
+from typing import NamedTuple
 import jax.numpy as jnp
 from jax import random as jax_random
 
@@ -207,7 +209,7 @@ We will use the following priors:
 
 - a truncated log-normal distribution with support on $[0,1]$ with parameters $(\mu,\sigma)$.
 
-    - To implement this, let $Z\sim Normal(\mu,\sigma)$ and $\tilde{Z}$ be truncated normal with support $[\log(0),\log(1)]$, then $\exp(Z)$ has a log normal distribution with bounded support $[0,1]$. This can be easily coded since `numpyro` has a built-in truncated normal distribution, and `numpyro`'s `TransformedDistribution` class that includes an exponential transformation.
+    - To implement this, let $Z\sim Normal(\mu,\sigma)$ and $\tilde{Z}$ be truncated normal with support $[-\infty,\log(1)]$, then $\exp(Z)$ has a log normal distribution with bounded support $[0,1]$. This can be easily coded since `numpyro` has a built-in truncated normal distribution, and `numpyro`'s `TransformedDistribution` class that includes an exponential transformation.
 
 - a shifted von Mises distribution that has support confined to $[0,1]$ with parameter $(\mu,\kappa)$.
 
@@ -227,7 +229,7 @@ def TruncatedLogNormal_trans(loc, scale):
     Obtains the truncated log normal distribution using numpyro's TruncatedNormal and ExpTransform
     """
     base_dist = ndist.TruncatedNormal(
-        low=jnp.log(0), high=jnp.log(1), loc=loc, scale=scale
+        low=-jnp.inf, high=jnp.log(1), loc=loc, scale=scale
     )
     return ndist.TransformedDistribution(base_dist, ndist.transforms.ExpTransform())
 
@@ -353,7 +355,7 @@ The class `BayesianInference` has several key methods :
 
 - `MCMC_sampling`:
    - INPUT: (data, num_samples, num_warmup=1000)
-   - Takes a `np.array` data and generates MCMC sampling of posterior of size `num_samples`.
+   - Takes a `jnp.array` data and generates MCMC sampling of posterior of size `num_samples`.
 
 - `SVI_run`:
   - INPUT: (data, guide_dist, n_steps=10000)
@@ -362,165 +364,180 @@ The class `BayesianInference` has several key methods :
   - RETURN: (params, losses) - the learned parameters in a `dict` and the vector of loss at each step.
 
 ```{code-cell} ipython3
-class BayesianInference:
-    def __init__(self, param, name_dist):
-        """
-        Parameters
-        ---------
-        param : tuple.
-            a tuple object that contains all relevant parameters for the distribution
-        dist : str.
-            name of the distribution - 'beta', 'uniform', 'lognormal', 'vonMises', 'tent'
-        """
-        self.param = param
-        self.name_dist = name_dist
-        # jax requires explicit PRNG state to be passed
-        self.rng_key = jax_random.PRNGKey(0)
-
-    def sample_prior(self):
-        """Define the prior distribution to sample from in numpyro models."""
-        if self.name_dist == "beta":
-            # unpack parameters
-            alpha0, beta0 = self.param
-            sample = numpyro.sample(
-                "theta", ndist.Beta(alpha0, beta0), rng_key=self.rng_key
-            )
-
-        elif self.name_dist == "uniform":
-            # unpack parameters
-            lb, ub = self.param
-            sample = numpyro.sample(
-                "theta", ndist.Uniform(lb, ub), rng_key=self.rng_key
-            )
-
-        elif self.name_dist == "lognormal":
-            # unpack parameters
-            loc, scale = self.param
-            sample = numpyro.sample(
-                "theta", TruncatedLogNormal_trans(loc, scale), rng_key=self.rng_key
-            )
-
-        elif self.name_dist == "vonMises":
-            # unpack parameters
-            kappa = self.param
-            sample = numpyro.sample(
-                "theta", ShiftedVonMises(kappa), rng_key=self.rng_key
-            )
-
-        elif self.name_dist == "laplace":
-            # unpack parameters
-            loc, scale = self.param
-            sample = numpyro.sample(
-                "theta", TruncatedLaplace(loc, scale), rng_key=self.rng_key
-            )
-
-        return sample
-
-    def show_prior(self, size=1e5, bins=20, disp_plot=1):
-        """
-        Visualizes prior distribution by sampling from prior and plots the approximated sampling distribution
-        """
-        self.bins = bins
-
-        with numpyro.plate("show_prior", size=size):
-            sample = self.sample_prior()
-        # to JAX array
-        sample_array = jnp.asarray(sample)
-
-        # plot histogram and kernel density
-        if disp_plot == 1:
-            sns.displot(
-                sample_array, kde=True, stat="density", bins=bins, height=5, aspect=1.5
-            )
-            plt.xlim(0, 1)
-            plt.show()
-        else:
-            return sample_array
-
-    def model(self, data):
-        """
-        Define the probabilistic model by specifying prior, conditional likelihood, and data conditioning
-        """
-        theta = self.sample_prior()
-        output = numpyro.sample(
-            "obs", ndist.Binomial(len(data), theta), obs=jnp.sum(data)
+class BayesianInference(NamedTuple):
+    """
+    Parameters
+    ---------
+    param : tuple.
+        a tuple object that contains all relevant parameters for the distribution
+    dist : str.
+        name of the distribution - 'beta', 'uniform', 'lognormal', 'vonMises', 'tent'
+    """
+    param: tuple
+    name_dist: str
+    # jax requires explicit PRNG state to be passed
+    rng_key: jax_random.PRNGKey = jax_random.PRNGKey(0)
+
+
+def sample_prior(model: BayesianInference):
+    """Define the prior distribution to sample from in numpyro models."""
+    if model.name_dist == "beta":
+        # unpack parameters
+        alpha0, beta0 = model.param
+        sample = numpyro.sample(
+            "theta", ndist.Beta(alpha0, beta0), rng_key=model.rng_key
+        )
+
+    elif model.name_dist == "uniform":
+        # unpack parameters
+        lb, ub = model.param
+        sample = numpyro.sample(
+            "theta", ndist.Uniform(lb, ub), rng_key=model.rng_key
         )
-        return output
-
-    def MCMC_sampling(self, data, num_samples, num_warmup=1000):
-        """
-        Computes numerically the posterior distribution with beta prior parametrized by (alpha0, beta0)
-        given data using MCMC
-        """
-        data = jnp.array(data, dtype=float)
-        nuts_kernel = nNUTS(self.model)
-        mcmc = nMCMC(
-            nuts_kernel,
-            num_samples=num_samples,
-            num_warmup=num_warmup,
-            progress_bar=False,
+
+    elif model.name_dist == "lognormal":
+        # unpack parameters
+        loc, scale = model.param
+        sample = numpyro.sample(
+            "theta", TruncatedLogNormal_trans(loc, scale), rng_key=model.rng_key
+        )
+
+    elif model.name_dist == "vonMises":
+        # unpack parameters
+        kappa = model.param
+        sample = numpyro.sample(
+            "theta", ShiftedVonMises(kappa), rng_key=model.rng_key
+        )
+
+    elif model.name_dist == "laplace":
+        # unpack parameters
+        loc, scale = model.param
+        sample = numpyro.sample(
+            "theta", TruncatedLaplace(loc, scale), rng_key=model.rng_key
         )
-        mcmc.run(self.rng_key, data=data)
-
-        samples = mcmc.get_samples()["theta"]
-        return samples
-
-    def beta_guide(self, data):
-        """
-        Defines the candidate parametrized variational distribution that we train to approximate posterior with numpyro
-        Here we use parameterized beta
-        """
-        alpha_q = numpyro.param("alpha_q", 10, constraint=nconstraints.positive)
-        beta_q = numpyro.param("beta_q", 10, constraint=nconstraints.positive)
-
-        numpyro.sample("theta", ndist.Beta(alpha_q, beta_q))
-
-    def truncnormal_guide(self, data):
-        """
-        Defines the candidate parametrized variational distribution that we train to approximate posterior with numpyro
-        Here we use truncated normal on [0,1]
-        """
-        loc = numpyro.param("loc", 0.5, constraint=nconstraints.interval(0.0, 1.0))
-        scale = numpyro.param("scale", 1, constraint=nconstraints.positive)
-        numpyro.sample("theta", ndist.TruncatedNormal(loc, scale, low=0.0, high=1.0))
-
-    def SVI_init(self, guide_dist, lr=0.0005):
-        """Initiate SVI training mode with Adam optimizer"""
-        adam_params = {"lr": lr}
-
-        if guide_dist == "beta":
-            optimizer = nAdam(step_size=lr)
-            svi = nSVI(self.model, self.beta_guide, optimizer, loss=nTrace_ELBO())
-        elif guide_dist == "normal":
-            optimizer = nAdam(step_size=lr)
-            svi = nSVI(
-                self.model, self.truncnormal_guide, optimizer, loss=nTrace_ELBO()
-            )
-        else:
-            print("WARNING: Please input either 'beta' or 'normal'")
-            svi = None
-
-        return svi
-
-    def SVI_run(self, data, guide_dist, n_steps=10000):
-        """
-        Runs SVI and returns optimized parameters and losses
-
-        Returns
-        --------
-        params : the learned parameters for guide
-        losses : a vector of loss at each step
-        """
-
-        # initiate SVI
-        svi = self.SVI_init(guide_dist=guide_dist)
-
-        data = jnp.array(data, dtype=float)
-        result = svi.run(self.rng_key, n_steps, data, progress_bar=False)
-        params = dict((key, jnp.asarray(value)) for key, value in result.params.items())
-        losses = jnp.asarray(result.losses)
-
-        return params, losses
+
+    return sample
+
+
+def show_prior(
+    model: BayesianInference, size=1e5, bins=20, disp_plot=1
+    ):
+    """
+    Visualizes prior distribution by sampling from prior and plots the approximated sampling distribution
+    """
+    with numpyro.plate("show_prior", size=size):
+        sample = sample_prior(model)
+    # to JAX array
+    sample_array = jnp.asarray(sample)
+
+    # plot histogram and kernel density
+    if disp_plot == 1:
+        sns.displot(
+            sample_array, kde=True, stat="density", bins=bins, height=5, aspect=1.5
+        )
+        plt.xlim(0, 1)
+        plt.show()
+    else:
+        return sample_array
+
+
+def set_model(model: BayesianInference, data):
+    """
+    Define the probabilistic model by specifying prior, conditional likelihood, and data conditioning
+    """
+    theta = sample_prior(model)
+    output = numpyro.sample(
+        "obs", ndist.Binomial(len(data), theta), obs=jnp.sum(data)
+    )
+
+
+def MCMC_sampling(
+    model: BayesianInference, data, num_samples, num_warmup=1000
+    ):
+    """
+    Computes numerically the posterior distribution with beta prior parametrized by (alpha0, beta0)
+    given data using MCMC
+    """
+    data = jnp.array(data, dtype=float)
+    nuts_kernel = nNUTS(set_model)
+    mcmc = nMCMC(
+        nuts_kernel,
+        num_samples=num_samples,
+        num_warmup=num_warmup,
+        progress_bar=False,
+    )
+    mcmc.run(model.rng_key, model=model, data=data)
+
+    samples = mcmc.get_samples()["theta"]
+    return samples
+
+
+# arguments in this function are used to align with the arguments in set_model()
+# this is required by svi.run()
+def beta_guide(model: BayesianInference, data):
+    """
+    Defines the candidate parametrized variational distribution that we train to approximate posterior with numpyro
+    Here we use parameterized beta
+    """
+    alpha_q = numpyro.param("alpha_q", 10, constraint=nconstraints.positive)
+    beta_q = numpyro.param("beta_q", 10, constraint=nconstraints.positive)
+
+    numpyro.sample("theta", ndist.Beta(alpha_q, beta_q))
+
+
+# similar with beta_guide()
+def truncnormal_guide(model: BayesianInference, data):
+    """
+    Defines the candidate parametrized variational distribution that we train to approximate posterior with numpyro
+    Here we use truncated normal on [0,1]
+    """
+    loc = numpyro.param("loc", 0.5, constraint=nconstraints.interval(0.0, 1.0))
+    scale = numpyro.param("scale", 1, constraint=nconstraints.positive)
+    numpyro.sample("theta", ndist.TruncatedNormal(loc, scale, low=0.0, high=1.0))
+
+
+def SVI_init(model: BayesianInference, guide_dist, lr=0.0005):
+    """Initiate SVI training mode with Adam optimizer"""
+    adam_params = {"lr": lr}
+
+    if guide_dist == "beta":
+        optimizer = nAdam(step_size=lr)
+        svi = nSVI(
+            set_model, beta_guide, optimizer, loss=nTrace_ELBO()
+        )
+    elif guide_dist == "normal":
+        optimizer = nAdam(step_size=lr)
+        svi = nSVI(
+            set_model, truncnormal_guide, optimizer, loss=nTrace_ELBO()
+        )
+    else:
+        print("WARNING: Please input either 'beta' or 'normal'")
+        svi = None
+
+    return svi
+
+
+def SVI_run(model: BayesianInference, data, guide_dist, n_steps=10000):
+    """
+    Runs SVI and returns optimized parameters and losses
+
+    Returns
+    --------
+    params : the learned parameters for guide
+    losses : a vector of loss at each step
+    """
+
+    # initiate SVI
+    svi = SVI_init(model, guide_dist)
+
+    data = jnp.array(data, dtype=float)
+    result = svi.run(
+        model.rng_key, n_steps, model, data, progress_bar=False
+        )
+    params = dict((key, jnp.asarray(value)) for key, value in result.params.items())
+    losses = jnp.asarray(result.losses)
+
+    return params, losses
 ```
 
 ## Alternative prior distributions
@@ -542,7 +559,7 @@ mystnb:
 ---
 # truncated log normal
 exampleLN = BayesianInference(param=(0, 2), name_dist="lognormal")
-exampleLN.show_prior(size=100000, bins=20)
+show_prior(exampleLN, size=100000, bins=20)
 ```
 
 ```{code-cell} ipython3
@@ -555,7 +572,7 @@ mystnb:
 ---
 # truncated uniform
 exampleUN = BayesianInference(param=(0.1, 0.8), name_dist="uniform")
-exampleUN.show_prior(size=100000, bins=20)
+show_prior(exampleUN, size=100000, bins=20)
 ```
 
 The above graphs show that sampling seems to work well with both distributions.
@@ -572,7 +589,7 @@ mystnb:
 ---
 # shifted von Mises
 exampleVM = BayesianInference(param=10, name_dist="vonMises")
-exampleVM.show_prior(size=100000, bins=20)
+show_prior(exampleVM, size=100000, bins=20)
 ```
 
 The graphs look good too.
@@ -589,7 +606,7 @@ mystnb:
 ---
 # truncated Laplace
 exampleLP = BayesianInference(param=(0.5, 0.05), name_dist="laplace")
-exampleLP.show_prior(size=100000, bins=40)
+show_prior(exampleLP, size=100000, bins=20)
 ```
 
 Having assured ourselves that our sampler seems to do a good job, let's put it to work in using MCMC to compute posterior probabilities.
@@ -607,6 +624,7 @@ It has two key methods:
 - `BayesianInferencePlot.SVI_plot()` takes desired VI distribution class ('beta' or 'normal') as input and plots the posteriors together with the prior.
 
 ```{code-cell} ipython3
+@dataclass
 class BayesianInferencePlot:
     """
     Easily implement the MCMC and VI inference for a given instance of BayesianInference class and
@@ -623,108 +641,121 @@ class BayesianInferencePlot:
 
     """
 
-    def __init__(self, theta, N_list, BayesianInferenceClass, binwidth=0.02):
-        """Enter Parameters for data generation and plotting"""
-        self.theta = theta
-        self.N_list = N_list
-        self.BayesianInferenceClass = BayesianInferenceClass
+    """Enter Parameters for data generation and plotting"""
+    theta: float
+    N_list: list
+    BayesianInferenceClass: BayesianInference
 
-        # plotting parameters
-        self.binwidth = binwidth
-        self.linewidth = 0.05
-        self.colorlist = sns.color_palette(n_colors=len(N_list))
+    # plotting parameters
+    binwidth: float = 0.02
+    linewidth: float = 0.05
+    colorlist: list = field(init=False)
 
-        # data generation
-        N_max = max(N_list)
-        self.data = simulate_draw(theta, N_max)
+    # data generation
+    N_max: float = field(init=False)
+    data: np.array = field(init=False)
 
-    def MCMC_plot(self, num_samples, num_warmup=1000):
-        fig, ax = plt.subplots()
+    def __post_init__(self):
+        self.colorlist = sns.color_palette(n_colors=len(self.N_list))
+        self.N_max = max(self.N_list)
+        self.data = simulate_draw(self.theta, self.N_max)
 
-        # plot prior
-        prior_sample = self.BayesianInferenceClass.show_prior(disp_plot=0)
-        sns.histplot(
-            data=prior_sample,
-            kde=True,
-            stat="density",
-            binwidth=self.binwidth,
-            color="#4C4E52",
-            linewidth=self.linewidth,
-            alpha=0.1,
-            ax=ax,
-            label="Prior distribution",
-        )
 
-        # plot posteriors
-        for id, n in enumerate(self.N_list):
-            samples = self.BayesianInferenceClass.MCMC_sampling(
-                self.data[:n], num_samples, num_warmup
-            )
-            sns.histplot(
-                samples,
-                kde=True,
-                stat="density",
-                binwidth=self.binwidth,
-                linewidth=self.linewidth,
-                alpha=0.2,
-                color=self.colorlist[id - 1],
-                label=f"Posterior with $n={n}$",
-            )
-        ax.legend(loc="upper left")
-        plt.xlim(0, 1)
-        plt.show()
+def MCMC_plot(
+    plot_model: BayesianInferencePlot, num_samples, num_warmup=1000
+    ):
+    fig, ax = plt.subplots()
 
-    def SVI_fitting(self, guide_dist, params):
-        """Fit the beta/truncnormal curve using parameters trained by SVI."""
-        # create x axis
-        xaxis = np.linspace(0, 1, 1000)
-        if guide_dist == "beta":
-            y = st.beta.pdf(xaxis, a=params["alpha_q"], b=params["beta_q"])
-
-        elif guide_dist == "normal":
-            # rescale upper/lower bound. See Scipy's truncnorm doc
-            lower, upper = (0, 1)
-            loc, scale = params["loc"], params["scale"]
-            a, b = (lower - loc) / scale, (upper - loc) / scale
-
-            y = st.truncnorm.pdf(
-                xaxis, a=a, b=b, loc=params["loc"], scale=params["scale"]
-            )
-        return (xaxis, y)
-
-    def SVI_plot(self, guide_dist, n_steps=2000):
-        fig, ax = plt.subplots()
-
-        # plot prior
-        prior_sample = self.BayesianInferenceClass.show_prior(disp_plot=0)
+    # plot prior
+    prior_sample = show_prior(
+        plot_model.BayesianInferenceClass, disp_plot=0
+        )
+    sns.histplot(
+        data=prior_sample,
+        kde=True,
+        stat="density",
+        binwidth=plot_model.binwidth,
+        color="#4C4E52",
+        linewidth=plot_model.linewidth,
+        alpha=0.1,
+        ax=ax,
+        label="Prior distribution",
+    )
+
+    # plot posteriors
+    for id, n in enumerate(plot_model.N_list):
+        samples = MCMC_sampling(
+            plot_model.BayesianInferenceClass, plot_model.data[:n], num_samples, num_warmup
+        )
         sns.histplot(
-            data=prior_sample,
+            samples,
             kde=True,
             stat="density",
-            binwidth=self.binwidth,
-            color="#4C4E52",
-            linewidth=self.linewidth,
-            alpha=0.1,
-            ax=ax,
-            label="Prior distribution",
+            binwidth=plot_model.binwidth,
+            linewidth=plot_model.linewidth,
+            alpha=0.2,
+            color=plot_model.colorlist[id - 1],
+            label=f"Posterior with $n={n}$",
+        )
+    ax.legend(loc="upper left")
+    plt.xlim(0, 1)
+    plt.show()
+
+
+def SVI_fitting(guide_dist, params):
+    """Fit the beta/truncnormal curve using parameters trained by SVI."""
+    # create x axis
+    xaxis = jnp.linspace(0, 1, 1000)
+    if guide_dist == "beta":
+        y = st.beta.pdf(xaxis, a=params["alpha_q"], b=params["beta_q"])
+
+    elif guide_dist == "normal":
+        # rescale upper/lower bound. See Scipy's truncnorm doc
+        lower, upper = (0, 1)
+        loc, scale = params["loc"], params["scale"]
+        a, b = (lower - loc) / scale, (upper - loc) / scale
+
+        y = st.truncnorm.pdf(
+            xaxis, a=a, b=b, loc=loc, scale=scale
         )
+    return (xaxis, y)
+
+
+def SVI_plot(
+    plot_model: BayesianInferencePlot, guide_dist, n_steps=2000
+    ):
+    fig, ax = plt.subplots()
+
+    # plot prior
+    prior_sample = show_prior(plot_model.BayesianInferenceClass, disp_plot=0)
+    sns.histplot(
+        data=prior_sample,
+        kde=True,
+        stat="density",
+        binwidth=plot_model.binwidth,
+        color="#4C4E52",
+        linewidth=plot_model.linewidth,
+        alpha=0.1,
+        ax=ax,
+        label="Prior distribution",
+    )
 
-        # plot posteriors
-        for id, n in enumerate(self.N_list):
-            (params, losses) = self.BayesianInferenceClass.SVI_run(
-                self.data[:n], guide_dist, n_steps
-            )
-            x, y = self.SVI_fitting(guide_dist, params)
-            ax.plot(
-                x,
-                y,
-                alpha=1,
-                color=self.colorlist[id - 1],
-                label=f"Posterior with $n={n}$",
-            )
-        ax.legend(loc="upper left")
-        plt.xlim(0, 1)
-        plt.show()
+    # plot posteriors
+    for id, n in enumerate(plot_model.N_list):
+        (params, losses) = SVI_run(
+            plot_model.BayesianInferenceClass, plot_model.data[:n], guide_dist, n_steps
+        )
+        x, y = SVI_fitting(guide_dist, params)
+        ax.plot(
+            x,
+            y,
+            alpha=1,
+            color=plot_model.colorlist[id - 1],
+            label=f"Posterior with $n={n}$",
+        )
+    ax.legend(loc="upper left")
+    plt.xlim(0, 1)
+    plt.show()
 ```
 
 Let's set some parameters that we'll use in all of the examples below.
@@ -768,7 +799,7 @@ BETA = BayesianInference(param=(5, 5), name_dist="beta")
 BETA_plot = BayesianInferencePlot(true_theta, num_list, BETA)
 
 # plot analytical Beta prior and posteriors
-xaxis = np.linspace(0, 1, 1000)
+xaxis = jnp.linspace(0, 1, 1000)
 y_prior = st.beta.pdf(xaxis, 5, 5)
 
 fig, ax = plt.subplots()
@@ -805,8 +836,8 @@ mystnb:
     name: fig_mcmc_beta
 ---
 
-BayesianInferencePlot(true_theta, num_list, BETA).MCMC_plot(
-    num_samples=MCMC_num_samples
+MCMC_plot(
+    BETA_plot, num_samples=MCMC_num_samples
 )
 ```
 
@@ -819,8 +850,8 @@ mystnb:
     name: fig_svi_beta_beta
 ---
 
-BayesianInferencePlot(true_theta, num_list, BETA).SVI_plot(
-    guide_dist="beta", n_steps=SVI_num_steps
+SVI_plot(
+    BETA_plot, guide_dist="beta", n_steps=SVI_num_steps
 )
 ```
 
@@ -838,8 +869,8 @@ that will be more accurate, as we shall see next.
 (Increasing the step size increases computational time though).
 
 ```{code-cell} ipython3
-BayesianInferencePlot(true_theta, num_list, BETA).SVI_plot(
-    guide_dist="beta", n_steps=100000
+SVI_plot(
+    BETA_plot, guide_dist="beta", n_steps=100000
 )
 ```
 
@@ -885,8 +916,11 @@ example_CLASS = STD_UNIFORM
 print(
     f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
 )
-BayesianInferencePlot(true_theta, num_list, example_CLASS).MCMC_plot(
-    num_samples=MCMC_num_samples
+example_plotCLASS = BayesianInferencePlot(
+    true_theta, num_list, example_CLASS
+    )
+MCMC_plot(
+    example_plotCLASS, num_samples=MCMC_num_samples
 )
 ```
 
@@ -902,8 +936,11 @@ example_CLASS = UNIFORM
 print(
     f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
 )
-BayesianInferencePlot(true_theta, num_list, example_CLASS).MCMC_plot(
-    num_samples=MCMC_num_samples
+example_plotCLASS = BayesianInferencePlot(
+    true_theta, num_list, example_CLASS
+    )
+MCMC_plot(
+    example_plotCLASS, num_samples=MCMC_num_samples
 )
 ```
 
@@ -926,8 +963,11 @@ example_CLASS = LOGNORMAL
 print(
     f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
 )
-BayesianInferencePlot(true_theta, num_list, example_CLASS).MCMC_plot(
-    num_samples=MCMC_num_samples
+example_plotCLASS = BayesianInferencePlot(
+    true_theta, num_list, example_CLASS
+    )
+MCMC_plot(
+    example_plotCLASS, num_samples=MCMC_num_samples
 )
 ```
 
@@ -945,8 +985,11 @@ print(
     f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
 )
 print("\nNOTE: Shifted von Mises")
-BayesianInferencePlot(true_theta, num_list, example_CLASS).MCMC_plot(
-    num_samples=MCMC_num_samples
+example_plotCLASS = BayesianInferencePlot(
+    true_theta, num_list, example_CLASS
+    )
+MCMC_plot(
+    example_plotCLASS, num_samples=MCMC_num_samples
 )
 ```
 
@@ -963,8 +1006,11 @@ example_CLASS = LAPLACE
 print(
     f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
 )
-BayesianInferencePlot(true_theta, num_list, example_CLASS).MCMC_plot(
-    num_samples=MCMC_num_samples
+example_plotCLASS = BayesianInferencePlot(
+    true_theta, num_list, example_CLASS
+    )
+MCMC_plot(
+    example_plotCLASS, num_samples=MCMC_num_samples
 )
 ```
 
@@ -990,8 +1036,11 @@ example_CLASS = BayesianInference(param=(0, 1), name_dist="uniform")
 print(
     f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
 )
-BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(
-    guide_dist="normal", n_steps=SVI_num_steps
+example_plotCLASS = BayesianInferencePlot(
+    true_theta, num_list, example_CLASS
+    )
+SVI_plot(
+    example_plotCLASS, guide_dist="normal", n_steps=SVI_num_steps
 )
 ```
 
@@ -1008,8 +1057,11 @@ example_CLASS = LOGNORMAL
 print(
     f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
 )
-BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(
-    guide_dist="normal", n_steps=SVI_num_steps
+example_plotCLASS = BayesianInferencePlot(
+    true_theta, num_list, example_CLASS
+    )
+SVI_plot(
+    example_plotCLASS, guide_dist="normal", n_steps=SVI_num_steps
 )
 ```
 
@@ -1026,8 +1078,11 @@ example_CLASS = LAPLACE
 print(
     f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
 )
-BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(
-    guide_dist="normal", n_steps=SVI_num_steps
+example_plotCLASS = BayesianInferencePlot(
+    true_theta, num_list, example_CLASS
+    )
+SVI_plot(
+    example_plotCLASS, guide_dist="normal", n_steps=SVI_num_steps
 )
 ```
 
@@ -1046,8 +1101,11 @@ example_CLASS = STD_UNIFORM
 print(
     f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
 )
-BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(
-    guide_dist="beta", n_steps=SVI_num_steps
+example_plotCLASS = BayesianInferencePlot(
+    true_theta, num_list, example_CLASS
+    )
+SVI_plot(
+    example_plotCLASS, guide_dist="beta", n_steps=SVI_num_steps
 )
 ```
 
@@ -1064,8 +1122,11 @@ example_CLASS = LOGNORMAL
 print(
     f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
 )
-BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(
-    guide_dist="beta", n_steps=SVI_num_steps
+example_plotCLASS = BayesianInferencePlot(
+    true_theta, num_list, example_CLASS
+    )
+SVI_plot(
+    example_plotCLASS, guide_dist="beta", n_steps=SVI_num_steps
 )
 ```
 
@@ -1083,8 +1144,11 @@ print(
     f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
 )
 print("Shifted von Mises")
-BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(
-    guide_dist="beta", n_steps=SVI_num_steps
+example_plotCLASS = BayesianInferencePlot(
+    true_theta, num_list, example_CLASS
+    )
+SVI_plot(
+    example_plotCLASS, guide_dist="beta", n_steps=SVI_num_steps
 )
 ```
 
@@ -1101,7 +1165,10 @@ example_CLASS = LAPLACE
 print(
     f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
 )
-BayesianInferencePlot(true_theta, num_list, example_CLASS).SVI_plot(
-    guide_dist="beta", n_steps=SVI_num_steps
+example_plotCLASS = BayesianInferencePlot(
+    true_theta, num_list, example_CLASS
+    )
+SVI_plot(
+    example_plotCLASS, guide_dist="beta", n_steps=SVI_num_steps
 )
 ```

From 9c3a5faa6015150621fc5629779b9c7241210739 Mon Sep 17 00:00:00 2001
From: xuanguang-li <edwardli02123@gmail.com>
Date: Mon, 1 Sep 2025 11:14:40 +0800
Subject: [PATCH 13/22] fix comment

---
 lectures/bayes_nonconj.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lectures/bayes_nonconj.md b/lectures/bayes_nonconj.md
index e12c9b9d3..d83559834 100644
--- a/lectures/bayes_nonconj.md
+++ b/lectures/bayes_nonconj.md
@@ -640,8 +640,6 @@ class BayesianInferencePlot:
         a class initiated using BayesianInference()
 
     """
-
-    """Enter Parameters for data generation and plotting"""
     theta: float
     N_list: list
     BayesianInferenceClass: BayesianInference

From 731dec14c7472d5b340efc396f8b368686c62d6b Mon Sep 17 00:00:00 2001
From: xuanguang-li <edwardli02123@gmail.com>
Date: Mon, 1 Sep 2025 15:09:08 +0800
Subject: [PATCH 14/22] recover to test compiling

---
 lectures/bayes_nonconj.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lectures/bayes_nonconj.md b/lectures/bayes_nonconj.md
index d83559834..e12c9b9d3 100644
--- a/lectures/bayes_nonconj.md
+++ b/lectures/bayes_nonconj.md
@@ -640,6 +640,8 @@ class BayesianInferencePlot:
         a class initiated using BayesianInference()
 
     """
+
+    """Enter Parameters for data generation and plotting"""
     theta: float
     N_list: list
     BayesianInferenceClass: BayesianInference

From b8a7514d381d0fb6aff05e0e9f089972c5a5f4b7 Mon Sep 17 00:00:00 2001
From: xuanguang-li <edwardli02123@gmail.com>
Date: Tue, 2 Sep 2025 09:29:46 +0800
Subject: [PATCH 15/22] fix comment

---
 lectures/bayes_nonconj.md | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/lectures/bayes_nonconj.md b/lectures/bayes_nonconj.md
index e12c9b9d3..fd85a8579 100644
--- a/lectures/bayes_nonconj.md
+++ b/lectures/bayes_nonconj.md
@@ -141,7 +141,8 @@ def simulate_draw(theta, n):
 
 def analytical_beta_posterior(data, alpha0, beta0):
     """
-    Computes analytically the posterior distribution with beta prior parametrized by (alpha, beta)
+    Computes analytically the posterior distribution 
+    with beta prior parametrized by (alpha, beta)
     given # num observations
 
     Parameters
@@ -226,7 +227,8 @@ We will use the following priors:
 ```{code-cell} ipython3
 def TruncatedLogNormal_trans(loc, scale):
     """
-    Obtains the truncated log normal distribution using numpyro's TruncatedNormal and ExpTransform
+    Obtains the truncated log normal distribution 
+    using numpyro's TruncatedNormal and ExpTransform
     """
     base_dist = ndist.TruncatedNormal(
         low=-jnp.inf, high=jnp.log(1), loc=loc, scale=scale
@@ -443,7 +445,8 @@ def show_prior(
 
 def set_model(model: BayesianInference, data):
     """
-    Define the probabilistic model by specifying prior, conditional likelihood, and data conditioning
+    Define the probabilistic model by specifying prior, 
+    conditional likelihood, and data conditioning
     """
     theta = sample_prior(model)
     output = numpyro.sample(
@@ -455,7 +458,8 @@ def MCMC_sampling(
     model: BayesianInference, data, num_samples, num_warmup=1000
     ):
     """
-    Computes numerically the posterior distribution with beta prior parametrized by (alpha0, beta0)
+    Computes numerically the posterior distribution 
+    with beta prior parametrized by (alpha0, beta0)
     given data using MCMC
     """
     data = jnp.array(data, dtype=float)
@@ -476,7 +480,8 @@ def MCMC_sampling(
 # this is required by svi.run()
 def beta_guide(model: BayesianInference, data):
     """
-    Defines the candidate parametrized variational distribution that we train to approximate posterior with numpyro
+    Defines the candidate parametrized variational distribution 
+    that we train to approximate posterior with numpyro
     Here we use parameterized beta
     """
     alpha_q = numpyro.param("alpha_q", 10, constraint=nconstraints.positive)
@@ -488,7 +493,8 @@ def beta_guide(model: BayesianInference, data):
 # similar with beta_guide()
 def truncnormal_guide(model: BayesianInference, data):
     """
-    Defines the candidate parametrized variational distribution that we train to approximate posterior with numpyro
+    Defines the candidate parametrized variational distribution 
+    that we train to approximate posterior with numpyro
     Here we use truncated normal on [0,1]
     """
     loc = numpyro.param("loc", 0.5, constraint=nconstraints.interval(0.0, 1.0))
@@ -638,10 +644,8 @@ class BayesianInferencePlot:
         a list of sample size
     BayesianInferenceClass : class.
         a class initiated using BayesianInference()
-
     """
 
-    """Enter Parameters for data generation and plotting"""
     theta: float
     N_list: list
     BayesianInferenceClass: BayesianInference

From dedd7ea0974b9d63d579caf38757a9f32dd1eed9 Mon Sep 17 00:00:00 2001
From: xuanguang-li <edwardli02123@gmail.com>
Date: Tue, 2 Sep 2025 16:52:55 +0800
Subject: [PATCH 16/22] fix typo

---
 lectures/bayes_nonconj.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/lectures/bayes_nonconj.md b/lectures/bayes_nonconj.md
index fd85a8579..6009b0e51 100644
--- a/lectures/bayes_nonconj.md
+++ b/lectures/bayes_nonconj.md
@@ -273,7 +273,7 @@ $$
 where
 
 $$
-p\left(Y\right)=\int p\left(Y\mid\theta\right)p\left(Y\right) d\theta.
+p\left(Y\right)=\int p\left(Y\mid\theta\right)p\left(\theta\right) d\theta.
 $$ (eq:intchallenge)
 
 The integral on the right side of {eq}`eq:intchallenge` is typically difficult to compute.
@@ -297,7 +297,7 @@ Note that
 $$
 \begin{aligned}D_{KL}(q(\theta;\phi)\;\|\;p(\theta\mid Y)) & =-\int q(\theta;\phi)\log\frac{P(\theta\mid Y)}{q(\theta;\phi)} d\theta\\
  & =-\int q(\theta)\log\frac{\frac{p(\theta,Y)}{p(Y)}}{q(\theta)} d\theta\\
- & =-\int q(\theta)\log\frac{p(\theta,Y)}{p(\theta)q(Y)} d\theta\\
+ & =-\int q(\theta)\log\frac{p(\theta,Y)}{q(\theta)p(Y)} d\theta\\
  & =-\int q(\theta)\left[\log\frac{p(\theta,Y)}{q(\theta)}-\log p(Y)\right] d\theta\\
  & =-\int q(\theta)\log\frac{p(\theta,Y)}{q(\theta)}+\int q(\theta)\log p(Y) d\theta\\
  & =-\int q(\theta)\log\frac{p(\theta,Y)}{q(\theta)} d\theta+\log p(Y)\\
@@ -406,7 +406,7 @@ def sample_prior(model: BayesianInference):
 
     elif model.name_dist == "vonMises":
         # unpack parameters
-        kappa = model.param
+        kappa, = model.param
         sample = numpyro.sample(
             "theta", ShiftedVonMises(kappa), rng_key=model.rng_key
         )
@@ -425,7 +425,8 @@ def show_prior(
     model: BayesianInference, size=1e5, bins=20, disp_plot=1
     ):
     """
-    Visualizes prior distribution by sampling from prior and plots the approximated sampling distribution
+    Visualizes prior distribution by sampling from prior 
+    and plots the approximated sampling distribution
     """
     with numpyro.plate("show_prior", size=size):
         sample = sample_prior(model)

From 021f0c53ecf379caf374ca8d0ad04db80179ff7d Mon Sep 17 00:00:00 2001
From: xuanguang-li <edwardli02123@gmail.com>
Date: Tue, 2 Sep 2025 17:53:38 +0800
Subject: [PATCH 17/22] fix building error

---
 lectures/bayes_nonconj.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lectures/bayes_nonconj.md b/lectures/bayes_nonconj.md
index 6009b0e51..76e04d396 100644
--- a/lectures/bayes_nonconj.md
+++ b/lectures/bayes_nonconj.md
@@ -406,7 +406,7 @@ def sample_prior(model: BayesianInference):
 
     elif model.name_dist == "vonMises":
         # unpack parameters
-        kappa, = model.param
+        kappa = model.param
         sample = numpyro.sample(
             "theta", ShiftedVonMises(kappa), rng_key=model.rng_key
         )

From de835a18df614f03815b6fa4581a39770056bf73 Mon Sep 17 00:00:00 2001
From: Kenko LI <edwardli02123@gmail.com>
Date: Sat, 6 Sep 2025 16:48:56 +0800
Subject: [PATCH 18/22] modify styling

---
 lectures/bayes_nonconj.md | 489 +++++++++++++++++++++-----------------
 1 file changed, 268 insertions(+), 221 deletions(-)

diff --git a/lectures/bayes_nonconj.md b/lectures/bayes_nonconj.md
index 76e04d396..d44bbeac4 100644
--- a/lectures/bayes_nonconj.md
+++ b/lectures/bayes_nonconj.md
@@ -53,18 +53,18 @@ import matplotlib.pyplot as plt
 import scipy.stats as st
 
 from dataclasses import dataclass, field
-from typing import NamedTuple
+from typing import NamedTuple, Sequence
 import jax.numpy as jnp
-from jax import random as jax_random
+from jax import random
 
 import numpyro
-from numpyro import distributions as ndist
-import numpyro.distributions.constraints as nconstraints
-from numpyro.infer import MCMC as nMCMC
-from numpyro.infer import NUTS as nNUTS
-from numpyro.infer import SVI as nSVI
-from numpyro.infer import Trace_ELBO as nTrace_ELBO
-from numpyro.optim import Adam as nAdam
+from numpyro import distributions as dist
+import numpyro.distributions.constraints as constraints
+from numpyro.infer import MCMC
+from numpyro.infer import NUTS
+from numpyro.infer import SVI
+from numpyro.infer import Trace_ELBO
+from numpyro.optim import Adam
 ```
 
 ## Unleashing MCMC on a binomial likelihood
@@ -132,24 +132,24 @@ $$
 The analytical posterior for a given conjugate beta prior is coded in the following
 
 ```{code-cell} ipython3
-def simulate_draw(theta, n):
-    """Draws a Bernoulli sample of size n with probability P(Y=1) = theta"""
+def simulate_draw(θ, n):
+    """Draws a Bernoulli sample of size n with probability P(Y=1) = θ"""
     rand_draw = np.random.rand(n)
-    draw = (rand_draw < theta).astype(int)
+    draw = (rand_draw < θ).astype(int)
     return draw
 
 
-def analytical_beta_posterior(data, alpha0, beta0):
+def analytical_beta_posterior(data, α0, β0):
     """
     Computes analytically the posterior distribution 
-    with beta prior parametrized by (alpha, beta)
+    with beta prior parametrized by (α, β)
     given # num observations
 
     Parameters
     ---------
     num : int.
         the number of observations after which we calculate the posterior
-    alpha0, beta0 : float.
+    α0, β0 : float.
         the parameters for the beta distribution as a prior
 
     Returns
@@ -159,7 +159,7 @@ def analytical_beta_posterior(data, alpha0, beta0):
     num = len(data)
     up_num = data.sum()
     down_num = num - up_num
-    return st.beta(alpha0 + up_num, beta0 + down_num)
+    return st.beta(α0 + up_num, β0 + down_num)
 ```
 
 ### Two ways to approximate posteriors
@@ -230,24 +230,24 @@ def TruncatedLogNormal_trans(loc, scale):
     Obtains the truncated log normal distribution 
     using numpyro's TruncatedNormal and ExpTransform
     """
-    base_dist = ndist.TruncatedNormal(
+    base_dist = dist.TruncatedNormal(
         low=-jnp.inf, high=jnp.log(1), loc=loc, scale=scale
     )
-    return ndist.TransformedDistribution(base_dist, ndist.transforms.ExpTransform())
+    return dist.TransformedDistribution(base_dist, dist.transforms.ExpTransform())
 
 
-def ShiftedVonMises(kappa):
+def ShiftedVonMises(κ):
     """Obtains the shifted von Mises distribution using AffineTransform"""
-    base_dist = ndist.VonMises(0, kappa)
-    return ndist.TransformedDistribution(
-        base_dist, ndist.transforms.AffineTransform(loc=0.5, scale=1 / (2 * jnp.pi))
+    base_dist = dist.VonMises(0, κ)
+    return dist.TransformedDistribution(
+        base_dist, dist.transforms.AffineTransform(loc=0.5, scale=1 / (2 * jnp.pi))
     )
 
 
 def TruncatedLaplace(loc, scale):
     """Obtains the truncated Laplace distribution on [0,1]"""
-    base_dist = ndist.Laplace(loc, scale)
-    return ndist.TruncatedDistribution(base_dist, low=0.0, high=1.0)
+    base_dist = dist.Laplace(loc, scale)
+    return dist.TruncatedDistribution(base_dist, low=0.0, high=1.0)
 ```
 
 ### Variational inference
@@ -321,7 +321,7 @@ We can implement Stochastic Variational Inference (SVI) in numpyro using the `Ad
 
 We use two sets of variational distributions: Beta and TruncatedNormal with support $[0,1]$
 
-- Learnable parameters for the Beta distribution are (alpha, beta), both of which are positive.
+- Learnable parameters for the Beta distribution are ($\alpha$, $\beta$), both of which are positive.
 - Learnable parameters for the Truncated Normal distribution are (loc, scale).
 
 ```{note}
@@ -336,14 +336,14 @@ We have constructed a Python class `BayesianInference` that requires the followi
 - `name_dist`: a string that specifies distribution names
 
 The (`param`, `name_dist`) pair includes:
-- (alpha, beta, 'beta')
+- ($\alpha$, $\beta$, 'beta')
 
 - (lower_bound, upper_bound, 'uniform')
 
 - (loc, scale, 'lognormal')
    - Note: This is the truncated log normal.
 
-- (kappa, 'vonMises'), where kappa denotes concentration parameter, and center location is set to $0.5$. Using `numpyro`, this is the **shifted** distribution.
+- ($\kappa$, 'vonMises'), where $\kappa$ denotes concentration parameter, and center location is set to $0.5$. Using `numpyro`, this is the **shifted** distribution.
 
 - (loc, scale, 'laplace')
    - Note: This is the truncated Laplace
@@ -378,23 +378,41 @@ class BayesianInference(NamedTuple):
     param: tuple
     name_dist: str
     # jax requires explicit PRNG state to be passed
-    rng_key: jax_random.PRNGKey = jax_random.PRNGKey(0)
+    rng_key: random.PRNGKey
+
+
+def create_bayesian_inference(
+    param: tuple, 
+    name_dist: str, 
+    *, 
+    rng_key: random.PRNGKey = None
+) -> BayesianInference:
+    """Factory function to create a BayesianInference instance"""
+
+    if rng_key is None:
+        rng_key = random.PRNGKey(0)
+    
+    return BayesianInference(
+        param=param,
+        name_dist=name_dist,
+        rng_key=rng_key
+    )
 
 
 def sample_prior(model: BayesianInference):
     """Define the prior distribution to sample from in numpyro models."""
     if model.name_dist == "beta":
         # unpack parameters
-        alpha0, beta0 = model.param
+        α0, β0 = model.param
         sample = numpyro.sample(
-            "theta", ndist.Beta(alpha0, beta0), rng_key=model.rng_key
+            "theta", dist.Beta(α0, β0), rng_key=model.rng_key
         )
 
     elif model.name_dist == "uniform":
         # unpack parameters
         lb, ub = model.param
         sample = numpyro.sample(
-            "theta", ndist.Uniform(lb, ub), rng_key=model.rng_key
+            "theta", dist.Uniform(lb, ub), rng_key=model.rng_key
         )
 
     elif model.name_dist == "lognormal":
@@ -406,9 +424,9 @@ def sample_prior(model: BayesianInference):
 
     elif model.name_dist == "vonMises":
         # unpack parameters
-        kappa = model.param
+        κ = model.param
         sample = numpyro.sample(
-            "theta", ShiftedVonMises(kappa), rng_key=model.rng_key
+            "theta", ShiftedVonMises(κ), rng_key=model.rng_key
         )
 
     elif model.name_dist == "laplace":
@@ -423,7 +441,7 @@ def sample_prior(model: BayesianInference):
 
 def show_prior(
     model: BayesianInference, size=1e5, bins=20, disp_plot=1
-    ):
+):
     """
     Visualizes prior distribution by sampling from prior 
     and plots the approximated sampling distribution
@@ -451,21 +469,21 @@ def set_model(model: BayesianInference, data):
     """
     theta = sample_prior(model)
     output = numpyro.sample(
-        "obs", ndist.Binomial(len(data), theta), obs=jnp.sum(data)
+        "obs", dist.Binomial(len(data), theta), obs=jnp.sum(data)
     )
 
 
 def MCMC_sampling(
     model: BayesianInference, data, num_samples, num_warmup=1000
-    ):
+):
     """
     Computes numerically the posterior distribution 
-    with beta prior parametrized by (alpha0, beta0)
+    with beta prior parametrized by (α0, β0)
     given data using MCMC
     """
     data = jnp.array(data, dtype=float)
-    nuts_kernel = nNUTS(set_model)
-    mcmc = nMCMC(
+    nuts_kernel = NUTS(set_model)
+    mcmc = MCMC(
         nuts_kernel,
         num_samples=num_samples,
         num_warmup=num_warmup,
@@ -485,10 +503,10 @@ def beta_guide(model: BayesianInference, data):
     that we train to approximate posterior with numpyro
     Here we use parameterized beta
     """
-    alpha_q = numpyro.param("alpha_q", 10, constraint=nconstraints.positive)
-    beta_q = numpyro.param("beta_q", 10, constraint=nconstraints.positive)
+    α_q = numpyro.param("alpha_q", 10, constraint=constraints.positive)
+    β_q = numpyro.param("beta_q", 10, constraint=constraints.positive)
 
-    numpyro.sample("theta", ndist.Beta(alpha_q, beta_q))
+    numpyro.sample("theta", dist.Beta(α_q, β_q))
 
 
 # similar with beta_guide()
@@ -498,9 +516,9 @@ def truncnormal_guide(model: BayesianInference, data):
     that we train to approximate posterior with numpyro
     Here we use truncated normal on [0,1]
     """
-    loc = numpyro.param("loc", 0.5, constraint=nconstraints.interval(0.0, 1.0))
-    scale = numpyro.param("scale", 1, constraint=nconstraints.positive)
-    numpyro.sample("theta", ndist.TruncatedNormal(loc, scale, low=0.0, high=1.0))
+    loc = numpyro.param("loc", 0.5, constraint=constraints.interval(0.0, 1.0))
+    scale = numpyro.param("scale", 1, constraint=constraints.positive)
+    numpyro.sample("theta", dist.TruncatedNormal(loc, scale, low=0.0, high=1.0))
 
 
 def SVI_init(model: BayesianInference, guide_dist, lr=0.0005):
@@ -508,14 +526,14 @@ def SVI_init(model: BayesianInference, guide_dist, lr=0.0005):
     adam_params = {"lr": lr}
 
     if guide_dist == "beta":
-        optimizer = nAdam(step_size=lr)
-        svi = nSVI(
-            set_model, beta_guide, optimizer, loss=nTrace_ELBO()
+        optimizer = Adam(step_size=lr)
+        svi = SVI(
+            set_model, beta_guide, optimizer, loss=Trace_ELBO()
         )
     elif guide_dist == "normal":
-        optimizer = nAdam(step_size=lr)
-        svi = nSVI(
-            set_model, truncnormal_guide, optimizer, loss=nTrace_ELBO()
+        optimizer = Adam(step_size=lr)
+        svi = SVI(
+            set_model, truncnormal_guide, optimizer, loss=Trace_ELBO()
         )
     else:
         print("WARNING: Please input either 'beta' or 'normal'")
@@ -565,8 +583,8 @@ mystnb:
     name: fig_lognormal_dist
 ---
 # truncated log normal
-exampleLN = BayesianInference(param=(0, 2), name_dist="lognormal")
-show_prior(exampleLN, size=100000, bins=20)
+example_ln = create_bayesian_inference(param=(0, 2), name_dist="lognormal")
+show_prior(example_ln, size=100000, bins=20)
 ```
 
 ```{code-cell} ipython3
@@ -578,8 +596,8 @@ mystnb:
     name: fig_uniform_dist
 ---
 # truncated uniform
-exampleUN = BayesianInference(param=(0.1, 0.8), name_dist="uniform")
-show_prior(exampleUN, size=100000, bins=20)
+example_un = create_bayesian_inference(param=(0.1, 0.8), name_dist="uniform")
+show_prior(example_un, size=100000, bins=20)
 ```
 
 The above graphs show that sampling seems to work well with both distributions.
@@ -595,8 +613,8 @@ mystnb:
     name: fig_vonmises_dist
 ---
 # shifted von Mises
-exampleVM = BayesianInference(param=10, name_dist="vonMises")
-show_prior(exampleVM, size=100000, bins=20)
+example_vm = create_bayesian_inference(param=10, name_dist="vonMises")
+show_prior(example_vm, size=100000, bins=20)
 ```
 
 The graphs look good too.
@@ -612,8 +630,8 @@ mystnb:
     name: fig_laplace_dist
 ---
 # truncated Laplace
-exampleLP = BayesianInference(param=(0.5, 0.05), name_dist="laplace")
-show_prior(exampleLP, size=100000, bins=20)
+example_lp = create_bayesian_inference(param=(0.5, 0.05), name_dist="laplace")
+show_prior(example_lp, size=100000, bins=20)
 ```
 
 Having assured ourselves that our sampler seems to do a good job, let's put it to work in using MCMC to compute posterior probabilities.
@@ -622,7 +640,7 @@ Having assured ourselves that our sampler seems to do a good job, let's put it t
 
 We construct a class `BayesianInferencePlot` to implement MCMC or VI algorithms and plot multiple posteriors for different updating data sizes and different possible priors.
 
-This class takes as inputs the true data generating parameter `theta`, a list of updating data sizes for multiple posterior plotting, and a defined and parametrized `BayesianInference` class.
+This class takes as inputs the true data generating parameter `θ`, a list of updating data sizes for multiple posterior plotting, and a defined and parametrized `BayesianInference` class.
 
 It has two key methods:
 
@@ -631,49 +649,73 @@ It has two key methods:
 - `BayesianInferencePlot.SVI_plot()` takes desired VI distribution class ('beta' or 'normal') as input and plots the posteriors together with the prior.
 
 ```{code-cell} ipython3
-@dataclass
-class BayesianInferencePlot:
+class BayesianInferencePlot(NamedTuple):
     """
-    Easily implement the MCMC and VI inference for a given instance of BayesianInference class and
-    plot the prior together with multiple posteriors
+    Easily implement the MCMC and VI inference for a given instance of 
+    BayesianInference class and plot the prior together with multiple posteriors
 
     Parameters
     ----------
-    theta : float.
+    θ : float.
         the true DGP parameter
     N_list : list.
         a list of sample size
-    BayesianInferenceClass : class.
-        a class initiated using BayesianInference()
+    bayesian_model : BayesianInference.
+        a class initiated using create_bayesian_inference()
+    binwidth : float.
+        plotting parameter for histogram bin width
+    linewidth : float.
+        plotting parameter for line width
+    colorlist : list.
+        list of colors for plotting
+    N_max : int.
+        maximum sample size
+    data : np.ndarray.
+        generated data array
     """
-
-    theta: float
-    N_list: list
-    BayesianInferenceClass: BayesianInference
-
-    # plotting parameters
-    binwidth: float = 0.02
-    linewidth: float = 0.05
-    colorlist: list = field(init=False)
-
-    # data generation
-    N_max: float = field(init=False)
-    data: np.array = field(init=False)
-
-    def __post_init__(self):
-        self.colorlist = sns.color_palette(n_colors=len(self.N_list))
-        self.N_max = max(self.N_list)
-        self.data = simulate_draw(self.theta, self.N_max)
+    θ: float
+    N_list: Sequence[int]
+    bayesian_model: BayesianInference
+    binwidth: float
+    linewidth: float
+    colorlist: list
+    N_max: int
+    data: np.ndarray
+
+
+def create_bayesian_inference_plot(
+    θ: float,
+    N_list: Sequence[int],
+    bayesian_model: BayesianInference,
+    *,
+    binwidth: float = 0.02,
+    linewidth: float = 0.05,
+) -> BayesianInferencePlot:
+    """Factory function to create a BayesianInferencePlot instance"""
+
+    colorlist = sns.color_palette(n_colors=len(N_list))
+    N_max = int(max(N_list))
+    data = simulate_draw(θ, N_max)
+    return BayesianInferencePlot(
+        θ=θ,
+        N_list=list(map(int, N_list)),
+        bayesian_model=bayesian_model,
+        binwidth=binwidth,
+        linewidth=linewidth,
+        colorlist=colorlist,
+        N_max=N_max,
+        data=data,
+    )
 
 
 def MCMC_plot(
     plot_model: BayesianInferencePlot, num_samples, num_warmup=1000
-    ):
+):
     fig, ax = plt.subplots()
 
     # plot prior
     prior_sample = show_prior(
-        plot_model.BayesianInferenceClass, disp_plot=0
+        plot_model.bayesian_model, disp_plot=0
         )
     sns.histplot(
         data=prior_sample,
@@ -690,7 +732,7 @@ def MCMC_plot(
     # plot posteriors
     for id, n in enumerate(plot_model.N_list):
         samples = MCMC_sampling(
-            plot_model.BayesianInferenceClass, plot_model.data[:n], num_samples, num_warmup
+            plot_model.bayesian_model, plot_model.data[:n], num_samples, num_warmup
         )
         sns.histplot(
             samples,
@@ -728,11 +770,11 @@ def SVI_fitting(guide_dist, params):
 
 def SVI_plot(
     plot_model: BayesianInferencePlot, guide_dist, n_steps=2000
-    ):
+):
     fig, ax = plt.subplots()
 
     # plot prior
-    prior_sample = show_prior(plot_model.BayesianInferenceClass, disp_plot=0)
+    prior_sample = show_prior(plot_model.bayesian_model, disp_plot=0)
     sns.histplot(
         data=prior_sample,
         kde=True,
@@ -748,7 +790,7 @@ def SVI_plot(
     # plot posteriors
     for id, n in enumerate(plot_model.N_list):
         (params, losses) = SVI_run(
-            plot_model.BayesianInferenceClass, plot_model.data[:n], guide_dist, n_steps
+            plot_model.bayesian_model, plot_model.data[:n], guide_dist, n_steps
         )
         x, y = SVI_fitting(guide_dist, params)
         ax.plot(
@@ -765,17 +807,17 @@ def SVI_plot(
 
 Let's set some parameters that we'll use in all of the examples below.
 
-To save computer time at first, notice that we'll set `MCMC_num_samples = 2000` and `SVI_num_steps = 5000`.
+To save computer time at first, notice that we'll set `mcmc_num_samples = 2000` and `svi_num_steps = 5000`.
 
 (Later, to increase accuracy of approximations, we'll want to increase these.)
 
 ```{code-cell} ipython3
 num_list = [5, 10, 50, 100, 1000]
-MCMC_num_samples = 2000
-SVI_num_steps = 5000
+mcmc_num_samples = 2000
+svi_num_steps = 5000
 
-# theta is the data generating process
-true_theta = 0.8
+# θ is the data generating process
+true_θ = 0.8
 ```
 
 ### Beta prior and posteriors:
@@ -799,9 +841,9 @@ mystnb:
     name: fig_analytical
 ---
 # first examine Beta prior
-BETA = BayesianInference(param=(5, 5), name_dist="beta")
+beta = create_bayesian_inference(param=(5, 5), name_dist="beta")
 
-BETA_plot = BayesianInferencePlot(true_theta, num_list, BETA)
+beta_plot = create_bayesian_inference_plot(true_θ, num_list, beta)
 
 # plot analytical Beta prior and posteriors
 xaxis = jnp.linspace(0, 1, 1000)
@@ -811,11 +853,11 @@ fig, ax = plt.subplots()
 # plot analytical beta prior
 ax.plot(xaxis, y_prior, label="Analytical Beta prior", color="#4C4E52")
 
-data, colorlist, N_list = BETA_plot.data, BETA_plot.colorlist, BETA_plot.N_list
+data, colorlist, N_list = beta_plot.data, beta_plot.colorlist, beta_plot.N_list
 
 # Plot analytical beta posteriors
 for id, n in enumerate(N_list):
-    func = analytical_beta_posterior(data[:n], alpha0=5, beta0=5)
+    func = analytical_beta_posterior(data[:n], α0=5, β0=5)
     y_posterior = func.pdf(xaxis)
     ax.plot(
         xaxis,
@@ -842,7 +884,7 @@ mystnb:
 ---
 
 MCMC_plot(
-    BETA_plot, num_samples=MCMC_num_samples
+    beta_plot, num_samples=mcmc_num_samples
 )
 ```
 
@@ -856,7 +898,7 @@ mystnb:
 ---
 
 SVI_plot(
-    BETA_plot, guide_dist="beta", n_steps=SVI_num_steps
+    beta_plot, guide_dist="beta", n_steps=svi_num_steps
 )
 ```
 
@@ -875,7 +917,7 @@ that will be more accurate, as we shall see next.
 
 ```{code-cell} ipython3
 SVI_plot(
-    BETA_plot, guide_dist="beta", n_steps=100000
+    beta_plot, guide_dist="beta", n_steps=100000
 )
 ```
 
@@ -886,7 +928,7 @@ next proceed to situations in which our prior is not a beta distribution, so we
 
 So we will have non-conjugate priors and are cast into situations in which we can't calculate posteriors analytically.
 
-### MCMC
+### Markov chain Monte Carlo
 
 First, we implement and display MCMC.
 
@@ -895,17 +937,63 @@ We first initialize the `BayesianInference` classes and then can directly call `
 ```{code-cell} ipython3
 # Initialize BayesianInference classes
 # Try uniform
-STD_UNIFORM = BayesianInference(param=(0, 1), name_dist="uniform")
-UNIFORM = BayesianInference(param=(0.2, 0.7), name_dist="uniform")
+std_uniform = create_bayesian_inference(param=(0, 1), name_dist="uniform")
+uniform = create_bayesian_inference(param=(0.2, 0.7), name_dist="uniform")
 
 # Try truncated log normal
-LOGNORMAL = BayesianInference(param=(0, 2), name_dist="lognormal")
+lognormal = create_bayesian_inference(param=(0, 2), name_dist="lognormal")
 
 # Try Von Mises
-VONMISES = BayesianInference(param=10, name_dist="vonMises")
+vonmises = create_bayesian_inference(param=10, name_dist="vonMises")
 
 # Try Laplace
-LAPLACE = BayesianInference(param=(0.5, 0.07), name_dist="laplace")
+laplace = create_bayesian_inference(param=(0.5, 0.07), name_dist="laplace")
+```
+
+To conduct our experiments more concisely, here we define two experiment functions that will print the model information and plot the result.
+
+
+```{code-cell} ipython3
+def plot_mcmc_experiment(
+    bayesian_model: BayesianInference,
+    true_θ: float,
+    num_list: Sequence[int],
+    num_samples: int,
+    num_warmup: int = 1000,
+    description: str = ""
+):
+    """
+    Helper function to run and plot MCMC experiments for a given Bayesian model
+    """
+    print(
+        f"=======INFO=======\nParameters: {bayesian_model.param}\nPrior Dist: {bayesian_model.name_dist}"
+    )
+    if description:
+        print(description)
+    
+    plot_model = create_bayesian_inference_plot(true_θ, num_list, bayesian_model)
+    MCMC_plot(plot_model, num_samples=num_samples, num_warmup=num_warmup)
+
+
+def plot_svi_experiment(
+    bayesian_model: BayesianInference,
+    true_θ: float,
+    num_list: Sequence[int],
+    guide_dist: str,
+    n_steps: int,
+    description: str = ""
+):
+    """
+    Helper function to run and plot SVI experiments for a given Bayesian model
+    """
+    print(
+        f"=======INFO=======\nParameters: {bayesian_model.param}\nPrior Dist: {bayesian_model.name_dist}"
+    )
+    if description:
+        print(description)
+
+    plot_model = create_bayesian_inference_plot(true_θ, num_list, bayesian_model)
+    SVI_plot(plot_model, guide_dist=guide_dist, n_steps=n_steps)
 ```
 
 ```{code-cell} ipython3
@@ -917,15 +1005,11 @@ mystnb:
     name: fig_mcmc_stduniform
 ---
 # Uniform
-example_CLASS = STD_UNIFORM
-print(
-    f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
-)
-example_plotCLASS = BayesianInferencePlot(
-    true_theta, num_list, example_CLASS
-    )
-MCMC_plot(
-    example_plotCLASS, num_samples=MCMC_num_samples
+plot_mcmc_experiment(
+    std_uniform, 
+    true_θ, 
+    num_list, 
+    mcmc_num_samples
 )
 ```
 
@@ -937,15 +1021,11 @@ mystnb:
       MCMC density (uniform prior)
     name: fig_mcmc_uniform
 ---
-example_CLASS = UNIFORM
-print(
-    f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
-)
-example_plotCLASS = BayesianInferencePlot(
-    true_theta, num_list, example_CLASS
-    )
-MCMC_plot(
-    example_plotCLASS, num_samples=MCMC_num_samples
+plot_mcmc_experiment(
+    uniform, 
+    true_θ, 
+    num_list, 
+    mcmc_num_samples
 )
 ```
 
@@ -964,15 +1044,11 @@ mystnb:
     name: fig_mcmc_lognormal
 ---
 # log normal
-example_CLASS = LOGNORMAL
-print(
-    f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
-)
-example_plotCLASS = BayesianInferencePlot(
-    true_theta, num_list, example_CLASS
-    )
-MCMC_plot(
-    example_plotCLASS, num_samples=MCMC_num_samples
+plot_mcmc_experiment(
+    lognormal, 
+    true_θ, 
+    num_list, 
+    mcmc_num_samples
 )
 ```
 
@@ -985,16 +1061,12 @@ mystnb:
     name: fig_mcmc_vonmises
 ---
 # von Mises
-example_CLASS = VONMISES
-print(
-    f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
-)
-print("\nNOTE: Shifted von Mises")
-example_plotCLASS = BayesianInferencePlot(
-    true_theta, num_list, example_CLASS
-    )
-MCMC_plot(
-    example_plotCLASS, num_samples=MCMC_num_samples
+plot_mcmc_experiment(
+    vonmises, 
+    true_θ, 
+    num_list, 
+    mcmc_num_samples,
+    description="\nNOTE: Shifted von Mises"
 )
 ```
 
@@ -1007,24 +1079,20 @@ mystnb:
     name: fig_mcmc_laplace
 ---
 # Laplace
-example_CLASS = LAPLACE
-print(
-    f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
-)
-example_plotCLASS = BayesianInferencePlot(
-    true_theta, num_list, example_CLASS
-    )
-MCMC_plot(
-    example_plotCLASS, num_samples=MCMC_num_samples
+plot_mcmc_experiment(
+    laplace, 
+    true_θ, 
+    num_list, 
+    mcmc_num_samples
 )
 ```
 
-### VI
+### Variational inference
 
 To get more accuracy we will now increase the number of steps for Variational Inference (VI)
 
 ```{code-cell} ipython3
-SVI_num_steps = 50000
+svi_num_steps = 50000
 ```
 #### VI with a truncated normal guide
 
@@ -1037,15 +1105,12 @@ mystnb:
     name: fig_svi_uniform_normal
 ---
 # Uniform
-example_CLASS = BayesianInference(param=(0, 1), name_dist="uniform")
-print(
-    f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
-)
-example_plotCLASS = BayesianInferencePlot(
-    true_theta, num_list, example_CLASS
-    )
-SVI_plot(
-    example_plotCLASS, guide_dist="normal", n_steps=SVI_num_steps
+plot_svi_experiment(
+    create_bayesian_inference(param=(0, 1), name_dist="uniform"),
+    true_θ, 
+    num_list, 
+    "normal", 
+    svi_num_steps
 )
 ```
 
@@ -1058,15 +1123,12 @@ mystnb:
     name: fig_svi_lognormal_normal
 ---
 # log normal
-example_CLASS = LOGNORMAL
-print(
-    f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
-)
-example_plotCLASS = BayesianInferencePlot(
-    true_theta, num_list, example_CLASS
-    )
-SVI_plot(
-    example_plotCLASS, guide_dist="normal", n_steps=SVI_num_steps
+plot_svi_experiment(
+    lognormal,
+    true_θ, 
+    num_list, 
+    "normal", 
+    svi_num_steps
 )
 ```
 
@@ -1079,15 +1141,12 @@ mystnb:
     name: fig_svi_laplace_normal
 ---
 # Laplace
-example_CLASS = LAPLACE
-print(
-    f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
-)
-example_plotCLASS = BayesianInferencePlot(
-    true_theta, num_list, example_CLASS
-    )
-SVI_plot(
-    example_plotCLASS, guide_dist="normal", n_steps=SVI_num_steps
+plot_svi_experiment(
+    laplace,
+    true_θ, 
+    num_list, 
+    "normal", 
+    svi_num_steps
 )
 ```
 
@@ -1102,15 +1161,12 @@ mystnb:
     name: fig_svi_uniform_beta
 ---
 # uniform
-example_CLASS = STD_UNIFORM
-print(
-    f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
-)
-example_plotCLASS = BayesianInferencePlot(
-    true_theta, num_list, example_CLASS
-    )
-SVI_plot(
-    example_plotCLASS, guide_dist="beta", n_steps=SVI_num_steps
+plot_svi_experiment(
+    std_uniform,
+    true_θ, 
+    num_list, 
+    "beta", 
+    svi_num_steps
 )
 ```
 
@@ -1123,15 +1179,12 @@ mystnb:
     name: fig_svi_lognormal_beta
 ---
 # log normal
-example_CLASS = LOGNORMAL
-print(
-    f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
-)
-example_plotCLASS = BayesianInferencePlot(
-    true_theta, num_list, example_CLASS
-    )
-SVI_plot(
-    example_plotCLASS, guide_dist="beta", n_steps=SVI_num_steps
+plot_svi_experiment(
+    lognormal,
+    true_θ, 
+    num_list, 
+    "beta", 
+    svi_num_steps
 )
 ```
 
@@ -1144,16 +1197,13 @@ mystnb:
     name: fig_svi_vonmises_beta
 ---
 # von Mises
-example_CLASS = VONMISES
-print(
-    f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
-)
-print("Shifted von Mises")
-example_plotCLASS = BayesianInferencePlot(
-    true_theta, num_list, example_CLASS
-    )
-SVI_plot(
-    example_plotCLASS, guide_dist="beta", n_steps=SVI_num_steps
+plot_svi_experiment(
+    vonmises,
+    true_θ, 
+    num_list, 
+    "beta", 
+    svi_num_steps,
+    description="Shifted von Mises"
 )
 ```
 
@@ -1166,14 +1216,11 @@ mystnb:
     name: fig_svi_laplace_beta
 ---
 # Laplace
-example_CLASS = LAPLACE
-print(
-    f"=======INFO=======\nParameters: {example_CLASS.param}\nPrior Dist: {example_CLASS.name_dist}"
-)
-example_plotCLASS = BayesianInferencePlot(
-    true_theta, num_list, example_CLASS
-    )
-SVI_plot(
-    example_plotCLASS, guide_dist="beta", n_steps=SVI_num_steps
+plot_svi_experiment(
+    laplace,
+    true_θ, 
+    num_list, 
+    "beta", 
+    svi_num_steps
 )
 ```

From 5d12d723d28b6f85b31e3fb8c196db3c24e6835f Mon Sep 17 00:00:00 2001
From: Kenko LI <edwardli02123@gmail.com>
Date: Sat, 6 Sep 2025 17:42:44 +0800
Subject: [PATCH 19/22] delete dataclasses import

---
 lectures/bayes_nonconj.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lectures/bayes_nonconj.md b/lectures/bayes_nonconj.md
index d44bbeac4..c965b7897 100644
--- a/lectures/bayes_nonconj.md
+++ b/lectures/bayes_nonconj.md
@@ -52,7 +52,6 @@ import seaborn as sns
 import matplotlib.pyplot as plt
 import scipy.stats as st
 
-from dataclasses import dataclass, field
 from typing import NamedTuple, Sequence
 import jax.numpy as jnp
 from jax import random

From f7f9c3b1b761b74c1c50141c19c7d654907a9946 Mon Sep 17 00:00:00 2001
From: Kenko LI <edwardli02123@gmail.com>
Date: Wed, 10 Sep 2025 16:45:13 +0800
Subject: [PATCH 20/22] modify styling

---
 lectures/bayes_nonconj.md | 111 +++++++++++++++++++++++---------------
 1 file changed, 68 insertions(+), 43 deletions(-)

diff --git a/lectures/bayes_nonconj.md b/lectures/bayes_nonconj.md
index c965b7897..44a8c0d8e 100644
--- a/lectures/bayes_nonconj.md
+++ b/lectures/bayes_nonconj.md
@@ -209,7 +209,7 @@ We will use the following priors:
 
 - a truncated log-normal distribution with support on $[0,1]$ with parameters $(\mu,\sigma)$.
 
-    - To implement this, let $Z\sim Normal(\mu,\sigma)$ and $\tilde{Z}$ be truncated normal with support $[-\infty,\log(1)]$, then $\exp(Z)$ has a log normal distribution with bounded support $[0,1]$. This can be easily coded since `numpyro` has a built-in truncated normal distribution, and `numpyro`'s `TransformedDistribution` class that includes an exponential transformation.
+    - To implement this, let $Z\sim N(\mu,\sigma)$ and $\tilde{Z}$ be truncated normal with support $[-\infty,\log(1)]$, then $\exp(Z)$ has a log normal distribution with bounded support $[0,1]$. This can be easily coded since `numpyro` has a built-in truncated normal distribution, and `numpyro`'s `TransformedDistribution` class that includes an exponential transformation.
 
 - a shifted von Mises distribution that has support confined to $[0,1]$ with parameter $(\mu,\kappa)$.
 
@@ -224,7 +224,7 @@ We will use the following priors:
     - The truncated Laplace can be created using `numpyro`'s `TruncatedDistribution` class.
 
 ```{code-cell} ipython3
-def TruncatedLogNormal_trans(loc, scale):
+def truncated_log_normal_trans(loc, scale):
     """
     Obtains the truncated log normal distribution 
     using numpyro's TruncatedNormal and ExpTransform
@@ -232,18 +232,21 @@ def TruncatedLogNormal_trans(loc, scale):
     base_dist = dist.TruncatedNormal(
         low=-jnp.inf, high=jnp.log(1), loc=loc, scale=scale
     )
-    return dist.TransformedDistribution(base_dist, dist.transforms.ExpTransform())
+    return dist.TransformedDistribution(
+        base_dist, dist.transforms.ExpTransform()
+    )
 
 
-def ShiftedVonMises(κ):
+def shifted_von_mises(κ):
     """Obtains the shifted von Mises distribution using AffineTransform"""
     base_dist = dist.VonMises(0, κ)
     return dist.TransformedDistribution(
-        base_dist, dist.transforms.AffineTransform(loc=0.5, scale=1 / (2 * jnp.pi))
+        base_dist, 
+        dist.transforms.AffineTransform(loc=0.5, scale=1 / (2 * jnp.pi))
     )
 
 
-def TruncatedLaplace(loc, scale):
+def truncated_laplace(loc, scale):
     """Obtains the truncated Laplace distribution on [0,1]"""
     base_dist = dist.Laplace(loc, scale)
     return dist.TruncatedDistribution(base_dist, low=0.0, high=1.0)
@@ -354,11 +357,11 @@ The class `BayesianInference` has several key methods :
 - `show_prior`:
    - Plots the approximate prior distribution by repeatedly drawing samples and fitting a kernel density curve.
 
-- `MCMC_sampling`:
+- `mcmc_sampling`:
    - INPUT: (data, num_samples, num_warmup=1000)
    - Takes a `jnp.array` data and generates MCMC sampling of posterior of size `num_samples`.
 
-- `SVI_run`:
+- `svi_run`:
   - INPUT: (data, guide_dist, n_steps=10000)
   - guide_dist = 'normal' - use a **truncated** normal distribution as the parametrized guide
   - guide_dist = 'beta' - use a beta distribution as the parametrized guide
@@ -371,25 +374,24 @@ class BayesianInference(NamedTuple):
     ---------
     param : tuple.
         a tuple object that contains all relevant parameters for the distribution
-    dist : str.
-        name of the distribution - 'beta', 'uniform', 'lognormal', 'vonMises', 'tent'
+    name_dist : str.
+        name of the distribution - 'beta', 'uniform', 'lognormal', 'vonMises', 'laplace'
+    rng_key : jax.random.PRNGKey
+        PRNG key for random number generation. 
     """
     param: tuple
     name_dist: str
-    # jax requires explicit PRNG state to be passed
     rng_key: random.PRNGKey
 
 
 def create_bayesian_inference(
     param: tuple, 
     name_dist: str, 
-    *, 
-    rng_key: random.PRNGKey = None
+    seed: int = 0
 ) -> BayesianInference:
     """Factory function to create a BayesianInference instance"""
 
-    if rng_key is None:
-        rng_key = random.PRNGKey(0)
+    rng_key = random.PRNGKey(seed)
     
     return BayesianInference(
         param=param,
@@ -418,21 +420,23 @@ def sample_prior(model: BayesianInference):
         # unpack parameters
         loc, scale = model.param
         sample = numpyro.sample(
-            "theta", TruncatedLogNormal_trans(loc, scale), rng_key=model.rng_key
+            "theta", 
+            truncated_log_normal_trans(loc, scale), 
+            rng_key=model.rng_key
         )
 
     elif model.name_dist == "vonMises":
         # unpack parameters
         κ = model.param
         sample = numpyro.sample(
-            "theta", ShiftedVonMises(κ), rng_key=model.rng_key
+            "theta", shifted_von_mises(κ), rng_key=model.rng_key
         )
 
     elif model.name_dist == "laplace":
         # unpack parameters
         loc, scale = model.param
         sample = numpyro.sample(
-            "theta", TruncatedLaplace(loc, scale), rng_key=model.rng_key
+            "theta", truncated_laplace(loc, scale), rng_key=model.rng_key
         )
 
     return sample
@@ -453,7 +457,12 @@ def show_prior(
     # plot histogram and kernel density
     if disp_plot == 1:
         sns.displot(
-            sample_array, kde=True, stat="density", bins=bins, height=5, aspect=1.5
+            sample_array, 
+            kde=True, 
+            stat="density", 
+            bins=bins, 
+            height=5, 
+            aspect=1.5
         )
         plt.xlim(0, 1)
         plt.show()
@@ -472,7 +481,7 @@ def set_model(model: BayesianInference, data):
     )
 
 
-def MCMC_sampling(
+def mcmc_sampling(
     model: BayesianInference, data, num_samples, num_warmup=1000
 ):
     """
@@ -517,10 +526,13 @@ def truncnormal_guide(model: BayesianInference, data):
     """
     loc = numpyro.param("loc", 0.5, constraint=constraints.interval(0.0, 1.0))
     scale = numpyro.param("scale", 1, constraint=constraints.positive)
-    numpyro.sample("theta", dist.TruncatedNormal(loc, scale, low=0.0, high=1.0))
+    numpyro.sample(
+        "theta", 
+        dist.TruncatedNormal(loc, scale, low=0.0, high=1.0)
+    )
 
 
-def SVI_init(model: BayesianInference, guide_dist, lr=0.0005):
+def svi_init(model: BayesianInference, guide_dist, lr=0.0005):
     """Initiate SVI training mode with Adam optimizer"""
     adam_params = {"lr": lr}
 
@@ -541,7 +553,7 @@ def SVI_init(model: BayesianInference, guide_dist, lr=0.0005):
     return svi
 
 
-def SVI_run(model: BayesianInference, data, guide_dist, n_steps=10000):
+def svi_run(model: BayesianInference, data, guide_dist, n_steps=10000):
     """
     Runs SVI and returns optimized parameters and losses
 
@@ -552,13 +564,15 @@ def SVI_run(model: BayesianInference, data, guide_dist, n_steps=10000):
     """
 
     # initiate SVI
-    svi = SVI_init(model, guide_dist)
+    svi = svi_init(model, guide_dist)
 
     data = jnp.array(data, dtype=float)
     result = svi.run(
         model.rng_key, n_steps, model, data, progress_bar=False
         )
-    params = dict((key, jnp.asarray(value)) for key, value in result.params.items())
+    params = dict(
+        (key, jnp.asarray(value)) for key, value in result.params.items()
+    )
     losses = jnp.asarray(result.losses)
 
     return params, losses
@@ -643,9 +657,9 @@ This class takes as inputs the true data generating parameter `θ`, a list of up
 
 It has two key methods:
 
-- `BayesianInferencePlot.MCMC_plot()` takes desired MCMC sample size as input and plots the output posteriors together with the prior defined in `BayesianInference` class.
+- `BayesianInferencePlot.mcmc_plot()` takes desired MCMC sample size as input and plots the output posteriors together with the prior defined in `BayesianInference` class.
 
-- `BayesianInferencePlot.SVI_plot()` takes desired VI distribution class ('beta' or 'normal') as input and plots the posteriors together with the prior.
+- `BayesianInferencePlot.svi_plot()` takes desired VI distribution class ('beta' or 'normal') as input and plots the posteriors together with the prior.
 
 ```{code-cell} ipython3
 class BayesianInferencePlot(NamedTuple):
@@ -707,7 +721,7 @@ def create_bayesian_inference_plot(
     )
 
 
-def MCMC_plot(
+def mcmc_plot(
     plot_model: BayesianInferencePlot, num_samples, num_warmup=1000
 ):
     fig, ax = plt.subplots()
@@ -730,8 +744,11 @@ def MCMC_plot(
 
     # plot posteriors
     for id, n in enumerate(plot_model.N_list):
-        samples = MCMC_sampling(
-            plot_model.bayesian_model, plot_model.data[:n], num_samples, num_warmup
+        samples = mcmc_sampling(
+            plot_model.bayesian_model, 
+            plot_model.data[:n], 
+            num_samples, 
+            num_warmup
         )
         sns.histplot(
             samples,
@@ -748,7 +765,7 @@ def MCMC_plot(
     plt.show()
 
 
-def SVI_fitting(guide_dist, params):
+def svi_fitting(guide_dist, params):
     """Fit the beta/truncnormal curve using parameters trained by SVI."""
     # create x axis
     xaxis = jnp.linspace(0, 1, 1000)
@@ -767,7 +784,7 @@ def SVI_fitting(guide_dist, params):
     return (xaxis, y)
 
 
-def SVI_plot(
+def svi_plot(
     plot_model: BayesianInferencePlot, guide_dist, n_steps=2000
 ):
     fig, ax = plt.subplots()
@@ -788,10 +805,10 @@ def SVI_plot(
 
     # plot posteriors
     for id, n in enumerate(plot_model.N_list):
-        (params, losses) = SVI_run(
+        (params, losses) = svi_run(
             plot_model.bayesian_model, plot_model.data[:n], guide_dist, n_steps
         )
-        x, y = SVI_fitting(guide_dist, params)
+        x, y = svi_fitting(guide_dist, params)
         ax.plot(
             x,
             y,
@@ -882,7 +899,7 @@ mystnb:
     name: fig_mcmc_beta
 ---
 
-MCMC_plot(
+mcmc_plot(
     beta_plot, num_samples=mcmc_num_samples
 )
 ```
@@ -896,7 +913,7 @@ mystnb:
     name: fig_svi_beta_beta
 ---
 
-SVI_plot(
+svi_plot(
     beta_plot, guide_dist="beta", n_steps=svi_num_steps
 )
 ```
@@ -915,7 +932,7 @@ that will be more accurate, as we shall see next.
 (Increasing the step size increases computational time though).
 
 ```{code-cell} ipython3
-SVI_plot(
+svi_plot(
     beta_plot, guide_dist="beta", n_steps=100000
 )
 ```
@@ -965,13 +982,17 @@ def plot_mcmc_experiment(
     Helper function to run and plot MCMC experiments for a given Bayesian model
     """
     print(
-        f"=======INFO=======\nParameters: {bayesian_model.param}\nPrior Dist: {bayesian_model.name_dist}"
+        f"=======INFO=======\n"
+        f"Parameters: {bayesian_model.param}\n"
+        f"Prior Dist: {bayesian_model.name_dist}"
     )
     if description:
         print(description)
     
-    plot_model = create_bayesian_inference_plot(true_θ, num_list, bayesian_model)
-    MCMC_plot(plot_model, num_samples=num_samples, num_warmup=num_warmup)
+    plot_model = create_bayesian_inference_plot(
+        true_θ, num_list, bayesian_model
+    )
+    mcmc_plot(plot_model, num_samples=num_samples, num_warmup=num_warmup)
 
 
 def plot_svi_experiment(
@@ -986,13 +1007,17 @@ def plot_svi_experiment(
     Helper function to run and plot SVI experiments for a given Bayesian model
     """
     print(
-        f"=======INFO=======\nParameters: {bayesian_model.param}\nPrior Dist: {bayesian_model.name_dist}"
+        f"=======INFO=======\n"
+        f"Parameters: {bayesian_model.param}\n"
+        f"Prior Dist: {bayesian_model.name_dist}"
     )
     if description:
         print(description)
 
-    plot_model = create_bayesian_inference_plot(true_θ, num_list, bayesian_model)
-    SVI_plot(plot_model, guide_dist=guide_dist, n_steps=n_steps)
+    plot_model = create_bayesian_inference_plot(
+        true_θ, num_list, bayesian_model
+    )
+    svi_plot(plot_model, guide_dist=guide_dist, n_steps=n_steps)
 ```
 
 ```{code-cell} ipython3

From b8d6ff609402a42b6ffc4ee37aee57d2c7374d6a Mon Sep 17 00:00:00 2001
From: Kenko LI <edwardli02123@gmail.com>
Date: Wed, 10 Sep 2025 17:14:15 +0800
Subject: [PATCH 21/22] retry building

---
 lectures/bayes_nonconj.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lectures/bayes_nonconj.md b/lectures/bayes_nonconj.md
index 44a8c0d8e..769b5ca41 100644
--- a/lectures/bayes_nonconj.md
+++ b/lectures/bayes_nonconj.md
@@ -1044,7 +1044,7 @@ mystnb:
     caption: |
       MCMC density (uniform prior)
     name: fig_mcmc_uniform
----
+--- 
 plot_mcmc_experiment(
     uniform, 
     true_θ, 

From 3f2b51b63447d3bac9720307ed00230b9271db82 Mon Sep 17 00:00:00 2001
From: Matt McKay <mmcky@users.noreply.github.com>
Date: Thu, 11 Sep 2025 10:17:09 +1000
Subject: [PATCH 22/22] Update lectures/bayes_nonconj.md

---
 lectures/bayes_nonconj.md | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/lectures/bayes_nonconj.md b/lectures/bayes_nonconj.md
index 769b5ca41..ab78d35b2 100644
--- a/lectures/bayes_nonconj.md
+++ b/lectures/bayes_nonconj.md
@@ -59,10 +59,7 @@ from jax import random
 import numpyro
 from numpyro import distributions as dist
 import numpyro.distributions.constraints as constraints
-from numpyro.infer import MCMC
-from numpyro.infer import NUTS
-from numpyro.infer import SVI
-from numpyro.infer import Trace_ELBO
+from numpyro.infer import MCMC, NUTS, SVI, Trace_ELBO
 from numpyro.optim import Adam
 ```