Mathematical Foundations of AI & ML
Unit 8: The Probabilistic View of Learning
FAU Erlangen-Nürnberg
By the end of this lecture, students can:
%%| echo: false
%%| fig-align: center
%%| align: center
graph TD
U["Total Uncertainty"] --> A["Aleatory"]
U --> E["Epistemic"]
A --> A1["Irreducible Noise"]
A --> A2["Bayes Error Floor"]
E --> E1["Reducible by Data"]
E --> E2["Model Uncertainty"]
style A fill:#f96,stroke:#333
style E fill:#69f,stroke:#333//| echo: false
true_func = (x) => Math.sin(x * Math.PI) + 0.5 * x;
// Generate Data
unc_data = {
const points = [];
const rng = d3.randomNormal(0, aleatory_noise);
for (let i = 0; i < n_samples_unc; i++) {
const x = d3.randomUniform(-2, 2)();
points.push({x: x, y: true_func(x) + rng()});
}
return points.sort((a,b) => a.x - b.x);
}
// Fit a simple polynomial (degree 3) for the epistemic uncertainty band
// Instead of complex regression, let's just use a confidence interval approach
// width of CI shrinks as 1 / sqrt(N)
unc_ci_width = aleatory_noise * 1.96 / Math.sqrt(n_samples_unc);
Plot.plot({
width: 800,
height: 400,
x: {domain: [-2.1, 2.1], label: "x"},
y: {domain: [-3, 3], label: "y"},
marks: [
// True function
Plot.line(d3.range(-2, 2.1, 0.1), {x: d => d, y: d => true_func(d), stroke: "gray", strokeDasharray: "4,4", title: "True Process"}),
// Epistemic Uncertainty Band
Plot.areaY(d3.range(-2, 2.1, 0.1), {
x: d => d,
y1: d => true_func(d) - unc_ci_width,
y2: d => true_func(d) + unc_ci_width,
fill: "steelblue",
fillOpacity: 0.3,
title: "Epistemic Uncertainty (Model Confidence)"
}),
// Aleatory Noise (Data points)
Plot.dot(unc_data, {x: "x", y: "y", r: 4, fill: "red", fillOpacity: 0.6, title: "Observations (Aleatory Noise)"})
]
})N. Notice how the blue band shrinks, but the red dots remain scattered.\[ f_s \geq 2 f_{\max} \]
\[ p(x \mid \mu, \sigma^2) = \frac{1}{\sqrt{2\pi\sigma^2}} \exp\!\left(-\frac{(x-\mu)^2}{2\sigma^2}\right) \]
//| echo: false
//| panel: input
viewof g_mean = Inputs.range([-3, 3], {value: 0, step: 0.1, label: "Mean (μ)"})
viewof g_std = Inputs.range([0.1, 3], {value: 1, step: 0.1, label: "Std Dev (σ)"})
viewof show_intervals = Inputs.checkbox(["1σ (68%)", "2σ (95%)", "3σ (99.7%)"], {label: "Show Intervals", value: ["1σ (68%)"]})//| echo: false
gaussian_pdf = (x, mu, sigma) => {
const variance = sigma * sigma;
return (1 / Math.sqrt(2 * Math.PI * variance)) * Math.exp(-Math.pow(x - mu, 2) / (2 * variance));
}
g_x_vals = d3.range(-6, 6.05, 0.05);
g_data = g_x_vals.map(x => ({x: x, y: gaussian_pdf(x, g_mean, g_std)}));
Plot.plot({
width: 800,
height: 350,
x: {domain: [-6, 6], label: "x"},
y: {domain: [0, 4.2], label: "Density p(x)"},
marks: [
// 3 sigma
show_intervals.includes("3σ (99.7%)") ? Plot.areaY(g_data.filter(d => d.x >= g_mean - 3*g_std && d.x <= g_mean + 3*g_std), {x: "x", y: "y", fill: "#ffbaba", fillOpacity: 0.5}) : null,
// 2 sigma
show_intervals.includes("2σ (95%)") ? Plot.areaY(g_data.filter(d => d.x >= g_mean - 2*g_std && d.x <= g_mean + 2*g_std), {x: "x", y: "y", fill: "#ff7b7b", fillOpacity: 0.6}) : null,
// 1 sigma
show_intervals.includes("1σ (68%)") ? Plot.areaY(g_data.filter(d => d.x >= g_mean - 1*g_std && d.x <= g_mean + 1*g_std), {x: "x", y: "y", fill: "#ff5252", fillOpacity: 0.8}) : null,
// PDF Line
Plot.line(g_data, {x: "x", y: "y", stroke: "white", strokeWidth: 3}),
// Mean Line
Plot.ruleX([g_mean], {stroke: "white", strokeDasharray: "4,4"})
]
})
\[ p(\mathbf{x} \mid \boldsymbol{\mu}, \boldsymbol{\Sigma}) = (2\pi)^{-d/2} |\boldsymbol{\Sigma}|^{-1/2} \exp\!\left(-\frac{1}{2}(\mathbf{x}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1}(\mathbf{x}-\boldsymbol{\mu})\right) \]


//| echo: false
//| panel: input
viewof cov_var_x = Inputs.range([0.1, 5], {value: 2, step: 0.1, label: "Variance X (σ_x²)"})
viewof cov_var_y = Inputs.range([0.1, 5], {value: 2, step: 0.1, label: "Variance Y (σ_y²)"})
viewof cov_rho = Inputs.range([-0.99, 0.99], {value: 0, step: 0.05, label: "Correlation (ρ)"})//| echo: false
// Generate 2D Gaussian samples
cov_data = {
const N = 500;
const points = [];
// Cholesky decomposition of [[var_x, cov], [cov, var_y]]
// cov = rho * sqrt(var_x * var_y)
const cv = cov_rho * Math.sqrt(cov_var_x * cov_var_y);
const L11 = Math.sqrt(cov_var_x);
const L21 = cv / L11;
const L22 = Math.sqrt(cov_var_y - L21 * L21);
const rng = d3.randomNormal(0, 1);
for (let i = 0; i < N; i++) {
const z1 = rng();
const z2 = rng();
const x = L11 * z1;
const y = L21 * z1 + L22 * z2;
points.push({x: x, y: y});
}
return points;
}
Plot.plot({
width: 600,
height: 600,
x: {domain: [-8, 8], label: "Feature 1 (X)"},
y: {domain: [-8, 8], label: "Feature 2 (Y)"},
aspectRatio: 1,
marks: [
Plot.dot(cov_data, {x: "x", y: "y", r: 3, fill: "steelblue", fillOpacity: 0.4}),
Plot.density(cov_data, {x: "x", y: "y", stroke: "white", thresholds: 5})
]
})For a continuous distribution \(p(x)\), the (differential) entropy is
\[ H(p) \;=\; -\int p(x)\, \log p(x)\, dx \;=\; -\mathbb{E}_p[\log p(X)] \]
(discrete analogue: \(H(p) = -\sum_x p(x) \log p(x)\)).
For distributions \(q\) and \(p\) on the same space:
\[ \mathrm{KL}(q \,\|\, p) \;=\; \mathbb{E}_q\!\left[\log \tfrac{q(x)}{p(x)}\right] \;=\; \int q(x)\, \log \tfrac{q(x)}{p(x)}\, dx \]
For two 1D Gaussians, KL admits a closed form:
\[ \mathrm{KL}\!\left(\mathcal{N}(\mu_1,\sigma_1^2)\,\|\,\mathcal{N}(\mu_2,\sigma_2^2)\right) \;=\; \log\frac{\sigma_2}{\sigma_1} \;+\; \frac{\sigma_1^2 + (\mu_1 - \mu_2)^2}{2\sigma_2^2} \;-\; \frac{1}{2} \]
The form used to regularize variational autoencoders — \(p = \mathcal{N}(\mathbf{0}, I)\) vs. \(q = \mathcal{N}(\boldsymbol{\mu},\, \mathrm{diag}(\sigma_1^2,\dots,\sigma_d^2))\) — is the per-dimension sum:
\[ \mathrm{KL}(q\,\|\,p) \;=\; \tfrac{1}{2} \sum_{j=1}^{d} \left( \mu_j^2 + \sigma_j^2 - \log \sigma_j^2 - 1 \right) \]
\[ \mathcal{L}(\boldsymbol{\theta}) = p(\mathcal{D} \mid \boldsymbol{\theta}) = \prod_{i=1}^{N} p(\mathbf{x}_i \mid \boldsymbol{\theta}) \]
\[ \ell(\boldsymbol{\theta}) = \log \mathcal{L}(\boldsymbol{\theta}) = \sum_{i=1}^{N} \log p(\mathbf{x}_i \mid \boldsymbol{\theta}) \]
\[ \hat{\boldsymbol{\theta}}_{\text{MLE}} = \arg\max_{\boldsymbol{\theta}} \ell(\boldsymbol{\theta}) \]
//| echo: false
// 5 fixed data points
mle_fixed_data = [{x: -0.5}, {x: 0.2}, {x: 1.1}, {x: 1.5}, {x: 2.2}]
// Calculate true MLE
mle_true_mu = d3.mean(mle_fixed_data, d => d.x);
mle_true_var = d3.mean(mle_fixed_data, d => Math.pow(d.x - mle_true_mu, 2));
mle_true_sigma = Math.sqrt(mle_true_var);
// Calculate log likelihood for current guess
mle_log_likelihood = {
let ll = 0;
for(let p of mle_fixed_data) {
const variance = mle_sigma * mle_sigma;
const pdf = (1 / Math.sqrt(2 * Math.PI * variance)) * Math.exp(-Math.pow(p.x - mle_mu, 2) / (2 * variance));
ll = ll + Math.log(pdf);
}
return ll;
}
// Max log likelihood for the true parameters
mle_max_ll = {
let ll = 0;
for(let p of mle_fixed_data) {
const variance = mle_true_var;
const pdf = (1 / Math.sqrt(2 * Math.PI * variance)) * Math.exp(-Math.pow(p.x - mle_true_mu, 2) / (2 * variance));
ll = ll + Math.log(pdf);
}
return ll;
}
mle_pdf_curve = d3.range(-5, 5.05, 0.05).map(x => {
const variance = mle_sigma * mle_sigma;
return {x: x, y: (1 / Math.sqrt(2 * Math.PI * variance)) * Math.exp(-Math.pow(x - mle_mu, 2) / (2 * variance))}
});
html`
<div style="margin-bottom: 20px;">
<strong>Current Log-Likelihood: <span style="color: ${mle_log_likelihood > mle_max_ll - 0.5 ? '#a8ff9e' : '#ff9e9e'}">${mle_log_likelihood.toFixed(2)}</span></strong><br>
<progress value="${mle_log_likelihood}" min="-30" max="${mle_max_ll}" style="width: 100%; height: 20px; accent-color: ${mle_log_likelihood > mle_max_ll - 0.5 ? '#a8ff9e' : '#ff9e9e'};"></progress>
</div>
`
Plot.plot({
width: 800,
height: 350,
x: {domain: [-5, 5], label: "Data Value (x)"},
y: {domain: [0, 1.5], label: "Likelihood p(x|μ,σ)"},
marks: [
Plot.ruleY([0]),
// The guessed PDF
Plot.areaY(mle_pdf_curve, {x: "x", y: "y", fill: "steelblue", fillOpacity: 0.3}),
Plot.line(mle_pdf_curve, {x: "x", y: "y", stroke: "white", strokeWidth: 2}),
// The data points projected onto the PDF
Plot.dot(mle_fixed_data, {
x: "x",
y: d => {
const variance = mle_sigma * mle_sigma;
return (1 / Math.sqrt(2 * Math.PI * variance)) * Math.exp(-Math.pow(d.x - mle_mu, 2) / (2 * variance));
},
r: 6, stroke: "#ff7b7b", fill: "none", strokeWidth: 2
}),
// Droplines to axis
Plot.ruleX(mle_fixed_data, {
x: "x",
y1: 0,
y2: d => {
const variance = mle_sigma * mle_sigma;
return (1 / Math.sqrt(2 * Math.PI * variance)) * Math.exp(-Math.pow(d.x - mle_mu, 2) / (2 * variance));
},
stroke: "#ff7b7b", strokeDasharray: "2,2"
}),
// Data points on axis
Plot.dot(mle_fixed_data, {x: "x", y: 0, r: 6, fill: "#ff7b7b"})
]
})\[ \ell(\mu) = -\frac{N}{2}\log(2\pi\sigma^2) - \frac{1}{2\sigma^2}\sum_{i=1}^{N}(x_i - \mu)^2 \]
\[ \hat{\mu}_{\text{MLE}} = \frac{1}{N}\sum_{i=1}^{N} x_i = \bar{x} \]
\[ \hat{\sigma}^2_{\text{MLE}} = \frac{1}{N}\sum_{i=1}^{N}(x_i - \hat{\mu})^2 \]
\[ \ell(\boldsymbol{\theta}) = -\frac{N}{2}\log(2\pi\sigma^2) - \frac{1}{2\sigma^2}\sum_{i=1}^{N}(y_i - f_{\boldsymbol{\theta}}(\mathbf{x}_i))^2 \]
\[ \hat{\boldsymbol{\mu}} = \frac{1}{N}\sum_{i=1}^{N}\mathbf{x}_i, \quad \hat{\boldsymbol{\Sigma}} = \frac{1}{N}\sum_{i=1}^{N}(\mathbf{x}_i - \hat{\boldsymbol{\mu}})(\mathbf{x}_i - \hat{\boldsymbol{\mu}})^\top \]


\[ p(\boldsymbol{\theta} \mid \mathcal{D}) = \frac{p(\mathcal{D} \mid \boldsymbol{\theta}) \, p(\boldsymbol{\theta})}{p(\mathcal{D})} \]

\[ p(\mathcal{D}) = \int p(\mathcal{D} \mid \boldsymbol{\theta}) \, p(\boldsymbol{\theta}) \, d\boldsymbol{\theta} \]
\[ \mu_N = \frac{\sigma^2 \mu_0 + N \sigma_0^2 \bar{x}}{\sigma^2 + N \sigma_0^2}, \quad \sigma_N^2 = \frac{\sigma^2 \sigma_0^2}{\sigma^2 + N \sigma_0^2} \]


//| echo: false
//| panel: input
viewof bayes_prior_mu = Inputs.range([-5, 5], {value: 0, step: 0.1, label: "Prior Mean (μ₀)"})
viewof bayes_prior_var = Inputs.range([0.1, 10], {value: 3, step: 0.1, label: "Prior Var (σ₀²)"})
viewof bayes_data_mu = Inputs.range([-5, 5], {value: 2.5, step: 0.1, label: "Data Mean (Sample x̄)"})
viewof bayes_data_var = Inputs.range([0.1, 10], {value: 1, step: 0.1, label: "Data Noise (σ²)"})
viewof bayes_N = Inputs.range([0, 50], {value: 3, step: 1, label: "Samples Observed (N)"})//| echo: false
// Calculate Posterior params
bayes_post_var = 1.0 / ( (1.0/bayes_prior_var) + (bayes_N/bayes_data_var) )
bayes_post_mu = bayes_post_var * ( (bayes_prior_mu/bayes_prior_var) + (bayes_N * bayes_data_mu / bayes_data_var) )
bayes_x_vals = d3.range(-8, 8.05, 0.05);
bayes_curves = {
const data = [];
for(let x of bayes_x_vals) {
// Prior
const p_prior = (1 / Math.sqrt(2 * Math.PI * bayes_prior_var)) * Math.exp(-Math.pow(x - bayes_prior_mu, 2) / (2 * bayes_prior_var));
// Likelihood (conceptually, the likelihood of the mean parameter given the data)
// Scaled for visualization so it fits on same plot
const likelihood_var = bayes_data_var / (bayes_N > 0 ? bayes_N : 0.0001);
const p_like_unscaled = (1 / Math.sqrt(2 * Math.PI * likelihood_var)) * Math.exp(-Math.pow(x - bayes_data_mu, 2) / (2 * likelihood_var));
// Scale likelihood to have max height ~ 1 for visual clarity against prior
const p_like = bayes_N === 0 ? 0 : p_like_unscaled * Math.sqrt(2 * Math.PI * likelihood_var) * 0.5;
// Posterior
const p_post = (1 / Math.sqrt(2 * Math.PI * bayes_post_var)) * Math.exp(-Math.pow(x - bayes_post_mu, 2) / (2 * bayes_post_var));
data.push({x: x, val: p_prior, type: "Prior P(θ)"});
if (bayes_N > 0) data.push({x: x, val: p_like, type: "Likelihood (scaled)"});
data.push({x: x, val: p_post, type: "Posterior P(θ|D)"});
}
return data;
}
Plot.plot({
width: 800,
height: 400,
x: {domain: [-8, 8], label: "Mean Parameter (μ)"},
y: {domain: [0, 1.2], label: "Density"},
color: {
domain: ["Prior P(θ)", "Likelihood (scaled)", "Posterior P(θ|D)"],
range: ["#888888", "#5ca7ff", "#ff4d4d"]
},
marks: [
Plot.line(bayes_curves, {x: "x", y: "val", stroke: "type", strokeWidth: 3}),
Plot.areaY(bayes_curves, {x: "x", y: "val", fill: "type", fillOpacity: 0.15}),
// Highlight MAP / Data Mean points on axis
Plot.ruleX([bayes_prior_mu], {stroke: "#888888", strokeDasharray: "4,4"}),
bayes_N > 0 ? Plot.ruleX([bayes_data_mu], {stroke: "#5ca7ff", strokeDasharray: "4,4"}) : null,
Plot.ruleX([bayes_post_mu], {stroke: "#ff4d4d", strokeWidth: 2})
]
})| Aspect | Frequentist | Bayesian |
|---|---|---|
| Parameters | Fixed, unknown | Random variables |
| Inference | Point estimate + CI | Full posterior distribution |
| Prior knowledge | Not incorporated | Formally included |
| Uncertainty | Sampling variability | Posterior width |
| Interpretation | Long-run frequency | Degree of belief |
\[ \hat{\boldsymbol{\theta}}_{\text{MAP}} = \arg\max_{\boldsymbol{\theta}} \, p(\boldsymbol{\theta} \mid \mathcal{D}) = \arg\max_{\boldsymbol{\theta}} \left[\log p(\mathcal{D} \mid \boldsymbol{\theta}) + \log p(\boldsymbol{\theta})\right] \]
\[ p(\mathbf{x}_{\text{new}} \mid \mathcal{D}) = \int p(\mathbf{x}_{\text{new}} \mid \boldsymbol{\theta}) \, p(\boldsymbol{\theta} \mid \mathcal{D}) \, d\boldsymbol{\theta} \]

\[ p(y|x) = \sum_{k=1}^{K} \pi_k(x) \, \mathcal{N}(y | \mu_k(x), \sigma_k^2(x)) \]

