Mathematical Foundations of AI & ML
Unit 7: Generalization, Bias-Variance, and Regularization
FAU Erlangen-Nürnberg
By the end of this lecture, students can:
Training Data: 15 samples Noise: \(\sigma = 0.2\)
d3 = require("d3@7")
d3_reg = require("d3-regression@1")
seed = 42
randomVal = d3.randomNormal.source(d3.randomLcg(seed))(0, 0.2)
trueFunc = (x) => Math.sin(1.5 * Math.PI * x)
points = {
const train = Array.from({length: 15}, (_, i) => {
let x = (i+0.5) / 15 + (d3.randomUniform.source(d3.randomLcg(seed+i))() - 0.5) * 0.05;
return {x: x, y: trueFunc(x) + randomVal(), type: "train"};
});
const test = Array.from({length: 100}, (_, i) => {
let x = (i+0.5) / 100;
return {x: x, y: trueFunc(x) + randomVal(), type: "test"};
});
return [...train, ...test];
}
trainPts = points.filter(d => d.type === "train")
testPts = points.filter(d => d.type === "test")
polyModel = {
try {
return d3_reg.regressionPoly().x(d => d.x).y(d => d.y).order(polyDegree)(trainPts);
} catch(e) {
// fallback if d3-regression fails for very high degrees/collinearity
return {predict: (x) => 0};
}
}
mse = {
const trainMSE = d3.mean(trainPts, d => Math.pow(polyModel.predict(d.x) - d.y, 2));
const testMSE = d3.mean(testPts, d => Math.pow(polyModel.predict(d.x) - d.y, 2));
return {train: trainMSE || 0, test: testMSE || 0};
}
Plot.plot({
width: 1000,
height: 600,
y: {domain: [-2, 2], grid: true},
x: {domain: [0, 1], grid: true},
color: {domain: ["train", "test"], range: ["red", "steelblue"]},
marks: [
Plot.dot(points, {x: "x", y: "y", fill: "type", fillOpacity: 0.8, r: d => d.type === "train" ? 6 : 3}),
Plot.line(d3.range(0, 1.01, 0.01), {x: d => d, y: d => trueFunc(d), stroke: "gray", strokeDasharray: "4,4", strokeWidth: 2, title: "True Function"}),
Plot.line(d3.range(0, 1.01, 0.01).map(x => ({x: x, y: polyModel.predict(x)})), {x: "x", y: "y", stroke: "orange", strokeWidth: 4, title: "Fitted Model", clip: true})
]
})\[ \text{EPE}(\mathbf{x}) = \mathbb{E}_{\mathcal{D}} \mathbb{E}_{y|\mathbf{x}} \big[ (y - \hat{f}_{\mathcal{D}}(\mathbf{x}))^2 \big] \]
\[ y - \hat{f}(\mathbf{x}) = \underbrace{(y - f(\mathbf{x}))}_{\text{noise}} + \underbrace{(f(\mathbf{x}) - \mathbb{E}_{\mathcal{D}}[\hat{f}(\mathbf{x})])}_{\text{bias}} + \underbrace{(\mathbb{E}_{\mathcal{D}}[\hat{f}(\mathbf{x})] - \hat{f}(\mathbf{x}))}_{\text{variance term}} \]
\[ \text{EPE}(\mathbf{x}) = \underbrace{\sigma^2_{\text{noise}}}_{\text{irreducible}} + \underbrace{\big(\mathbb{E}_{\mathcal{D}}[\hat{f}(\mathbf{x})] - f(\mathbf{x})\big)^2}_{\text{Bias}^2} + \underbrace{\mathbb{E}_{\mathcal{D}}\big[(\hat{f}(\mathbf{x}) - \mathbb{E}_{\mathcal{D}}[\hat{f}(\mathbf{x})])^2\big]}_{\text{Variance}} \]
graph LR
C[Model Complexity] --> B[Bias decreases]
C --> V[Variance increases]
B --> E[Total Error]
V --> E
N[Noise] --> E
style E fill:#f9f,stroke:#333,stroke-width:4px
style C fill:#ccf,stroke:#333
bvDatasets = {
const sets = [];
for(let i=0; i<numSamples; i++) {
const rng = d3.randomNormal.source(d3.randomLcg(seed * 10 + i))(0, 0.4);
const pts = Array.from({length: 12}, (_, j) => {
let x = (j+0.5)/12 + (d3.randomUniform.source(d3.randomLcg(seed * 100 + i*12 + j))() - 0.5) * 0.05;
return {x: x, y: trueFunc(x) + rng()};
});
sets.push(pts);
}
return sets;
}
bvModels = bvDatasets.map(pts => d3_reg.regressionPoly().x(d=>d.x).y(d=>d.y).order(bvDegree)(pts))
bvLines = bvModels.flatMap((model, i) => {
return d3.range(0, 1.05, 0.05).map(x => ({x: x, y: model.predict(x), lineId: i, type: "individual"}));
})
bvAverageLine = {
const xs = d3.range(0, 1.05, 0.05);
return xs.map(x => {
const sum = bvModels.reduce((acc, m) => acc + m.predict(x), 0);
return {x: x, y: sum / numSamples, type: "average"};
});
}
Plot.plot({
width: 1000,
height: 600,
y: {domain: [-2, 2], grid: true},
x: {domain: [0, 1], grid: true},
marks: [
Plot.line(d3.range(0, 1.01, 0.01), {x: d => d, y: d => trueFunc(d), stroke: "gray", strokeDasharray: "4,4", strokeWidth: 3, title: "True Function"}),
Plot.line(bvLines, {x: "x", y: "y", z: "lineId", stroke: "steelblue", strokeOpacity: 0.2, strokeWidth: 2, clip: true}),
showAverage ? Plot.line(bvAverageLine, {x: "x", y: "y", stroke: "red", strokeWidth: 5, strokeDasharray: "2,2", title: "Average Fit", clip: true}) : null
]
})\[ J_{\text{reg}}(\mathbf{w}) = \underbrace{\frac{1}{N}\sum_{i=1}^{N} L(\hat{y}_i, y_i)}_{\text{data fit}} + \underbrace{\lambda \cdot \Omega(\mathbf{w})}_{\text{complexity penalty}} \]
\[ \hat{\mathbf{w}} = \arg\min_{\mathbf{w}} \left[ R_N(\mathbf{w}) + \lambda \, \Omega(\mathbf{w}) \right] \]
\[ L_{\text{ridge}} = \sum_{i=1}^{N}(\hat{y}_i - y_i)^2 + \lambda \|\mathbf{w}\|_2^2 \]
\[ \hat{\mathbf{w}}_{\text{ridge}} = (\mathbf{X}^\top\mathbf{X} + \lambda\mathbf{I})^{-1}\mathbf{X}^\top\mathbf{y} \]
\[ L_{\text{lasso}} = \sum_{i=1}^{N}(\hat{y}_i - y_i)^2 + \lambda \|\mathbf{w}\|_1 \]
constraintRegion = {
if (regType === "Ridge (L2)") {
return d3.range(0, 2 * Math.PI + 0.1, 0.1).map(theta => {
const r = constraintT;
return {w1: r * Math.cos(theta), w2: r * Math.sin(theta)};
});
} else {
return [
{w1: constraintT, w2: 0},
{w1: 0, w2: constraintT},
{w1: -constraintT, w2: 0},
{w1: 0, w2: -constraintT},
{w1: constraintT, w2: 0}
];
}
}
minPointPerfect = {
const unconstrainedLoss = 0;
let inRegion = false;
if (regType === "Ridge (L2)") {
inRegion = (2*2 + 1*1) <= constraintT*constraintT;
} else {
inRegion = (2 + 1) <= constraintT;
}
if (inRegion) return {w1: 2, w2: 1};
let minLoss = Infinity;
let bestW1 = 0, bestW2 = 0;
if (regType === "Ridge (L2)") {
for (let theta = 0; theta < 2*Math.PI; theta += 0.01) {
let w1 = constraintT * Math.cos(theta);
let w2 = constraintT * Math.sin(theta);
let loss = Math.pow(w1 - 2, 2) + 3 * Math.pow(w2 - 1, 2);
if (loss < minLoss) { minLoss = loss; bestW1 = w1; bestW2 = w2; }
}
} else {
const segments = [
{sw1: constraintT, sw2: 0, ew1: 0, ew2: constraintT},
{sw1: 0, sw2: constraintT, ew1: -constraintT, ew2: 0},
{sw1: -constraintT, sw2: 0, ew1: 0, ew2: -constraintT},
{sw1: 0, sw2: -constraintT, ew1: constraintT, ew2: 0}
];
for (const seg of segments) {
for (let alpha = 0; alpha <= 1; alpha += 0.005) {
let w1 = seg.sw1 * (1 - alpha) + seg.ew1 * alpha;
let w2 = seg.sw2 * (1 - alpha) + seg.ew2 * alpha;
let loss = Math.pow(w1 - 2, 2) + 3 * Math.pow(w2 - 1, 2);
if (loss < minLoss) { minLoss = loss; bestW1 = w1; bestW2 = w2; }
}
}
}
if (regType === "Lasso (L1)") {
if (Math.abs(bestW1) < 0.05) { bestW1 = 0; bestW2 = constraintT * Math.sign(bestW2); }
if (Math.abs(bestW2) < 0.05) { bestW2 = 0; bestW1 = constraintT * Math.sign(bestW1); }
}
return {w1: bestW1, w2: bestW2};
}
contourLines = {
const lines = [];
const minLoss = Math.pow(minPointPerfect.w1 - 2, 2) + 3 * Math.pow(minPointPerfect.w2 - 1, 2);
const levels = [0.5, 2, 4, 8, 12, minLoss];
for (const C of levels) {
if (C < 0) continue;
const pts = [];
for (let alpha = 0; alpha <= 2*Math.PI + 0.1; alpha += 0.05) {
pts.push({
w1: 2 + Math.sqrt(C) * Math.cos(alpha),
w2: 1 + Math.sqrt(C/3) * Math.sin(alpha),
level: C,
isMin: Math.abs(C - minLoss) < 0.01
});
}
lines.push(pts);
}
return lines;
}
Plot.plot({
width: 800,
height: 600,
x: {domain: [-2, 3], grid: true, label: "Coefficient w1"},
y: {domain: [-2, 3], grid: true, label: "Coefficient w2"},
marks: [
...contourLines.map(pts =>
Plot.line(pts, {x: "w1", y: "w2", stroke: d => d.isMin[0] ? "red" : "gray", strokeWidth: d => d.isMin[0] ? 3 : 1})
),
Plot.line(constraintRegion, {x: "w1", y: "w2", stroke: "steelblue", fill: "steelblue", fillOpacity: 0.2, strokeWidth: 2}),
Plot.dot([minPointPerfect], {x: "w1", y: "w2", fill: "red", r: 8}),
Plot.dot([{w1: 2, w2: 1}], {x: "w1", y: "w2", fill: "black", r: 5, stroke: "white"}),
Plot.text([{w1: 2, w2: 1.15}], {x: "w1", y: "w2", text: () => "OLS Unconstrained Min", fill: "black"})
]
})| Property | Ridge (L2) | Lasso (L1) |
|---|---|---|
| Penalty | \(\sum w_j^2\) | \(\sum \|w_j\|\) |
| Sparsity | No (shrinks all) | Yes (zeroes some) |
| Closed form | Yes | No (requires optimization) |
| Correlated features | Keeps all, shrinks equally | Selects one arbitrarily |
| Best for | Many relevant features | Few relevant features |
\[ \Omega(\mathbf{w}) = \alpha \|\mathbf{w}\|_1 + (1 - \alpha) \|\mathbf{w}\|_2^2 \]
\[ \text{CV}(k) = \frac{1}{k} \sum_{j=1}^{k} R_{\text{test}}^{(j)} \]
Scenario: A student uses the test set to tune \(\lambda\), then reports test set accuracy as the model’s generalization performance. What goes wrong?
Answer: C — the test set was used for selection, so it no longer provides an unbiased estimate.



Week 7: Overfitting & Regularization — IsingDataset (16×16)

© Philipp Pelz - Mathematical Foundations of AI & ML