\[\def\gets{:=} \def\N{\mathbb{N}} \def\Z{\mathbb{Z}} \def\R{\mathbb{R}} \def\Q{\mathbb{Q}} \def\O{\mathrm{O}} \def\Prop{\mathbb{P}} \def\Algorithm{\textbf{Algorithm}~} \def\Subroutine{\textbf{Subroutine}~} \def\Let{\textbf{Let}~} \def\Find{\textbf{Find}~} \def\Compute{\textbf{Compute}~} \def\For{\textbf{For}~} \def\While{\textbf{While}~} \def\If{\textbf{If}~} \def\Else{\textbf{Else}~} \def\ElseIf{\textbf{Else If}~} \def\Return{\textbf{Return}~} \def\return{~\textbf{return}~} \def\Output{\textbf{Output}~} \def\output{~\textbf{output}~} \def\tot{~\textrm{to}~} \def\andt{~\textrm{and}~} \def\ort{~\textrm{or}~} \def\Forallt{\textbf{For all}~} \def\forallt{~\textbf{for all}~} \def\Foreacht{\textbf{For each}~} \def\foreacht{~\textbf{for each}~} \newcommand{\abs}[1]{|#1|} \newcommand{\lrabs}[1]{\left|#1\right|} \newcommand{\parens}[1]{\left(#1\right)} \newcommand{\brackets}[1]{\left[#1\right]} \newcommand{\floor}[1]{\left\lfloor#1\right\rfloor} \newcommand{\ceil}[1]{\left\lceil#1\right\rceil} \def\gcd{\mathrm{gcd}} \def\qst{q_{start}} \def\qacc{q_{accept}} \def\qrej{q_{reject}} \newcommand{\lang}[1]{L_\mathrm{#1}} \def\atm{\lang{ACC}} \def\htm{\lang{HALT}} \def\ehtm{\lang{\varepsilon\text{-}HALT}} \def\eqtm{\lang{EQ}} \def\etm{\lang{\varnothing}} \def\oninput{\text{ on input }} \def\oninputs{\text{ on inputs }} \def\Run{\textbf{Run}~} \def\ont{~\textrm{on}~} \def\Accept{\textbf{Accept}~} \def\accept{~\textbf{accept}~} \def\accepts{~\textrm{accepts}~} \def\Reject{\textbf{Reject}~} \def\reject{~\textbf{reject}~} \def\rejects{~\textrm{rejects}~} \def\Halt{\textbf{Halt}~} \def\halt{~\textbf{halt}~} \newcommand{\inner}[1]{\langle #1 \rangle} \def\Tr{\le_\mathrm{T}} \def\pmr{\le_\mathrm{p}} \def\Construct{\textbf{Construct}~} \def\bquote{\text{"}} \def\equote{\text{"}} \def\RD{\mathsf{R}} \def\RE{\mathsf{RE}} \def\coRE{\mathsf{coRE}} \def\DTIME{\mathsf{DTIME}} \def\P{\mathsf{P}} \def\NP{\mathsf{NP}} \def\coNP{\mathsf{coNP}} \def\MAZE{\mathrm{MAZE}} \def\TSP{\mathrm{TSP}} \def\SAT{\mathrm{SAT}} \def\TSAT{\mathrm{3SAT}} \def\VC{\mathrm{VERTEX\text{-}COVER}} \def\SC{\mathrm{SET\text{-}COVER}} \def\HC{\mathrm{HAMCYCLE}} \def\HP{\mathrm{HAMPATH}} \def\IS{\mathrm{INDEPENDENT\text{-}SET}} \def\CLIQUE{\mathrm{CLIQUE}} \def\SSUM{\mathrm{SUBSET\text{-}SUM}} \def\KNAPSACK{\mathrm{KNAPSACK}} \def\MAXCUT{\mathrm{MAX\text{-}CUT}} \def\Ex{\mathbb{E}} \newcommand{\lrPr}[1]{\Pr\brackets{#1}} \newcommand{\lrEx}[1]{\Ex\brackets{#1}} \newcommand{\lrexp}[1]{\exp\parens{#1}} \def\MSAT{\mathrm{Max}\text{-}\mathrm{3SAT}} \def\PRIMES{\mathrm{PRIMES}} \def\RP{\mathsf{RP}} \def\coRP{\mathsf{coRP}} \def\BPP{\mathsf{BPP}} \def\ZPP{\mathsf{ZPP}} \def\Xj{X^{(j)}} \def\BQP{\mathsf{BQP}} \def\NPI{\mathsf{NPI}} \newcommand{\expp}[1]{\exp\left(#1\right)}\]

Appendix

Proof of the Simplified Chernoff Bounds

We previously showed that the unsimplified Chernoff bounds

\[\begin{split}\Pr[X \ge (1 + \delta)\mu] &\le \parens{\frac{e^\delta}{(1+\delta)^{1+\delta}}}^\mu ~~~~\text{for $\delta > 0$}\\ \Pr[X \le (1 - \delta)\mu] &\le \parens{\frac{e^{-\delta}}{(1-\delta)^{1-\delta}}}^\mu ~~~~\text{for $0 < \delta < 1$}\end{split}\]

hold. We now demonstrate that the simplified bounds

\[\begin{split}\large \begin{aligned} \Pr[X \ge (1 + \delta)\mu] &\le e^{-\frac{\delta^2\mu}{2+\delta}} ~~~~\text{for $\delta > 0$}\\ \Pr[X \le (1 - \delta)\mu] &\le e^{-\frac{\delta^2\mu}{2}} ~~~~\text{for $0 < \delta < 1$} \end{aligned}\end{split}\]

follow from the unsimplified bounds.

Proof 190

We first consider the upper-tail Chernoff bound. For \(\delta > 0\), we have

\[\begin{split}\large \begin{aligned} \Pr[X \ge (1 + \delta)\mu] &\le \parens{\frac{e^\delta}{(1+\delta)^{1+\delta}}}^\mu\\ &= \parens{\frac{e^\delta}{(e^{\ln(1+\delta)})^{1+\delta}}}^\mu\\ &= \parens{\frac{e^\delta}{e^{(1+\delta)\ln(1+\delta)}}}^\mu\\ &= \parens{e^{\delta-(1+\delta)\ln(1+\delta)}}^\mu \end{aligned}\end{split}\]

From a list of logarithmic identities, we obtain the inequality

\[\ln(1+x) \ge \frac{2x}{2+x}\]

for \(x > 0\). This gives us

\[\begin{split}\large \begin{aligned} -\ln(1+x) &\le -\frac{2x}{2+x}\\ e^{-\ln(1+x)} &\le e^{-\frac{2x}{2+x}} \end{aligned}\end{split}\]

Applying this to our Chernoff-bound expression with \(x = \delta\), we get

\[\begin{split}\large \begin{aligned} \Pr[X \ge (1 + \delta)\mu] &\le \parens{e^{\delta-(1+\delta)\ln(1+\delta)}}^\mu\\ &\le \parens{e^{\delta-(1+\delta)\frac{2\delta}{2+\delta}}}^\mu\\ &= \parens{e^{\frac{\delta(2+\delta)-2\delta(1+\delta)} {2+\delta}}}^\mu\\ &= \parens{e^{\frac{2\delta+\delta^2-2\delta-2\delta^2} {2+\delta}}}^\mu\\ &= \parens{e^{\frac{-\delta^2}{2+\delta}}}^\mu\\ &= e^{-\frac{\delta^2\mu}{2+\delta}} \end{aligned}\end{split}\]

resulting in the simplified upper-tail bound.

For the lower-tail Chernoff bound, we have for \(0 < \delta < 1\):

\[\begin{split}\large \begin{aligned} \Pr[X \le (1 - \delta)\mu] &\le \parens{\frac{e^{-\delta}}{(1-\delta)^{1-\delta}}}^\mu\\ &= \parens{\frac{e^\delta}{(e^{\ln(1-\delta)})^{1-\delta}}}^\mu\\ &= \parens{\frac{e^{-\delta}}{e^{(1-\delta)\ln(1-\delta)}}}^\mu\\ &= \parens{e^{-\delta-(1-\delta)\ln(1-\delta)}}^\mu \end{aligned}\end{split}\]

The Taylor series for the natural logarithm gives us:

\[\begin{split}\ln(1-x) &= -x - \frac{x^2}{2} - \frac{x^3}{3} - \frac{x^4}{4} - \dots\\ &\ge -x - \frac{x^2}{2} ~~~~\text{for $0 < x < 1$}\\ -\ln(1-x) &\le x + \frac{x^2}{2}\end{split}\]

The second step follows since \(\frac{x^3}{3} + \frac{x^4}{4} + \dots\) is positive for \(x > 0\). Applying the result to our Chernoff-bound expression with \(x = \delta\), we obtain

\[\begin{split}\large \begin{aligned} \Pr[X \le (1 - \delta)\mu] &\le \parens{e^{-\delta-(1-\delta)\ln(1-\delta)}}^\mu\\ &\le \parens{e^{-\delta+(1-\delta)(\delta+\frac{\delta^2}{2})}}^\mu\\ &= \parens{e^{-\delta+\delta-\delta^2+\frac{\delta^2}{2}- \frac{\delta^3}{2}}}^\mu\\ &= \parens{e^{-\frac{\delta^2}{2}-\frac{\delta^3}{2}}}^\mu \end{aligned}\end{split}\]

Since \(\delta > 0\), we have \(\delta^3/2 > 0\), and

\[-\frac{\delta^2}{2}-\frac{\delta^3}{2} \le -\frac{\delta^2}{2}\]

Thus,

\[\begin{split}\large \begin{aligned} \Pr[X \le (1 - \delta)\mu] &\le \parens{e^{-\frac{\delta^2}{2}-\frac{\delta^3}{2}}}^\mu\\ &\le \parens{e^{-\frac{\delta^2}{2}}}^\mu\\ &= e^{-\frac{\delta^2\mu}{2}} \end{aligned}\end{split}\]

completing our proof of the simplified lower-tail bound.

Proof of the Upper-Tail Hoeffding’s Inequality

The upper-tail Hoeffding’s inequality we use (Theorem 155) states that for a sequence of independent indicator random variables \(X_1, \dots, X_n\),

\[\lrPr{\frac{1}{n}X \ge p + \varepsilon} \le e^{-2\varepsilon^2 n}\]

where \(X = X_1 + \dots + X_n\), \(\Ex[X_i] = p_i\), and \(\lrEx{\frac{1}{n}X} = \frac{1}{n}\sum_i p_i = p\). We proceed to prove a more general variant of this bound:

Theorem 191

Let \(X = X_1 + \dots + X_n\), where the \(X_i\) are independent random variables such that \(X_i \in [0, 1]\), and the \(X_i\) have expectation \(\Ex[X_i] = p_i\), respectively. Let

\[p = \lrEx{\frac{1}{n}X} = \frac{1}{n}\sum_i p_i\]

Let \(\varepsilon > 0\) be a deviation from the expectation. Then

\[\lrPr{\frac{1}{n}X \ge p + \varepsilon} \le \expp{-2\varepsilon^2 n}\]

(Note that \(\expp{x}\) is alternate notation for \(e^{x}\).)

Here, we do not require that the \(X_i\) are indicators, just that they are in the range \([0, 1]\).

First, we establish a preliminary fact about the expectation of the product of independent random variables.

Lemma 192

Let \(X\) and \(Y\) be independent random variables. Then

\[\Ex[XY] = \Ex[X]\cdot\Ex[Y]\]
Proof 193

By definition of expectation, we have

\[\begin{split}\Ex[XY] &= \sum_{x,y} xy\Pr[X = x, Y = y]\\ &= \sum_x \sum_y xy\Pr[X = x]\cdot\Pr[Y = y]\\ &= \sum_x \parens{x\Pr[X = x] \sum_y y\Pr[Y = y]}\\ &= \sum_x x\Pr[X = x]\cdot\Ex[Y]\\ &= \Ex[Y] \sum_x x\Pr[X = x]\\ &= \Ex[Y]\cdot\Ex[X]\end{split}\]

In the second step, we used the fact that \(X\) and \(Y\) are independent, so that \(\Pr[X = x, Y = y] = \Pr[X = x]\cdot\Pr[Y = y]\).

By induction, we can conclude that

\[\lrEx{\prod_i X_i} = \prod_i \Ex[X_i]\]

when the \(X_i\) are independent.

We now proceed to prove Theorem 191. We have

\[\begin{split}\lrPr{\frac{1}{n}X \ge p + \varepsilon} &= \Pr[X \ge n(p + \varepsilon)]\\ &= \Pr[e^{sX} \ge \expp{sn(p + \varepsilon)}]\end{split}\]

The latter step holds for all \(s \ge 0\). The process of working with the random variable \(e^{sX}\) rather than \(X\) directly is the Chernoff bounding technique; the idea is that small deviations in \(X\) turn into large deviations in \(e^{sX}\), so that Markov’s inequality (Theorem 133) produces a more useful result. We will choose an appropriate value of \(s\) later. For any value of \(s\), \(e^{sX}\) is nonnegative, so we can apply Markov’s inequality to obtain:

\[\begin{split}\Pr[e^{sX} \ge \expp{sn(p + \varepsilon)}] &\le \Ex[e^{sX}]/\expp{sn(p + \varepsilon)}\\ &= \expp{-sn(p + \varepsilon)} \cdot \Ex[e^{sX}]\\ &= \expp{-sn(p + \varepsilon)} \cdot \Ex[\expp{s(X_1 + X_2 + \dots + X_n)}]\\ &= \expp{-sn(p + \varepsilon)} \cdot \Ex[e^{sX_1} e^{sX_2} \dots e^{sX_n}]\\ &= \expp{-sn(p + \varepsilon)} \prod_i \Ex[e^{sX_i}]\end{split}\]

In the last step, we applied the fact that the \(X_i\) are independent to conclude that the random variables \(e^{sX_i}\) are also independent, and therefore the expectation of their product is the product of their expectations.

We now need to establish a bound on \(\Ex[e^{sX_i}]\).

Lemma 194

Let \(X\) be a random variable such that \(X \in [0, 1]\) and \(\Ex[X] = p\). Then

\[\Ex[e^{sX}] \le \expp{sp + s^2/8}\]

for all \(s \in \R\).

Proof 195

We first observe that because \(e^{sx}\) is a convex function, we can bound it from above on the interval \([0, 1]\) by the line that passes through the endpoints \(1\) and \(e^s\):

an exponential function f(x) is convex, so every point on the interval [0, 1] lies at or beneath the line that passes through the endpoints f(0) and f(1)


This line has slope \(e^s - 1\) and y-intercept \(1\), giving us \(xe^s - x + 1\). Thus,

\[e^{sx} \le xe^s - x + 1\]

in the interval \(0 \le x \le 1\). Then

\[\begin{split}\Ex[e^{sX}] &\le \Ex[Xe^s - X + 1]\\ &= (e^s - 1)\Ex[X] + 1\\ &= pe^s - p + 1\end{split}\]

where the second step follows from linearity of expectation. We now have an upper bound on \(\Ex[e^{sX}]\), but it is not a convenient one – we would like it to be of the form \(e^\alpha\), so that we can better use it in proving Theorem 191, for which we’ve already obtained a bound that contains an exponential. Using the fact that \(z = e^{\ln z}\), we get

\[\begin{split}\Ex[e^{sX}] &\le pe^s - p + 1\\ &= \expp{\ln (pe^s - p + 1)}\end{split}\]

To further bound this expression, let

\[\phi(s) = \ln (pe^s - p + 1)\]

Then finding an upper bound for \(\phi(s)\) will in turn allow us to bound \(e^{\phi(s)}\). By the Lagrange form of Taylor’s theorem, we get

\[\phi(s) = \phi(0) + s\phi'(0) + \frac{1}{2}s^2\phi''(v)\]

for some \(v \in [0, s]\). To differentiate \(\phi(s)\), we let \(f(s) = \ln s\) and \(g(s) = pe^s - p + 1\), so that \(\phi(s) = (f \circ g)(s)\). Then by the chain rule, we have 1

\[\begin{split}\phi'(s) &= (f \circ g)'(s)\\ &= f'(g(s)) \cdot g'(s)\\ &= \frac{1}{g(s)} \cdot g'(s)~~~~~~~~~\text{(since $f'(s) = \frac{d}{ds}\ln s = \frac{1}{s}$)}\\ &= \frac{1}{pe^s - p + 1} \cdot pe^s\\ &= \frac{pe^s}{pe^s - p + 1}\end{split}\]
1

The function \(g(s) = pe^s - p + 1\) does not have any real roots for \(p \in [0, 1]\), so it is safe to divide by it.

To compute \(\phi''(s)\), we now let \(\hat{f}(s) = pe^s\), so that \(\phi'(s) = \hat{f}(s)/g(s)\). Then by the quotient rule, we get

\[\begin{split}\phi''(s) &= \frac{\hat{f}'(s)g(s) - \hat{f}(s)g'(s)}{g^2(s)}\\ &= \frac{pe^s(pe^s - p + 1) - pe^s pe^s}{(pe^s - p + 1)^2}\\ &= \frac{pe^s}{pe^s - p + 1}\cdot\frac{(pe^s - p + 1) - pe^s}{pe^s - p + 1}\\ &= \frac{pe^s}{pe^s - p + 1}\parens{1 - \frac{pe^s}{pe^s - p + 1}}\\ &= t(1 - t)\end{split}\]

where \(t = \frac{pe^s}{pe^s - p + 1}\). Observe that \(t(1 - t)\) is a parabola with a maximum at \(t = \frac{1}{2}\):

\[\begin{split}h(t) &= t(1 - t) = t - t^2\\ h'(t) &= 1 - 2t\\ h''(t) &= -2\end{split}\]

Setting \(h'(t) = 1 - 2t = 0\), we get that \(t = \frac{1}{2}\) is an extremum, and since the second derivative \(h''(\frac{1}{2})\) is negative, it is a maximum. Thus, we have

\[\begin{split}\phi''(s) &= t(1 - t)\\ &\le \frac{1}{2}\parens{1 - \frac{1}{2}}\\ &= \frac{1}{4}\end{split}\]

We can now plug everything into our result from applying Taylor’s theorem:

\[\begin{split}\phi(s) &= \phi(0) + s\phi'(0) + \frac{1}{2}s^2\phi''(v)\\ &= \ln(p - p + 1) + s\frac{p}{p - p + 1} + \frac{1}{2}s^2\phi''(v)\\ &= sp + \frac{1}{2}s^2\phi''(v)\\ &\le sp + \frac{1}{2}s^2\cdot \frac{1}{4}\\ &= sp + \frac{s^2}{8}\end{split}\]

Thus,

\[\begin{split}\Ex[e^{sX}] &\le e^{\phi(s)}\\ &\le \expp{sp + s^2/8}\end{split}\]

as claimed.

Continuing our proof of Theorem 191, we have

\[\Pr[e^{sX} \ge \expp{sn(p + \varepsilon)}] \le \expp{-sn(p + \varepsilon)} \prod_i \Ex[e^{sX_i}]\]

Applying Lemma 194, we get

\[\begin{split}\Pr[e^{sX} \ge \expp{sn(p + \varepsilon)}] &\le \expp{-sn(p + \varepsilon)} \prod_i \Ex[e^{sX_i}]\\ &\le \expp{-sn(p + \varepsilon)} \prod_i \expp{sp_i + s^2/8}\\ &= \expp{-sn(p + \varepsilon)} \cdot \expp{\sum_i (sp_i + s^2/8)}\\ &= \expp{-sn(p + \varepsilon)} \cdot \expp{snp + s^2n/8}\\ &= \expp{-sn\varepsilon + s^2n/8}\end{split}\]

We now choose \(s\) to minimize the exponent \(r(s) = -sn\varepsilon + s^2n/8\). We have:

\[\begin{split}r(s) &= -sn\varepsilon + \frac{n}{8}s^2\\ r'(s) &= -n\varepsilon + \frac{n}{8}\cdot 2s = -n\varepsilon + \frac{n}{4}s\\ r''(s) &= \frac{n}{4}\end{split}\]

We have another parabola, and since the second derivative is positive, we obtain a minimum for \(r(s)\) at

\[\begin{split}r'(s) &= -n\varepsilon + \frac{n}{4}s = 0\\ s &= 4\varepsilon\end{split}\]

Then

\[\begin{split}r(4\varepsilon) &= -4n\varepsilon^2 + \frac{n}{8}\cdot 16\varepsilon^2\\ &= -4n\varepsilon^2 + 2n\varepsilon^2\\ &= -2n\varepsilon^2\end{split}\]

Thus,

\[\begin{split}\lrPr{\frac{1}{n}X \ge p + \varepsilon} &\le \Pr[e^{sX} \ge \expp{sn(p + \varepsilon)}]\\ &\le \expp{-2n\varepsilon^2}\end{split}\]

completing our proof of Theorem 191.

General Case of Hoeffding’s Inequality

We can further generalize Theorem 191 to the case where the individual random variables \(X_i\) are in the range \([a_i, b_i]\). We start by generalizing Lemma 194 to obtain Hoeffding’s lemma.

Lemma 196 (Hoeffding’s Lemma)

Let \(X\) be a random variable such that \(X \in [a, b]\), where \(a < b\), and \(\Ex[X] = p\). Then

\[\Ex[e^{sX}] \le \expp{sp + s^2(a - b)^2/8}\]

for all \(s \in \R\).

Proof 197

Let \(X' = \frac{X - a}{b - a}\). Then \(X'\) is a random variable such that \(X' \in [0, 1]\), and

\[\Ex[X'] = \lrEx{\frac{X - a}{b - a}} = \frac{p - a}{b - a}\]

by linearity of expectation. Let \(p' = \frac{p - a}{b - a}\). Applying Lemma 194 to \(X'\), we get

\[\Ex[e^{s'X'}] \le \expp{s'p' + s'^2/8}\]

Now consider \(e^{sX}\). We have \(X = a + (b - a)X'\), so

\[\begin{split}\Ex[e^{sX}] &= \Ex[\expp{sa + s(b - a)X'}]\\ &= e^{sa}\Ex[\expp{s(b - a)X'}]\end{split}\]

Let \(s' = s(b - a)\). From the result above, we have

\[\begin{split}e^{sa}\Ex[\expp{s(b - a)X'}] &= e^{sa}\Ex[e^{s'X'}]\\ &\le e^{sa}\expp{s'p' + s'^2/8}\\ &= \expp{sa + s'p' + s'^2/8}\end{split}\]

Substituting \(p' = \frac{p - a}{b - a}\) and \(s' = s(b - a)\), we obtain the following for the exponent:

\[\begin{split}sa + s'p' + \frac{1}{8}s'^2 &= sa + s(b - a)\frac{p - a}{b - a} + \frac{1}{8}s^2(b - a)^2\\ &= sa + s(p - a) + \frac{1}{8}s^2(b - a)^2\\ &= sp + \frac{1}{8}s^2(b - a)^2\end{split}\]

Thus,

\[\begin{split}\Ex[e^{s'X'}] &\le \expp{sa + s'p' + s'^2/8}\\ &= \expp{sp + s^2(b - a)^2/8}\end{split}\]

as claimed.

We can now derive the general case of Hoeffding’s inequality.

Theorem 198 (Hoeffding’s Inequality – General Case)

Let \(X = X_1 + \dots + X_n\), where the \(X_i\) are independent random variables such that \(X_i \in [a_i, b_i]\) for \(a_i < b_i\) and \(\Ex[X_i] = p_i\). Let

\[p = \lrEx{\frac{1}{n}X} = \frac{1}{n}\sum_i p_i\]

Let \(\varepsilon > 0\) be a deviation from the expectation. Then

\[\lrPr{\frac{1}{n}X \ge p + \varepsilon} \le \lrexp{-\frac{2n^2\varepsilon^2}{\sum_i (b_i - a_i)^2}}\]
Proof 199

We proceed as in the proof of Theorem 191 to obtain

\[\begin{split}\lrPr{\frac{1}{n}X \ge p + \varepsilon} &= \Pr[X \ge n(p + \varepsilon)]\\ &= \Pr[e^{sX} \ge \expp{sn(p + \varepsilon)}]\\ &\le \Ex[e^{sX}]/\expp{sn(p + \varepsilon)}~~~~~~~~~\text{(Markov's inequality)}\\ &= \expp{-sn(p + \varepsilon)} \cdot \Ex[e^{sX}]\\ &= \expp{-sn(p + \varepsilon)} \cdot \Ex[\expp{s(X_1 + X_2 + \dots + X_n)}]\\ &= \expp{-sn(p + \varepsilon)} \cdot \Ex[e^{sX_1} e^{sX_2} \dots e^{sX_n}]\\ &= \expp{-sn(p + \varepsilon)} \prod_i \Ex[e^{sX_i}]~~~~~~~~~\text{(independence of the $X_i$)}\end{split}\]

Applying Hoeffding’s lemma (Lemma 196), we obtain

\[\begin{split}\expp{-sn(p + \varepsilon)} \prod_i \Ex[e^{sX_i}] &\le \expp{-sn(p + \varepsilon)} \prod_i \expp{sp_i + s^2(b_i - a_i)^2/8}\\ &= \expp{-sn(p + \varepsilon)} \cdot \lrexp{\sum_i (sp_i + s^2(b_i - a_i)^2/8)}\\ &= \expp{-sn(p + \varepsilon)} \cdot \lrexp{snp + \frac{s^2}{8} \sum_i (b_i - a_i)^2}\\ &= \lrexp{-sn\varepsilon + \frac{s^2}{8} \sum_i (b_i - a_i)^2}\end{split}\]

We choose \(s\) to minimize the exponent \(r(s)\):

\[\begin{split}r(s) &= -sn\varepsilon + \frac{s^2}{8} \sum_i (b_i - a_i)^2\\ r'(s) &= -n\varepsilon + \frac{s}{4} \sum_i (b_i - a_i)^2\\ r''(s) &= \frac{1}{4} \sum_i (b_i - a_i)^2\end{split}\]

Since \(a_i < b_i\), we have \(b_i - a_i > 0\) for all \(i\), so that \(r''(s)\) is positive. Thus, we obtain a minimum for \(r(s)\) when \(r'(s) = 0\):

\[\begin{split}r'(s) &= -n\varepsilon + \frac{s}{4} \sum_i (b_i - a_i)^2 = 0\\ s &= \frac{4n\varepsilon}{\sum_i (b_i - a_i)^2}\end{split}\]

Then

\[\begin{split}r\parens{\frac{4n\varepsilon}{\sum_i (b_i - a_i)^2}} &= -n\varepsilon\frac{4n\varepsilon}{\sum_i (b_i - a_i)^2} + \frac{16n^2\varepsilon^2}{\parens{\sum_i (b_i - a_i)^2}^2}\cdot \frac{1}{8}\sum_i (b_i - a_i)^2\\ &= -\frac{4n^2\varepsilon^2}{\sum_i (b_i - a_i)^2} + \frac{2n^2\varepsilon^2}{\sum_i (b_i - a_i)^2}\\ &= -\frac{2n^2\varepsilon^2}{\sum_i (b_i - a_i)^2}\end{split}\]

Thus,

\[\begin{split}\lrPr{\frac{1}{n}X \ge p + \varepsilon} &\le \Pr[e^{sX} \ge \expp{sn(p + \varepsilon)}]\\ &\le \lrexp{-\frac{2n^2\varepsilon^2}{\sum_i (b_i - a_i)^2}}\end{split}\]

completing our proof of the general case of Hoeffding’s inequality.