Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download

📚 The CoCalc Library - books, templates and other resources

132928 views
License: OTHER
1
%!TEX root = main.tex
2
3
\appendix
4
\onecolumn
5
\section*{Overview}
6
\begin{table}[H]
7
\centering
8
\hspace*{-1cm}\begin{tabular}{lllll}
9
\toprule
10
Name & Function $\varphi(x)$ & Range of Values & $\varphi'(x)$ & Used by \\\midrule %
11
Sign function$^\dagger$ & $\begin{cases}+1 &\text{if } x \geq 0\\-1 &\text{if } x < 0\end{cases}$ & $\Set{-1,1}$ & $0$ & \cite{971754} \\
12
\parbox[t]{2.6cm}{Heaviside\\step function$^\dagger$} & $\begin{cases}+1 &\text{if } x > 0\\0 &\text{if } x < 0\end{cases}$ & $\Set{0, 1}$ & $0$ & \cite{mcculloch1943logical}\\
13
Logistic function & $\frac{1}{1+e^{-x}}$ & $[0, 1]$ & $\frac{e^x}{(e^x +1)^2}$ & \cite{duch1999survey} \\
14
Tanh & $\frac{e^x - e^{-x}}{e^x + e^{-x}} = \tanh(x)$ & $[-1, 1]$ & $\sech^2(x)$ & \cite{LeNet-5,Thoma:2014}\\
15
\gls{ReLU}$^\dagger$ & $\max(0, x)$ & $[0, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\0 &\text{if } x < 0\end{cases}$ & \cite{AlexNet-2012}\\
16
\parbox[t]{2.6cm}{\gls{LReLU}$^\dagger$\footnotemark\\(\gls{PReLU})} & $\varphi(x) = \max(\alpha x, x)$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\\alpha &\text{if } x < 0\end{cases}$ & \cite{maas2013rectifier,he2015delving} \\
17
Softplus & $\log(e^x + 1)$ & $(0, +\infty)$ & $\frac{e^x}{e^x + 1}$ & \cite{dugas2001incorporating,glorot2011deep} \\
18
\gls{ELU} & $\begin{cases}x &\text{if } x > 0\\\alpha (e^x - 1) &\text{if } x \leq 0\end{cases}$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\\alpha e^x &\text{otherwise}\end{cases}$ & \cite{clevert2015fast} \\
19
Softmax$^\ddagger$ & $o(\mathbf{x})_j = \frac{e^{x_j}}{\sum_{k=1}^K e^{x_k}}$ & $[0, 1]^K$ & $o(\mathbf{x})_j \cdot \frac{\sum_{k=1}^K e^{x_k} - e^{x_j}}{\sum_{k=1}^K e^{x_k}}$ & \cite{AlexNet-2012,Thoma:2014}\\
20
Maxout$^\ddagger$ & $o(\mathbf{x}) = \max_{x \in \mathbf{x}} x$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x_i = \max \mathbf{x}\\0 &\text{otherwise}\end{cases}$ & \cite{goodfellow2013maxout} \\
21
\bottomrule
22
\end{tabular}
23
\caption[Activation functions]{Overview of activation functions. Functions
24
marked with $\dagger$ are not differentiable at 0 and functions
25
marked with $\ddagger$ operate on all elements of a layer
26
simultaneously. The hyperparameters $\alpha \in (0, 1)$ of Leaky
27
ReLU and ELU are typically $\alpha = 0.01$. Other activation
28
function like randomized leaky ReLUs exist~\cite{xu2015empirical},
29
but are far less commonly used.\\
30
Some functions are smoothed versions of others, like the logistic
31
function for the Heaviside step function, tanh for the sign
32
function, softplus for ReLU.\\
33
Softmax is the standard activation function for the last layer of
34
a classification network as it produces a probability
35
distribution. See \Cref{fig:activation-functions-plot} for a plot
36
of some of them.}
37
\label{table:activation-functions-overview}
38
\end{table}
39
\footnotetext{$\alpha$ is a hyperparameter in leaky ReLU, but a learnable parameter in the parametric ReLU function.}
40
41
\section*{Evaluation Results}
42
\glsunset{LReLU}
43
\begin{table}[H]
44
\centering
45
\begin{tabular}{@{\extracolsep{4pt}}lcccccc@{}}
46
\toprule
47
\multirow{2}{*}{Function} & \multicolumn{4}{c}{Single model} & \multicolumn{2}{c}{Ensemble of 10} \\\cline{2-3}\cline{4-5}\cline{6-7}
48
& \multicolumn{2}{c}{Training set} &\multicolumn{2}{c}{Test set} & Training set & Test set \\\midrule
49
Identity & \SI{66.25}{\percent} & $\boldsymbol{\sigma=0.77}$ &\SI{56.74}{\percent} & \textbf{$\sigma=0.51$} & \SI{68.77}{\percent} & \SI{58.78}{\percent}\\
50
Logistic & \SI{51.87}{\percent} & $\sigma=3.64$ &\SI{46.54}{\percent} & $\sigma=3.22$ & \SI{61.19}{\percent} & \SI{54.58}{\percent}\\
51
Logistic$^-$ & \SI{66.49}{\percent} & $\sigma=1.99$ &\SI{57.84}{\percent} & $\sigma=1.15$ & \SI{69.04}{\percent} & \SI{60.10}{\percent}\\
52
Softmax & \SI{75.22}{\percent} & $\sigma=2.41$ &\SI{59.49}{\percent} & $\sigma=1.25$ & \SI{78.87}{\percent} & \SI{63.06}{\percent}\\
53
Tanh & \SI{67.27}{\percent} & $\sigma=2.38$ &\SI{55.70}{\percent} & $\sigma=1.44$ & \SI{70.21}{\percent} & \SI{58.10}{\percent}\\
54
Softsign & \SI{66.43}{\percent} & $\sigma=1.74$ &\SI{55.75}{\percent} & $\sigma=0.93$ & \SI{69.78}{\percent} & \SI{58.40}{\percent}\\
55
\gls{ReLU} & \SI{78.62}{\percent} & $\sigma=2.15$ &\SI{62.18}{\percent} & $\sigma=0.99$ & \SI{81.81}{\percent} & \SI{64.57}{\percent}\\
56
\gls{ReLU}$^-$ & \SI{76.01}{\percent} & $\sigma=2.31$ &\SI{62.87}{\percent} & $\sigma=1.08$ & \SI{78.18}{\percent} & \SI{64.81}{\percent}\\
57
Softplus & \SI{66.75}{\percent} & $\sigma=2.45$ &\SI{56.68}{\percent} & $\sigma=1.32$ & \SI{71.27}{\percent} & \SI{60.26}{\percent}\\
58
S2ReLU & \SI{63.32}{\percent} & $\sigma=1.69$ &\SI{56.99}{\percent} & $\sigma=1.14$ & \SI{65.80}{\percent} & \SI{59.20}{\percent}\\
59
\gls{LReLU} & \SI{74.92}{\percent} & $\sigma=2.49$ &\SI{61.86}{\percent} & $\sigma=1.23$ & \SI{77.67}{\percent} & \SI{64.01}{\percent}\\
60
\gls{PReLU} & \textbf{\SI{80.01}{\percent}} & $\sigma=2.03$ &\SI{62.16}{\percent} & $\sigma=0.73$ & \textbf{\SI{83.50}{\percent}} & \textbf{\SI{64.79}{\percent}}\\
61
\gls{ELU} & \SI{76.64}{\percent} & $\sigma=1.48$ &\textbf{\SI{63.38}{\percent}} & $\sigma=0.55$ & \SI{78.30}{\percent} & \SI{64.70}{\percent}\\
62
\bottomrule
63
\end{tabular}
64
\caption[Activation function evaluation results on CIFAR-100]{Training and
65
test accuracy of adjusted baseline models trained with different
66
activation functions on CIFAR-100. For \gls{LReLU}, $\alpha = 0.3$ was
67
chosen.}
68
\label{table:CIFAR-100-accuracies-activation-functions}
69
\end{table}
70
71
\begin{table}[H]
72
\centering
73
\setlength\tabcolsep{1.5pt}
74
\begin{tabular}{@{\extracolsep{4pt}}lcccccccr@{}}
75
\toprule
76
\multirow{2}{*}{Function} & \multicolumn{4}{c}{Single model} & \multicolumn{2}{c}{Ensemble of 10} & \multicolumn{2}{c}{Epochs}\\\cline{2-5}\cline{6-7}\cline{8-9}
77
& \multicolumn{2}{c}{Training set} &\multicolumn{2}{c}{Test set} & Train & Test & Range & \multicolumn{1}{c}{Mean} \\\midrule
78
Identity & \SI{87.92}{\percent} & $\sigma=0.40$ & \SI{84.69}{\percent} & $\sigma=0.08$ & \SI{88.59}{\percent} & \SI{85.43}{\percent} & \hphantom{0}92 -- 140 & 114.5\\%TODO: Really?
79
Logistic & \SI{81.46}{\percent} & $\sigma=5.08$ & \SI{79.67}{\percent} & $\sigma=4.85$ & \SI{86.38}{\percent} & \SI{84.60}{\percent} & \hphantom{0}\textbf{58} -- \hphantom{0}\textbf{91} & \textbf{77.3}\\
80
Softmax & \SI{88.19}{\percent} & $\sigma=0.31$ & \SI{84.70}{\percent} & $\sigma=0.15$ & \SI{88.69}{\percent} & \SI{85.43}{\percent} & 124 -- 171& 145.8\\
81
Tanh & \SI{88.41}{\percent} & $\sigma=0.36$ & \SI{84.46}{\percent} & $\sigma=0.27$ & \SI{89.24}{\percent} & \SI{85.45}{\percent} & \hphantom{0}89 -- 123 & 108.7\\
82
Softsign & \SI{88.00}{\percent} & $\sigma=0.47$ & \SI{84.46}{\percent} & $\sigma=0.23$ & \SI{88.77}{\percent} & \SI{85.33}{\percent} & \hphantom{0}77 -- 119 & 104.1\\
83
\gls{ReLU} & \SI{88.93}{\percent} & $\sigma=0.46$ & \textbf{\SI{85.35}{\percent}} & $\sigma=0.21$ & \SI{89.35}{\percent} & \SI{85.95}{\percent} & \hphantom{0}96 -- 132 & 102.8\\
84
Softplus & \SI{88.42}{\percent} & $\boldsymbol{\sigma=0.29}$ & \SI{85.16}{\percent} & $\sigma=0.15$ & \SI{88.90}{\percent} & \SI{85.73}{\percent} & 108 -- 143 & 121.0\\
85
\gls{LReLU} & \SI{88.61}{\percent} & $\sigma=0.41$ & \SI{85.21}{\percent} & $\boldsymbol{\sigma=0.05}$ & \SI{89.07}{\percent} & \SI{85.83}{\percent} & \hphantom{0}87 -- 117 & 104.5\\
86
\gls{PReLU} & \textbf{\SI{89.62}{\percent}} & $\sigma=0.41$ & \textbf{\SI{85.35}{\percent}} & $\sigma=0.17$& \textbf{\SI{90.10}{\percent}} & \SI{86.01}{\percent} & \hphantom{0}85 -- 111 & 100.5\\
87
\gls{ELU} & \SI{89.49}{\percent} & $\sigma=0.42$ & \textbf{\SI{85.35}{\percent}} & $\sigma=0.10$ & \SI{89.94}{\percent} & \textbf{\SI{86.03}{\percent}} & \hphantom{0}73 -- 113 & 92.4\\
88
\bottomrule
89
\end{tabular}
90
\caption[Activation function evaluation results on HASYv2]{Test accuracy of
91
adjusted baseline models trained with different activation
92
functions on HASYv2. For \gls{LReLU}, $\alpha = 0.3$ was chosen.}
93
\label{table:HASYv2-accuracies-activation-functions}
94
\end{table}
95
96
\begin{table}[H]
97
\centering
98
\setlength\tabcolsep{1.5pt}
99
\begin{tabular}{@{\extracolsep{4pt}}lcccccccr@{}}
100
\toprule
101
\multirow{2}{*}{Function} & \multicolumn{4}{c}{Single model} & \multicolumn{2}{c}{Ensemble of 10} & \multicolumn{2}{c}{Epochs}\\\cline{2-5}\cline{6-7}\cline{8-9}
102
& \multicolumn{2}{c}{Training set} &\multicolumn{2}{c}{Test set} & Train & Test & Range & \multicolumn{1}{c}{Mean} \\\midrule
103
Identity & \SI{87.49}{\percent} & $\sigma=2.50$ & \SI{69.86}{\percent} & $\sigma=1.41$ & \SI{89.78}{\percent} & \SI{71.90}{\percent} & \hphantom{0}51 -- \hphantom{0}65 & 53.4\\
104
Logistic & \SI{45.32}{\percent} & $\sigma=14.88$& \SI{40.85}{\percent} & $\sigma=12.56$ & \SI{51.06}{\percent} & \SI{45.49}{\percent} & \hphantom{0}38 -- \hphantom{0}93 & 74.6\\
105
Softmax & \SI{87.90}{\percent} & $\sigma=3.58$ & \SI{67.91}{\percent} & $\sigma=2.32$ & \SI{91.51}{\percent} & \SI{70.96}{\percent} & 108 -- 150 & 127.5\\
106
Tanh & \SI{85.38}{\percent} & $\sigma=4.04$ & \SI{67.65}{\percent} & $\sigma=2.01$ & \SI{90.47}{\percent} & \SI{71.29}{\percent} & 48 -- \hphantom{0}92 & 65.2\\
107
Softsign & \SI{88.57}{\percent} & $\sigma=4.00$ & \SI{69.32}{\percent} & $\sigma=1.68$ & \SI{93.04}{\percent} & \SI{72.40}{\percent} & 55 -- 117 & 83.2\\
108
\gls{ReLU} & \SI{94.35}{\percent} & $\sigma=3.38$ & \SI{71.01}{\percent} & $\sigma=1.63$ & \SI{98.20}{\percent} & \SI{74.85}{\percent} & 52 -- \hphantom{0}98 & 75.5\\
109
Softplus & \SI{83.03}{\percent} & $\sigma=2.07$ & \SI{68.28}{\percent} & $\sigma=1.74$ & \SI{93.04}{\percent} & \SI{75.99}{\percent} & 56 -- \hphantom{0}89 & 68.9\\
110
\gls{LReLU} & \SI{93.83}{\percent} & $\sigma=3.89$ & \SI{74.66}{\percent} & $\sigma=2.11$ & \SI{97.56}{\percent} & \SI{78.08}{\percent} & 52 -- 120 & 80.1\\
111
\gls{PReLU} & \SI{95.53}{\percent} & $\sigma=1.92$ & \SI{71.69}{\percent} & $\sigma=1.37$ & \SI{98.17}{\percent} & \SI{74.69}{\percent} & 59 -- 101 & 78.8\\
112
\gls{ELU} & \SI{95.42}{\percent} & $\sigma=3.57$ & \SI{75.09}{\percent} & $\sigma=2.39$ & \SI{98.54}{\percent} & \SI{78.66}{\percent} & 66 -- \hphantom{0}72 & 67.2\\
113
\bottomrule
114
\end{tabular}
115
\caption[Activation function evaluation results on STL-10]{Test accuracy of
116
adjusted baseline models trained with different activation
117
functions on STL-10. For \gls{LReLU}, $\alpha = 0.3$ was chosen.}
118
\label{table:STL-10-accuracies-activation-functions}
119
\end{table}
120
121
\begin{table}[H]
122
\centering
123
\hspace*{-1cm}\begin{tabular}{lllll}
124
\toprule
125
Name & Function $\varphi(x)$ & Range of Values & $\varphi'(x)$ \\\midrule % & Used by
126
Sign function$^\dagger$ & $\begin{cases}+1 &\text{if } x \geq 0\\-1 &\text{if } x < 0\end{cases}$ & $\Set{-1,1}$ & $0$ \\%& \cite{971754} \\
127
\parbox[t]{2.6cm}{Heaviside\\step function$^\dagger$} & $\begin{cases}+1 &\text{if } x > 0\\0 &\text{if } x < 0\end{cases}$ & $\Set{0, 1}$ & $0$ \\%& \cite{mcculloch1943logical}\\
128
Logistic function & $\frac{1}{1+e^{-x}}$ & $[0, 1]$ & $\frac{e^x}{(e^x +1)^2}$ \\%& \cite{duch1999survey} \\
129
Tanh & $\frac{e^x - e^{-x}}{e^x + e^{-x}} = \tanh(x)$ & $[-1, 1]$ & $\sech^2(x)$ \\%& \cite{LeNet-5,Thoma:2014}\\
130
\gls{ReLU}$^\dagger$ & $\max(0, x)$ & $[0, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\0 &\text{if } x < 0\end{cases}$ \\%& \cite{AlexNet-2012}\\
131
\parbox[t]{2.6cm}{\gls{LReLU}$^\dagger$\footnotemark\\(\gls{PReLU})} & $\varphi(x) = \max(\alpha x, x)$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\\alpha &\text{if } x < 0\end{cases}$ \\%& \cite{maas2013rectifier,he2015delving} \\
132
Softplus & $\log(e^x + 1)$ & $(0, +\infty)$ & $\frac{e^x}{e^x + 1}$ \\%& \cite{dugas2001incorporating,glorot2011deep} \\
133
\gls{ELU} & $\begin{cases}x &\text{if } x > 0\\\alpha (e^x - 1) &\text{if } x \leq 0\end{cases}$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\\alpha e^x &\text{otherwise}\end{cases}$ \\%& \cite{clevert2015fast} \\
134
Softmax$^\ddagger$ & $o(\mathbf{x})_j = \frac{e^{x_j}}{\sum_{k=1}^K e^{x_k}}$ & $[0, 1]^K$ & $o(\mathbf{x})_j \cdot \frac{\sum_{k=1}^K e^{x_k} - e^{x_j}}{\sum_{k=1}^K e^{x_k}}$ \\%& \cite{AlexNet-2012,Thoma:2014}\\
135
Maxout$^\ddagger$ & $o(\mathbf{x}) = \max_{x \in \mathbf{x}} x$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x_i = \max \mathbf{x}\\0 &\text{otherwise}\end{cases}$ \\%& \cite{goodfellow2013maxout} \\
136
\bottomrule
137
\end{tabular}
138
\caption[Activation functions]{Overview of activation functions. Functions
139
marked with $\dagger$ are not differentiable at 0 and functions
140
marked with $\ddagger$ operate on all elements of a layer
141
simultaneously. The hyperparameters $\alpha \in (0, 1)$ of Leaky
142
ReLU and ELU are typically $\alpha = 0.01$. Other activation
143
function like randomized leaky ReLUs exist~\cite{xu2015empirical},
144
but are far less commonly used.\\
145
Some functions are smoothed versions of others, like the logistic
146
function for the Heaviside step function, tanh for the sign
147
function, softplus for ReLU.\\
148
Softmax is the standard activation function for the last layer of
149
a classification network as it produces a probability
150
distribution. See \Cref{fig:activation-functions-plot} for a plot
151
of some of them.}
152
\label{table:activation-functions-overview}
153
\end{table}
154
\footnotetext{$\alpha$ is a hyperparameter in leaky ReLU, but a learnable parameter in the parametric ReLU function.}
155
156
\begin{figure}[ht]
157
\centering
158
\begin{tikzpicture}
159
\definecolor{color1}{HTML}{E66101}
160
\definecolor{color2}{HTML}{FDB863}
161
\definecolor{color3}{HTML}{B2ABD2}
162
\definecolor{color4}{HTML}{5E3C99}
163
\begin{axis}[
164
legend pos=north west,
165
legend cell align={left},
166
axis x line=middle,
167
axis y line=middle,
168
x tick label style={/pgf/number format/fixed,
169
/pgf/number format/fixed zerofill,
170
/pgf/number format/precision=1},
171
y tick label style={/pgf/number format/fixed,
172
/pgf/number format/fixed zerofill,
173
/pgf/number format/precision=1},
174
grid = major,
175
width=16cm,
176
height=8cm,
177
grid style={dashed, gray!30},
178
xmin=-2, % start the diagram at this x-coordinate
179
xmax= 2, % end the diagram at this x-coordinate
180
ymin=-1, % start the diagram at this y-coordinate
181
ymax= 2, % end the diagram at this y-coordinate
182
xlabel=x,
183
ylabel=y,
184
tick align=outside,
185
enlargelimits=false]
186
\addplot[domain=-2:2, color1, ultra thick,samples=500] {1/(1+exp(-x))};
187
\addplot[domain=-2:2, color2, ultra thick,samples=500] {tanh(x)};
188
\addplot[domain=-2:2, color4, ultra thick,samples=500] {max(0, x)};
189
\addplot[domain=-2:2, color4, ultra thick,samples=500, dashed] {ln(exp(x) + 1)};
190
\addplot[domain=-2:2, color3, ultra thick,samples=500, dotted] {max(x, exp(x) - 1)};
191
\addlegendentry{$\varphi_1(x)=\frac{1}{1+e^{-x}}$}
192
\addlegendentry{$\varphi_2(x)=\tanh(x)$}
193
\addlegendentry{$\varphi_3(x)=\max(0, x)$}
194
\addlegendentry{$\varphi_4(x)=\log(e^x + 1)$}
195
\addlegendentry{$\varphi_5(x)=\max(x, e^x - 1)$}
196
\end{axis}
197
\end{tikzpicture}
198
\caption[Activation functions]{Activation functions plotted in $[-2, +2]$.
199
$\tanh$ and ELU are able to produce negative numbers. The image of
200
ELU, ReLU and Softplus is not bound on the positive side, whereas
201
$\tanh$ and the logistic function are always below~1.}
202
\label{fig:activation-functions-plot}
203
\end{figure}
204
205
\glsreset{LReLU}
206
\twocolumn
207