CoCalc -- book.tex

📚 The CoCalc Library - books, templates and other resources
cocalc-examples / think-OS / book / book.tex
¹³²⁹²³ views
License: OTHER
1
% LaTeX source for ``Think OS:
2
% A Brief Introduction to Operating Systems''
3
% Copyright 2015  Allen B. Downey.
4

5
% License: Creative Commons Attribution-NonCommercial 3.0 Unported License.
6
% http://creativecommons.org/licenses/by-nc/3.0/
7
%
8

9
%\documentclass[10pt,b5paper]{book}
10
\documentclass[12pt]{book}
11
\usepackage[width=5.5in,height=8.5in,
12
  hmarginratio=3:2,vmarginratio=1:1]{geometry}
13

14
% for some of these packages, you might have to install
15
% texlive-latex-extra (in Ubuntu)
16

17
\usepackage[T1]{fontenc}
18
\usepackage{textcomp}
19
\usepackage{mathpazo}
20
%\usepackage{pslatex}
21

22
\usepackage{url}
23
\usepackage{fancyhdr}
24
\usepackage{fancyvrb}
25
\usepackage{graphicx}
26
\usepackage{subfig}
27
\usepackage{amsmath}
28
\usepackage{amsthm}
29
%\usepackage{amssymb}
30
\usepackage{makeidx}
31
\usepackage{setspace}
32
\usepackage{upquote}
33
\usepackage{listings}
34
\usepackage{color}
35

36
\title{Think OS}
37
\author{Allen B. Downey}
38

39
\newcommand{\thetitle}{Think OS: A Brief Introduction to Operating Systems}
40
\newcommand{\theversion}{0.7.3}
41

42

43
% Commands that control the appearance of the listings
44
\definecolor{light-gray}{gray}{0.95}
45

46
\lstset{basicstyle=\tt, frame=single, 
47
backgroundcolor=\color{light-gray}, escapeinside={(*}{*)},
48
numbers=left, numberstyle=\tiny, numbersep=10pt}
49

50

51
\makeindex
52

53
\begin{document}
54

55
\frontmatter
56

57

58
%%% EXERCISE
59

60
\newtheoremstyle{exercise}% name of the style to be used
61
  {\topsep}% measure of space to leave above the theorem. E.g.: 3pt
62
  {\topsep}% measure of space to leave below the theorem. E.g.: 3pt
63
  {}% name of font to use in the body of the theorem
64
  {0pt}% measure of space to indent
65
  {\bfseries}% name of head font
66
  {}% punctuation between head and body
67
  { }% space after theorem head; " " = normal interword space
68
  {}% Manually specify head
69

70
\theoremstyle{exercise}
71
\newtheorem{exercise}{Exercise}[chapter]
72

73
\sloppy
74
%\setlength{\topmargin}{-0.375in}
75
%\setlength{\oddsidemargin}{0.0in}
76
%\setlength{\evensidemargin}{0.0in}
77

78
% Uncomment these to center on 8.5 x 11
79
%\setlength{\topmargin}{0.625in}
80
%\setlength{\oddsidemargin}{0.875in}
81
%\setlength{\evensidemargin}{0.875in}
82

83
%\setlength{\textheight}{7.2in}
84

85
\setlength{\headsep}{3ex}
86
\setlength{\parindent}{0.0in}
87
\setlength{\parskip}{1.7ex plus 0.5ex minus 0.5ex}
88
\renewcommand{\baselinestretch}{1.02}
89

90
% see LaTeX Companion page 62
91
\setlength{\topsep}{-0.0\parskip}
92
\setlength{\partopsep}{-0.5\parskip}
93
\setlength{\itemindent}{0.0in}
94
\setlength{\listparindent}{0.0in}
95

96
% see LaTeX Companion page 26
97
% these are copied from /usr/local/teTeX/share/texmf/tex/latex/base/book.cls
98
% all I changed is afterskip
99

100
\makeatletter
101

102
\renewcommand{\section}{\@startsection 
103
    {section} {1} {0mm}%
104
    {-3.5ex \@plus -1ex \@minus -.2ex}%
105
    {0.7ex \@plus.2ex}%
106
    {\normalfont\Large\bfseries}}
107
\renewcommand\subsection{\@startsection {subsection}{2}{0mm}%
108
    {-3.25ex\@plus -1ex \@minus -.2ex}%
109
    {0.3ex \@plus .2ex}%
110
    {\normalfont\large\bfseries}}
111
\renewcommand\subsubsection{\@startsection {subsubsection}{3}{0mm}%
112
    {-3.25ex\@plus -1ex \@minus -.2ex}%
113
    {0.3ex \@plus .2ex}%
114
    {\normalfont\normalsize\bfseries}}
115

116
% The following line adds a little extra space to the column
117
% in which the Section numbers appear in the table of contents
118
\renewcommand{\l@section}{\@dottedtocline{1}{1.5em}{3.0em}}
119
\setcounter{tocdepth}{1}
120

121
\newcommand{\adjustpage}[1]{\enlargethispage{#1\baselineskip}}
122

123

124
% Note: the following command seems to cause problems for Acroreader
125
% on Windows, so for now I am overriding it.
126
%\newcommand{\clearemptydoublepage}{
127
%            \newpage{\pagestyle{empty}\cleardoublepage}}
128
\newcommand{\clearemptydoublepage}{\cleardoublepage}
129

130
%\newcommand{\blankpage}{\pagestyle{empty}\vspace*{1in}\newpage}
131
\newcommand{\blankpage}{\vspace*{1in}\newpage}
132

133
% HEADERS
134

135
\renewcommand{\chaptermark}[1]{\markboth{#1}{}}
136
\renewcommand{\sectionmark}[1]{\markright{\thesection\ #1}{}}
137

138
\lhead[\fancyplain{}{\bfseries\thepage}]%
139
      {\fancyplain{}{\bfseries\rightmark}}
140
\rhead[\fancyplain{}{\bfseries\leftmark}]%
141
      {\fancyplain{}{\bfseries\thepage}}
142
\cfoot{}
143

144
\pagestyle{fancyplain}
145

146

147
% turn off the rule under the header
148
%\setlength{\headrulewidth}{0pt}
149

150
% the following is a brute-force way to prevent the headers
151
% from getting transformed into all-caps
152
\renewcommand\MakeUppercase{}
153

154
% Exercise environment
155
\newtheoremstyle{myex}% name
156
     {9pt}%      Space above
157
     {9pt}%      Space below
158
     {}%         Body font
159
     {}%         Indent amount (empty = no indent, \parindent = para indent)
160
     {\bfseries}% Thm head font
161
     {}%        Punctuation after thm head
162
     {0.5em}%     Space after thm head: " " = normal interword space;
163
           %       \newline = linebreak
164
     {}%         Thm head spec (can be left empty, meaning `normal')
165

166
\theoremstyle{myex}
167

168

169

170
%--title page--------------------------------------------------
171
\pagebreak
172
\thispagestyle{empty}
173

174
\begin{flushright}
175
\vspace*{2.0in}
176

177
\begin{spacing}{3}
178
{\huge Think OS}\\
179
{\Large  A Brief Introduction to Operating Systems}
180
\end{spacing}
181

182
\vspace{0.25in}
183

184
Version \theversion
185

186
\vspace{1in}
187

188

189
{\Large
190
Allen B. Downey\\
191
}
192

193

194
\vspace{0.5in}
195

196
{\Large Green Tea Press}
197

198
{\small Needham, Massachusetts}
199

200
%\includegraphics[width=1in]{figs/logo1.eps}
201
\vfill
202

203
\end{flushright}
204

205

206
%--copyright--------------------------------------------------
207
\pagebreak
208
\thispagestyle{empty}
209

210
{\small
211
Copyright \copyright ~2015 Allen B. Downey.
212

213

214
\vspace{0.2in}
215

216
\begin{flushleft}
217
Green Tea Press       \\
218
9 Washburn Ave \\
219
Needham MA 02492
220
\end{flushleft}
221

222
Permission is granted to copy, distribute, and/or modify this document
223
under the terms of the Creative Commons Attribution-NonCommercial 3.0 Unported
224
License, which is available at \url{http://creativecommons.org/licenses/by-nc/3.0/}.
225

226
The original form of this book is \LaTeX\ source code.  Compiling this
227
code has the effect of generating a device-independent representation
228
of a textbook, which can be converted to other formats and printed.
229

230
The \LaTeX\ source for this book is available from
231
\url{http://greenteapress.com/thinkos}.
232

233
The cover for this book is based on a photo by Paul Friel
234
(\url{http://flickr.com/people/frielp/}), who made it available under
235
the Creative Commons Attribution license.  The original photo
236
is at \url{http://flickr.com/photos/frielp/11999738/}.
237

238
\vspace{0.2in}
239

240
} % end small
241

242

243
\chapter{Preface}
244
\label{preface}
245

246
In many computer science programs, Operating Systems is an advanced
247
topic.  By the time students take it, they know how to program
248
in C, and they have probably taken a class in Computer Architecture.
249
Usually the goal of the class is to expose students to the design
250
and implementation of operating systems, with the implied assumption
251
that some of them will do research in this area, or write part of
252
an OS.
253

254
This book is intended for a different audience, and it has different
255
goals.  I developed it for a class at Olin College called Software
256
Systems.
257

258
Most students taking this class learned to program in Python,
259
so one of the goals is to help them learn C.
260
For that part of the class, I use Griffiths and Griffiths, {\it Head
261
  First C}, from O'Reilly Media.  This book is meant to complement
262
that one.
263

264
Few of my students will ever write an operating system, but many of
265
them will write low-level applications in C or work on embedded
266
systems.  My class includes material from operating systems, networks,
267
databases, and embedded systems, but it emphasizes the topics
268
programmers need to know.
269

270
This book does not assume that you have studied Computer Architecture.
271
As we go along, I will explain what we need.
272

273
If this book is successful, it should give you a better understanding
274
of what is happening when programs run, and what you can do to make
275
them run better and faster.
276

277
Chapter 1 explains some of the differences between compiled and
278
interpreted languages, with some insight into how compilers work.
279
Recommended reading: {\it Head First C} Chapter 1.
280

281
Chapter 2 explains how the operating system uses processes to
282
protect running programs from interfering with each other.
283

284
Chapter 3 explains virtual memory and address translation.
285
Recommended reading: {\it Head First C} Chapter 2.
286

287
Chapter 4 is about file systems and data streams.
288
Recommended reading: {\it Head First C} Chapter 3.
289

290
Chapter 5 describes how numbers, letters, and other values are
291
encoded, and presents the bitwise operators.
292

293
Chapter 6 explains how to use dynamic memory management, and how
294
it works.
295
Recommended reading: {\it Head First C} Chapter 6.
296

297
Chapter 7 is about caching and the memory hierarchy.
298

299
Chapter 8 is about multitasking and scheduling.
300

301
Chapter 9 is about POSIX threads and mutexes.
302
Recommended reading: {\it Head First C} Chapter 12 and
303
{\it Little Book of Semaphores} Chapters 1 and 2.
304

305
Chapter 10 is about POSIX condition variables and the producer/consumer
306
problem. Recommended reading: {\it Little Book of Semaphores} Chapters 3
307
and 4.
308

309
Chapter 11 is about using POSIX semaphores and implementing semaphores in C.
310

311
\section*{A note on this draft}
312

313
The current version of this book is an early draft.  While I am
314
working on the text, I have not yet included the figures.  So
315
there are a few places where, I'm sure, the explanation will be
316
greatly improved when the figures are ready.
317

318

319
\section{Using the code}
320
\label{code}
321

322
Example code for this book is available from
323
\url{https://github.com/AllenDowney/ThinkOS}.  Git is a version
324
control system that allows you to keep track of the files that
325
make up a project.  A collection of files under Git's control is
326
called a {\bf repository}.  GitHub is a hosting service that provides
327
storage for Git repositories and a convenient web interface.
328
\index{repository}
329
\index{Git}
330
\index{GitHub}
331

332
The GitHub homepage for my repository provides several ways to
333
work with the code:
334

335
\begin{itemize}
336

337
\item You can create a copy of my repository
338
on GitHub by pressing the {\sf Fork} button.  If you don't already
339
have a GitHub account, you'll need to create one.  After forking, you'll
340
have your own repository on GitHub that you can use to keep track
341
of code you write while working on this book.  Then you can
342
clone the repo, which means that you copy the files
343
to your computer.
344
\index{fork}
345

346
\item Or you could clone
347
my repository.  You don't need a GitHub account to do this, but you
348
won't be able to write your changes back to GitHub.
349
\index{clone}
350

351
\item If you don't want to use Git at all, you can download the files
352
in a Zip file using the button in the lower-right corner of the
353
GitHub page.
354

355
\end{itemize}
356

357

358
\section*{Contributor List}
359

360

361
If you have a suggestion or correction, please send email to 
362
{\tt downey@allendowney.com}.  If I make a change based on your
363
feedback, I will add you to the contributor list
364
(unless you ask to be omitted).
365
\index{contributors}
366

367
If you include at least part of the sentence the
368
error appears in, that makes it easy for me to search.  Page and
369
section numbers are fine, too, but not quite as easy to work with.
370
Thanks!
371

372
\small
373

374
\begin{itemize}
375

376
\item I am grateful to the students in Software Systems at Olin
377
College, who tested an early draft of this book in Spring 2014.
378
They corrected many errors and made many helpful suggestions.
379
I appreciate their pioneering spirit!
380

381
\item James P Giannoules spotted a copy-and-paste error.
382

383
\item Andy Engle knows the difference between GB and GiB.
384

385
\item Aashish Karki noted some broken syntax.
386

387
% ENDCONTRIB
388

389
\end{itemize}
390

391
Other people who found typos and errors include
392
Jim Tyson, Donald Robertson, Jeremy Vermast, Yuzhong Huang, Ian Hill.
393

394
\normalsize
395

396
\clearemptydoublepage
397

398
% TABLE OF CONTENTS
399

400
\tableofcontents
401

402
\clearemptydoublepage
403

404
% inline syntax formatting
405
\newcommand{\vb}{\verb}%}
406

407
% START THE BOOK
408
\mainmatter
409

410

411
\chapter{Compilation}
412

413

414
\section{Compiled and interpreted languages}
415

416
People often describe programming languages as either compiled or interpreted.
417
``Compiled'' means that programs are translated into machine language and
418
then executed by hardware; ``interpreted'' means that programs
419
are read and executed by a software interpreter.
420
Usually C is considered a compiled language and Python is
421
considered an interpreted language.  But the distinction is not always
422
clear-cut.
423

424
First, many languages can be either compiled or interpreted.  For
425
example, there are C interpreters and Python compilers.
426
Second, there are languages like Java that use a hybrid
427
approach, compiling programs into an intermediate language and then
428
running the translated program in an interpreter.  Java uses an
429
intermediate language called Java bytecode, which is similar to
430
machine language, but it is executed by a software interpreter, the
431
Java virtual machine (JVM).
432

433
So being compiled or interpreted is not an intrinsic
434
characteristic of a language; nevertheless, there are some general
435
differences between compiled and interpreted languages.
436

437

438
\section{Static types}
439

440
Many interpreted languages support dynamic types, but compiled
441
languages are usually limited to static types.  In a statically-typed
442
language, you can tell by looking at the program what type each
443
variable refers to.  In a dynamically-typed language,
444
you don't always know the type of a variable until the
445
program is running.  In general, {\bf static} refers to things that
446
happen at compile time (while a program is being compiled), and {\bf dynamic} refers to things that happen
447
at run time (while a program is running).
448

449
For example, in Python you can write a function like this:
450

451
\begin{verbatim}
452
def add(x, y):
453
    return x + y
454
\end{verbatim}
455

456
Looking at this code, you can't tell what type {\tt x} and {\tt y}
457
will refer to at run time.  This function might be called several
458
times, each time with values with different types.  Any values that
459
support the addition operator will work; any other types will cause an
460
exception or {\bf runtime error}.
461

462
In C you would write the same function like this:
463

464
\begin{verbatim}
465
int add(int x, int y) {
466
    return x + y;
467
}
468
\end{verbatim}
469

470
The first line of the function includes {\bf type declarations} for the
471
parameters and the return value: {\tt x} and {\tt y} are declared to
472
be integers, which means that we can check at compile time
473
whether the addition operator is legal for this type (it is).  The
474
return value is also declared to be an integer.
475

476
Because of these declarations, when this function is called elsewhere
477
in the program, the compiler can check whether the arguments provided
478
have the right type, and whether the return value is used correctly.
479

480
These checks happen before the program starts executing, so errors can
481
be found earlier.  More importantly, errors can be found in parts
482
of the program that have never run.  Furthermore, these checks don't
483
have to happen at run time, which is one of the reasons compiled
484
languages generally run faster than interpreted languages.
485

486
Declaring types at compile time also saves space.  In dynamic
487
languages, variable names are stored in memory while the program runs,
488
and they are often accessible by the program.  For example, in Python
489
the built-in function {\tt locals} returns a dictionary that contains
490
variable names and their values.  Here's an example in a Python
491
interpreter:
492

493
\begin{verbatim}
494
>>> x = 5
495
>>> print locals()
496
{'x': 5, '__builtins__': <module '__builtin__' (built-in)>,
497
'__name__': '__main__', '__doc__': None, '__package__': None}
498
\end{verbatim}
499

500
This shows that the name of the variable is stored in memory while
501
the program is running (along with some other values that are part
502
of the default runtime environment).
503

504
In compiled languages, variable names exist at compile-time but not at
505
run time.  The compiler chooses a location for each variable and
506
records these locations as part of the compiled program.\footnote{This
507
  is a simplification; we will go into more detail later.}  The
508
location of a variable is called its {\bf address}.  At run time, the
509
value of each variable is stored at its address, but the names of the
510
variables are not stored at all (unless they are added by the compiler
511
for purposes of debugging).
512

513
%This difference is reflected in the way people draw diagrams for
514
%different languages.  In Python, every variable is a reference to
515
%a value, so the usual diagram shows a name with an arrow pointing
516
%to its value.  In C, a variable is the name of a location in memory
517
%that stores a value, so the usual diagram is a name next to a
518
%box that contains a value.
519

520
%\begin{figure}
521
% descriptive.py
522
%\centerline{\includegraphics[width=2.5in]{figs/variables.pdf}}
523
%\caption{Diagrams that represent variables in Python (left) and
524
%C (right).}
525
%\label{variables}
526
%\end{figure}
527

528

529

530
\section{The compilation process}
531

532
As a programmer, you should have a mental model of what happens
533
during compilation.  If you understand the process, it will help
534
you interpret error messages, debug your code, and avoid
535
common pitfalls.
536

537
The steps of compilation are:
538

539
\begin{enumerate}
540

541
\item Preprocessing: C is one of several languages that include
542
  {\bf preprocessing directives} that take effect before the program is
543
  compiled.  For example, the \verb"#include" directive causes the
544
  source code from another file to be inserted at the location of the
545
  directive.
546

547
\item Parsing: During parsing, the compiler reads the source code and
548
  builds an internal representation of the program, called an
549
  {\bf abstract syntax tree}.  Errors
550
  detected during this step are generally syntax errors.
551

552
\item Static checking: The compiler checks whether variables and
553
  values have the right type, whether functions are called with the
554
  right number and type of arguments, etc.  Errors detected during
555
  this step are sometimes called {\bf static semantic} errors.
556

557
\item Code generation: The compiler reads the internal representation
558
  of the program and generates machine code or byte code.
559

560
\item Linking: If the program uses values and functions defined in a
561
  library, the compiler has to find the appropriate library and
562
  include the required code.
563

564
\item Optimization: At several points in the process, the compiler
565
  can transform the program to generate code that runs faster or
566
  uses less space.  Most optimizations are simple changes that eliminate
567
  obvious waste, but some compilers perform sophisticated analyses and
568
  transformations.
569

570
\end{enumerate}
571

572
Normally when you run {\tt gcc}, it runs all of these steps and
573
generates an executable file.  For example, here is a minimal C
574
program:
575

576
\begin{verbatim}
577
#include <stdio.h>
578
int main()
579
{
580
    printf("Hello World\n");
581
}
582
\end{verbatim}
583

584
If you save this code in a file called
585
{\tt hello.c}, you can compile and run it like this:
586

587
\begin{verbatim}
588
$ gcc hello.c
589
$ ./a.out
590
\end{verbatim}
591

592
By default, {\tt gcc} stores the executable code in a file
593
called {\tt a.out} (which originally stood for ``assembler output'').
594
The second line runs the executable.  The prefix \verb"./" tells
595
the shell to look for it in the current directory.
596

597
It is usually a good idea to use the {\tt -o} flag to provide a
598
better name for the executable:
599

600
\begin{verbatim}
601
$ gcc hello.c -o hello
602
$ ./hello
603
\end{verbatim}
604

605

606
\section{Object code}
607

608
The {\tt -c} flag tells {\tt gcc} to compile the program and
609
generate machine code, but not to link it or generate an executable:
610

611
\begin{verbatim}
612
$ gcc hello.c -c
613
\end{verbatim}
614

615
The result is a file named {\tt hello.o}, where the {\tt o} stands for
616
{\bf object code}, which is the compiled program.  Object code is not
617
executable, but it can be linked into an executable.
618

619
The UNIX command {\tt nm} reads an object file and generates
620
information about the names it defines and uses.  For example:
621

622
\begin{verbatim}
623
$ nm hello.o
624
0000000000000000 T main
625
                 U puts
626
\end{verbatim}
627

628
This output indicates that {\tt hello.o} defines the name {\tt main}
629
and uses a function named {\tt puts}, which stands for ``put string''.
630
In this example, {\tt gcc} performs an optimization by replacing
631
{\tt printf}, which is a large and complicated function, with
632
{\tt puts}, which is relatively simple.
633

634
You can control how much optimization {\tt gcc} does with
635
the {\tt -O} flag.  By default, it does very little optimization, which
636
can make debugging easier.  The option {\tt -O1} turns on the most
637
common and safe optimizations.  Higher numbers turn on additional
638
optimizations that require longer compilation time.
639

640
In theory, optimization should not change the behavior of the program,
641
other than to speed it up.  But if your program has a subtle bug,
642
you might find that optimization makes the bug appear or disappear.
643
It is usually a good idea to turn off optimization while you are developing
644
new code.  Once the program is working and passing appropriate tests,
645
you can turn on optimization and confirm that the tests still pass.
646

647

648
\section{Assembly code}
649

650
Similar to the {\tt -c} flag, the {\tt -S} flag tells {\tt gcc}
651
to compile the program and generate assembly code, which is basically
652
a human-readable form of machine code.
653

654
\begin{verbatim}
655
$ gcc hello.c -S
656
\end{verbatim}
657

658
The result is a file named {\tt hello.s}, which might look something like
659
this:
660

661
\begin{verbatim}
662
        .file        "hello.c"
663
        .section     .rodata
664
.LC0:
665
        .string      "Hello World"
666
        .text
667
        .globl       main
668
        .type        main, @function
669
main:
670
.LFB0:
671
        .cfi_startproc
672
        pushq %rbp
673
        .cfi_def_cfa_offset 16
674
        .cfi_offset 6, -16
675
        movq %rsp, %rbp
676
        .cfi_def_cfa_register 6
677
        movl $.LC0, %edi
678
        call puts
679
        movl $0, %eax
680
        popq %rbp
681
        .cfi_def_cfa 7, 8
682
        ret
683
        .cfi_endproc
684
.LFE0:
685
        .size        main, .-main
686
        .ident       "GCC: (Ubuntu/Linaro 4.7.3-1ubuntu1) 4.7.3"
687
        .section     .note.GNU-stack,"",@progbits
688
\end{verbatim}
689

690
{\tt gcc} is usually configured to generate code for the machine you
691
are running on, so for me it generates x86 assembly language,
692
which runs on a wide variety of processors from Intel, AMD, and
693
others.  If you are running on a different architecture, you might
694
see different code.
695

696

697
\section{Preprocessing}
698

699
Taking another step backward through the compilation process, you
700
can use the {\tt -E} flag to run the preprocessor only:
701

702
\begin{verbatim}
703
$ gcc hello.c -E
704
\end{verbatim}
705

706
The result is the output from the preprocessor.  In this example,
707
it contains the included code from {\tt stdio.h}, and all the files
708
included from {\tt stdio.h}, and all the files included from those
709
files, and so on.  On my machine, the total is more than 800 lines
710
of code.  Since almost every C program includes {\tt stdio.h}, those
711
800 lines of code get compiled a lot.  If, like many C programs,
712
you also include {\tt stdlib.h}, the result is more than 1800 lines
713
of code.
714

715

716
\section{Understanding errors}
717

718
Now that we know the steps in the compilation process, it is easier
719
to understand error messages.  For example, if there is an error
720
in a \verb"#include" directive, you'll get a message from the
721
preprocessor:
722

723
\begin{verbatim}
724
hello.c:1:20: fatal error: stdioo.h: No such file or directory
725
compilation terminated.
726
\end{verbatim}
727

728
If there's a syntax error, you get a message from the compiler:
729

730
\begin{verbatim}
731
hello.c: In function 'main':
732
hello.c:6:1: error: expected ';' before '}' token
733
\end{verbatim}
734

735
If you use a function that's not defined in any of the standard
736
libraries, you get a message from the linker:
737

738
\begin{verbatim}
739
/tmp/cc7iAUbN.o: In function `main':
740
hello.c:(.text+0xf): undefined reference to `printff'
741
collect2: error: ld returned 1 exit status
742
\end{verbatim}
743

744
{\tt ld} is the name of the UNIX linker, so named because ``loading''
745
is another step in the compilation process that is closely related
746
to linking.
747

748
Once the program starts, C does very little runtime checking,
749
so there are only a few runtime errors you are likely to see.  If you
750
divide by zero, or perform another illegal floating-point operation, you
751
will get a ``Floating point exception.''  And if you try to read or
752
write an incorrect location in memory, you will get a ``Segmentation
753
fault.''
754

755
% TODO: -Wall and lint
756

757

758
\chapter{Processes}
759

760
\section{Abstraction and virtualization}
761

762
Before we talk about processes, I want to define a few words:
763

764
\begin{itemize}
765

766
\item Abstraction: An abstraction is a simplified representation
767
of something complicated.  For example, if you drive a car, you
768
understand that when you turn the wheel left, the car goes left,
769
and vice versa.  Of course, the steering wheel is connected to
770
a sequence of mechanical and (often) hydraulic systems that turn
771
the wheels, and the wheels interact with the road in ways that
772
can be complex, but as a driver, you normally don't have to think
773
about any of those details.  You can get along very well with
774
a simple mental model of steering.  Your mental model is an
775
abstraction.
776

777
Similarly, when you use a web browser, you understand that when
778
you click on a link, the browser displays the page the link refers
779
to.  The software and network communication that make that possible
780
are complex, but as a user, you don't have to know the
781
details.
782

783
A large part of software engineering is designing abstractions like
784
these that allow users and other programmers to use powerful
785
and complicated systems without having to know about the details
786
of their implementation.
787

788
\item Virtualization: An important kind of abstraction is
789
virtualization, which is the process of creating a desirable
790
illusion.
791

792
For example, many public libraries participate in inter-library
793
collaborations that allow them to borrow books from each other.
794
When I request a book, sometimes the book is on the shelf at my
795
local library, but other times it has to be transferred from another
796
collection.  Either way, I get a notification when it is available
797
for pickup.  I don't need to know where it came from, and I don't
798
need to know which books my library has.  As a whole, the system
799
creates the illusion that my library has every book in the world.
800

801
The collection physically located at my local library might be small,
802
but the collection available to me virtually includes every book
803
in the inter-library collaboration.
804

805
As another example, most computers are only connected to one
806
network, but that network is connected to others, and so on.  What
807
we call the Internet is a collection of networks and a set of
808
protocols that forward packets from one network to the next.
809
From the point of view of a user or programmer, the system behaves
810
as if every computer on the Internet is connected to every other
811
computer.  The number of physical connections is small, but the
812
number of virtual connections is very large.
813

814
\end{itemize}
815

816
The word ``virtual'' is often used in the context of a virtual
817
machine, which is software that creates the illusion of a dedicated
818
computer running a particular operating system, when in reality
819
the virtual machine might be running, along with many other virtual
820
machines, on a computer running a different operating system.
821

822
In the context of virtualization, we sometimes call what is
823
really happening ``physical'', and what is virtually happening
824
either ``logical'' or ``abstract.''
825

826

827
\section{Isolation}
828

829
One of the most important principles of engineering is isolation:
830
when you are designing a system with multiple components, it is usually
831
a good idea to isolate them from each other so that a change in one
832
component doesn't have undesired effects on other components.
833

834
One of the most important goals of an operating system is to isolate
835
each running program from the others so that programmers don't have to
836
think about every possible interaction.  The software object that
837
provides this isolation is a {\bf process}.
838

839
A process is a software object that represents a running program.
840
I mean ``software object'' in the sense of object-oriented programming;
841
in general, an object contains data and provides methods
842
that operate on the data.  A process is an object that contains the
843
following data:
844

845
\begin{itemize}
846

847
\item The text of the program, usually a sequence of
848
  machine language instructions.
849

850
\item Data associated with the program, including static data (allocated
851
  at compile time) and dynamic data (allocated at run time).
852

853
\item The state of any pending input/output operations.  For example,
854
  if the process is waiting for data to be read from disk or for a
855
  packet to arrive on a network, the status of these operations is
856
  part of the process.
857

858
\item The hardware state of the program, which includes data stored
859
  in registers, status information, and the program counter, which
860
  indicates which instruction is currently executing.
861

862
\end{itemize}
863

864
Usually one process runs one program, but it is also possible for
865
a process to load and run a new program.
866

867
It is also possible, and common, to run the same program in more than one
868
process.  In that case, the processes share the same program text
869
but generally have different data and hardware states.
870

871
Most operating systems provide a fundamental set of capabilities
872
to isolate processes from each other:
873

874
\begin{itemize}
875

876
\item Multitasking: Most operating systems have the ability to
877
  interrupt a running process at almost any time, save its hardware
878
  state, and then resume the process later.  In general, programmers
879
  don't have to think about these interruptions.  The program behaves
880
  as if it is running continuously on a dedicated processor, 
881
  except that the time between instructions is unpredictable.
882

883
\item Virtual memory: Most operating systems create the
884
  illusion that each process has its own chunk of memory, isolated
885
  from all other processes.  Again, programmers generally don't
886
  have to think about how virtual memory works; they can proceed
887
  as if every program has a dedicated chunk of memory.
888

889
\item Device abstraction: Processes running on the same computer share
890
  the disk drive, the network interface, the graphics card, and other
891
  hardware.  If processes interacted with this hardware directly,
892
  without coordination, chaos would ensue.  For example, network data
893
  intended for one process might be read by another.  Or multiple
894
  processes might try to store data in the same location on a hard
895
  drive.  It is up to the operating system to maintain order by
896
  providing appropriate abstractions.
897

898
\end{itemize}
899

900
As a programmer, you don't need to know much about how these
901
capabilities are implemented.  But if you are
902
curious, you will find a lot of interesting things
903
going on under the metaphorical hood.  And if you know what's
904
going on, it can make you a better programmer.
905

906

907
\section{UNIX processes}
908
\label{unixps}
909

910
While I write this book, the process I
911
am most aware of is my text editor, emacs.  Every once in a while
912
I switch to a terminal window, which is a window running a UNIX shell
913
that provides a command-line interface.
914

915
When I move the mouse, the window manager wakes up, sees that the
916
mouse is over the terminal window, and wakes up the terminal.
917
The terminal wakes up the shell.
918
If I type {\tt make} in the shell, it creates a
919
new process to run Make, which creates another process to run LaTeX
920
and then another process to display the results.
921

922
If I need to look something up, I might switch to another desktop,
923
which wakes up the window manager again.  If I click on the icon for a
924
web browser, the window manager creates a process to run the web
925
browser.  Some browsers, like Chrome, create a new process for each
926
window and each tab.
927

928
And those are just the processes I am aware of.  At the same time
929
there are many other processes running in the {\bf background}.
930
Many of them are performing operations related to the operating
931
system.
932

933
The UNIX command {\tt ps} prints information about running processes.
934
If you run it in a terminal, you might see something like this:
935

936
\begin{verbatim}
937
  PID TTY          TIME CMD
938
 2687 pts/1    00:00:00 bash
939
 2801 pts/1    00:01:24 emacs
940
24762 pts/1    00:00:00 ps
941
\end{verbatim}
942

943
The first column is the unique numerical process ID.  The second
944
column is the terminal that created the process; 
945
``TTY'' stands for teletypewriter, which was the original
946
mechanical terminal.
947

948
The third column is the total processor time used by the process,
949
in hours, minutes, and seconds.
950
The last column is the name of the running program.  In
951
this example, {\tt bash} is the name of the shell that interprets
952
the commands I type in the terminal, emacs is my text editor, and
953
ps is the program generating this output.
954

955
By default, {\tt ps} lists only the processes associated with
956
the current terminal.  If you use the {\tt -e} flag, you get every
957
process (including processes belonging to other users, which is
958
a security flaw, in my opinion).
959

960
On my system there are currently 233 processes.
961
Here are some of them:
962

963
\begin{verbatim}
964
  PID TTY          TIME CMD
965
    1 ?        00:00:17 init
966
    2 ?        00:00:00 kthreadd
967
    3 ?        00:00:02 ksoftirqd/0
968
    4 ?        00:00:00 kworker/0:0
969
    8 ?        00:00:00 migration/0
970
    9 ?        00:00:00 rcu_bh
971
   10 ?        00:00:16 rcu_sched
972
   47 ?        00:00:00 cpuset
973
   48 ?        00:00:00 khelper
974
   49 ?        00:00:00 kdevtmpfs
975
   50 ?        00:00:00 netns
976
   51 ?        00:00:00 bdi-default
977
   52 ?        00:00:00 kintegrityd
978
   53 ?        00:00:00 kblockd
979
   54 ?        00:00:00 ata_sff
980
   55 ?        00:00:00 khubd
981
   56 ?        00:00:00 md
982
   57 ?        00:00:00 devfreq_wq
983
\end{verbatim}
984

985
{\tt init} is the first process created when the operating system
986
starts.  It creates many of the other processes, and then sits idle
987
until the processes it created are done.
988

989
{\tt kthreadd} is a process the operating system uses to create new
990
{\bf threads}.  We'll talk more about threads later, but for now you can
991
think of a thread as kind of a process.  The {\tt k} at the beginning
992
stands for {\bf kernel}, which is the part of the operating system
993
responsible for core capabilities like creating threads.  The extra
994
{\tt d} at the end stands for {\bf daemon}, which is another name for
995
processes like this that run in the background and provide operating
996
system services.  In this context, ``daemon'' is used in the
997
sense of a helpful spirit, with no connotation of evil.
998

999
Based on the name, you can infer that {\tt ksoftirqd} is also a kernel
1000
daemon; specifically, it handles software interrupt requests, or
1001
``soft IRQ''.
1002

1003
{\tt kworker} is a worker process created by the kernel to do some
1004
kind of processing for the kernel.
1005

1006
There are often multiple processes running these kernel services.
1007
On my system at the moment, there are 8 {\tt ksoftirqd} processes
1008
and 35 {\tt kworker} processes.
1009

1010
I won't go into more details about the other processes, but if you
1011
are interested you can search for more information about them.
1012
You should run {\tt ps} on your system and compare your results
1013
to mine.
1014

1015
%TODO: using gdb here?
1016

1017

1018
\chapter{Virtual memory}
1019

1020
\section{A bit of information theory}
1021

1022
A {\bf bit} is a binary digit; it is also a unit of information.  If you
1023
have one bit, you can specify one of two possibilities, usually
1024
written 0 and 1.  If you have two bits, there are 4 possible
1025
combinations, 00, 01, 10, and 11.  In general, if you have $b$ bits, you
1026
can indicate one of $2^b$ values.  A {\bf byte} is 8 bits, so it can
1027
hold one of 256 values.
1028

1029
Going in the other direction, suppose you want to store a letter
1030
of the alphabet.  There are 26 letters, so how many bits do you
1031
need?  With 4 bits, you can specify one of 16 values, so that's
1032
not enough.  With 5 bits, you can specify up to 32 values, so
1033
that's enough for all the letters, with a few values left over.
1034

1035
In general, if you want to specify one of $N$ values, you should
1036
choose the smallest value of $b$ so that $2^b \ge N$.  Taking the
1037
log base 2 of both sides yields $b \ge log_2 N$.
1038

1039
Suppose I flip a coin and tell you the outcome.  I have given
1040
you one bit of information.  If I roll a six-sided die and tell
1041
you the outcome, I have given you $log_2 6$ bits of information.
1042
And in general, if the probability of the outcome is 1 in $N$,
1043
then the outcome contains $log_2 N$ bits of information.
1044

1045
Equivalently, if the probability of the outcome is $p$, then
1046
the information content is $-log_2 p$.  This quantity is called
1047
the {\bf self-information} of the outcome.  It measures
1048
how surprising the outcome is, which is why it is also called
1049
{\bf surprisal}.  If your horse has only one chance in 16 of winning,
1050
and he wins, you get 4 bits of information (along with the
1051
payout).  But if the favorite wins 75\% of the time, the news
1052
of the win contains only 0.42 bits.
1053

1054
Intuitively, unexpected news carries a lot of
1055
information; conversely, if there is something you were already confident
1056
of, confirming it contributes only a small amount of information.
1057

1058
For several topics in this book, we will need to be comfortable
1059
converting back and forth between the number of bits, $b$, and the
1060
number of values they can encode, $N = 2^b$.
1061

1062

1063
\section{Memory and storage}
1064

1065
While a process is running, most of its data is held in {\bf main
1066
memory}, which is usually some kind of random access memory (RAM).
1067
On most current computers, main memory is {\bf volatile}, which means that
1068
when the computer shuts down, the contents of main memory are lost.
1069
A typical desktop computer has 2--8 GiB of
1070
memory.  GiB stands for ``gibibyte,'' which is $2^{30}$ bytes.  
1071

1072
If the process reads and writes files, those files are usually stored
1073
on a hard disk drive (HDD) or solid state drive (SSD).  These storage
1074
devices are {\bf non-volatile}, so they are used for long-term storage.
1075
Currently a typical desktop computer has a HDD with a capacity of 500
1076
GB to 2 TB.  GB stands for ``gigabyte,'' which is $10^9$ bytes.  TB
1077
stands for ``terabyte,'' which is $10^{12}$ bytes.
1078

1079
You might have noticed that I used the binary unit
1080
GiB for the size of main memory and the decimal units GB and TB for
1081
the size of the HDD.  For historical and technical reasons, memory is
1082
measured in binary units, and disk drives are measured in decimal
1083
units.  In this book I will be careful to distinguish binary and
1084
decimal units, but you should be aware that the word ``gigabyte'' and the
1085
abbreviation GB are often used ambiguously.
1086

1087
In casual use, the term ``memory'' is sometimes used for HDDs and SSDs
1088
as well as RAM, but the properties of these devices are very
1089
different, so we will need to distinguish them.  I will use
1090
{\bf storage} to refer to HDDs and SSDs.
1091

1092

1093
\section{Address spaces}
1094

1095
Each byte in main memory is specified by an integer {\bf physical
1096
address}.  The set of valid physical addresses is called the
1097
physical {\bf address space}.  It
1098
usually runs from 0 to $N-1$, where $N$ is
1099
the size of main memory.  On a system with 1 GiB of physical memory,
1100
the highest valid address is $2^{30}-1$, which is 1,073,741,823 in
1101
decimal, or 0x03ff ffff in hexadecimal (the prefix 0x indicates a
1102
hexadecimal number).
1103

1104
However, most operating systems provide {\bf virtual memory}, which
1105
means that programs never deal with physical addresses, and don't 
1106
have to know how much physical memory is available.
1107

1108
Instead, programs work with {\bf virtual addresses}, which are numbered
1109
from 0 to $M-1$, where $M$ is the number of valid virtual addresses.
1110
The size of the virtual address space is determined by the operating
1111
system and the hardware it runs on.
1112

1113
You have probably heard people talk about 32-bit and 64-bit systems.
1114
These terms indicate the size of the registers, which is usually also
1115
the size of a virtual address.  On a 32-bit system, virtual addresses
1116
are 32 bits, which means that the virtual address space runs from 0 to
1117
0xffff ffff.  The size of this address space is $2^{32}$ bytes, or 4
1118
GiB.
1119

1120
On a 64-bit system, the size of the virtual address space is $2^{64}$
1121
bytes, or $2^4 \cdot 1024^6$ bytes.  That's 16 exbibytes, which is
1122
about a billion times bigger than current physical memories.  It might
1123
seem strange that a virtual address space can be so much bigger
1124
than physical memory, but we will see soon how that works.
1125

1126
When a program reads and writes values in memory, it generates virtual
1127
addresses.  The hardware, with help from the operating system,
1128
translates to physical addresses before accessing main memory.  This
1129
translation is done on a per-process basis, so even if two processes
1130
generate the same virtual address, they would map to different
1131
locations in physical memory.
1132

1133
Thus, virtual memory is one important way the operating system
1134
isolates processes from each other.  In general, a process cannot
1135
access data belonging to another process, because there is no
1136
virtual address it can generate that maps to physical memory
1137
allocated to another process.
1138

1139

1140

1141
\section{Memory segments}
1142

1143
The data of a running process is organized into five segments:
1144

1145
\begin{itemize}
1146

1147
\item The {\bf code segment} contains the program text; that is, the
1148
  machine language instructions that make up the program.
1149

1150
\item The {\bf static segment} contains immutable values, like string
1151
literals.  For example, if your program contains the string
1152
{\tt "Hello, World"}, those characters will be stored in the
1153
static segment.
1154

1155
\item The {\bf global segment} contains global variables and local variables that are declared {\tt static}.
1156

1157
\item The {\bf heap segment} contains chunks of memory allocated
1158
  at run time, most often by calling the C library function
1159
  {\tt malloc}.
1160

1161
\item The {\bf stack segment} contains the call stack, which is a
1162
  sequence of stack frames.  Each time a function is called, a stack
1163
  frame is allocated to contain the 
1164
  parameters and local variables of the function.  When the function
1165
  completes, its stack frame is removed from the stack.
1166

1167
\end{itemize}
1168

1169
The arrangement of these segments is determined partly by the 
1170
compiler and partly by the operating system.  The details vary
1171
from one system to another, but in the most common arrangement:
1172

1173
\begin{itemize}
1174

1175
\item The text segment is near the ``bottom'' of memory, that is,
1176
  at addresses near 0.
1177

1178
\item The static segment is often just above the text segment, that is,
1179
at higher addresses.
1180

1181
\item The global segment is often just above the static segment.
1182

1183
\item The heap is often above the global segment.  As it expands,
1184
  it grows up toward larger addresses.
1185
  
1186
\item The stack is near the top of memory; that is, near the
1187
  highest addresses in the virtual address space.  As the
1188
  stack expands, it grows down toward smaller addresses.
1189

1190
\end{itemize}
1191

1192
%TODO: Figure out how to handle the code that is in both ExercisesInC
1193
% and the repo for the book.
1194

1195
To determine the layout of these segments on your system, try running
1196
this program, which is in {\tt aspace.c} in the repository for this
1197
book (see Section~\ref{code}).
1198

1199
\begin{verbatim}
1200
#include <stdio.h>
1201
#include <stdlib.h>
1202

1203
int global;
1204

1205
int main ()
1206
{
1207
    int local = 5;
1208
    void *p = malloc(128);
1209
    char *s = "Hello, World";
1210

1211
    printf ("Address of main is %p\n", main);
1212
    printf ("Address of global is %p\n", &global);
1213
    printf ("Address of local is %p\n", &local);
1214
    printf ("p points to %p\n", p);
1215
    printf ("s points to %p\n", s);
1216
}
1217
\end{verbatim}
1218

1219
{\tt main} is the name of a function; when it is used as a variable,
1220
it refers to the address of the first machine language instruction
1221
in {\tt main}, which we expect to be in the text segment.
1222

1223
{\tt global} is a global variable, so we expect it to be in the
1224
global segment.  {\tt local} is a local variable, so we expect it
1225
to be on the stack.
1226

1227
{\tt s} refers to a ``string literal", which is a string that appears
1228
as part of the program (as opposed to a string that is read from a file,
1229
input by a user, etc.).  We expect the location of the string to be
1230
in the static segment (as opposed to the pointer, {\tt s}, which is
1231
a local variable).
1232

1233
{\tt p} contains an address returned by {\tt malloc}, which allocates
1234
space in the heap.  ``malloc'' stands for ``memory allocate.''
1235

1236
The format sequence \verb"%p" tells {\tt printf} to format each
1237
address as a ``pointer'', so it displays the results in hexadecimal.
1238

1239
When I run this program, the output looks like this (I added spaces
1240
to make it easier to read):
1241

1242
\begin{verbatim}
1243
Address of main is   0x      40057d
1244
Address of global is 0x      60104c
1245
Address of local is  0x7ffe6085443c
1246
p points to          0x     16c3010
1247
s points to          0x      4006a4
1248

1249
\end{verbatim}
1250

1251
As expected, the address of {\tt main} is the lowest, followed by
1252
the location of the string literal.  The location of
1253
{\tt global} is next, then the address {\tt p} points to.
1254
The address of {\tt local} is much bigger.
1255

1256
The largest address has 12 hexadecimal digits.  Each hex digit
1257
corresponds to 4 bits, so it is a 48-bit address.  That suggests
1258
that the usable part of the virtual address space is $2^{48}$ bytes.
1259

1260
As an exercise, run this program on your computer and compare your
1261
results to mine.  Add a second call to {\tt malloc} and check whether
1262
the heap on your system grows up (toward larger addresses).  Add a
1263
function that prints the address of a local variable, and check
1264
whether the stack grows down.
1265

1266

1267
\section{Static local variables}
1268

1269
Local variables on the stack are sometimes called {\bf automatic},
1270
because they are allocated automatically when a function is called,
1271
and freed automatically when the function returns.
1272

1273
In C there is another kind of local variable, called {\bf static},
1274
which is allocated in the global segment.  It is initialized when
1275
the program starts and keeps its value from one function call to
1276
the next.
1277

1278
For example, the following function keeps track of how many times
1279
it has been called.
1280

1281
\begin{verbatim}
1282
int times_called()
1283
{
1284
    static int counter = 0;
1285
    counter++;
1286
    return counter;
1287
}
1288
\end{verbatim}
1289

1290
The keyword {\tt static} indicates that {\tt counter} is a static
1291
local variable.  The initialization happens only once, when the program
1292
starts.
1293

1294
If you add this function to {\tt aspace.c} you can confirm that
1295
{\tt counter} is allocated in the global segment along with global
1296
variables, not in the stack.
1297

1298

1299
\section{Address translation}
1300
\label{address_translation}
1301

1302
How does a virtual address (VA) get translated to a physical address (PA)?
1303
The basic mechanism is simple, but a simple
1304
implementation would be too slow and take too much space.  So actual
1305
implementations are a bit more complicated.
1306

1307
\begin{figure}
1308
\centerline{\includegraphics[width=3in]{figs/address_translation.pdf}}
1309
\caption{Diagram of the address translation process.}
1310
\label{addtrans}
1311
\end{figure}
1312

1313
Most processors provide a memory management unit (MMU) that sits between the CPU and main memory.  The MMU performs fast translation between VAs and PAs.
1314

1315
\begin{enumerate}
1316

1317
\item When a program reads or writes a variable, the CPU generates a
1318
VA.  
1319

1320
\item The MMU splits the VA into two parts, called the page number and
1321
the offset.  A ``page'' is a chunk of memory; the size of a page
1322
depends on the operating system and the hardware, but common sizes
1323
are 1--4 KiB.
1324

1325
\item The MMU looks up the page number in the translation lookaside buffer (TLB) and gets the corresponding physical page number.  Then it combines
1326
the physical page number with the offset to produce a PA.
1327

1328
\item The PA is passed to main memory, which reads or writes the given
1329
location.
1330

1331
\end{enumerate}
1332

1333
The TLB contains cached copies of data from the page table (which is stored in kernel memory).  The page table contains the mapping from virtual page numbers to physical page numbers.  Since each process has its own page table, the TLB has to make sure it only uses entries from the page table of the process that's running.
1334

1335
Figure~\ref{addtrans} shows a diagram of this process.
1336
To see how it all works, suppose that the VA is 32 bits and the physical memory is 1 GiB, divided into 1 KiB pages.
1337

1338
\begin{itemize}
1339

1340
\item Since 1 GiB is $2^{30}$ bytes and 1 KiB is $2^{10}$ bytes, there
1341
  are $2^{20}$ physical pages, sometimes called ``frames.''
1342

1343
\item The size of the virtual address space is $2^{32}$ B and the size
1344
  of a page is $2^{10}$ B, so there are $2^{22}$ virtual pages.
1345

1346
\item The size of the offset is determined by the page size.  In this
1347
  example the page size is $2^{10}$ B, so it takes 10 bits to specify
1348
  a byte on a page.
1349

1350
\item If a VA is 32 bits and the offset is 10 bits, the remaining
1351
  22 bits make up the virtual page number.
1352

1353
\item Since there are $2^{20}$ physical pages, each physical page
1354
  number is 20 bits.  Adding in the 10 bit offset, the resulting
1355
  PAs are 30 bits.
1356

1357
\end{itemize}
1358

1359
So far this all seems feasible.  But let's think about how big a page
1360
table might have to be.  The simplest implementation of a page
1361
table is an array with one entry for each virtual page.
1362
Each entry would contain a physical page number, which is 20 bits
1363
in this example, plus some additional information about each
1364
frame.  So we expect 3--4 bytes per entry.  But with $2^{22}$ virtual pages,
1365
the page table would require $2^{24}$ bytes, or 16 MiB.
1366

1367
And since we need a page table for each process, a system running
1368
256 processes would need $2^{32}$ bytes, or 4 GiB, just for page tables!
1369
And that's just with 32-bit virtual addresses.  With 48- or 64-bit
1370
VAs, the numbers are ridiculous.
1371

1372
Fortunately, we don't actually need that much space, because
1373
most processes don't use even a small fraction of their
1374
virtual address space.  And if a process doesn't use a virtual
1375
page, we don't need an entry in the page table for it.
1376

1377
Another way to say the same thing is that page tables are ``sparse'',
1378
which implies that the simple implementation, an array of page
1379
table entries, is a bad idea.  Fortunately, there are several
1380
good implementations for sparse arrays.
1381

1382
One option is a multilevel page table, which is what many operating
1383
systems, including Linux, use.  Another option is an associative table, where each entry includes both the virtual page number and the physical page number.  Searching an associative table can be slow in software, but in hardware we
1384
can search the entire table in parallel, so associative arrays are
1385
often used to represent the page table entries in the TLB.
1386

1387
You can read more about these implementations at
1388
\url{http://en.wikipedia.org/wiki/Page_table}; you might find the
1389
details interesting.  But the fundamental idea is that page tables are
1390
sparse, so we have to choose a good implementation for sparse arrays.
1391

1392
I mentioned earlier that the operating system can interrupt a running
1393
process, save its state, and then run another process.  This mechanism
1394
is called a {\bf context switch}.  Since each process has its own
1395
page table, the operating system has to work with the MMU to make
1396
sure each process gets the right page table.  In older machines,
1397
the page table information in the MMU had to be replaced during every
1398
context switch, which was expensive.  In newer systems, each page
1399
table entry in the MMU includes the process ID, so page tables from
1400
multiple processes can be in the MMU at the same time.
1401

1402

1403

1404
\chapter{Files and file systems}
1405

1406
When a process completes (or crashes), any data stored in main
1407
memory is lost.  But data stored on a hard disk drive (HDD) or
1408
solid state drive (SSD) is ``persistent;'' that is, it survives
1409
after the process completes, even if the computer shuts down.
1410

1411
Hard disk drives are complicated.  Data is stored in blocks, which
1412
are laid out in sectors, which make up tracks, which are arranged
1413
in concentric circles on platters.
1414

1415
Solid state drives are simpler in one sense, because blocks are
1416
numbered sequentially, but they raise a different complication: each
1417
block can be written a limited number of times before it becomes
1418
unreliable.
1419

1420
As a programmer, you don't want to deal with these complications.
1421
What you want is an appropriate abstraction of persistent storage
1422
hardware.  The most common abstraction is called a ``file system.''
1423

1424
Abstractly:
1425

1426
\begin{itemize}
1427

1428
\item A ``file system'' is a mapping from each file's name to its contents.
1429
If you think of the names as keys, and the contents as values,
1430
a file system is a kind of key-value database
1431
(see \url{https://en.wikipedia.org/wiki/Key-value_database}).
1432

1433
\item A ``file'' is a sequence of bytes.
1434

1435
\end{itemize}
1436

1437
File names are usually strings, and they are usually ``hierarchical'';
1438
that is, the string specifies a path from a top-level directory (or
1439
folder), through a series of subdirectories, to a specific file.
1440

1441
The primary difference between the abstraction and the underlying
1442
mechanism is that files are byte-based and persistent storage is
1443
block-based.  The operating system translates byte-based file operations 
1444
in the C library into block-based operations on storage devices.
1445
Typical block sizes are 1--8 KiB.
1446

1447
For example, the following code opens a file and reads the first byte:
1448

1449
\begin{verbatim}
1450
    FILE *fp = fopen("/home/downey/file.txt", "r");
1451
    char c = fgetc(fp);
1452
    fclose(fp);
1453
\end{verbatim}
1454

1455
When this code runs:
1456

1457
\begin{enumerate}
1458

1459
\item {\tt fopen} uses the filename to find the top-level directory,
1460
  called \verb"/", the subdirectory {\tt home}, and the
1461
  sub-subdirectory {\tt downey}.
1462

1463
\item It finds the file named {\tt file.txt} and ``opens'' it for
1464
  reading, which means it creates a data structure that represents the
1465
  file being read.  Among other things, this data structure
1466
  keeps track of how much of the file has been read, called the ``file
1467
  position''.
1468

1469
  In DOS, this data structure is called a File Control Block, but I
1470
  want to avoid that term because in UNIX it means something else.  In
1471
  UNIX, there seems to be no good name for it.  It is an entry in the
1472
  open file table, so I will call it an OpenFileTableEntry.
1473

1474
\item When we call {\tt fgetc}, the operating system checks whether
1475
  the next character of the file is already in memory.  If so, it
1476
  reads the next character, advances the file position, and returns
1477
  the result.
1478

1479
\item If the next character is not in memory, the operating
1480
  system issues an I/O request to get the next block.  Disk drives are
1481
  slow, so a process waiting for a block from disk is usually
1482
  interrupted so another process can run until the data arrives.
1483

1484
\item When the I/O operation is complete, the new block of data is
1485
  stored in memory, and the process resumes.  It reads the first
1486
  character and stores it as a local variable.
1487

1488
\item When the process closes the file, the operating system completes
1489
  or cancels any pending operations, removes data stored in
1490
  memory, and frees the OpenFileTableEntry.
1491

1492
\end{enumerate}
1493

1494
The process for writing a file is similar, but there are some
1495
additional steps.  Here is an example that opens a file for
1496
writing and changes the first character.
1497

1498
\begin{verbatim}
1499
    FILE *fp = fopen("/home/downey/file.txt", "w");
1500
    fputc('b', fp);
1501
    fclose(fp);
1502
\end{verbatim}
1503

1504
When this code runs:
1505

1506
\begin{enumerate}
1507

1508
\item Again, {\tt fopen} uses the filename to find the file.  If it
1509
  does not already exist, it creates a new file and adds an entry in
1510
  the parent directory, {\tt /home/downey}.
1511

1512
\item The operating system creates an OpenFileTableEntry that
1513
  indicates that the file is open for writing, and sets the file
1514
  position to 0.
1515

1516
\item {\tt fputc} attempts to write (or re-write) the first byte of
1517
  the file.  If the file already exists, the operating system has to
1518
  load the first block into memory.  Otherwise it allocates a new
1519
  block in memory and requests a new block on disk.
1520

1521
\item After the block in memory is modified, it might not be copied
1522
  back to the disk right away.  In general, data written to a file is
1523
  ``buffered'', which means it is stored in memory and only written to
1524
  disk when there is at least one block to write.
1525

1526
\item When the file is closed, any buffered data is written to disk
1527
  and the OpenFileTableEntry is freed.
1528

1529
\end{enumerate}
1530

1531
To summarize, the C library provides the abstraction of a file
1532
system that maps from file names to streams of bytes.  This abstraction
1533
is built on top of storage devices that are actually organized
1534
in blocks.
1535

1536

1537
\section{Disk performance}
1538

1539
\newcommand{\mus}{$\mu$s~}
1540

1541
I mentioned earlier that disk drives are slow.  On current HDDs, the
1542
average time to read a block from disk to memory might be 5--25 ms
1543
(see \url{https://en.wikipedia.org/wiki/Hard_disk_drive_performance_characteristics}).
1544
SSDs are faster, taking 25 \mus to read a 4 KiB block and 250 \mus to
1545
write one (see \url{http://en.wikipedia.org/wiki/Ssd#Controller}).
1546

1547
To put these numbers in perspective, let's compare them to the clock
1548
cycle of the CPU.  A processor with clock rate 2 GHz completes one
1549
clock cycle every 0.5 ns.  The time to get a byte from memory to
1550
the CPU is typically around 100 ns.  If the processor completes one
1551
instruction per clock cycle, it would complete 200 instructions
1552
while waiting for a byte from memory.
1553

1554
In one microsecond, it would complete 2000 instructions,
1555
so while waiting 25 \mus for a byte from an SSD, it would complete 50,000.
1556

1557
In one millisecond, it would complete 2,000,000 instructions,
1558
so while waiting 20 ms for a byte from a HDD, it might complete
1559
40 million.  If there's nothing for the CPU to do while it waits,
1560
it would be idle.  That's why the operating system generally
1561
switches to another process while it is waiting for data from disk.
1562

1563
The gap in performance between main memory and persistent storage is
1564
one of the major challenges of computer system design.  Operating
1565
systems and hardware provide several features intended to ``fill in''
1566
this gap:
1567

1568
\begin{itemize}
1569

1570
\item Block transfers: The time it takes to load a single byte from
1571
  disk is 5--25 ms.  By comparison, the additional time to load an 8
1572
  KiB block is negligible.  So systems generally try to read large
1573
  blocks each time they access the disk.
1574

1575
\item Prefetching: Sometimes the operating system can predict that a
1576
  process will read a block and start loading it before it is
1577
  requested.  For example, if you open a file and read the first
1578
  block, there is a good chance you will go on to read the second
1579
  block.  The operating system might start loading additional blocks
1580
  before they are requested.
1581

1582
\item Buffering: As I mentioned, when you write a file, the operating
1583
  system stores the data in memory and only writes it to disk later.
1584
  If you modify the block several times while it is in memory, the
1585
  system only has to write it to disk once.
1586

1587
\item Caching: If a process has used a block recently, it is likely to
1588
  use it again soon.  If the operating system keeps a copy of the
1589
  block in memory, it can handle future requests at memory speed.
1590

1591
\end{itemize}
1592

1593
Some of these features are also implemented in hardware.  For example,
1594
some disk drives provide a cache that stores recently-used blocks,
1595
and many disk drives read more than one block at a time, even if only
1596
one is requested.
1597

1598
These mechanisms generally improve the performance of
1599
programs, but they don't change the behavior.  Usually programmers
1600
don't have to think about them, with two exceptions: (1) if the
1601
performance of a program is unexpectedly bad, you might have to know
1602
something about these mechanisms to diagnose the problem, and (2)
1603
when data is buffered, it can be harder to debug a program.  For
1604
example, if a program prints a value and then crashes, the value
1605
might not appear, because it might be in a buffer.  Similarly, if a
1606
program writes data to disk and then the computer loses power, the
1607
data might be lost if it is in a cache and not yet on disk.
1608

1609

1610
\section{Disk metadata}
1611

1612
The blocks that make up a file might be arranged contiguously on
1613
disk, and file system performance is generally better if they are,
1614
but most operating systems don't require contiguous allocation.
1615
They are free to place a block anywhere on disk, and they use
1616
various data structures to keep track of them.
1617

1618
In many UNIX file systems, that data structure is called an ``inode,''
1619
which stands for ``index node''.  More generally, information about
1620
files, including the location of their blocks, is called ``metadata''.
1621
(The content of the file is data, so information about the file is
1622
data about data, hence ``meta''.)
1623

1624
Since inodes reside on disk along with the rest of the data, they are
1625
designed to fit neatly into disk blocks.  A UNIX inode contains
1626
information about a file, including the user ID of the file owner;
1627
permission flags indicating who is allowed to read, write, or execute
1628
it; and timestamps that indicate when it was last modified and
1629
accessed.  In addition, it contains block numbers for the first 12
1630
blocks that make up the file.
1631

1632
If the block size is 8 KiB, the first 12 blocks make up 96 KiB.
1633
On most systems, that's big enough for a large majority of files,
1634
but it's definitely not big enough for all of them.  That's
1635
why the inode also contains a pointer to an ``indirection block'',
1636
which contains nothing but pointers to other blocks.
1637

1638
The number of pointers in an indirection block depends on the sizes of
1639
the blocks and the block numbers, but it is often 1024.  With 1024
1640
block numbers and 8 KiB blocks, an indirection block can address 8
1641
MiB.  That's big enough for all but the largest files, but still not
1642
big enough for all.
1643

1644
That's why the inode also contains a pointer to a ``double indirection
1645
block'', which contains pointers to indirection blocks.  With
1646
1024 indirection blocks, we can address 8 GiB.
1647

1648
And if that's not big enough, there is (finally) a triple indirection
1649
block, which contains pointers to double indirection blocks, yielding
1650
a maximum file size of 8 TiB.  When UNIX inodes were designed, that
1651
seemed big enough to serve for a long time.  But that was a long time
1652
ago.
1653

1654
As an alternative to indirection blocks, some files systems, like FAT,
1655
use a File Allocation Table that contains one entry for each block,
1656
called a ``cluster'' in this context.  A root directory contains a
1657
pointer to the first cluster in each file.  The FAT entry for each
1658
cluster points to the next cluster in the file, similar to a linked
1659
list.  For more details, see
1660
\url{http://en.wikipedia.org/wiki/File_Allocation_Table}.
1661

1662

1663
\section{Block allocation}
1664

1665
File systems have to keep track of which blocks belong to each file;
1666
they also have to keep track of which blocks are available for use.
1667
When a new file is created, the file system finds an available
1668
block and allocates it.  When a file is deleted, the file system
1669
makes its blocks available for re-allocation.
1670

1671
The goals of the block allocation system are:
1672

1673
\begin{itemize}
1674

1675
\item Speed: Allocating and freeing blocks should be fast.
1676

1677
\item Minimal space overhead: The data structures used by the allocator
1678
  should be small, leaving as much space as possible for data.
1679

1680
\item Minimal fragmentation: If some blocks are left unused, or some
1681
  are only partially used, the unused space is called
1682
  ``fragmentation''.
1683

1684
\item Maximum contiguity: Data that is likely to be used at the same
1685
  time should be physically contiguous, if possible, to improve
1686
  performance.
1687

1688
\end{itemize}
1689

1690
It is hard to design a file system that achieves all of these
1691
goals, especially since file system performance depends on
1692
``workload characteristics'' like file sizes, access
1693
patterns, etc.  A file system that is well tuned for one workload
1694
might not perform as well for another.
1695

1696
For this reason, most operating systems support several kinds of file
1697
systems, and file system design is an active area of research and
1698
development.  In the last decade, Linux systems have migrated
1699
from ext2, which was a conventional UNIX file system, to ext3,
1700
a ``journaling'' file system intended to improve speed and
1701
contiguity, and more recently to ext4, which can handle larger files
1702
and file systems.  Within the next few years, there might be
1703
another migration to the B-tree file system, Btrfs.
1704

1705

1706
\section{Everything is a file?}
1707

1708
The file abstraction is really a ``stream of bytes'' abstraction,
1709
which turns out to be useful for many things, not just file systems.
1710

1711
One example is the UNIX pipe, which is a simple form of inter-process
1712
communication.  Processes can be set up so that output from one
1713
process is taken as input into another process.  For the first
1714
process, the pipe behaves like a file open for writing, so it
1715
can use C library functions like {\tt fputs} and {\tt fprintf}.
1716
For the second process, the pipe behaves like a file open for
1717
reading, so it uses {\tt fgets} and {\tt fscanf}.
1718

1719
Network communication also uses the stream of bytes abstraction.
1720
A UNIX socket is a data structure that represents a communication
1721
channel between processes on different computers (usually).  Again,
1722
processes can read data from and write data to a socket using
1723
``file'' handling functions.
1724

1725
Reusing the file abstraction makes life easier for programmers, since
1726
they only have to learn one API (application program interface).
1727
It also makes programs more versatile, since a program intended to
1728
work with files can also work with data coming from pipes and other
1729
sources.
1730

1731
% TODO: gprof here?
1732

1733

1734
\chapter{More bits and bytes}
1735

1736
\section{Representing integers}
1737

1738
You probably know that computers represent numbers in
1739
base 2, also known as binary.  For positive numbers, the binary
1740
representation is straightforward; for example, the representation
1741
for $5_{10}$ is $b101$.
1742

1743
For negative numbers, the most obvious representation uses
1744
a sign bit to indicate whether a number is positive or negative.
1745
But there is another representation, called ``two's complement''
1746
that is much more common because it is easier to work with
1747
in hardware.
1748

1749
To find the two's complement of a negative number, $-x$, find
1750
the binary representation of $x$, flip all the bits, and add 1.
1751
For example, to represent $-5_{10}$, start with the representation
1752
of $5_{10}$, which is $b0000 0101$ if we write the 8-bit version.
1753
Flipping all the bits and adding 1 yields $b1111 1011$.
1754

1755
In two's complement, the leftmost bit acts like a sign bit;
1756
it is 0 for positive numbers and 1 for negative numbers.
1757

1758
To convert from an 8-bit number to 16-bits, we have to add
1759
more 0's for a positive number and add 1's for a negative number.
1760
In effect, we have to copy the sign bit into the new bits.
1761
This process is called ``sign extension''.
1762

1763
In C all integer types are signed (able to represent positive and
1764
negative numbers) unless you declare them {\tt unsigned}.  The
1765
difference, and the reason this declaration is important, is that
1766
operations on unsigned integers don't use sign extension.
1767

1768

1769
\section{Bitwise operators}
1770

1771
People learning C are sometimes confused
1772
about the bitwise operators \verb"&" and \verb"|".  These
1773
operators treat integers as bit vectors and compute logical
1774
operations on corresponding bits.
1775

1776
For example, \verb"&" computes the AND operation, which yields
1777
1 if both operands are 1, and 0 otherwise.  Here is an example
1778
of \verb"&" applied to two 4-bit numbers:
1779
%
1780
\begin{verbatim}
1781
  1100
1782
& 1010
1783
  ----
1784
  1000
1785
\end{verbatim}
1786
%
1787
In C, this means that the expression \verb"12 & 10" has the
1788
value 8.
1789

1790
Similarly, \verb"|" computes the OR operation, which yields
1791
1 if either operand is 1, and 0 otherwise.
1792
%
1793
\begin{verbatim}
1794
  1100
1795
| 1010
1796
  ----
1797
  1110
1798
\end{verbatim}
1799
%
1800
So the expression \verb"12 | 10" has the value 14.
1801

1802
Finally, \verb"^" computes the XOR operation, which yields
1803
1 if either operand is 1, but not both.
1804
%
1805
\begin{verbatim}
1806
  1100
1807
^ 1010
1808
  ----
1809
  0110
1810
\end{verbatim}
1811
%
1812
So the expression \verb"12 ^ 10" has the value 6.
1813

1814
Most commonly, \verb"&" is used to clear a set of bits from
1815
a bit vector, \verb"|" is used to set bits, and \verb"^"
1816
is used to flip, or ``toggle'' bits.  Here are the details:
1817

1818
{\bf Clearing bits}: For any value $x$, $x \& 0$ is 0, and $x \& 1$ is $x$.
1819
So if you AND a vector with 3, it 
1820
selects only the two rightmost bits, and sets the rest to 0.
1821
%
1822
\begin{verbatim}
1823
  xxxx
1824
& 0011
1825
  ----
1826
  00xx
1827
\end{verbatim}
1828
%
1829
In this context, the value 3 is called a ``mask'' because it
1830
selects some bits and masks the rest.
1831

1832
{\bf Setting bits}: Similarly, for any $x$, $x | 0$ is x, and $x | 1$ is $1$.
1833
So if you OR a vector with 3, it sets the rightmost
1834
bits, and leaves the rest alone:
1835
%
1836
\begin{verbatim}
1837
  xxxx
1838
| 0011
1839
  ----
1840
  xx11
1841
\end{verbatim}
1842
%
1843
{\bf Toggling bits}: Finally, if you XOR a vector with 3, it flips the
1844
rightmost bits and leaves the rest alone.  As an exercise, see if you
1845
can compute the two's complement of 12 using \verb"^".  Hint: what's
1846
the two's complement representation of -1?
1847

1848
% (12 ^ -1) + 1
1849

1850
C also provides shift operators, {\tt <<} and {\tt >>}, which shift
1851
bits left and right.  Each left shift doubles a number, so
1852
{\tt 5 << 1} is 10, and {\tt 5 << 2} is 20.  Each right shift
1853
divides by two (rounding down), so {\tt 5 >> 1} is 2 and
1854
{\tt 2 >> 1} is 1.
1855

1856

1857
\section{Representing floating-point numbers}
1858

1859
Floating-point numbers are represented using the binary
1860
version of scientific notation.  In decimal notation, large
1861
numbers are written as the product of a coefficient and 10 raised
1862
to an exponent.  For example, the speed of light in m/s is
1863
approximately $2.998 \cdot 10^8$.
1864

1865
Most computers use the IEEE standard for floating-point
1866
arithmetic.  The C type {\tt float} usually corresponds
1867
to the 32-bit IEEE standard; {\tt double} usually corresponds
1868
to the 64-bit standard.
1869

1870
In the 32-bit standard, the leftmost bit is the sign bit, $s$.
1871
The next 8 bits are the exponent, $q$, and the last 23 bits are
1872
the coefficient, $c$.  The value of a floating-point number is
1873
%
1874
\[ (-1)^s c \cdot 2^q \]
1875
%
1876
Well, that's almost correct, but there's one more wrinkle.
1877
Floating-point numbers are usually normalized so that there is
1878
one digit before the point.  For example, in base 10, we prefer
1879
$2.998 \cdot 10^8$ rather than $2998 \cdot 10^5$ or any other
1880
equivalent expression.  In base 2, a normalized number always
1881
has the digit 1 before the binary point.  Since the digit in
1882
this location is always 1, we can save space by leaving it
1883
out of the representation.
1884

1885
For example, the integer representation of $13_{10}$ is $b1101$.
1886
In floating point, that's $1.101 \cdot 2^3$, so the exponent
1887
is 3 and the part of the coefficient that would be stored
1888
is 101 (followed by 20 zeros).
1889

1890
Well, that's almost correct, but there's one more wrinkle.
1891
The exponent is stored with a ``bias''.  In the 32-bit standard,
1892
the bias is 127, so the exponent 3 would be stored as 130.
1893

1894
To pack and unpack floating-point numbers in C, we can use a 
1895
union and bitwise operations.  Here's an example:
1896
%
1897
\begin{verbatim}
1898
    union {
1899
        float f;
1900
        unsigned int u;
1901
    } p;
1902

1903
    p.f = -13.0;
1904
    unsigned int sign = (p.u >> 31) & 1;
1905
    unsigned int exp = (p.u >> 23) & 0xff;
1906

1907
    unsigned int coef_mask = (1 << 23) - 1;
1908
    unsigned int coef = p.u & coef_mask;
1909

1910
    printf("%d\n", sign);
1911
    printf("%d\n", exp);
1912
    printf("0x%x\n", coef);
1913
\end{verbatim}
1914
%
1915
This code is in {\tt float.c} in the repository for this
1916
book (see Section~\ref{code}).
1917

1918
The union allows us to store a floating-point value using
1919
{\tt p.f} and then read it as an unsigned integer using
1920
{\tt p.u}.
1921

1922
To get the sign bit, we shift the bits to the right 31
1923
places and then use a 1-bit mask to select only the
1924
rightmost bit.
1925

1926
To get the exponent, we shift the bits 23 places, then select the
1927
rightmost 8 bits (the hexadecimal value {\tt 0xff} has eight 1's).
1928

1929
To get the coefficient, we need to extract the 23 rightmost bits
1930
and ignore the rest.  We do that by making a mask with 1s in the
1931
23 rightmost places and 0s on the left.  The easiest way to do that
1932
is by shifting 1 to the left by 23 places and then subtracting 1.  
1933

1934
The output of this program is:
1935
%
1936
\begin{verbatim}
1937
1
1938
130
1939
0x500000
1940
\end{verbatim}
1941
%
1942
As expected, the sign bit for a negative number is 1.  The exponent 
1943
is 130, including the bias.  And the coefficient, which I printed in
1944
hexadecimal, is $101$ followed by 20 zeros.
1945

1946
As an exercise, try assembling or disassembling a {\tt double}, which
1947
uses the 64-bit standard.  See
1948
\url{http://en.wikipedia.org/wiki/IEEE_floating_point}.
1949

1950

1951
\section{Unions and memory errors}
1952

1953
There are two common uses of C unions.  One, which we saw in the
1954
previous section, is to access the binary representation of data.
1955
Another is to store heterogeneous data.  For example, you could
1956
use a union to represent a number that might be an integer, float,
1957
complex, or rational number.
1958

1959
However, unions are error-prone.  It is up to you, as the programmer,
1960
to keep track of what type of data is in the union; if you write
1961
a floating-point value and then interpret it as an integer, the result
1962
is usually nonsense.
1963

1964
Actually, the same thing can happen if you read a location in memory
1965
incorrectly.  One way that can happen is if you read past the end of
1966
an array.
1967

1968
To see what happens, I'll start with a function that allocates an
1969
array on the stack and fills it with the numbers from 0 to 99.
1970

1971
\begin{verbatim}
1972
void f1() {
1973
    int i;
1974
    int array[100];
1975

1976
    for (i=0; i<100; i++) {
1977
        array[i] = i;
1978
    }
1979
}
1980
\end{verbatim}
1981

1982
Next I'll define a function that creates a smaller array and
1983
deliberately accesses elements before the beginning and after
1984
the end:
1985

1986
\begin{verbatim}
1987
void f2() {
1988
    int x = 17;
1989
    int array[10];
1990
    int y = 123;
1991

1992
    printf("%d\n", array[-2]);
1993
    printf("%d\n", array[-1]);
1994
    printf("%d\n", array[10]);
1995
    printf("%d\n", array[11]);
1996
}
1997
\end{verbatim}
1998

1999
If I call {\tt f1} and then {\tt f2}, I get these results:
2000

2001
\begin{verbatim}
2002
17
2003
123
2004
98
2005
99
2006
\end{verbatim}
2007

2008
The details here depend on the compiler, which arranges variables
2009
on the stack.  From these results, we can infer that the
2010
compiler put {\tt x} and {\tt y} next to each other, ``below''
2011
the array (at a lower address).  And when we read past the
2012
array, it looks like we are getting values that were left on
2013
the stack by the previous function call.
2014

2015
In this example, all of the variables are integers, so it is
2016
relatively easy to figure out what is going on.  But in general
2017
when you read beyond the bounds of an array, the values you
2018
read might have any type.  For example, if I change {\tt f1}
2019
to make an array of floats, the results are:
2020

2021
\begin{verbatim}
2022
17
2023
123
2024
1120141312
2025
1120272384
2026
\end{verbatim}
2027

2028
The latter two values are what you get if you interpret a
2029
floating-point value as an integer.  If you encountered this output
2030
while debugging, you would have a hard time figuring out what's
2031
going on.
2032

2033

2034
\section{Representing strings}
2035

2036
Related issues sometimes come up with strings.  First, remember
2037
that C strings are null-terminated.  When you allocate space
2038
for a string, don't forget the extra byte at the end.
2039

2040
Also, the letters {\it and numbers} in C strings are
2041
encoded in ASCII.  The ASCII codes for the digits ``0'' through ``9''
2042
are 48 through 57, {\it not} 0 through 9.  The ASCII code 0 is the NUL
2043
character that marks the end of a string.  And the ASCII codes 1
2044
through 9 are special characters used in some communication protocols.
2045
ASCII code 7 is a bell; on some terminals, printing it makes a sound.
2046

2047
The ASCII code for the letter ``A'' is 65; the code for
2048
``a'' is 97.  Here are those codes in binary:
2049

2050
\begin{verbatim}
2051
65 = b0100 0001
2052
97 = b0110 0001
2053
\end{verbatim}
2054

2055
A careful observer will notice that they differ by a single
2056
bit.  And this pattern holds for the rest of the letters; the
2057
sixth bit (counting from the right) acts as a ``case bit'', 0 for
2058
upper-case letters and 1 for lower case letters.
2059

2060
As an exercise, write a function that takes a string and converts
2061
from lower-case to upper-case by flipping the sixth bit.  As a challenge,
2062
you can make a faster version by reading the string 32 or 64 bits
2063
at a time, rather than one character at a time.  This optimization
2064
is made easier if the length of the string is a multiple of 4 or
2065
8 bytes.
2066

2067
If you read past the end of a string, you are likely to see
2068
strange characters.  Conversely, if you write a string and
2069
then accidentally read it as an int or float, the results
2070
will be hard to interpret.
2071

2072
For example, if you run:
2073

2074
\begin{verbatim}
2075
    char array[] = "allen";
2076
    float *p = array;
2077
    printf("%f\n", *p);
2078
\end{verbatim}
2079

2080
You will find that the ASCII representation of the first 8 characters
2081
of my name, interpreted as a double-precision floating point number,
2082
is 69779713878800585457664.
2083

2084
% TODO: assert here?
2085

2086

2087
\chapter{Memory management}
2088

2089
C provides 4 functions for dynamic memory allocation:
2090

2091
\begin{itemize}
2092

2093
\item {\tt malloc}, which takes an integer size, in bytes, and returns
2094
a pointer to a newly-allocated chunk of memory with (at least) the
2095
given size.  If it can't satisfy the request, it returns
2096
the special pointer value NULL.
2097

2098
\item {\tt calloc}, which is the same as {\tt malloc} except that
2099
it also clears the newly allocated chunk; that
2100
is, it sets all bytes in the chunk to 0.
2101

2102
\item {\tt free}, which takes a pointer to a previously allocated
2103
chunk and deallocates it; that is, it makes the space available for
2104
future allocation.
2105

2106
\item {\tt realloc}, which takes a pointer to a previously allocated
2107
chunk and a new size.  It allocates a chunk of memory with the new
2108
size, copies data from the old chunk to the new, frees the old chunk,
2109
and returns a pointer to the new chunk.
2110

2111
\end{itemize}
2112

2113
This API is notoriously error-prone and unforgiving.  Memory management
2114
is one of the most challenging parts of designing large software systems,
2115
which is why most modern languages provide higher-level memory
2116
management features like garbage collection.
2117

2118

2119
\section{Memory errors}
2120

2121
The C memory management API is a bit like Jasper Beardly, a minor
2122
character on the animated television program {\it The Simpsons}; 
2123
in a few episodes, he appears as a strict substitute teacher who imposes corporal punishment --- a ``paddlin''' --- for all infractions.
2124

2125
Here are some of things a program can do that deserve a paddling:
2126

2127
\begin{itemize}
2128

2129
\item If you access (read or write) any chunk that has not been
2130
allocated, that's a paddling.
2131

2132
\item If you free an allocated chunk and then access it, that's
2133
a paddling.
2134

2135
\item If you try to free a chunk that has not been allocated,
2136
that's a paddling.
2137

2138
\item If you free the same chunk more than once, that's a paddling.
2139

2140
\item If you call {\tt realloc} with a chunk that was not allocated,
2141
or was allocated and then freed, that's a paddling.
2142

2143
\end{itemize}
2144

2145
It might not sound difficult to follow these rules, but in a large
2146
program a chunk of memory might be allocated in one part of the
2147
program, used in several other parts, and freed in yet another
2148
part.  So changes in one part of the program can require changes
2149
in many other parts.
2150

2151
Also, there might be many aliases, or references to the same allocated
2152
chunk, in different parts of the program.  The chunk should not be
2153
freed until all references to the chunk are no longer in use.  
2154
Getting this right often requires careful analysis across all parts
2155
of the program, which is difficult and contrary to fundamental
2156
principles of good software engineering.
2157

2158
Ideally, every function that allocates memory should include, as part
2159
of the documented interface, information about how that memory is supposed
2160
to be freed.  Mature libraries often do this well, but in the real world,
2161
software engineering practice often falls short of this ideal.
2162

2163
To make matters worse, memory errors can be difficult
2164
to find because the symptoms are unpredictable.  For example:
2165

2166
\begin{itemize}
2167

2168
\item If you read a value from an unallocated chunk, the system {\em might} detect the error, trigger a runtime error called a ``segmentation fault'', and stop the program.  Or, the program might read unallocated memory without detecting the error; in that case, the value it gets is whatever happened to be stored at the accessed location, which is unpredictable, and might be different each time the program runs.
2169

2170
\item If you write a value to an unallocated chunk, and don't get a segmentation fault, things are even worse.  After you write a value to an invalid location, a long time might pass before it is read and causes problems.  At that point it will be very difficult to find the source of the problem.
2171

2172
\end{itemize} 
2173

2174
And things can be even worse than that!  One of the most common
2175
problems with C-style memory management is that the data structures
2176
used to implement {\tt malloc} and {\tt free} (which we will see soon)
2177
are often stored along with the allocated chunks.  So if you
2178
accidentally write past the end of a dynamically-allocated chunk, you
2179
are likely to mangle these data structures.  The system usually won't
2180
detect the problem until later, when you call {\tt malloc} or
2181
{\tt free}, and those functions fail in some inscrutable way.
2182

2183
One conclusion you should draw from this is that safe memory
2184
management requires design and discipline.  If you write a library
2185
or module that allocates memory, you should also provide an
2186
interface to free it, and memory management should be part of
2187
the API design from the beginning.
2188

2189
If you use a library that allocates memory, you should be disciplined
2190
in your use of the API.  For example, if the library provides
2191
functions to allocate and deallocate storage, you should use those
2192
functions and not, for example, call {\tt free} on a chunk you did not
2193
{\tt malloc}.  And you should avoid keeping multiple references to the
2194
same chunk in different parts of your program.
2195

2196
Often there is a trade-off between safe memory management and performance.
2197
For example, the most common source of memory errors is writing 
2198
beyond the bounds of an array.  The obvious remedy for this problem
2199
is bounds checking; that is, every access to the array should check
2200
whether the index is out of bounds.  High-level libraries that provide
2201
array-like structures usually perform bounds checking.  But C arrays
2202
and most low-level libraries do not.
2203

2204

2205
\section{Memory leaks}
2206
\label{leak}
2207

2208
There is one more memory error that may or may not deserve a paddling.
2209
If you allocate a chunk of memory and never free it, that's a ``memory
2210
leak''.
2211

2212
For some programs, memory leaks are ok.  For example, if your program
2213
allocates memory, performs computations on it, and then exits, it is
2214
probably not necessary to free the allocated memory.  When the program
2215
exits, all of its memory is deallocated by the operating system.
2216
Freeing memory immediately before exiting might feel more responsible,
2217
but it is mostly a waste of time.
2218

2219
But if a program runs for a long time and leaks memory, its total
2220
memory use will increase indefinitely.  At that point, a few things
2221
might happen:
2222

2223
\begin{itemize}
2224

2225
\item At some point, the system runs out of physical memory.  On
2226
  systems without virtual memory, the next call to {\tt malloc} will
2227
  fail, returning NULL.
2228

2229
\item On systems with virtual memory, the operating system can move
2230
  another process's pages from memory to disk and then allocate
2231
  more space to the leaking process.  I explain this mechanism
2232
  in Section~\ref{paging}.
2233

2234
\item There might be a limit on the amount of space a single
2235
  process can allocate; beyond that, {\tt malloc} returns NULL.
2236

2237
\item Eventually, a process might fill its virtual address space (or
2238
  the usable part).  After that, there are no more addresses to
2239
  allocate, so {\tt malloc} returns NULL.
2240

2241
\end{itemize}
2242

2243
If {\tt malloc} returns NULL, but you persist and access
2244
the chunk you think you allocated, you get a segmentation fault.
2245
For this reason, it is considered good style to check the result from
2246
{\tt malloc} before using it.  One option is to add a condition like
2247
this after every {\tt malloc} call:
2248

2249
\begin{verbatim}
2250
void *p = malloc(size);
2251
if (p == NULL) {
2252
    perror("malloc failed");
2253
    exit(-1);
2254
}
2255
\end{verbatim}
2256

2257
{\tt perror} is declared in {\tt stdio.h}; it prints
2258
an error message and additional information about the last error
2259
that occurred.
2260

2261
{\tt exit}, which is declared in {\tt stdlib.h}, causes the process
2262
to terminate.  The argument is a status code that indicates how
2263
the process terminated.  By convention, status code 0 indicates normal
2264
termination and -1 indicates an error condition.  Sometimes other
2265
codes are used to indicate different error conditions.
2266

2267
Error-checking code can be a nuisance, and it makes programs
2268
harder to read.  You can mitigate these problems by wrapping library
2269
function calls and their error-checking code in your own
2270
functions.  For example, here is a {\tt malloc} wrapper that checks
2271
the return value.
2272

2273
\begin{verbatim}
2274
void *check_malloc(int size)
2275
{
2276
    void *p = malloc (size);
2277
    if (p == NULL) {
2278
        perror("malloc failed");
2279
        exit(-1);
2280
    }
2281
    return p;
2282
}
2283
\end{verbatim}
2284

2285
Because memory management is so difficult, most large programs, like
2286
web browsers, leak memory.  To see which programs on your system are
2287
using the most memory, you can use the UNIX utilities {\tt ps} and
2288
{\tt top}.
2289

2290

2291
% TODO: using Valgrind here?
2292

2293

2294
\section{Implementation}
2295

2296
When a process starts, the system allocates space for the text segment
2297
and statically allocated data, space for the stack, and space for the
2298
heap, which contains dynamically allocated data.
2299

2300
Not all programs allocate data dynamically, so the initial size of the
2301
heap might be small or zero.  Initially the heap contains only one
2302
free chunk.
2303

2304
When {\tt malloc} is called, it checks whether it can find a free
2305
chunk that's big enough.  If not, it has to request more memory
2306
from the system.  The function that does that is {\tt sbrk},
2307
which sets the ``program break'', which you can think of as a pointer
2308
to the end of the heap.
2309

2310
When {\tt sbrk} is called, the OS allocates new pages of physical
2311
memory, updates the process's page table, and sets the program
2312
break.
2313

2314
In theory, a program could call {\tt sbrk} directly (without using
2315
{\tt malloc}) and manage the heap itself.  But {\tt malloc} is easier
2316
to use and, for most memory-use patterns, it runs fast and uses memory
2317
efficiently.
2318

2319
To implement the memory management API (that is, the functions
2320
{\tt malloc}, {\tt free}, {\tt calloc}, and {\tt realloc}),
2321
most Linux systems use {\tt ptmalloc},
2322
which is based on {\tt dlmalloc}, written by Doug Lea.  A short paper
2323
that describes key elements of the implementation is
2324
available at \url{http://gee.cs.oswego.edu/dl/html/malloc.html}.
2325

2326
For programmers, the most important elements to be aware of are:
2327

2328
\begin{itemize}
2329

2330
\item The run time of {\tt malloc} does not usually depend on the
2331
size of the chunk, but might depend on how many free chunks there
2332
are.  {\tt free} is usually fast, regardless of the number of
2333
free chunks.  Because {\tt calloc} clears every byte in the chunk,
2334
the run time depends on chunk size (as well as the number of free
2335
chunks).
2336

2337
{\tt realloc} is sometimes fast, if the new size is smaller than the
2338
current size, or if space is available to expand the existing chunk.
2339
If not, it has to copy data from the old chunk to the new; in that
2340
case, the run time depends on the size of the old chunk.
2341

2342
\item Boundary tags: When {\tt malloc} allocates a chunk, it adds
2343
  space at the beginning and end to store information about the chunk,
2344
  including its size and the state (allocated or free).  These bits of
2345
  data are called ``boundary tags''.  Using these tags, {\tt malloc}
2346
  can get from any chunk to the previous chunk and the next chunk in
2347
  memory.  In addition, free chunks are chained into a doubly-linked
2348
  list; each free chunk contains pointers to the next and previous
2349
  chunks in the ``free list''.
2350

2351
The boundary tags and free list pointers make up {\tt malloc}'s
2352
internal data structures.  These data structures are interspersed with
2353
program data, so it is easy for a program error to damage them.
2354

2355
\item Space overhead: Boundary tags and free list pointers take up
2356
  space.  The minimum chunk size on most systems is 16 bytes.  So for
2357
  very small chunks, {\tt malloc} is not space efficient.  If your
2358
  program requires large numbers of small structures, it might be more
2359
  efficient to allocate them in arrays.
2360

2361
\item Fragmentation: If you allocate and free chunks with varied
2362
  sizes, the heap will tend to become fragmented.  That is, the free
2363
  space might be broken into many small pieces.  Fragmentation wastes
2364
  space; it also slows the program down by making memory caches less
2365
  effective.
2366

2367
\item Binning and caching: The free list is sorted by size into bins,
2368
  so when {\tt malloc} searches for a chunk with a particular size, it
2369
  knows what bin to search in.  If you free a chunk and then
2370
  immediately allocate a chunk with the same size, {\tt malloc} will
2371
  usually be fast.
2372

2373
\end{itemize}
2374

2375

2376
\chapter{Caching}
2377

2378
%TODO: move this model of computer hardware to Chapter 1
2379
%TODO: talk about CPU modes, either here or in Chapter 1
2380

2381
\section{How programs run}
2382

2383
In order to understand caching, you have to understand how computers
2384
execute programs.  For a deep understanding of this topic, you should
2385
study computer architecture.  My goal in this chapter is to provide
2386
a simple model of program execution.
2387

2388
When a program starts, the code (or text) is usually on a hard disk
2389
or solid state drive.  The operating system creates a new process to
2390
run the program, then the ``loader''
2391
copies the text from storage into main memory and starts the program by
2392
calling {\tt main}.
2393

2394
While the program is running, most of its data is stored in main
2395
memory, but some of the data is in registers, which are
2396
small units of memory on the CPU.  These registers include:
2397

2398
\begin{itemize}
2399

2400
\item The program counter, or PC, which contains the address (in
2401
  memory) of the next instruction in the program.
2402

2403
\item The instruction register, or IR, which contains the machine code
2404
  instruction currently executing.
2405

2406
\item The stack pointer, or SP, which contains the address of the
2407
  stack frame for the current function, which contains its parameters
2408
  and local variables.
2409

2410
\item General-purpose registers that hold the data the program is
2411
  currently working with.
2412

2413
\item A status register, or flag register, that contains information
2414
  about the current computation.  For example, the flag register
2415
  usually contains a bit that is set if the result of the previous
2416
  operation was zero.
2417

2418
\end{itemize}
2419

2420
When a program is running, the CPU executes the following steps,
2421
called the ``instruction cycle'':
2422

2423
\begin{itemize}
2424

2425
\item Fetch: The next instruction is fetched from memory and stored
2426
in the instruction register.
2427

2428
\item Decode: Part of the CPU, called the ``control unit'', decodes
2429
the instruction and sends signals to the other parts of
2430
the CPU.
2431

2432
\item Execute: Signals from the control unit cause the appropriate
2433
  computation to occur.
2434

2435
\end{itemize}
2436

2437
Most computers can execute a few hundred different instructions,
2438
called the ``instruction set''.  But most instructions fall
2439
into a few general categories:
2440

2441
\begin{itemize}
2442

2443
\item Load: Transfers a value from memory to a register.
2444

2445
\item Arithmetic/logic: Loads operands from registers, performs
2446
a mathematical operation, and stores the result in a register.
2447

2448
\item Store: Transfers a value from a register to memory.
2449

2450
\item Jump/branch: Changes the program counter, causing the flow
2451
of execution to jump to another location in the program.  Branches
2452
are usually conditional, which means that they check a flag
2453
in the flag register and jump only if it is set.
2454

2455
\end{itemize}
2456

2457
Some instructions sets, including the ubiquitous x86, provide
2458
instructions that combine a load and an arithmetic operation.
2459

2460
During each instruction cycle, one instruction is read from the
2461
program text.  In addition, about half of the instructions in a
2462
typical program load or store data.  And therein
2463
lies one of the fundamental problems of computer architecture: the
2464
``memory bottleneck''.
2465

2466
In current computers, a typical core is capable of executing an instruction in less than 1 ns.  But the time it takes to transfer data to and from memory is about 100 ns.  If the CPU has to wait 100 ns to fetch the next instruction, and another 100 ns to load data, it would complete instructions 200 times slower than what's theoretically possible.  For many computations, memory is the speed limiting factor, not the CPU.
2467

2468

2469
\section{Cache performance}
2470

2471
The solution to this problem, or at least a partial solution, is
2472
caching.  A ``cache'' is a small, fast memory that is physically close
2473
to the CPU, usually on the same chip.
2474

2475
%TODO: Clean up these paragraphs
2476

2477
Actually, current computers typically have several levels of cache: the Level 1 cache, which is the smallest and fastest, might be 1--2 MiB with a access times near 1 ns; the Level 2 cache might have access times near 4 ns, and the Level 3 might take 16 ns.
2478

2479
When the CPU loads a value from memory, it stores a copy in the cache.
2480
If the same value is loaded again, the CPU gets the cached copy
2481
and doesn't have to wait for memory.
2482

2483
Eventually the cache gets full.  Then, in order to bring something
2484
new in, we have to kick something out.  So if the CPU loads a value
2485
and then loads it again much later, it might not be in cache any more.
2486

2487
The performance of many programs is limited by the effectiveness
2488
of the cache.  If the instructions and data needed by the CPU are usually in cache, the program can run close to the full speed of the CPU.  If the CPU
2489
frequently needs data that are not in cache, the program is
2490
limited by the speed of memory.
2491

2492
The cache ``hit rate'', $h$, is the fraction of memory accesses that
2493
find data in cache; the ``miss rate'', $m$, is the fraction of memory
2494
accesses that have to go to memory.  If the time to process a cache
2495
hit is $T_h$ and the time for a cache miss is $T_m$, the average time
2496
for each memory access is
2497
%
2498
\[ h T_h + m T_m \]
2499
%
2500
Equivalently, we could define the ``miss penalty'' as the extra
2501
time to process a cache miss, $T_p = T_m - T_h$.  Then the average access
2502
time is
2503
%
2504
\[ T_h + m T_p \]
2505
%
2506
When the miss rate is low, the average access time can be close to
2507
$T_h$.  That is, the program can perform as if memory ran at
2508
cache speeds.
2509

2510

2511
\section{Locality}
2512

2513
When a program reads a byte for the first time, the cache usually
2514
loads a ``block'' or ``line'' of data that includes the requested
2515
byte and some of its neighbors.  If the program goes on to read one
2516
of the neighbors, it will already be in cache.
2517

2518
As an example, suppose the block size is 64 B;
2519
you read a string with length 64, and the first
2520
byte of the string happens to fall at the beginning of a block.  When
2521
you load the first byte, you incur a miss penalty, but
2522
after that the rest of the string will be in cache.  After
2523
reading the whole string, the hit rate will be 63/64, about 98\%.
2524
If the string spans two blocks, you would incur 2 miss penalties.  But
2525
even then the hit rate would be 62/64, or almost 97\%.  If you then
2526
read the same string again, the hit rate would be 100\%.
2527

2528
On the other hand, if the program jumps around unpredictably,
2529
reading data from scattered locations in memory, and seldom
2530
accessing the same location twice, cache performance would be
2531
poor.
2532

2533
The tendency of a program to use the same data more than once is
2534
called ``temporal locality''.  The tendency to use data in nearby
2535
locations is called ``spatial locality''.  Fortunately, many
2536
programs naturally display both kinds of locality:
2537

2538
\begin{itemize}
2539

2540
\item Most programs contain blocks of code with no jumps or
2541
branches.  Within these blocks, instructions run
2542
sequentially, so the access pattern has
2543
spatial locality.
2544

2545
\item In a loop, programs execute the same instructions many
2546
times, so the access pattern has temporal locality.
2547

2548
\item The result of one instruction is often used immediately as
2549
an operand of the next instruction, so the data access pattern
2550
has temporal locality.
2551

2552
\item When a program executes a function, its parameters and local
2553
variables are stored together on the stack; accessing these values
2554
has spatial locality.
2555

2556
\item One of the most common processing patterns is to read or write
2557
the elements of an array sequentially; this pattern also has
2558
spatial locality.
2559

2560
\end{itemize}
2561

2562
The next section explores the relationship
2563
between a program's access pattern and cache performance.
2564

2565

2566
\section{Measuring cache performance}
2567

2568
When I was a graduate student at U.C. Berkeley I was a teaching
2569
assistant for Computer Architecture with Brian Harvey.  One of my
2570
favorite exercises involved a program that iterates through an array
2571
and measures the average time to read and write an element.  By
2572
varying the size of the array, it is possible to infer the size
2573
of the cache, the block size, and some other attributes.
2574

2575
My modified version of this program is in the {\tt cache} directory
2576
of the repository for this
2577
book (see Section~\ref{code}).
2578

2579
The important part of the program is this loop:
2580

2581
\begin{verbatim}
2582
    iters = 0;
2583
    do {
2584
        sec0 = get_seconds();
2585

2586
        for (index = 0; index < limit; index += stride) 
2587
            array[index] = array[index] + 1;
2588
        
2589
        iters = iters + 1; 
2590
        sec = sec + (get_seconds() - sec0);
2591
        
2592
    } while (sec < 0.1);
2593
\end{verbatim}
2594

2595
The inner {\tt for} loop traverses the array.  {\tt limit}
2596
determines how much of the array it traverses; {\tt stride}
2597
determines how many elements it skips over.  For example, if
2598
{\tt limit} is 16 and {\tt stride} is 4, the loop would access
2599
elements 0, 4, 8, and 12.
2600

2601
{\tt sec} keeps track of the total CPU time used by the inner loop.
2602
The outer loop runs until {\tt sec} exceeds 0.1 seconds, which is
2603
long enough that we can compute the average time with sufficient
2604
precision.
2605

2606
\verb"get_seconds" uses the system call \verb"clock_gettime",
2607
converts to seconds, and returns the result as a {\tt double}:
2608

2609
\begin{verbatim}
2610
double get_seconds(){
2611
    struct timespec ts;
2612
    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
2613
    return ts.tv_sec + ts.tv_nsec / 1e9;
2614
}
2615
\end{verbatim}
2616

2617
\begin{figure}
2618
% graph_cache_data.py
2619
\centerline{\includegraphics[width=3in]{figs/cache_data.pdf}}
2620
\caption{Average miss penalty as a function of array size and stride.}
2621
\label{cachedata}
2622
\end{figure}
2623

2624
To isolate the time to access the elements of the array,
2625
the program runs a second loop that is almost identical except
2626
that the inner loop doesn't touch the array; it always increments
2627
the same variable:
2628

2629
\begin{verbatim}
2630
    iters2 = 0;
2631
    do {
2632
        sec0 = get_seconds();
2633
        
2634
        for (index = 0; index < limit; index += stride) 
2635
            temp = temp + index;
2636
        
2637
        iters2 = iters2 + 1;
2638
        sec = sec - (get_seconds() - sec0);
2639

2640
    } while (iters2 < iters);
2641
\end{verbatim}
2642

2643
The second loop runs the same number of iterations as the first.
2644
After each iteration, it {\em subtracts} the elapsed time from
2645
{\tt sec}.  When the loop completes, {\tt sec} contains the total
2646
time for all array accesses, minus the total time it took to increment
2647
{\tt temp}.  This difference is the total miss penalty incurred by
2648
all accesses.  Finally, we divide by the number of accesses to
2649
get the average miss penalty per access, in ns:
2650

2651
\begin{verbatim}
2652
sec * 1e9 / iters / limit * stride
2653
\end{verbatim}
2654

2655
If you compile and run {\tt cache.c} you should see output like this:
2656

2657
\begin{verbatim}
2658
Size:    4096 Stride:       8 read+write: 0.8633 ns
2659
Size:    4096 Stride:      16 read+write: 0.7023 ns
2660
Size:    4096 Stride:      32 read+write: 0.7105 ns
2661
Size:    4096 Stride:      64 read+write: 0.7058 ns
2662
\end{verbatim}
2663

2664
If you have Python and {\tt matplotlib} installed, you can use
2665
\verb"graph_data.py" to graph the results.  Figure~\ref{cachedata}
2666
shows the results when I ran it on a Dell Optiplex 7010.
2667
Notice that the array size and stride are reported in
2668
bytes, not number of array elements.
2669

2670
Take a minute to consider this graph, and see what you can infer
2671
about the cache.  Here are some things to think about:
2672

2673
\begin{itemize}
2674

2675
\item The program reads through the array many times, so it has plenty
2676
  of temporal locality.  If the entire array fits in cache, we expect
2677
  the average miss penalty to be near 0.
2678

2679
\item When the stride is 4 bytes, we read every element of the array,
2680
  so the program has plenty of spatial locality.  If the block size is
2681
  big enough to contain 64 elements, for example, the hit rate would
2682
  be 63/64, even if the array does not fit in cache.
2683

2684
\item If the stride is equal to the block size (or greater), the
2685
  spatial locality is effectively zero, because each time we read a
2686
  block, we only access one element.  In that case we expect to see
2687
  the maximum miss penalty.
2688

2689
\end{itemize}
2690

2691
In summary, we expect good cache performance if the array is smaller
2692
than the cache size {\em or} if the stride is smaller than the block
2693
size.  Performance only degrades if the array is bigger than the
2694
cache {\em and} the stride is large.
2695

2696
In Figure~\ref{cachedata}, cache performance is good, for all strides,
2697
as long as the array is less than $2^{22}$ B.  We can infer that the
2698
cache size is near 4 MiB; in fact, according to the specs, it is 3
2699
MiB.
2700

2701
When the stride is 8, 16, or 32 B, cache performance is good.  At 64 B
2702
it starts to degrade, and for larger strides the average miss
2703
penalty is about 9 ns.  We can infer that the block size near 128 B.
2704

2705
Many processors use ``multi-level caches'' that include a small,
2706
fast cache and a bigger, slower cache.  In this example, it looks 
2707
like the miss penalty increases a little when the array size is bigger
2708
than $2^{14}$ B, so it's possible that this processor also has a 16 KB
2709
cache with an access time less than 1 ns.
2710

2711

2712
\section{Programming for cache performance}
2713

2714
Memory caching is implemented in hardware, so most of the time
2715
programmers don't need to know much about it.  But if you know how
2716
caches work, you can write programs that use them more effectively.
2717

2718
For example, if you are working with a large array, it might be
2719
faster to traverse the array once, performing several operations with
2720
each element, rather than traversing the array several times.
2721

2722
If you are working with a 2-D array, it might be stored as an array
2723
of rows.  If you traverse through the elements, it would be faster
2724
to go row-wise, with stride equal to the element size, rather
2725
than column-wise, with stride equal to the row length.
2726

2727
Linked data structures don't always exhibit spatial locality, because
2728
the nodes aren't necessarily contiguous in memory.  But if you allocate
2729
many nodes at the same time, they are usually co-located in the heap.
2730
Or, even better, if you allocate an array of nodes all at once, you
2731
know they will be contiguous.
2732

2733
Recursive strategies like mergesort often have good cache behavior
2734
because they break big arrays into smaller pieces and then work
2735
with the pieces.  Sometimes these algorithms can be tuned to take
2736
advantage of cache behavior.
2737

2738
For applications where performance is critical, it is possible
2739
to design algorithms tailored to the size of the cache, the block size,
2740
and other hardware characterstics.  Algorithms like that are
2741
called ``cache-aware''.  The obvious drawback of cache-aware
2742
algorithms is that they are hardware-specific.
2743

2744

2745
\section{The memory hierarchy}
2746

2747
At some point during this chapter, a question like the following
2748
might have occurred to you: ``If caches are so much faster than
2749
main memory, why not make a really big cache and forget about
2750
memory?''
2751

2752
Without going too far into computer architecture, there are two
2753
reasons: electronics and economics.  Caches are fast because they are
2754
small and close to the CPU, which minimizes delays due to capacitance
2755
and signal propagation.  If you make a cache big, it will be slower.
2756

2757
Also, caches take up space on the processor chip, and bigger chips are
2758
more expensive.  Main memory is usually dynamic random-access memory
2759
(DRAM), which uses only one transistor and one capacitor per bit, so
2760
it is possible to pack more memory into the same amount of space.  But
2761
this way of implementing memory is slower than the way caches are
2762
implemented.
2763
 
2764
Also main memory is usually packaged in a dual in-line memory module
2765
(DIMM) that includes 16 or more chips.  Several small chips are cheaper
2766
than one big one.
2767

2768
The trade-off between speed, size, and cost is the fundamental reason
2769
for caching.  If there were one memory technology that was fast,
2770
big, and cheap, we wouldn't need anything else.
2771

2772
The same principle applies to storage as well as memory.  Solid state drives (SSD) are fast, but they are more expensive than hard drives (HDD), so they tend to be smaller.  Tape drives are even slower than hard
2773
drives, but they can store large amounts of data relatively
2774
cheaply.
2775

2776
The following table shows typical access times, sizes, and 
2777
costs for each of these technologies.  
2778

2779
\vspace{0.1in}
2780
\begin{center}
2781
    \begin{tabular}{| l | l | l | l |}
2782
    \hline
2783
    Device   &   Access   &   Typical    &   Cost   \\
2784
             &   time     &   size       &          \\ \hline
2785
    Register &   0.5 ns   &   256 B      &   ?      \\ \hline
2786
    Cache    &   1 ns     &   2 MiB      &   ?      \\ \hline
2787
    DRAM     &   100 ns   &   4 GiB      &   \$10 / GiB       \\ \hline
2788
    SSD      &   10 \mus  &   100 GiB    &   \$1 / GiB      \\ \hline
2789
    HDD      &   5 ms     &   500 GiB    &   \$0.25 / GiB     \\ \hline
2790
    Tape     &   minutes  &   1--2 TiB   &   \$0.02 / GiB      \\ \hline
2791
    \end{tabular}
2792
\end{center}
2793
\vspace{0.1in}
2794

2795
The number and size of registers depends on details of the
2796
architecture.  Current computers have about 32 general-purpose
2797
registers, each storing one ``word''.  On a 32-bit computer, a word
2798
is 32 bits or 4 B.  On a 64-bit computer, a word is 64 bits or 8 B.
2799
So the total size of the register file is 100--300 B.
2800

2801
The cost of registers and caches is hard to quantify.  They contribute
2802
to the cost of the chips they are on, but consumers don't see that
2803
cost directly.
2804

2805
For the other numbers in the table, I looked at the specifications for
2806
typical hardware for sale from online computer hardware stores.  By
2807
the time you read this, these numbers will be obsolete, but they give
2808
you an idea of what the performance and cost gaps looked like at one
2809
point in time.
2810

2811
These technologies make up the ``memory hierarchy'' (note that this
2812
use of ``memory'' also includes storage).  Each
2813
level of the hierarchy is bigger and slower than the one above it.
2814
And in some sense, each level acts as a cache for the one below
2815
it.  You can think of main memory as a cache for programs and data
2816
that are stored permanently on SSDs and HHDs.  And if you are working
2817
with very large datasets stored on tape, you could use hard drives
2818
to cache one subset of the data at a time.
2819

2820

2821
\section{Caching policy}
2822

2823
The memory hierarchy suggests a framework for thinking about
2824
caching.  At every level of the hierarchy, we have to address
2825
four fundamental questions of caching:
2826

2827
\begin{itemize}
2828

2829
\item Who moves data up and down the hierarchy?  At the top of the
2830
  hierarchy, register allocation is usually done by the compiler.
2831
  Hardware on the CPU handles the memory cache.  Users implicitly move
2832
  data from storage to memory when they execute programs and open
2833
  files.  But the operating system also moves data back and forth
2834
  between memory and storage.  At the bottom of the hierarchy,
2835
  administrators move data explicitly between disk and tape.
2836

2837
\item What gets moved?  In general, block sizes are small at the top
2838
  of the hierarchy and bigger at the bottom.  In a memory cache, a
2839
  typical block size is 128 B.  Pages in memory might be 4 KiB, but
2840
  when the operating system reads a file from disk, it might read 10s
2841
  or 100s of blocks at a time.
2842

2843
\item When does data get moved?  In the most basic cache, data gets
2844
  moved into cache when it is used for the first time.  But many
2845
  caches use some kind of ``prefetching'', meaning that data is
2846
  loaded before it is explicitly requested.  We have already seen
2847
  one form of prefetching: loading an entire block when only part of
2848
  it is requested.
2849

2850
\item Where in the cache does the data go?  When the cache is full, we
2851
  can't bring anything in without kicking something out.  Ideally,
2852
  we want to keep data that will be used again soon and replace data
2853
  that won't.
2854

2855
\end{itemize}
2856

2857
The answers to these questions make up the ``cache policy''.
2858
Near the top of the hierarchy, cache policies tend to be simple
2859
because they have to be fast and they are implemented in hardware.
2860
Near the bottom of the hierarchy, there is more time to make decisions,
2861
and well-designed policies can make a big difference.
2862

2863
Most cache policies are based on the principle that history repeats
2864
itself; if we have information about the recent past, we can use it to
2865
predict the immediate future.  For example, if a block of data has
2866
been used recently, we expect it to be used again soon.  This
2867
principle suggests a replacement policy called ``least recently
2868
used,'' or LRU, which removes from the cache a block of data that
2869
has not been used recently.  For more on this topic, see
2870
\url{http://en.wikipedia.org/wiki/Cache_algorithms}.
2871

2872

2873
\section{Paging}
2874
\label{paging}
2875

2876
In systems with virtual memory, the operating system can move
2877
pages back and forth between memory and storage.  As I mentioned
2878
in Section~\ref{leak}, this mechanism is called ``paging'' or
2879
sometimes ``swapping''.
2880

2881
Here's how the process works:
2882

2883
\begin{enumerate}
2884

2885
\item Suppose Process A calls {\tt malloc} to allocate a chunk.  If there
2886
is no free space in the heap with the requested size, {\tt malloc} calls
2887
{\tt sbrk} to ask the operating system for more memory.
2888

2889
\item If there is a free page in physical memory, the operating system
2890
adds it to the page table for Process A, creating a new range of valid
2891
virtual addresses.
2892

2893
\item If there are no free pages, the paging system chooses a ``victim
2894
page'' belonging to Process B.  It copies the contents of the victim
2895
page from memory to disk, then it modifies the page table for Process
2896
B to indicate that this page is ``swapped out''.
2897

2898
\item Once the data from Process B is written, the page can be reallocated
2899
to Process A.  To prevent Process A from reading Process B's data, the
2900
page should be cleared.
2901

2902
\item At this point the call to {\tt sbrk} can return, giving {\tt malloc}
2903
additional space in the heap.  Then {\tt malloc} allocates the requested
2904
chunk and returns.  Process A can resume.
2905

2906
\item When Process A completes, or is interrupted, the scheduler might
2907
allow Process B to resume.  When Process B accesses a page that has been swapped out, the memory management unit notices that the page is ``invalid'' and causes an interrupt.
2908

2909
\item When the operating system handles the interrupt, it sees that
2910
the page is swapped out, so it transfers the page back from disk to
2911
memory.  
2912

2913
\item Once the page is swapped in, Process B can resume.
2914

2915
\end{enumerate}
2916

2917
When paging works well, it can greatly improve the utilization of
2918
physical memory, allowing more processes to run in less space.
2919
Here's why:
2920

2921
\begin{itemize}
2922

2923
\item Most processes don't use all of their allocated memory.  Many
2924
  parts of the text segment are never executed, or execute once and
2925
  never again.  Those pages can be swapped out without causing any
2926
  problems.
2927

2928
\item If a program leaks memory, it might leave allocated space behind
2929
  and never access it again.  By swapping those pages out, the
2930
  operating system can effectively plug the leak.
2931

2932
\item On most systems, there are processes like daemons that sit idle
2933
  most of the time and only occasionally ``wake up'' to respond to
2934
  events.  While they are idle, these processes can be swapped out.
2935

2936
\item A user might have many windows open, but only a few are active
2937
  at a time.  The inactive processes can be swapped out.
2938

2939
\item Also, there might be many processes running the same program.
2940
  These processes can share the same text and static segments, avoiding the need to keep multiple copies in physical memory.
2941

2942
\end{itemize}
2943

2944
If you add up the total memory allocated to all processes, it can
2945
greatly exceed the size of physical memory, and yet the system can
2946
still behave well.
2947

2948
Up to a point.
2949

2950
When a process accesses a page that's swapped out, it has to get the
2951
data back from disk, which can take several milliseconds.  The
2952
delay is often noticeable.  If you leave a window idle for a long
2953
time and then switch back to it, it might start slowly,
2954
and you might hear the disk drive working while pages are
2955
swapped in.  
2956

2957
Occasional delays like that might be acceptable, but if you have too
2958
many processes using too much space, they start to interfere with each
2959
other.  When Process A runs, it evicts the pages Process B needs.
2960
Then when B runs, it evicts the pages A needs.  When this happens,
2961
both processes slow to a crawl and the system can become unresponsive.
2962
This scenario is called ``thrashing''.
2963

2964
In theory, operating systems could avoid thrashing by detecting an
2965
increase in paging and blocking or killing processes until the system
2966
is responsive again.  But as far as I can tell, most systems don't do
2967
this, or don't do it well; it is often left to users to limit their
2968
use of physical memory or try to recover when thrashing occurs.
2969

2970

2971
\chapter{Multitasking}
2972

2973
In many current systems, the CPU contains multiple cores, which means
2974
it can run several processes at the same time.  In addition, each core
2975
is capable of ``multitasking'', which means it can switch from one
2976
process to another quickly, creating the illusion that many processes
2977
are running at the same time.
2978

2979
The part of the operating system that implements multitasking is
2980
the ``kernel''.  In a nut or seed, the kernel is the innermost
2981
part, surrounded by a shell.  In an operating system, the kernel
2982
is the lowest level of software, surrounded by several other
2983
layers, including an interface called a ``shell.''  Computer
2984
scientists love extended metaphors.
2985

2986
At its most basic, the kernel's job is to
2987
handle interrupts.  An ``interrupt'' is an event that stops the
2988
normal instruction cycle and causes the flow of execution to jump to a
2989
special section of code called an ``interrupt handler''.
2990

2991
%TODO: put new vocab in bold and add glossaries
2992

2993
A {\bf hardware interrupt} is caused when a device sends a signal to the
2994
CPU.  For example, a network interface might cause an interrupt when
2995
a packet of data arrives, or a disk drive might cause an interrupt
2996
when a data transfer is complete.  Most systems also have timers that
2997
cause interrupts at regular intervals, or after an elapsed time.
2998

2999
A {\bf software interrupt} is caused by a running program.  For example, if
3000
an instruction cannot complete for some reason, it might trigger an
3001
interrupt so the condition can be handled by the operating system.
3002
Some floating-point errors, like division by zero, are handled
3003
using interrupts.
3004

3005
When a program needs to access a hardware device,
3006
it makes a {\bf system call}, which is similar to a function call,
3007
except that instead of jumping to the beginning of the function,
3008
it executes a special instruction that triggers an interrupt, causing
3009
the flow of execution to jump to the kernel.  The kernel reads the
3010
parameters of the system call, performs the requested operation,
3011
and then resumes the interrupted process.
3012

3013

3014
\section{Hardware state}
3015

3016
Handling interrupts requires cooperation between hardware and
3017
software.  When an interrupt occurs, there might be several
3018
instructions running on the CPU, data stored in registers, and
3019
other {\bf hardware state}.
3020

3021
Usually the hardware is responsible for bringing the CPU
3022
to a consistent state; for example, every instruction should either
3023
complete or behave as if it never started.  No instruction should
3024
be left half complete.  Also, the hardware is responsible for
3025
saving the program counter (PC), so the kernel knows where to
3026
resume.
3027

3028
Then, usually, it is the responsibility of the interrupt handler
3029
to save the rest of the hardware state before it does anything that
3030
might modify it, and then restore the saved state before the interrupted
3031
process resumes.
3032

3033
Here is an outline of this sequence of events:
3034

3035
\begin{enumerate}
3036

3037
\item When the interrupt occurs, the hardware saves the program
3038
counter in a special register and jumps to the appropriate interrupt
3039
handler.
3040

3041
\item The interrupt handler stores the program counter and the
3042
status register in memory, along with the contents of any data
3043
registers it plans to use.
3044

3045
\item The interrupt handler runs whatever code is needed to handle
3046
the interrupt.
3047

3048
\item Then it restores the contents of the saved registers.  Finally,
3049
it restores the program counter of the interrupted process, which
3050
has the effect of jumping back to the interrupted instruction.
3051

3052
\end{enumerate}
3053

3054
If this mechanism works correctly, there is generally no way for
3055
the interrupted process to know there was an interrupt, unless
3056
it detects the change in time between instructions.
3057

3058

3059
\section{Context switching}
3060

3061
Interrupt handlers can be fast because they don't have to save the
3062
entire hardware state; they only have to save registers they are
3063
planning to use.
3064

3065
But when an interrupt occurs, the kernel does not always resume the
3066
interrupted process.  It has the option of switching to another
3067
process.  This mechanism is called a ``context switch''.
3068

3069
In general, the kernel doesn't know which registers a process will
3070
use, so it has to save all of them.  Also, when it switches to a new
3071
process, it might have to clear data stored in the memory management
3072
unit (see
3073
Section~\ref{address_translation}).  And after the context switch, it
3074
might take some time for the new process to load data into the cache.
3075
For these reasons, context switches are relatively slow, on the order
3076
of thousands of cycles, or a few microseconds.
3077

3078
In a multi-tasking system, each process is allowed to run for a short
3079
period of time called a ``time slice'' or ``quantum''.  During
3080
a context switch, the kernel sets a hardware timer that causes
3081
an interrupt at the end of the time slice.  When the interrupt
3082
occurs, the kernel can switch to another process or allow the
3083
interrupted process to resume.  The part of the operating system
3084
that makes this decision is the ``scheduler''.
3085

3086

3087
\section{The process life cycle}
3088

3089
When a process is created, the operating system allocates a
3090
data structure that contains information about the process, called
3091
a ``process control block'' or PCB.  Among other things, the
3092
PCB keeps track of the process state, which is one of:
3093

3094
\begin{itemize}
3095

3096
\item Running, if the process is currently running on a core.
3097

3098
\item Ready, if the process could be running, but isn't, usually because
3099
there are more runnable processes than cores.
3100

3101
\item Blocked, if the process cannot run because it is waiting for
3102
a future event like network communication or a disk read.
3103

3104
\item Done, if the process has completed, but has exit status
3105
information that has not been read yet.
3106

3107
\end{itemize}
3108

3109
Here are the events that cause a process to transition from one state to another:
3110

3111
\begin{itemize}
3112

3113
\item A process is created when the running program executes a system
3114
  call like {\tt fork}.  At the end of the system call, the new
3115
  process is usually ready.  Then the scheduler might resume the
3116
  original process (the ``parent'') or start the new process (the
3117
  ``child'').
3118

3119
\item When a process is started or resumed by the scheduler, its state
3120
  changes from ready to running.
3121

3122
\item When a process is interrupted and the scheduler chooses not
3123
  to let it resume, its state changes from running to ready.
3124

3125
\item If a process executes a system call that cannot complete
3126
  immediately, like a disk request, it becomes blocked
3127
  and the scheduler usually chooses another process.
3128

3129
\item When an operation like a disk request completes, it causes an
3130
  interrupt.  The interrupt handler figures out which process was
3131
  waiting for the request and switches its state from
3132
  blocked to ready.  Then the scheduler may or may not choose to
3133
  resume the unblocked process.
3134

3135
\item When a process calls {\tt exit}, the interrupt handler stores
3136
  the exit code in the PCB and changes the process's state to done.
3137

3138
\end{itemize}
3139

3140

3141
\section{Scheduling}
3142

3143
As we saw in Section~\ref{unixps} there might be hundreds of
3144
processes on a computer, but usually most of them are blocked.  Most
3145
of the time, there are only a few processes that are ready or running.
3146
When an interrupt occurs, the scheduler decides which process to start
3147
or resume.
3148

3149
On a workstation or laptop, the primary goal of the scheduler is to
3150
minimize response time; that is, the computer should respond quickly
3151
to user actions.  Response time is also important on a server, but in
3152
addition the scheduler might try to maximize throughput, which is the
3153
number of requests that complete per unit of time.
3154

3155
Usually the scheduler doesn't have much information about what
3156
processes are doing, so its decisions are based on a few
3157
heuristics:
3158

3159
\begin{itemize}
3160

3161
\item Processes might be limited by different resources.  A process
3162
that does a lot of computation is probably CPU-bound, which means that
3163
its run time depends on how much CPU time it gets.  A process that
3164
reads data from a network or disk might be I/O-bound, which means that
3165
it would run faster if data input and output went faster, but would not
3166
run faster with more CPU time.  Finally, a process that interacts with
3167
the user is probably blocked, most of the time, waiting for user actions.
3168

3169
The operating system can sometimes classify processes based on their
3170
past behavior, and schedule them accordingly.  For example, when an
3171
interactive process is unblocked, it should probably run immediately,
3172
because a user is probably waiting for a reply.  On the other hand,
3173
a CPU-bound process that has been running for a long time might be
3174
less time-sensitive.
3175

3176
\item If a process is likely to run for a short time and then make
3177
a blocking request, it should probably run immediately, for two reasons:
3178
(1) if the request takes some time to complete, we should start it as soon
3179
as possible, and (2) it is better for a long-running process to wait
3180
for a short one, rather than the other way around.
3181

3182
As an analogy, suppose you are making an apple pie.  The crust takes
3183
5 minutes to prepare, but then it has to chill for half an hour.  It takes
3184
20 minutes to prepare the filling.  If you prepare the crust first,
3185
you can prepare the filling while the crust is chilling, and you can
3186
finish the pie in 35 minutes.  If you prepare the filling first, the
3187
process takes 55 minutes.
3188

3189
\end{itemize}
3190

3191
Most schedulers use some form of priority-based scheduling,
3192
where each process has a priority that can be adjusted up or down
3193
over time.  When the scheduler runs, it chooses the runnable process
3194
with the highest priority.
3195

3196
Here are some of the factors that determine a process's priority:
3197

3198
\begin{itemize}
3199

3200
\item A process usually starts with a relatively high priority so it
3201
  starts running quickly.
3202

3203
\item If a process makes a request and blocks before its time slice is
3204
  complete, it is more likely to be interactive or I/O-bound, so its
3205
  priority should go up.
3206

3207
\item If a process runs for an entire time slice, it is more likely to
3208
  be long-running and CPU-bound, so its priority should go down.
3209

3210
\item If a task blocks for a long time and then becomes ready, it
3211
  should get a priority boost so it can respond to whatever it was
3212
  waiting for.
3213

3214
\item If process A is blocked waiting for process B, for example if
3215
  they are connected by a pipe, the priority of process B should go
3216
  up.
3217

3218
\item The system call {\tt nice} allows a process to decrease (but not
3219
  increase) its own priority, allowing programmers to pass explicit
3220
  information to the scheduler.
3221

3222
\end{itemize}
3223

3224
For most systems running normal workloads, scheduling algorithms
3225
don't have a substantial effect on performance.  Simple scheduling
3226
policies are usually good enough.
3227

3228

3229
\section{Real-time scheduling}
3230

3231
However, for programs that interact with the real world, scheduling
3232
can be very important.  For example, a program that reads data from
3233
sensors and controls motors might have to complete recurring tasks at
3234
some minimum frequency and react to external events with some maximum
3235
response time.  These requirements are often expressed in terms of
3236
``tasks'' that must be completed before ``deadlines''.
3237

3238
Scheduling tasks to meet deadlines is called ``real-time
3239
  scheduling''.  For some applications, a general-purpose operating
3240
system like Linux can be modified to handle real-time scheduling.
3241
These modifications might include:
3242

3243
\begin{itemize}
3244

3245
\item Providing richer APIs for controlling task priorities.
3246

3247
\item Modifying the scheduler to guarantee that the process with
3248
highest priority runs within a fixed amount of time.
3249

3250
\item Reorganizing interrupt handlers to guarantee
3251
a maximum completion time.
3252

3253
\item Modifying locks and other synchronization mechanisms (coming up
3254
  in the next chapter) to allow a high-priority task to preempt a
3255
  lower-priority task.
3256

3257
\item Choosing an implementation of dynamic memory allocation that
3258
guarantees a maximum completion time.
3259

3260
\end{itemize}
3261

3262
For more demanding applications, especially in domains where real-time
3263
response is a matter of life and death, ``real-time operating
3264
  systems'' provide specialized capabilities, often with much simpler
3265
designs than general purpose operating systems.
3266

3267
%TODO: kernel mode?  signals?  user and system time
3268

3269

3270
\chapter{Threads}
3271

3272
When I mentioned threads in Section~\ref{unixps}, I said that a thread
3273
is a kind of process.  Now I will provide a more careful explanation.
3274

3275
When you create a process, the operating system creates a new address
3276
space, which includes the text segment, static segment, and heap; it
3277
also creates a new ``thread of execution'', which includes the program
3278
counter and other hardware state, and the call stack.
3279

3280
The processes we have seen so far are ``single-threaded'', which means
3281
that only one thread of execution runs in each address space.  In this
3282
chapter, you will learn about ``multi-threaded'' processes that have
3283
multiple threads running in the same address space.
3284

3285
Within a single process, all threads share the same text segment, so
3286
they run the same code.  But different threads often run different parts
3287
of the code.
3288

3289
And they share the same static segment, so if one thread changes a
3290
global variable, other threads see the change.  They also share the heap,
3291
so threads can share dynamically-allocated chunks.
3292

3293
But each thread has its own stack, so threads can call functions without
3294
interfering with each other.  Usually threads don't access each
3295
other's local variables (and sometimes they can't).
3296

3297
The example code for this chapter is in the repository for this book,
3298
in a directory named {\tt counter}.  For information on downloading
3299
this code, see Section~\ref{code}.
3300

3301

3302
\section{Creating threads}
3303

3304
The most popular threading standard used with C is POSIX Threads, or Pthreads for short.  The POSIX standard defines a thread model and an interface for creating and controlling threads.  Most versions of UNIX provide an implementation of Pthreads.
3305

3306
Using Pthreads is like using most C libraries:
3307

3308
\begin{itemize}
3309

3310
\item You include headers files at the beginning of your
3311
program.
3312

3313
\item You write code that calls functions defined by Pthreads.
3314

3315
\item When you compile the program, you link it with the Pthread library.
3316

3317
\end{itemize}
3318

3319
For my examples, I include the following headers:
3320

3321
\begin{verbatim}
3322
#include <stdio.h>
3323
#include <stdlib.h>
3324
#include <pthread.h>
3325
#include <semaphore.h>
3326
\end{verbatim}
3327

3328
The first two are standard; the third is for Pthreads and
3329
the fourth is for semaphores.  To compile with the Pthread library in {\tt gcc}, you can use the {\tt -l} option on the command line:
3330

3331
\begin{verbatim}
3332
gcc -g -O2 -o array array.c -lpthread
3333
\end{verbatim}
3334

3335
This compiles a source file named {\tt array.c} with debugging info
3336
and optimization, links with the Pthread library, and generates an
3337
executable named {\tt array}.
3338

3339

3340
\section{Creating threads}
3341

3342
The Pthread function that creates threads is called \verb"pthread_create".
3343
The following function shows how to use it:
3344

3345
\begin{verbatim}
3346
pthread_t make_thread(void *(*entry)(void *), Shared *shared)
3347
{
3348
    int n;
3349
    pthread_t thread;
3350

3351
    n = pthread_create(&thread, NULL, entry, (void *)shared);
3352
    if (n != 0) {
3353
        perror("pthread_create failed");
3354
        exit(-1);
3355
    }
3356
    return thread;
3357
}
3358
\end{verbatim}
3359

3360
\verb"make_thread" is a wrapper I wrote to make
3361
\verb"pthread_create" easier to use, and to provide error-checking.
3362

3363
%TODO: Define a newcommand like \py to make verb easier to use, and
3364
% get rid of the \_ hockey sticks
3365

3366
The return type from \verb"pthread_create" is \verb"pthread_t",
3367
which you can think of as an id or ``handle'' for the new thread.  
3368

3369
If {\tt pthread\_create} succeeds, it returns 0 and \verb"make_thread"
3370
returns the handle of the new thread.
3371
If an error occurs, {\tt pthread\_create} 
3372
returns an error code and \verb"make_thread" prints an error message
3373
and exits.
3374

3375
The parameters of \verb"make_thread" take some
3376
explaining.  Starting with the second, {\tt Shared}
3377
is a structure I defined to contain values shared between threads.
3378
The following {\tt typedef} statement creates the new type:
3379

3380
\begin{verbatim}
3381
typedef struct {
3382
    int counter;
3383
} Shared;
3384
\end{verbatim}
3385

3386
In this case, the only shared variable is {\tt counter}.
3387
{\tt make\_shared} allocates
3388
space for a {\tt Shared} structure and initializes the contents:
3389

3390
\begin{verbatim}
3391
Shared *make_shared()
3392
{
3393
    Shared *shared = check_malloc(sizeof (Shared));
3394
    shared->counter = 0;
3395
    return shared;
3396
}
3397
\end{verbatim}
3398

3399
Now that we have a shared data structure, let's get back to
3400
\verb"make_thread".
3401
The first parameter is a pointer to a function that takes
3402
a {\tt void} pointer and returns a {\tt void} pointer.  If the syntax
3403
for declaring this type makes your eyes bleed, you are not alone.
3404
Anyway, the purpose of this parameter is to specify the function where
3405
the execution of the new thread will begin.  By convention, this
3406
function is named {\tt entry}:
3407

3408
\begin{verbatim}
3409
void *entry(void *arg)
3410
{
3411
    Shared *shared = (Shared *) arg;
3412
    child_code(shared);
3413
    pthread_exit(NULL);
3414
}
3415
\end{verbatim}
3416

3417
The parameter of {\tt entry} has to be declared as a {\tt void}
3418
pointer, but in this program we know that it is really a pointer to a
3419
{\tt Shared} structure, so we can typecast it accordingly and then
3420
pass it along to {\tt child\_code}, which does the real work.
3421

3422
As a simple example, \verb"child_code" prints the value of
3423
the shared counter and increments it.
3424

3425
\begin{verbatim}
3426
void child_code(Shared *shared)
3427
{  
3428
    printf("counter = %d\n", shared->counter);
3429
    shared->counter++;
3430
}
3431
\end{verbatim}
3432

3433
When {\tt child\_code} returns, {\tt entry} invokes
3434
\verb"pthread_exit" which can be used to pass a value to the thread
3435
that joins with this thread.  In this case, the child has nothing to
3436
say, so we pass {\tt NULL}.
3437

3438
Finally, here is the code that creates the child threads:
3439

3440
\begin{verbatim}
3441
    int i;
3442
    pthread_t child[NUM_CHILDREN];
3443

3444
    Shared *shared = make_shared(1000000);
3445

3446
    for (i=0; i<NUM_CHILDREN; i++) {
3447
        child[i] = make_thread(entry, shared);
3448
    }
3449
\end{verbatim}
3450

3451
\verb"NUM_CHILDREN" is a compile-time constant that determines
3452
the number of child threads.  {\tt child} is an array of
3453
thread handles.
3454

3455

3456
\section{Joining threads}
3457

3458
When one thread wants to wait for another thread to complete,
3459
it invokes {\tt pthread\_join}.
3460
Here is my wrapper for {\tt pthread\_join}:
3461

3462
\begin{verbatim}
3463
void join_thread(pthread_t thread)
3464
{
3465
    int ret = pthread_join(thread, NULL);
3466
    if (ret == -1) {
3467
      perror("pthread_join failed");
3468
      exit(-1);
3469
    }
3470
}
3471
\end{verbatim}
3472

3473
The parameter is the handle of the thread you want to wait for.
3474
All the wrapper does is call {\tt pthread\_join} and check the
3475
result.
3476

3477
Any thread can join any other thread, but in the most common pattern
3478
the parent thread creates and joins all child threads.
3479
Continuing the example from the previous section, here's the
3480
code that waits on the children:
3481

3482
\begin{verbatim}
3483
    for (i=0; i<NUM_CHILDREN; i++) {
3484
        join_thread(child[i]);
3485
    }
3486
\end{verbatim}
3487

3488
This loops waits for the children one at a time in the order they
3489
were created.  There is no guarantee that the child threads complete 
3490
in that order, but this loop works correctly even if they don't.  If one
3491
of the children is late, the loop might have to wait, and other children
3492
might complete in the meantime.  But regardless, the loop exits
3493
only when all children are done.
3494

3495
If you have downloaded the repository for this book (see
3496
Section~\ref{code}), you'll find this example in {\tt
3497
  counter/counter.c}.  You can compile and run it like this:
3498

3499
\begin{verbatim}
3500
$ make counter
3501
gcc -Wall counter.c -o counter -lpthread
3502
$ ./counter
3503
\end{verbatim}
3504

3505
When I ran it with 5 children, I got the following output:
3506

3507
\begin{verbatim}
3508
counter = 0
3509
counter = 0
3510
counter = 1
3511
counter = 0
3512
counter = 3
3513
\end{verbatim}
3514

3515
When you run it, you will probably get different results.  And if
3516
you run it again, you might get different results each time.  What's
3517
going on?
3518

3519

3520
\section{Synchronization errors}
3521

3522
The problem with the previous program is that the children
3523
access the shared variable, {\tt counter}, without synchronization,
3524
so several threads can read the same value of {\tt counter} before
3525
any threads increment it.
3526

3527
Here is a sequence of events that could explain the output in the
3528
previous section:
3529

3530
\begin{verbatim}
3531
Child A reads 0
3532
Child B reads 0
3533
Child C reads 0
3534
Child A prints   0
3535
Child B prints   0
3536
Child A sets counter=1
3537
Child D reads 1
3538
Child D prints   1
3539
Child C prints   0
3540
Child A sets counter=1
3541
Child B sets counter=2
3542
Child C sets counter=3
3543
Child E reads 3
3544
Child E prints   3
3545
Child D sets counter=4
3546
Child E sets counter=5
3547
\end{verbatim}
3548

3549
Each time you run the program, threads might be interrupted at different
3550
points, or the scheduler might choose different threads to run, so
3551
the sequence of events, and the results, will be different.
3552

3553
Suppose we want to impose some order.  For example, we might want
3554
each thread to read a different value of {\tt counter} and increment
3555
it, so that the value of {\tt counter} reflects the number of
3556
threads that have executed \verb"child_code".
3557

3558
To enforce that requirement, we can use a ``mutex'', which is
3559
an object that guarantees ``mutual exclusion'' for a block of code;
3560
that is, only one thread can execute the block at a time.
3561

3562
I have written a small module called {\tt mutex.c} that provides
3563
mutex objects.  I'll show you how to use it first; then I'll explain
3564
how it works.
3565

3566
Here's a version of \verb"child_code" that uses a mutex to synchronize
3567
threads:
3568

3569
\begin{verbatim}
3570
void child_code(Shared *shared)
3571
{
3572
    mutex_lock(shared->mutex);
3573
    printf("counter = %d\n", shared->counter);
3574
    shared->counter++;
3575
    mutex_unlock(shared->mutex);
3576
}
3577
\end{verbatim}
3578

3579
Before any thread can access {\tt counter}, it has to ``lock''
3580
the mutex, which has the effect of barring all other threads.
3581
Suppose Thread A has locked the mutex and is in the
3582
middle of \verb"child_code".  If Thread B arrives and
3583
executes \verb"mutex_lock", it blocks.
3584

3585
When Thread A is done, it executes \verb"mutex_unlock",
3586
which allows Thread B to proceed.  In effect, the threads
3587
line up to execute \verb"child_code" one at a time, so they
3588
can't interfere with each other.  When I run this code with
3589
5 children, I get:
3590

3591
\begin{verbatim}
3592
counter = 0
3593
counter = 1
3594
counter = 2
3595
counter = 3
3596
counter = 4
3597
\end{verbatim}
3598

3599
And that satisfies the requirements.  In order for this solution to
3600
work, I have to add the Mutex to the Shared struct:
3601

3602
\begin{verbatim}
3603
typedef struct {
3604
    int counter;
3605
    Mutex *mutex;
3606
} Shared;
3607
\end{verbatim}
3608

3609
And initialize it in \verb"make_shared"
3610

3611
\begin{verbatim}
3612
Shared *make_shared(int end)
3613
{
3614
    Shared *shared = check_malloc(sizeof(Shared));
3615
    shared->counter = 0;
3616
    shared->mutex = make_mutex();   //-- this line is new
3617
    return shared;
3618
}
3619
\end{verbatim}
3620

3621
The code in this section is in \verb"counter_mutex.c".
3622
The definition of {\tt Mutex} is in {\tt mutex.c}, which I
3623
explain in the next section.
3624

3625

3626

3627
\section{Mutex}
3628

3629
My definition of {\tt Mutex} is a wrapper for a type called
3630
\verb"pthread_mutex_t", which is defined in the POSIX threads API.
3631

3632
To create a POSIX mutex, you have to allocate space for a
3633
\verb"pthread_mutex_t" type and then call \verb"pthread_mutex_init".
3634

3635
One of the problems with this API is that \verb"pthread_mutex_t"
3636
behaves like a structure, so if you pass it as an argument, it makes a
3637
copy, which makes the mutex behave incorrectly.  To avoid that, you have to
3638
pass \verb"pthread_mutex_t" by address.
3639

3640
My code makes it easier to get that right.  It defines a
3641
type, {\tt Mutex}, which is just a more readable name for
3642
\verb"pthread_mutex_t":
3643

3644
\begin{verbatim}
3645
#include <pthread.h>
3646

3647
typedef pthread_mutex_t Mutex;
3648
\end{verbatim}
3649

3650
Then it defines \verb"make_mutex", which allocates space and
3651
initializes the mutex:
3652

3653
\begin{verbatim}
3654
Mutex *make_mutex()
3655
{
3656
    Mutex *mutex = check_malloc(sizeof(Mutex));
3657
    int n = pthread_mutex_init(mutex, NULL);
3658
    if (n != 0) perror_exit("make_lock failed"); 
3659
    return mutex;
3660
}
3661
\end{verbatim}
3662

3663
The return value is a pointer, which you can pass around as an
3664
argument without causing unwanted copying.
3665

3666
The functions to lock and unlock the mutex are simple wrappers
3667
for POSIX functions:
3668

3669
\begin{verbatim}
3670
void mutex_lock(Mutex *mutex)
3671
{
3672
    int n = pthread_mutex_lock(mutex);
3673
    if (n != 0) perror_exit("lock failed");
3674
}
3675

3676
void mutex_unlock(Mutex *mutex)
3677
{
3678
    int n = pthread_mutex_unlock(mutex);
3679
    if (n != 0) perror_exit("unlock failed");
3680
}
3681
\end{verbatim}
3682

3683
This code is in {\tt mutex.c} and the header file {\tt mutex.h}.
3684

3685

3686
\chapter{Condition variables}
3687
\label{csem}
3688

3689
Many simple synchronization problems can be solved using mutexes
3690
as shown in the previous chapter.  In this chapter I introduce a
3691
bigger challenge, the well-known ``Producer-Consumer problem'', and
3692
a new tool to solve it, the condition variable.
3693

3694
\section{The work queue}
3695
\label{queue}
3696

3697
In some multi-threaded programs, threads are organized to perform
3698
different tasks.  Often they communicate with each other using a queue,
3699
where some threads, called ``producers'', put data into the queue
3700
and other threads, called ``consumers'', take data out.
3701

3702
For example, in applications with a graphical user interface, there
3703
might be one thread that runs the GUI, responding to user events,
3704
and another thread that processes user requests.  In that case,
3705
the GUI thread might put requests into a queue and the ``back end''
3706
thread might take requests out and process them.
3707

3708
To support this organization, we need a queue implementation that is
3709
``thread safe'', which means that both threads (or more than two) can
3710
access the queue at the same time.  And we need to handle the special
3711
cases when the queue is empty and, if the size of the queue is
3712
bounded, when the queue is full.
3713

3714
I'll start with a simple queue that is not thread safe, then we'll see
3715
what goes wrong and fix it.  The code for this example is in the
3716
repository for this book, in a folder called {\tt queue}.  The file
3717
{\tt queue.c} contains a basic implementation of a circular buffer,
3718
which you can read about at
3719
\url{https://en.wikipedia.org/wiki/Circular_buffer}.
3720

3721
Here's the structure definition:
3722

3723
\begin{verbatim}
3724
typedef struct {
3725
    int *array;
3726
    int length;
3727
    int next_in;
3728
    int next_out;
3729
} Queue;
3730
\end{verbatim}
3731

3732
{\tt array} is the array that contains the elements of the queue.
3733
For this example the elements are ints, but more generally
3734
they would be structures that contain user events, items of work, etc.
3735

3736
{\tt length} is the length of the array.  \verb"next_in" is an
3737
index into the array that indices where the next element should be
3738
added; similarly, \verb"next_out" is the index of the next element
3739
that should be removed.
3740

3741
\verb"make_queue" allocates space for this structure and initializes
3742
the fields:
3743

3744
\begin{verbatim}
3745
Queue *make_queue(int length)
3746
{
3747
    Queue *queue = (Queue *) malloc(sizeof(Queue));
3748
    queue->length = length + 1;
3749
    queue->array = (int *) malloc(length * sizeof(int));
3750
    queue->next_in = 0;
3751
    queue->next_out = 0;
3752
    return queue;
3753
}
3754
\end{verbatim}
3755

3756
The initial value for \verb"next_out" needs some explaining.
3757
Since the queue is initially empty, there is no next element to
3758
remove, so \verb"next_out" is invalid.  Setting
3759
\verb"next_out == next_in" is a special case that indicates
3760
that the queue is empty, so we can write:
3761

3762
\begin{verbatim}
3763
int queue_empty(Queue *queue)
3764
{
3765
    return (queue->next_in == queue->next_out);
3766
}
3767
\end{verbatim}
3768

3769
Now we can add elements to the queue using \verb"queue_push":
3770

3771
\begin{verbatim}
3772
void queue_push(Queue *queue, int item) {
3773
    if (queue_full(queue)) {
3774
        perror_exit("queue is full");
3775
    }
3776
  
3777
    queue->array[queue->next_in] = item;
3778
    queue->next_in = queue_incr(queue, queue->next_in);
3779
}
3780
\end{verbatim}
3781

3782
If the queue is full, \verb"queue_push" prints an error message
3783
and exits.  I will explain \verb"queue_full" soon.
3784

3785
If the queue is not full, \verb"queue_push" inserts the new
3786
element and then increments \verb"next_in" using \verb"queue_incr":
3787

3788
\begin{verbatim}
3789
int queue_incr(Queue *queue, int i)
3790
{
3791
    return (i+1) % queue->length;
3792
}
3793
\end{verbatim}
3794

3795
When the index, {\tt i}, gets to the end of the array, it wraps around
3796
to 0.  And that's where we run into a tricky part.  If we keep adding
3797
elements to the queue, eventually \verb"next_in" wraps around and catches
3798
up with \verb"next_out".  But if \verb"next_in == next_out", we would
3799
incorrectly conclude that the queue was empty.
3800

3801
To avoid that, we define another special case to indicate that the
3802
queue is full:
3803

3804
\begin{verbatim}
3805
int queue_full(Queue *queue)
3806
{
3807
    return (queue_incr(queue, queue->next_in) == queue->next_out);
3808
}
3809
\end{verbatim}
3810

3811
If incrementing \verb"next_in" lands on \verb"next_out", that means
3812
we can't add another element without making the queue seem empty.  So
3813
we stop one element before the ``end'' (keeping in mind that the end of
3814
the queue can be anywhere, not necessarily the end of the array).
3815

3816
Now we can write \verb"queue_pop", which removes and returns the next
3817
element from the queue:
3818

3819
\begin{verbatim}
3820
int queue_pop(Queue *queue) {
3821
    if (queue_empty(queue)) {
3822
        perror_exit("queue is empty");
3823
    }
3824
  
3825
    int item = queue->array[queue->next_out];
3826
    queue->next_out = queue_incr(queue, queue->next_out);
3827
    return item;
3828
}
3829
\end{verbatim}
3830

3831
If you try to pop from an empty queue, \verb"queue_pop" prints
3832
an error message and exits.
3833

3834

3835
\section{Producers and consumers}
3836
\label{prodcon}
3837

3838
Now let's make some threads to access this queue.  Here's the
3839
producer code:
3840

3841
\begin{verbatim}
3842
void *producer_entry(void *arg) {
3843
    Shared *shared = (Shared *) arg;
3844

3845
    for (int i=0; i<QUEUE_LENGTH-1; i++) {
3846
        printf("adding item %d\n", i);
3847
        queue_push(shared->queue, i);
3848
    }
3849
    pthread_exit(NULL);
3850
}
3851
\end{verbatim}
3852

3853
Here's the consumer code:
3854

3855
\begin{verbatim}
3856
void *consumer_entry(void *arg) {
3857
    int item;
3858
    Shared *shared = (Shared *) arg;
3859

3860
    for (int i=0; i<QUEUE_LENGTH-1; i++) {
3861
        item = queue_pop(shared->queue);
3862
        printf("consuming item %d\n", item);
3863
    }
3864
    pthread_exit(NULL);
3865
}
3866
\end{verbatim}
3867

3868
Here's the parent code that starts the threads and waits for them
3869

3870
\begin{verbatim}
3871
    pthread_t child[NUM_CHILDREN];
3872

3873
    Shared *shared = make_shared();
3874

3875
    child[0] = make_thread(producer_entry, shared);
3876
    child[1] = make_thread(consumer_entry, shared);
3877

3878
    for (int i=0; i<NUM_CHILDREN; i++) {
3879
        join_thread(child[i]);
3880
    }
3881
\end{verbatim}
3882

3883
And finally here's the shared structure that contains the queue:
3884

3885
\begin{verbatim}
3886
typedef struct {
3887
    Queue *queue;
3888
} Shared;
3889

3890
Shared *make_shared()
3891
{
3892
    Shared *shared = check_malloc(sizeof(Shared));
3893
    shared->queue = make_queue(QUEUE_LENGTH);
3894
    return shared;
3895
}
3896
\end{verbatim}
3897

3898
The code we have so far is a good starting place, but it has
3899
several problems:
3900

3901
\begin{itemize}
3902

3903
\item Access to the queue is not thread safe.  Different threads
3904
could access {\tt array}, \verb"next_in", and \verb"next_out"
3905
at the same time and leave the queue in a broken, ``inconsistent''
3906
state.
3907

3908
\item If the consumer is scheduled first, it finds the queue empty,
3909
print an error message, and exits.  We would rather have the consumer
3910
block until the queue is not empty.  Similarly, we would like the
3911
producer to block if the queue is full.
3912

3913
\end{itemize}
3914

3915
In the next section, we solve the first problem with a {\tt Mutex}.
3916
In the following section, we solve the second problem with condition
3917
variables.
3918

3919

3920
\section{Mutual exclusion}
3921

3922
We can make the queue thread safe with a mutex.  This version
3923
of the code is in \verb"queue_mutex.c".
3924

3925
First we add a {\tt Mutex} pointer to the queue structure:
3926

3927
\begin{verbatim}
3928
typedef struct {
3929
    int *array;
3930
    int length;
3931
    int next_in;
3932
    int next_out;
3933
    Mutex *mutex;          //-- this line is new
3934
} Queue;
3935
\end{verbatim}
3936

3937
And initialize the {\tt Mutex} in \verb"make_queue":
3938

3939
\begin{verbatim}
3940
Queue *make_queue(int length) {
3941
    Queue *queue = (Queue *) malloc(sizeof(Queue));
3942
    queue->length = length;
3943
    queue->array = (int *) malloc(length * sizeof(int));
3944
    queue->next_in = 0;
3945
    queue->next_out = 0;
3946
    queue->mutex = make_mutex();   //-- new
3947
    return queue;
3948
}
3949
\end{verbatim}
3950

3951
Next we add synchronization code to \verb"queue_push":
3952

3953
\begin{verbatim}
3954
void queue_push(Queue *queue, int item) {
3955
    mutex_lock(queue->mutex);   //-- new
3956
    if (queue_full(queue)) {
3957
      mutex_unlock(queue->mutex);   //-- new
3958
      perror_exit("queue is full");
3959
    }
3960
  
3961
    queue->array[queue->next_in] = item;
3962
    queue->next_in = queue_incr(queue, queue->next_in);
3963
    mutex_unlock(queue->mutex);   //-- new
3964
}
3965
\end{verbatim}
3966

3967
Before checking whether the queue is full, we have to lock
3968
the {\tt Mutex}.  If the queue is full, we have to unlock
3969
the {\tt Mutex} before exiting; otherwise the thread would leave
3970
it locked and no other threads could proceed.
3971

3972
The synchronization code for \verb"queue_pop" is similar:
3973

3974
\begin{verbatim}
3975
int queue_pop(Queue *queue) {
3976
    mutex_lock(queue->mutex);
3977
    if (queue_empty(queue)) {
3978
      mutex_unlock(queue->mutex);
3979
      perror_exit("queue is empty");
3980
    }
3981
  
3982
    int item = queue->array[queue->next_out];
3983
    queue->next_out = queue_incr(queue, queue->next_out);
3984
    mutex_unlock(queue->mutex);
3985
    return item;
3986
}
3987
\end{verbatim}
3988

3989
Note that the other {\tt Queue} functions, \verb"queue_full",
3990
\verb"queue_empty", and \verb"queue_incr" do not try to lock
3991
the mutex.  Any thread that calls these functions is required to
3992
lock the mutex first; this requirement is part of the documented
3993
interface for these functions.
3994

3995
With this additional code, the queue is thread safe; if you run it, you
3996
should not see any synchronization errors.  But it is likely
3997
that the consumer will exit at some point because the queue is
3998
empty, or the producer will exit because the queue is full,
3999
or both.
4000

4001
The next step is to add condition variables.
4002

4003

4004
\section{Condition variables}
4005

4006
A condition variable is a data structure associated with a condition;
4007
it allows threads to block until the condition becomes true.  For
4008
example, \verb"thread_pop" might want check whether the queue is
4009
empty and, if so, wait for a condition like ``queue not empty''.
4010

4011
Similarly, \verb"thread_push" might want to check whether the queue is
4012
full and, if so, block until it is not full.
4013

4014
I'll handle the first condition here, and you will have a chance to
4015
handle the second condition as an exercise.
4016

4017
First we add a condition variable to the {\tt Queue} structure:
4018

4019
\begin{verbatim}
4020
typedef struct {
4021
    int *array;
4022
    int length;
4023
    int next_in;
4024
    int next_out;
4025
    Mutex *mutex;
4026
    Cond *nonempty;   //-- new
4027
} Queue;
4028
\end{verbatim}
4029

4030
And initialize it in \verb"make_queue":
4031

4032
\begin{verbatim}
4033
Queue *make_queue(int length)
4034
{
4035
    Queue *queue = (Queue *) malloc(sizeof(Queue));
4036
    queue->length = length;
4037
    queue->array = (int *) malloc(length * sizeof(int));
4038
    queue->next_in = 0;
4039
    queue->next_out = 0;
4040
    queue->mutex = make_mutex();
4041
    queue->nonempty = make_cond();   //-- new
4042
    return queue;
4043
}
4044
\end{verbatim}
4045

4046
Now in \verb"queue_pop", if we find the queue empty, we don't
4047
exit; instead we use the condition variable to block:
4048

4049
\begin{verbatim}
4050
int queue_pop(Queue *queue) {
4051
  mutex_lock(queue->mutex);
4052
  while (queue_empty(queue)) {
4053
    cond_wait(queue->nonempty, queue->mutex);  //-- new
4054
  }
4055
  
4056
  int item = queue->array[queue->next_out];
4057
  queue->next_out = queue_incr(queue, queue->next_out);
4058
  mutex_unlock(queue->mutex);
4059
  cond_signal(queue->nonfull);   //-- new
4060
  return item;
4061
}
4062
\end{verbatim}
4063

4064
\verb"cond_wait" is complicated, so let's take it slow.  
4065
The first argument is the condition variable; in this case,
4066
the condition we are waiting for is ``queue not empty''.  The second
4067
argument is the mutex that protects the queue.
4068

4069
When the thread that locked the mutex calls \verb"cond_wait", it
4070
unlocks the mutex and then blocks.  This is important.  If
4071
\verb"cond_wait" did not unlock the mutex before blocking, no
4072
other thread would be able to access the queue, no more items
4073
could be added, and the queue would always be empty.
4074

4075
So while the consumer is blocked on {\tt nonempty}, the producer can
4076
run.  Let's see what happens when the producer runs \verb"queue_push":
4077

4078
\begin{verbatim}
4079
void queue_push(Queue *queue, int item) {
4080
    mutex_lock(queue->mutex);
4081
    if (queue_full(queue)) {
4082
        mutex_unlock(queue->mutex);
4083
        perror_exit("queue is full");
4084
    }
4085
    queue->array[queue->next_in] = item;
4086
    queue->next_in = queue_incr(queue, queue->next_in);
4087
    mutex_unlock(queue->mutex);
4088
    cond_signal(queue->nonempty);    //-- new
4089
}
4090
\end{verbatim}
4091

4092
Just as before, \verb"queue_push" locks the {\tt Mutex} and checks
4093
whether the queue is full.  Assuming it is not, \verb"queue_push" adds
4094
a new element to the queue and then unlocks the {\tt Mutex}.
4095

4096
But before returning, it does one more thing: it ``signals'' the
4097
condition variable {\tt nonempty}.
4098

4099
Signalling a condition variable usually indicates that the condition is
4100
true.  If there are no threads waiting
4101
on the condition variable, the signal has no effect.
4102

4103
If there are threads waiting on the condition variable, one of them
4104
gets unblocked and resumes execution of \verb"cond_wait".  But before
4105
the awakened thread can return from \verb"cond_wait", it has
4106
to wait for and lock the {\tt Mutex}, again.
4107

4108
Now go back to \verb"queue_pop" and see what happens when the thread
4109
returns from \verb"cond_wait".  It loops back to the top of the while
4110
loop and checks the condition again.  I'll explain why in just a
4111
second, but for now let's assume that the condition is true; that is,
4112
the queue is not empty.
4113

4114
When the consumer thread exits the while loop, we know two things: (1)
4115
the condition is true, so there is at least one item in the queue, and
4116
(2) the {\tt Mutex} is locked, so it is safe to access the queue.
4117

4118
After removing an item, \verb"queue_pop" unlocks the mutex
4119
and returns.
4120

4121
In the next section I'll show you how my {\tt Cond} code works, but first I
4122
want to answer two frequently-asked questions:
4123

4124
\begin{itemize}
4125

4126
\item Why is \verb"cond_wait" inside a while loop rather than an if
4127
statement; that is, why do we have to check the condition again after
4128
returning from \verb"cond_wait"?
4129

4130
The primary reason you have to re-check the condition is the possibility
4131
of an intercepted signal.  Suppose Thread A is waiting on {\tt nonempty}.
4132
Thread B adds an item to the queue and signals {\tt nonempty}.  Thread
4133
A wakes up an tries to lock the mutex, but before it gets the chance,
4134
Evil Thread C swoops in, locks the mutex, pops the item from the
4135
queue, and unlocks the mutex.  Now the queue is empty again, but
4136
Thread A is not blocked any more.  Thread A could lock the mutex and
4137
returns from \verb"cond_wait".  If Thread A does not check the condition
4138
again, it would try to pop an element from an empty queue, and probably
4139
cause an error.
4140

4141
\item The other question that comes up when people learn about condition
4142
variables is ``How does the condition variable know what condition it
4143
is associated with?''
4144

4145
This question is understandable because there is no explicit connection
4146
between a {\tt Cond} structure and the condition it relates to.  The
4147
connection is implicit in the way it is used.
4148

4149
Here's one way to think of it: the condition associated with a Cond
4150
is the thing that is false when you call \verb"cond_wait" and true
4151
when you call \verb"cond_signal".
4152

4153
\end{itemize}
4154

4155
Because threads have to check the condition when they return from
4156
\verb"cond_wait", it is not strictly necessary to call \verb"cond_signal"
4157
only when the condition is true.  If you have reason to think the
4158
condition {\em might} be true, you could call \verb"cond_signal" as
4159
a suggestion that now is a good time to check.
4160

4161

4162
\section{Condition variable implementation}
4163

4164
The Cond structure I used in the previous section is a wrapper
4165
for a type called \verb"pthread_cond_t", which is defined in the POSIX
4166
threads API.  It is very similar to Mutex, which is a wrapper for
4167
\verb"pthread_mutex_t".  Both wrappers are defined in {\tt utils.c} and
4168
{\tt utils.h}.
4169

4170
Here's the typedef:
4171

4172
\begin{verbatim}
4173
typedef pthread_cond_t Cond;
4174
\end{verbatim}
4175

4176
\verb"make_cond" allocates space, initializes the condition variable,
4177
and returns a pointer:
4178

4179
\begin{verbatim}
4180
Cond *make_cond() {
4181
    Cond *cond = check_malloc(sizeof(Cond)); 
4182
    int n = pthread_cond_init(cond, NULL);
4183
    if (n != 0) perror_exit("make_cond failed");
4184
 
4185
    return cond;
4186
}
4187
\end{verbatim}
4188

4189
And here are the wrappers for \verb"cond_wait" and \verb"cond_signal".
4190

4191
\begin{verbatim}
4192
void cond_wait(Cond *cond, Mutex *mutex) {
4193
    int n = pthread_cond_wait(cond, mutex);
4194
    if (n != 0) perror_exit("cond_wait failed");
4195
}
4196

4197
void cond_signal(Cond *cond) {
4198
    int n = pthread_cond_signal(cond);
4199
    if (n != 0) perror_exit("cond_signal failed");
4200
}
4201
\end{verbatim}
4202

4203
At this point there should be nothing too surprising there.
4204

4205

4206

4207
\chapter{Semaphores in C}
4208

4209
Semaphores are a good way to learn about synchronization, but
4210
they are not as widely used, in practice, as mutexes and
4211
condition variables.
4212

4213
Nevertheless, there are some synchronization problems that can be
4214
solved simply with semaphores, yielding solutions that are more
4215
demonstrably correct.
4216

4217
This chapter presents a C API for working with semaphores and
4218
my code for making it easier to work with.  And it presents
4219
a final challenge: can you write an implementation of a semaphore
4220
using mutexes and condition variables?
4221

4222
The code for this chapter is in directory {\tt semaphore} in the
4223
repository for this book (see Section~\ref{code}).
4224

4225

4226
\section{POSIX Semaphores}
4227

4228
A semaphore is a data structure used to help threads work together
4229
without interfering with each other.
4230

4231
The POSIX standard specifies an interface for semaphores;
4232
it is not part of Pthreads, but most UNIXes
4233
that implement Pthreads also provide semaphores.
4234

4235
POSIX semaphores have type {\tt sem\_t}.
4236
As usual, I put a wrapper around {\tt sem\_t}
4237
to make it safer and easier to use.  My wrapper is in {\tt sem.h}:
4238

4239
\begin{verbatim}
4240
typedef sem_t Semaphore;
4241

4242
Semaphore *make_semaphore(int value);
4243
void semaphore_wait(Semaphore *sem);
4244
void semaphore_signal(Semaphore *sem);
4245
\end{verbatim}
4246

4247
{\tt Semaphore} is a synonym for \verb"sem_t", but I find it more
4248
readable, and the capital letter reminds me to treat it like an
4249
object and pass it by pointer.
4250

4251
The implementation of these functions is in {\tt sem.c}:
4252

4253
\begin{verbatim}
4254
Semaphore *make_semaphore(int value)
4255
{
4256
    Semaphore *sem = check_malloc(sizeof(Semaphore));
4257
    int n = sem_init(sem, 0, value);
4258
    if (n != 0) perror_exit("sem_init failed");
4259
    return sem;
4260
}
4261
\end{verbatim}
4262

4263
{\tt make\_semaphore} takes the initial value of the semaphore
4264
as a parameter.  It allocates space for a Semaphore, initializes
4265
it, and returns a pointer to {\tt Semaphore}.
4266

4267
{\tt sem\_init} returns 0 if it succeeds and -1 if anything goes
4268
wrong.  One nice thing about using wrapper functions is that you can
4269
encapsulate the error-checking code, which makes the code that uses
4270
these functions more readable.
4271

4272
Here is the implementation of \verb"semaphore_wait":
4273

4274
\begin{verbatim}
4275
void semaphore_wait(Semaphore *sem)
4276
{
4277
    int n = sem_wait(sem);
4278
    if (n != 0) perror_exit("sem_wait failed");
4279
}
4280
\end{verbatim}
4281

4282
And here is \verb"semaphore_signal":
4283

4284
\begin{verbatim}
4285
void semaphore_signal(Semaphore *sem)
4286
{
4287
    int n = sem_post(sem);
4288
    if (n != 0) perror_exit("sem_post failed");
4289
}
4290
\end{verbatim}
4291

4292
I prefer to call this operation ``signal'' rather than ``post'',
4293
although both terms are common.
4294

4295
Here's an example that shows how to use a semaphore as a mutex:
4296

4297
\begin{verbatim}
4298
Semaphore *mutex = make_semaphore(1);
4299

4300
semaphore_wait(mutex);
4301
  // protected code goes here
4302
semaphore_signal(mutex);
4303
\end{verbatim}
4304

4305
When you use a semaphore as a mutex, you usually
4306
initialize it to 1 to indicate
4307
that the mutex is unlocked; that is, one thread can
4308
pass the semaphore without blocking.
4309

4310
Here I am using the variable name {\tt mutex} to indicate that
4311
the semaphore is being used as a mutex.  But remember that the behavior
4312
of a semaphore is not the same as a Pthread mutex.
4313

4314

4315
\section{Producers and consumers with semaphores}
4316

4317
Using these semaphore wrapper functions, we can
4318
write a solution to the Producer-Consumer problem from
4319
Section~\ref{prodcon}.
4320
The code in this section is in \verb"queue_sem.c".
4321

4322
%TODO: Get this code into ExercisesInC/examples/queue
4323

4324
Here's the new definition of {\tt Queue}, replacing the mutex
4325
and condition variables with semaphores:
4326

4327
\begin{verbatim}
4328
typedef struct {
4329
    int *array;
4330
    int length;
4331
    int next_in;
4332
    int next_out;
4333
    Semaphore *mutex;       //-- new
4334
    Semaphore *items;       //-- new
4335
    Semaphore *spaces;      //-- new
4336
} Queue;
4337
\end{verbatim}
4338

4339
And here's the new version of \verb"make_queue":
4340

4341
\begin{verbatim}
4342
Queue *make_queue(int length)
4343
{
4344
    Queue *queue = (Queue *) malloc(sizeof(Queue));
4345
    queue->length = length;
4346
    queue->array = (int *) malloc(length * sizeof(int));
4347
    queue->next_in = 0;
4348
    queue->next_out = 0;
4349
    queue->mutex = make_semaphore(1);
4350
    queue->items = make_semaphore(0);
4351
    queue->spaces = make_semaphore(length-1);
4352
    return queue;
4353
}
4354
\end{verbatim}
4355

4356
{\tt mutex} is used to guarantee exclusive access to the queue;
4357
the initial value is 1, so the mutex is
4358
initially unlocked.
4359

4360
{\tt items} is the number of items in the queue, which is also the number
4361
of consumer threads that can execute \verb"queue_pop" without blocking.
4362
Initially there are no items in the queue.
4363

4364
{\tt spaces} is the number of empty spaces in the queue, which is the
4365
number of producer threads that can execute \verb"queue_push" without
4366
blocking.  Initially the number of spaces is the capacity of the queue,
4367
which is {\tt length-1}, as explained in Section~\ref{queue}.
4368

4369
Here is the new version of \verb"queue_push", which is run by
4370
producer threads:
4371

4372
\begin{verbatim}
4373
void queue_push(Queue *queue, int item) {
4374
  semaphore_wait(queue->spaces);
4375
  semaphore_wait(queue->mutex);
4376

4377
  queue->array[queue->next_in] = item;
4378
  queue->next_in = queue_incr(queue, queue->next_in);
4379

4380
  semaphore_signal(queue->mutex);
4381
  semaphore_signal(queue->items);
4382
}
4383
\end{verbatim}
4384

4385
Notice that \verb"queue_push" doesn't have to call
4386
\verb"queue_full" any more; instead, the semaphore keeps track of
4387
how many spaces are available and blocks producers if the queue
4388
is full.
4389

4390
Here is the new version of \verb"queue_pop":
4391

4392
\begin{verbatim}
4393
int queue_pop(Queue *queue) {
4394
  semaphore_wait(queue->items);
4395
  semaphore_wait(queue->mutex);
4396
  
4397
  int item = queue->array[queue->next_out];
4398
  queue->next_out = queue_incr(queue, queue->next_out);
4399

4400
  semaphore_signal(queue->mutex);
4401
  semaphore_signal(queue->spaces);
4402

4403
  return item;
4404
}
4405
\end{verbatim}
4406

4407
This solution is explained, using pseudo-code, in Chapter 4 of
4408
{\it The Little Book of Semaphores}.
4409

4410
Using the code in the repo for this book, you should be able to compile
4411
and run this solution.  You should be able to run:
4412

4413
\begin{verbatim}
4414
$ make queue_sem
4415
$ ./queue_sem
4416
\end{verbatim}
4417

4418

4419

4420
\section{Make your own semaphores}
4421
\label{makeyourown}
4422

4423
Any problem that can be solved with semaphores can also be solved
4424
with condition variables and mutexes.  One way to prove that's true
4425
is to implement a semaphore using condition variables and mutexes.
4426

4427
Before you go on, you might want to try this as an exercise: write
4428
functions that implement the semaphore API in {\tt sem.h}
4429
using using condition variables and mutexes.  You can put your
4430
solution in {\tt mysem.c} and {\tt mysem.h} in the repo for this book,
4431
and you'll find my solution in \verb"mysem_soln.c" and
4432
\verb"mysem_soln.h".
4433

4434
If you have trouble getting started, you can use the following
4435
structure definition, from my solution, as a hint:
4436

4437
\begin{verbatim}
4438
typedef struct {
4439
  int value, wakeups;
4440
  Mutex *mutex;
4441
  Cond *cond;
4442
} Semaphore;
4443
\end{verbatim}
4444

4445
% TODO: Include Property 3 (it's in LBoS).
4446

4447
{\tt value} is the value of the semaphore.  {\tt wakeups} counts
4448
the number of pending signals; that is, the number of threads
4449
that have been woken but have not yet resumed execution.  The reason
4450
for wakeups is to make sure that our semaphores have
4451
Property 3, described in {\tt The Little Book of Semaphores}.
4452

4453
{\tt mutex} provides exclusive access to {\tt value} and
4454
{\tt wakeups}; {\tt cond} is the condition variable threads
4455
wait on if they wait on the semaphore.
4456

4457
Here is the initialization code for this structure:
4458

4459
\begin{verbatim}
4460
Semaphore *make_semaphore(int value)
4461
{
4462
  Semaphore *semaphore = check_malloc(sizeof(Semaphore));
4463
  semaphore->value = value;
4464
  semaphore->wakeups = 0;
4465
  semaphore->mutex = make_mutex();
4466
  semaphore->cond = make_cond();
4467
  return semaphore;
4468
}
4469
\end{verbatim}
4470

4471

4472
\subsection{Semaphore implementation}
4473

4474
Here is my implementation of semaphores using POSIX mutexes
4475
and condition variables:
4476

4477
\begin{verbatim}
4478
void semaphore_wait(Semaphore *semaphore)
4479
{
4480
  mutex_lock(semaphore->mutex);
4481
  semaphore->value--;
4482

4483
  if (semaphore->value < 0) {
4484
    do {
4485
      cond_wait(semaphore->cond, semaphore->mutex);
4486
    } while (semaphore->wakeups < 1);
4487
    semaphore->wakeups--;
4488
  }
4489
  mutex_unlock(semaphore->mutex);
4490
}
4491
\end{verbatim}
4492

4493
When a thread waits on the semaphore, it has to lock the mutex
4494
before it decrements {\tt value}.  If the value of the semaphore
4495
becomes negative, the thread blocks until a ``wakeup'' is
4496
available.  While it is blocked, the mutex is unlocked, so another
4497
thread can signal.
4498

4499
Here is the code for \verb"semaphore_signal":
4500

4501
\begin{verbatim}
4502
void semaphore_signal(Semaphore *semaphore)
4503
{
4504
  mutex_lock(semaphore->mutex);
4505
  semaphore->value++;
4506

4507
  if (semaphore->value <= 0) {
4508
    semaphore->wakeups++;
4509
    cond_signal(semaphore->cond);
4510
  }
4511
  mutex_unlock(semaphore->mutex);
4512
}
4513
\end{verbatim}
4514

4515
Again, a thread has to lock the mutex before it increments
4516
{\tt value}.  If the semaphore was negative, that means threads
4517
are waiting, so the signalling thread increments {\tt wakeups} and
4518
signals the condition variable.
4519

4520
At this point one of the waiting threads might wake up, but the
4521
mutex is still locked until the signalling thread unlocks it.
4522

4523
At that point, one of the waiting threads returns from \verb"cond_wait"
4524
and checks whether a wakeup is still available.  If not, it
4525
loops and waits on the condition variable again.  If so, it
4526
decrements {\tt wakeups}, unlocks the mutex, and exits.
4527

4528
One thing about this solution that might not be obvious is the use of
4529
a {\tt do...while} loop.  Can you figure out why it is not a
4530
more conventional {\tt while} loop?  What would go wrong?
4531

4532
The problem is that with a {\tt while} loop this implementation would
4533
not have Property 3.  It would be possible for a thread to signal and
4534
then run around and catch its own signal.
4535

4536
With the {\tt do...while} loop, it is guaranteed\footnote{Well,
4537
  almost.  It turns out that a well-timed spurious wakeup (see
4538
  \url{http://en.wikipedia.org/wiki/Spurious_wakeup}) can violate this
4539
  guarantee.} that when a thread signals, one of the waiting threads
4540
will get the signal, even if the signalling thread runs around and
4541
gets the mutex before one of the waiting threads resumes.
4542

4543
\end{document}
4544

4545

4546

4547
Product

Resources

Company