silo/masstree/doc/spec.tex

   1 % -*- tex-command: "pdflatex" -*-
   2 \documentclass[11pt]{article}
   3 \usepackage{times,mathptmx,amsmath,amssymb,graphicx,float}
   4 \usepackage[tmargin=1in,hmargin=.75in,height=9in,columnsep=2pc,letterpaper]{geometry}
   5 \usepackage[T1]{fontenc}
   6 \makeatletter
   7 \newcommand{\Keyset}{\ensuremath{\mathcal K}}
   8 \newcommand{\Valueset}{\ensuremath{\mathcal V}}
   9 \newcommand{\None}{\textsc{None}}
  10 \newcommand{\NONE}{\textsc{none}}
  11 \newcommand{\FALSE}{\textsc{false}}
  12 \newcommand{\TRUE}{\textsc{true}}
  13 \newcommand{\Bool}{\textsc{Bool}}
  14 \newcommand{\Nat}{\ensuremath{\mathbb N}}
  15 \newcommand{\sysGet}{\textit{Get}}
  16 \newcommand{\sysget}{\textit{get}}
  17 \newcommand{\sysgets}{\textit{get\/}s}
  18 \newcommand{\sysPut}{\textit{Put}}
  19 \newcommand{\sysput}{\textit{put}}
  20 \newcommand{\sysputs}{\textit{put\/}s}
  21 \newcommand{\sysScan}{\textit{Scan}}
  22 \newcommand{\sysscan}{\textit{scan}}
  23 \newcommand{\Inv}[1]{\ensuremath{\textrm{inv}(#1)}}
  24 \newcommand{\Res}[1]{\ensuremath{\textrm{res}(#1)}}
  25 \newcommand{\Precedes}{<}
  26 \newcommand{\NotPrecedes}{\not <}
  27 \newcommand{\Overlaps}{\approx}
  28 \newcommand{\PrecedesK}[1]{\ensuremath{\mathrel{<_{#1}}}}
  29 \newcommand{\PrecedesEqK}[1]{\ensuremath{\mathrel{\leq_{#1}}}}
  30 \newcommand{\V}[1]{\textit{#1}}
  31 \newcommand{\N}[1]{\text{\upshape{#1}}}
  32 \newcommand{\NIL}{\textsc{nil}}
  33 \newcommand{\NOTFOUND}{\textsc{notfound}}
  34 \newcommand{\VALUE}{\textsc{value}}
  35 \newcommand{\LAYER}{\textsc{layer}}
  36 \newcommand{\MARK}{\textsc{mark}}
  37 \newcommand{\INSERTING}{\textsc{inserting}}
  38 \newcommand{\REMOVING}{\textsc{removing}}
  39 \newcommand{\readnow}[1]{\ensuremath{{}^\circ#1}}
  40 \newcommand{\fencebase}{\text{- - - - - - - - -}}
  41 \newcommand{\rcufencebase}{\text{* * * * * * * * *}}
  42 \newcommand{\fence}{\ensuremath{\fencebase}}
  43 %\newcommand{\acquirefence}{\ensuremath{\fencebase_{\textsc{acquire}}}}
  44 %\newcommand{\releasefence}{\ensuremath{\fencebase_{\textsc{release}}}}
  45 \let\acquirefence\fence
  46 \let\releasefence\fence
  47 \newcommand{\rcufence}{\ensuremath{\rcufencebase_{\textsc{rcu
  48         fence}}}}
  49 \newcommand{\substring}[3]{\ensuremath{#1[#2\mathbin:#3]}}
  50 \newcommand{\Returns}[1]{\ensuremath{\rightarrow\text{#1}}}
  51 \newcommand{\Bplus}{B\(^+\)}
  52 \newcommand{\Blink}{B\(^{\text{link}}\)}
  53 \newcommand{\LEAFWIDTH}{\ensuremath{W_\text{leaf}}}
  54 \newcommand{\INTERIORWIDTH}{\ensuremath{W_\text{interior}}}
  55 \newcommand{\CX}[1]{\small{\textit{// #1}}}
  56 \newcommand{\C}[1]{\unskip\qquad\CX{#1}}
  57 \newcommand{\ITEM}[1]{\textsc{#1}}
  58 \newcommand{\AT}[2]{\ensuremath{#1_{@#2}}}
  59 \newenvironment{programtabbing}%
  60   {\begingroup\parskip\z@\begin{tabbing}\quad\qquad~~\= \quad~~\= \quad~~\= \quad~~\=\kill}%
  61   {\unskip\end{tabbing}\endgroup}
  62 \gdef\programtabskip{\leavevmode\raise1.5ex\hbox{\strut}}
  63 \gdef\_{\char`\_}
  64 \frenchspacing
  65 \makeatother
  66 \begin{document}
  67
  68 \section{Definitions}
  69
  70 A Masstree maps \emph{keys} to \emph{values}. Keys are variable-length strings. Values are arbitrary.
  71 Let \Keyset\ be the set of keys and \Valueset\ the set of values.
  72
  73 Masstree's operations are \sysput, \sysget, and \sysscan.
  74
  75 \begin{itemize}
  76 \item \(\sysput : \Keyset \times \Valueset \mapsto \None\)
  77 \item \(\sysget : \Keyset \mapsto (\Valueset \cup \None)\)
  78 \item \(\sysscan : \Keyset \times (\Keyset \times \Valueset \mapsto \Bool) \mapsto \None\)
  79 \end{itemize}
  80
  81 \noindent%
  82 \sysPut\
  83 stores a value at a given key. \sysGet\ returns the value for a given
  84 key (or \NONE\ if there is no value). \sysScan\ fetches multiple keys
  85 and values. It takes an initial key $k_0$ and a function object $f$.
  86 The function object is called for every key $k \geq k_0$ in the
  87 Masstree, once per key, in increasing order by key. Its arguments are
  88 a key and the corresponding value. We expect the scan function will
  89 save copies of the keys and values in the object, or count them, or
  90 whatever is appropriate, and that scan's caller will extract those
  91 values once the scan returns. The scan stops when the Masstree
  92 runs out of values or the function object returns false, whichever
  93 happens first.
  94
  95 Masstree can support other operations, such as remove and a compare-and-swap-style atomic put, but these three suffice to discuss correctness.
  96
  97
  98 \section{Correctness}
  99
 100 Masstree is a concurrent data structure. This means that multiple threads access Masstree simultaneously, complicating notions of correctness. Roughly speaking, Masstree is correct if every \sysget\ returns the most recent \sysput\ for that key, but when \sysgets\ and \sysputs\ overlap this gets a little complicated.
 101
 102 Multiple \emph{threads} submit \emph{invocations} to Masstree and
 103 receive \emph{responses}. An example invocation is \(\sysput(k,v)\): put
 104 value $v$ at key $k$. The pair of an invocation and its response is
 105 called an \emph{action}. An invocation or response is
 106 called a \emph{half-action}.
 107
 108 We assume a \sysget\ or \sysput\ response contains \emph{the
 109 relevant \sysput\ action}. For \sysget, this is the \sysput\ action that
 110 stored the corresponding value (or \NONE\ if there was no previous
 111 \sysput\ for that key). For \sysput, this is the immediately preceding
 112 \sysput\ action for that key. This differs from the methods' normal
 113 signatures, but simplifies the model. The response to a \sysscan\ action
 114 is a \emph{set} of \sysput\ actions. Here's an example sequence of
 115 actions:
 116
 117 \begin{flushleft}
 118 \begin{tabular}{@{}rll}
 119 & Invocation & Response \\
 120 $a_1$ & $\sysget(k)$ & \NONE \\
 121 $a_2$ & $\sysput(k, v_2)$ & \NONE \\
 122 $a_3$ & $\sysput(k, v_3)$ & $a_2$ \\
 123 $a_4$ & $\sysget(k)$ & $a_3$ \\
 124 $a_5$ & $\sysput(k+1, v_5)$ & \NONE \\
 125 $a_6$ & $\sysscan(k, \dots)$ & $\{a_3,a_5\}$ \\
 126 \end{tabular}
 127 \end{flushleft}
 128
 129 We assume that all half-actions are sequenced into a total order, which
 130 we might call global time. In this time order, every invocation precedes
 131 its corresponding response---$\Inv{a} < \Res{a}$ for all
 132 $a$---though other half-actions may intervene between them.
 133
 134 The total order on half-actions imposes a \emph{partial} order on
 135 actions. Given actions $a_1$ and $a_2$, we say that $a_1$
 136 \emph{precedes} $a_2$, written $a_1 \Precedes a_2$, if $a_1$'s
 137 response is before $a_2$'s invocation in the global time order:
 138 $\Res{a_1} < \Inv{a_2}$. We say that $a_1$ \emph{overlaps} $a_2$,
 139 written $a_1 \Overlaps a_2$, if $a_1 \NotPrecedes a_2$ and $a_2
 140 \NotPrecedes a_1$.
 141
 142 Masstree is correct if there exist per-key total orders
 143 \(\PrecedesK{k}\), where \(\PrecedesK{k}\) for key $k$ contains the set
 144 of \sysput\ actions on $k$, so that the following properties are
 145 satisfied.
 146
 147 \begin{enumerate}
 148 \item Every \sysput\ action on $k$ is in \(\PrecedesK{k}\).
 149 \item The \(\PrecedesK{k}\) order agrees with the partial order on actions. That is, if $a_1 \Precedes a_2$ and $a_1$ and $a_2$ are \sysput\ actions on $k$, then $a_1 \PrecedesK{k} a_2$.
 150 \item The \(\PrecedesK{k}\) order agrees with \sysput\ responses. That is, assuming $\mathord{\PrecedesK{k}} = [a_0, a_1, a_2, \dots]$, then $\Res{a_0} = \NONE$, $\Res{a_1} = a_0$, $\Res{a_2} = a_1$, and so forth.
 151 \item Given \(a\), a \sysget\ action on $k$ with response \NONE, no
 152   \(\sysput(k, \cdot)\) actions precede \(a\) in the partial order on actions.
 153 \item Given \(a\), a \sysget\ action on $k$ with response $b \neq \NONE$, $b$ is a \sysput\ on $k$ that either overlaps with $a$ or immediately precedes $a$ in $\PrecedesK{k}$. That is, $\Res{a} \NotPrecedes \Inv{b}$, and there is no $x$ with $b \PrecedesK{k} x \Precedes a$.
 154 \item Given \(a_1 \Precedes a_2\), two \sysget\ actions on the same $k$ that originate \emph{from the same thread}, we must have \(\Res{a_1} \PrecedesEqK{k} \Res{a_2}\). That is, no thread sees ``time'' go backwards.
 155 \item Given \(a\), a \sysscan\ action starting from $k$, a corresponding set of \sysget\ operations, returning the same results as the \sysscan, would also follow these rules. (As an example, if \(b = \sysput(k, \cdot) \in \Res{a}\), then $b$ must be a \sysput\ on $k$ that either overlaps with $a$ or immediately precedes $a$ in $\PrecedesK{k}$. A key $x$ in the scan range with no corresponding response is like a \sysget\ action that returned \NONE: no \(\sysput(x, \cdot)\) actions can precede \(a\).)
 156 \end{enumerate}
 157
 158
 159 \section{Data structures}
 160
 161 Here's a picture of a Masstree layer.
 162
 163 \begin{figure}[H]
 164 \includegraphics[scale=.8]{examples_1}
 165 \caption{Example Masstree layer.}
 166 \end{figure}
 167
 168 \noindent
 169 Leaves are on the bottom row, interior nodes above. The letters are
 170 keys.
 171
 172 Leaves are connected in a doubly-linked list (dotted arrows); this
 173 facilitates scans and reverse scans and speeds up some other operations.
 174 (See \Blink\ trees.) A leaf \(n\) contains \(n.\V{size}\) keys and
 175 values, where \(0 \leq n.\V{size} \leq \LEAFWIDTH = 15\).
 176
 177 A leaf \(n\) is responsible for all keys \(k\) in the range
 178 \(\N{lowkey}(n) \leq k < \N{highkey}(n)\). Each leaf stores its lowkey
 179 explicitly, and that lowkey \emph{never changes}. The highkey is not
 180 stored in the leaf---it equals the \emph{next} leaf's lowkey---and can
 181 change over time. If leaf \(n\) splits, then \(\N{highkey}(n)\)
 182 decreases; if the leaf following \(n\) is deleted, then
 183 \(\N{highkey}(n)\) increases to cover the deleted leaf's key range. The
 184 layer's leftmost leaf has lowkey \(-\infty\).
 185
 186 Interior nodes form a B$^+$ tree for fast access to data stored in
 187 leaves. Interior nodes are \emph{not} linked
 188 together.\footnote{\Blink\ trees also connect interior nodes in linked
 189 lists, but in our tests this didn't speed anything up, and it would
 190 complicate remove.}
 191
 192 An interior node \(n\) contains \(n.\V{size}\) keys, where \(0 \leq
 193 n.\V{size} \leq \INTERIORWIDTH = 15\). A node with \(n.\V{size}\) keys
 194 contains exactly \(n.\V{size} + 1\) child pointers. Keys are stored in a
 195 \(n.\V{kslice}\) array (see \S\ref{s:keys} for more on key slices).
 196 Values for key \(k\) are found in one of the underlying children:
 197 %
 198 \[
 199 \text{The child containing \(k\)} =
 200 \begin{cases}
 201 n.\V{child}[0] & \text{if \(k < n.\V{kslice}[0]\),} \\
 202 n.\V{child}[1] & \text{if \(n.\V{kslice}[0] \leq k < n.\V{kslice}[1]\),} \\
 203 \vdots \\
 204 n.\V{child}[n.\V{size}] & \text{if \(k \geq n.\V{kslice}[n.\V{size}-1]\).}
 205 \end{cases}
 206 \]
 207 %
 208 If \(n.\V{size} = 0\), then all values are found in \(n.\V{child}[0]\).
 209
 210 Although each interior node is responsible for a range of keys, this
 211 range (unlike with leaves) is \emph{implicit}, depending on keys higher
 212 in the tree. For example, in the figure, interior node (1) is
 213 responsible for the key range \([\textsc{j}, \infty)\), but bounds
 214 \textsc{j} and \(\infty\) are implicit from context: if the root node
 215 contained different keys, (1) might be responsible for
 216 \([\textsc{e},\textsc{q})\) or some other range. This means that we
 217 can't always tell from local information whether a key belongs in an
 218 interior node. This has consequences for the remove operation, as we
 219 will see.
 220
 221
 222 \section{Keys}
 223 \label{s:keys}
 224
 225 Masstree \emph{keys} are variable-length strings ordered
 226 lexicographically.
 227 %
 228 Masstree divides these strings into eight-byte \emph{slices}; the
 229 B-tree structures that make up a Masstree store these slices as keys.
 230 Slice comparison produces the same result as lexicographic comparison
 231 of the underlying keys.
 232
 233 Masstree code uses objects of type ``key.''
 234 %
 235 A key is basically a string decomposed into three parts, a prefix, a
 236 slice, and a suffix.
 237 %
 238 The \N{shift\_key} function, which is called when walking from one
 239 layer of Masstree to the next, shifts the current slice into the
 240 prefix, and shifts the next eight bytes of the suffix into the slice.
 241
 242 If \(s\) is a string, let \(\substring{s}{a}{b}\) represent the
 243 substring of \(s\) starting at index \(a\) and ending just before
 244 index \(b\).
 245 %
 246 Thus, \(\substring{s}{0}{s.\V{length}} = s\), and
 247 \(\substring{s}{i}{i}\) is empty for any \(i\).
 248 %
 249 The substring operation constrains \(a\) and \(b\) to the appropriate
 250 bounds:
 251 %
 252 \[ \substring{s}{a}{b} =
 253 \substring{s}{\max(a,0)}{\min(b,s.\V{length})}. \]
 254
 255 Then the key type's methods are as follows.
 256
 257 \begin{flushleft}
 258 \begin{tabular}{@{}p{.5\hsize}@{}p{.5\hsize}@{}}
 259 \begin{programtabbing}
 260 \textbf{class key}: \\
 261 \> representation: string \(s\), int \(p\) \\
 262 \programtabskip
 263 \> constructor(string \(s\)): \\
 264 \>\> return \(\N{key}\{s = s, p = 0\}\) \\
 265 \programtabskip
 266 \> slice(key \(k\)) \Returns{string}: \\
 267 \>\> return \(\substring{k.s}{k.p}{k.p+8}\) \\
 268 \programtabskip
 269 \> suffix(key \(k\)) \Returns{string}: \\
 270 \>\> return \(\substring{k.s}{k.p+8}{k.s.\V{length}}\) \\
 271 \programtabskip
 272 \> length(key \(k\)) \Returns{int}: \\
 273 \>\> return \(k.s.\V{length} - k.p\)
 274 \end{programtabbing}
 275 &
 276 \begin{programtabbing}
 277 \\
 278 \\
 279 \programtabskip
 280 \> prefix(key \(k\)) \Returns{string}: \\
 281 \>\> return \(\substring{k.s}{0}{k.p}\) \\
 282 \programtabskip
 283 \> shift\_key(key \(k\)) \Returns{key}: \\
 284 \>\> // precondition: \(k.\V{length} > 8\) \\
 285 \>\> return \(\N{key}\{s = k.s, p = k.p + 8\}\) \\
 286 \programtabskip
 287 \> reset\_key(key \(k\)) \Returns{key}: \\
 288 \>\> // undoes all prior shift\_keys \\
 289 \>\> return \(\N{key}\{s = k.s, p = 0\}\)
 290 \end{programtabbing}
 291 \end{tabular}
 292 \end{flushleft}
 293
 294 Masstree leaves store key slices, key lengths, and key suffixes. These
 295 are stored in separate arrays: the key at position \(\pi\) has slice
 296 \(n.\V{kslice}[\pi]\), length \(n.\V{klength}[\pi]\), and suffix
 297 \(n.\V{ksuffix}[\pi]\). (It is important to store slices and lengths
 298 inline, in the node, to allow prefetching. Suffixes, which can have
 299 arbitrary length, must be stored out of line.) Multiple keys with the
 300 same slice and \emph{different} suffixes are stored in a new layer.
 301
 302 Interior nodes, unlike leaves, store \emph{only} key slices. (This
 303 speeds up the code, simplifies suffix memory management, and reduces
 304 memory usage.) As a consequence, all keys with slice \(k\) \emph{must}
 305 be stored in the same leaf. This choice constrains split.
 306
 307
 308 \section{Memory model}
 309
 310 We write memory fences as rows of dashes\ \fence; on x86 (TSO) machines,
 311 these fences only affect the compiler's code generation---no machine
 312 instructions are required.
 313
 314 We assume that, except as constrained by fences, the compiler may
 315 arbitrarily cache and reorder reads from and writes to node data. For
 316 example, the following code might read \(n.\V{version}\) once or twice,
 317 and might write \(n.\V{kslice}[0]\) once or twice:
 318
 319 \begin{programtabbing}
 320 \(a \gets n.\V{version}\) \\
 321 \(++n.\V{kslice}[0]\) \\
 322 \(b \gets n.\V{version}\) \\
 323 \(n.\V{kslice}[0] \gets 0\)
 324 \end{programtabbing}
 325
 326 \noindent
 327 However, this code \emph{must} read \(n.\V{version}\) twice, and write
 328 \(n.\V{kslice}[0]\) twice:
 329
 330 \begin{programtabbing}
 331 \(a \gets n.\V{version}\) \\
 332 \(++n.\V{kslice}[0]\) \\
 333 \fence \\
 334 \(b \gets n.\V{version}\) \\
 335 \(n.\V{kslice}[0] \gets 0\)
 336 \end{programtabbing}
 337
 338 We occasionally use a superscript O notation, as in
 339 \(\readnow{n.\V{version}}\), to indicate that a value is
 340 read from main memory at that point (i.e., no previously cached
 341 value allowed). Some unmarked reads must also complete from main
 342 memory, specifically those after fences.
 343
 344 All reads and writes of individual variables are assumed to be atomic.
 345 We also assume one atomic compare-and-swap operation, cmpxchg, which has
 346 the following effect (but in one atomic step):
 347
 348 \begin{programtabbing}
 349 \textbf{cmpxchg}(address \V{value}, value \V{expected}, value
 350 \V{desired}) \Returns{bool}: \\
 351 \> if \(\readnow{\V{value}} = \V{expected}\): \\
 352 \>\> \(\V{value} \gets \V{desired}\);~ return \TRUE \\
 353 \> else: \\
 354 \>\> return \FALSE
 355 \end{programtabbing}
 356
 357 \section{Versions and locking}
 358
 359 Each node has a 32- or 64-bit \emph{version}.\footnote{64-bit versions
 360 offer greater protection against overflow. A 32-bit version could wrap,
 361 causing unexpected reader behavior, if a reader blocked mid-computation
 362 for \(2^{22}\) inserts.} The version is located at the same place in
 363 leaf and interior nodes---it's the only common aspect of layout between
 364 node types. Versions can be manipulated with compare-and-swap operations
 365 and can be assigned atomically. They have the following fields:
 366
 367 \begin{flushleft}
 368 \begin{tabular}{@{}lll}
 369 \V{locked} & 1 bit & whether node is locked \\
 370 \V{inserting} & 1 bit & whether node has been dirtied for insertion \\
 371 \V{splitting} & 1 bit & whether node has been dirtied for splitting \\
 372 && Together, \V{inserting} and \V{splitting} are the \emph{dirty bits}. \\
 373 \V{vinsert} & 6 or 16 bits & insert version counter \\
 374 \V{vsplit} & 19 or 33 bits & split version counter \\
 375 && \V{vinsert} overflows into \V{vsplit}; \V{vsplit} is a circular
 376 counter. \\
 377 \V{deleted} & 1 bit & whether node has been deleted \\
 378 \V{isroot} & 1 bit & whether node is root of its layer \\
 379 \V{isleaf} & 1 bit & whether node is a leaf \\
 380 \end{tabular}
 381 \end{flushleft}
 382
 383 The dirty bits alert readers of potentially unstable states that require
 384 retry. Masstree updates generally involve multiple writes that should be
 385 considered atomic (for example, inserting a new child into an interior
 386 node, or updating the various parts of a key in a leaf). In some cases
 387 Masstree can arrange to ``publish'' an update atomically,
 388 but this is not always possible. Thus,
 389 before changing a node in a dangerous way, Masstree writers must mark
 390 the node as dirty by changing its version. Masstree readers execute
 391 \emph{read transactions} that are \emph{validated} using versions. First
 392 they snapshot the node's version, then they snapshot the relevant
 393 portion of its contents, and finally they compare the node's new version
 394 to its snapshot. If the versions differ in more than the \V{locked}
 395 bit (that is, if \(v \oplus \readnow{n.\V{version}} > \V{locked}\)),
 396 then the node changed in some potentially invalidating way during the
 397 read operation, and the reader must retry the read transaction.
 398
 399 Masstree's lock and unlock functions are as follows. Note that they
 400 contain fences.
 401
 402 \begin{figure}[H]
 403 \begin{programtabbing}
 404 \textbf{lock}(node $n$): \\
 405 \> $v \gets n.\V{version}$ \\
 406 \> while $v.\V{locked}$ or $!\text{cmpxchg}(n.\V{version}, v, v + \V{locked})$: \\
 407 \>\> $v \gets \readnow{n.\V{version}}$ \\
 408 \> \acquirefence \\
 409 \programtabskip
 410 %
 411 \textbf{unlock}(node $n$): \C{precondition: \(n\) is locked} \\
 412 \> $v \gets n.\V{version}$ \\
 413 \> if $v.\V{inserting}$: \\
 414 \>\> $++v.\V{vinsert}$ \C{may overflow into \V{vsplit}} \\
 415 \> else if $v.\V{splitting}$: \\
 416 \>\> $++v.\V{vsplit}$;~ $v.\V{isroot} \gets 0$ \C{the true root has never split} \\
 417 \> $v.\{\V{locked},\V{inserting},\V{splitting}\} \gets 0$ \\
 418 \> \releasefence \\
 419 \> $n.\V{version} \gets v$
 420 \end{programtabbing}
 421 \end{figure}
 422
 423 A read transaction must first wait for its node to reach a stable
 424 state, so read transactions start with \N{stable\_version}, a form of
 425 ``lock'' that doesn't actually involve locking.
 426
 427 \begin{figure}[H]
 428 \begin{programtabbing}
 429 \textbf{stable\_version}(node $n$): \\
 430 \> $v \gets {n.\V{version}}$ \\
 431 \> while $v.\V{inserting}$ or $v.\V{splitting}$: \\
 432 \>\> $v \gets \readnow{n.\V{version}}$ \\
 433 \> \acquirefence \\
 434 \> return $v$
 435 \end{programtabbing}
 436 \end{figure}
 437
 438 Nodes contain \V{parent} pointers for traversing up the tree. The
 439 \N{locked\_parent} and \N{true\_root} functions traverse these pointers.
 440 \N{locked\_parent} must retry because a node's parent can spontaneously
 441 change, even while the node is locked, if its parent interior node
 442 splits. The \N{true\_root} function traverses parent pointers until it
 443 finds the root of a tree.
 444
 445 \begin{figure}[H]
 446 \begin{programtabbing}
 447 \textbf{locked\_parent}(node $n$): \C{precondition: \(n\) is locked} \\
 448 retry:~
 449 \> $p \gets n.\V{parent}$ \\
 450 \> if $p \neq \NIL$: \\
 451 \>\> lock($p$) \\
 452 \>\> if $p \neq \readnow{n.\V{parent}}$: \C{parent changed underneath us} \\
 453 \>\>\> unlock($p$);~ goto retry \\
 454 \> return $p$ \\
 455 \programtabskip
 456 %
 457 \textbf{true\_root}(node $n$): \\
 458 \> \(p \gets n.\V{parent}\) \\
 459 \> while \(p \neq \NIL\): \\
 460 \>\> \(n \gets p\);~ \(p \gets n.\V{parent}\) \\
 461 \> return \(n\)
 462 \end{programtabbing}
 463 \end{figure}
 464
 465
 466 \section{Invariants}
 467
 468 We write \(\AT{x}{t}\) for the value of \(x\) at time \(t\).
 469 Times \(t\) and \(t'\) are \emph{RCU-overlapping} if a
 470 single reader could be active over some interval including both \(t\)
 471 and \(t'\).
 472
 473 \paragraph{Root reachability}
 474
 475 Assume that at time \(t\), the true root of a layer is node \(n\), and
 476 \(n\) is unlocked. Let \(x\) be the root of that layer as observed by
 477 a concurrent reader active at time \(t\). Then either \(x=n\), or
 478 \(x.\V{isroot} = \FALSE\) and \(n\) is reachable from \(x\) by a chain
 479 of \V{parent} pointers.
 480
 481 \paragraph{Parent validity}
 482
 483 If \(n\) is reachable, then \(n.\V{parent}\) is either another
 484 reachable node or \NIL.
 485
 486 \paragraph{Key stability}
 487
 488 Let \(n\) be a reachable leaf, and consider two times \(t<t'\) where
 489 \(n.\V{version}\) is the same at both times, and not dirty. Consider a
 490 slot \(\pi\) reachable via \(\AT{n.\V{permutation}}{t}\). Then
 491 slot \(\pi\) is also reachable via \(\AT{n.\V{permutation}}{t'}\), and
 492 all key information in slot \(\pi\) is unchanged from the corresponding values at time
 493 \(t\). (That is, \(\AT{n.\V{kslice}[\pi]}{t} =
 494 \AT{n.\V{kslice}[\pi]}{t'}\), \(\AT{n.\V{ksuffix}[\pi]}{t} =
 495 \AT{n.\V{ksuffix}[\pi]}{t'}\),  and \(\AT{n.\V{klength}[\pi]}{t} =
 496 \AT{n.\V{klength}[\pi]}{t'}\).)
 497
 498 \paragraph{Interior node validity}
 499
 500 Assume that \(n\) is a reachable interior node with
 501 \(\AT{n.\V{size}}{t} = s\). Let \(t'>t\) be an RCU-overlapping time with
 502 \(t\), and let \(x = \AT{n.\V{child}[i]}{t'}\) for some \(i\) with \(0
 503 \leq i \leq s\). Then either \(x = \NIL\), or \(x\) is a pointer to a
 504 valid (though not necessarily reachable) node.
 505
 506 \paragraph{Cleanliness}
 507
 508 If node \(n\) is dirty at time \(t\), then \(n\) is locked at time \(t\).
 509
 510 \paragraph{Locking discipline}
 511
 512 If a Masstree interface function (such as get, put, or remove) locks a
 513 node, then it unlocks that node before returning to the user.
 514
 515 Node locks are obtained sequentially \emph{down} from layer to layer, but
 516 within a layer, they are obtained \emph{up} from leaves toward the root.
 517
 518 \paragraph{Leaf linkage}
 519
 520 At all times, all of a tree's active nodes are reachable from the
 521 leftmost leaf by \V{next} pointers.
 522
 523 Say that \(n\) and \(n'\) are active nodes in the same tree, with
 524 \(\N{lowkey}(n) < \N{lowkey}(n')\). Then either \(n\) is reachable from
 525 \(n'\) by \V{prev} pointers, or \(n\) is dirty (\V{splitting}).
 526
 527 \section{Transaction types}
 528
 529 Leaf values are accessed in read transactions validated by version.
 530
 531 Node modifications use transactions protected by locks.
 532
 533 Internal nodes are accessed during descent using hand-over-hand
 534 validation by version.
 535
 536 Obtaining a locked pointer to a parent node requires hand-over-hand
 537 locking with validation.
 538
 539 \section{Get}
 540
 541 Get first descends to the leaf responsible for a key,
 542 then finds the key in that leaf. This proceeds a layer at a time.
 543
 544 Two related procedures, \N{descend} and \N{advance}, find within a layer
 545 the leaf node responsible for key $k$. \N{Descend} is more fundamental:
 546 every lookup involves one or more calls to \N{descend}. \N{Advance} is
 547 used occasionally to fix lookups that happened concurrently with splits.
 548
 549 Both procedures return a leaf node $n$ and corresponding version $v$.
 550 The version is never dirty (it was obtained by \N{stable\_version}).
 551 Either version $v$ of leaf $n$ is responsible for $k$, or
 552 $v.\V{deleted}$ is true.
 553 However, neither \N{descend} nor \N{advance} lock nodes or perform any
 554 writes to shared memory, so $n$ may not \emph{remain} responsible for
 555 $k$. $n$ could split, moving responsibility for $k$ to the right; or $n$
 556 could be deleted, moving responsibility for $k$ to the left. Users of
 557 \N{descend} must detect these situations using read transactions
 558 validated by a check of $n.\V{version}$. If $n.\V{version}$ indicates a
 559 split, the user should call \N{advance} to find the leaf truly
 560 responsible for $k$; if $n.\V{version}$ indicates a deletion, the user
 561 must call \N{descend} again.
 562
 563 \begin{figure}[H]
 564 \begin{programtabbing}
 565 \textbf{descend}(node \V{root}, key $k$) \Returns{\(\langle
 566   \N{leaf \V{n}, version \V{v}}\rangle\)}: \\
 567 \CX{postcondition: either \(v.\V{deleted}\), or version \V{v} of leaf
 568 \V{n} is responsible for key $k$: \(\N{lowkey}(n) \leq k.\V{slice} <
 569 \N{highkey}(n_{@v})\)} \\
 570 retry:
 571 \> $n \gets \V{root}$;~ $v \gets \text{stable\_version}(n)$ \\
 572 \> if \(!v.\V{isroot}\): \\
 573 \>\> \(n \gets \text{true\_root}(n)\);~ goto retry \\
 574 descend:~
 575 if $v.\V{isleaf}$: \\
 576 \>\> return $\langle n, v\rangle$ \\
 577 \> $n_c \gets \text{child of \(n\) containing \(k\)}$ \\
 578 \> if $n_c = \NIL$: \C{retry from root if concurrent remove reshaped tree} \\
 579 \>\> goto retry \\
 580 \> $v_c \gets \text{stable\_version}(n_c)$ \C{hand-over-hand validation} \\
 581 \> \fence \\
 582 \> if $v \oplus \readnow{n.\V{version}} > \V{locked}$: \C{$n$ changed, must retry} \\
 583 \>\> $\V{oldv} \gets v$;~ $v \gets \text{stable\_version}(n)$ \\
 584 \>\> if $v.\V{vsplit} \neq \V{oldv}.\V{vsplit}$: \C{retry from root when internal node splits} \\
 585 \>\>\> goto retry \\
 586 \> else: \\
 587 \>\> $n \gets n_c$;~ $v \gets v_c$ \\
 588 \> goto descend
 589 \end{programtabbing}
 590 \label{fig:nlookup}
 591 \end{figure}
 592
 593 \begin{figure}[H]
 594 \begin{programtabbing}
 595 \textbf{advance}(leaf $n$, key $k$) \Returns{\(\langle
 596   \N{leaf $n'$, version $v$}\rangle\)}: \\
 597 \CX{precondition: \(\N{lowkey}(n) \leq k.\V{slice}\)} \\
 598 \CX{postcondition: either $v.\V{deleted}$ (caller should rerun \N{descend}), or version $v$ of leaf $n'$ is responsible for $k$} \\
 599 \> $v \gets \text{stable\_version}(n)$;~ $\V{next} \gets n.\V{next}$ \\
 600 \> while $!v.\V{deleted}$ and $\V{next} \neq \NIL$ and
 601         $k \geq \text{lowkey}(\V{next})$: \\
 602 \>\> $n \gets \V{next}$;~ $v \gets \text{stable\_version}(n)$;~
 603         $\V{next} \gets n.\V{next}$ \\
 604 \> return $\langle n, v \rangle$
 605 \end{programtabbing}
 606 \end{figure}
 607
 608
 609 \begin{figure}[H]
 610 \begin{programtabbing}
 611 \textbf{find\_key}(leaf $n$, key $k$, permutation \V{perm})
 612 \Returns{\(\langle \N{int \V{i}}, \N{int \(\pi\)} \rangle\)}: \\
 613 \CX{Returns the insertion index and position of \(k\) in \(n\),
 614   considering only key slice and length.} \\
 615 \CX{postcondition: \(0 \leq i \leq \V{perm}.\V{size}\)} \\
 616 \CX{postcondition: \(\pi = \NONE\) iff \(k\) is not contained in \(n\)} \\
 617 \> $i \gets 0$;~ $\V{klength} = \min(k.\V{length}, 9)$ \\
 618 \> while $i < \V{perm}.\V{size}$: \C{linear search; we use binary search in practice} \\
 619 \>\> $\pi \gets \V{perm}.\V{pos}[i]$ \\
 620 \>\> if $n.\V{kslice}[\pi] < k.\V{slice}$ or ($n.\V{kslice}[\pi] =
 621 k.\V{slice}$ and $n.\V{klength}[\pi] < \V{klength}$): \\
 622 \>\>\> $++i$ \\
 623 \>\> else if $n.\V{kslice}[\pi] = k.\V{slice}$ and $n.\V{klength}[\pi] =
 624 \V{klength}$: \\
 625 \>\>\> return $\langle i, \pi \rangle$ \\
 626 \>\> else: \\
 627 \>\>\> break \\
 628 \> return $\langle i, \NONE \rangle$
 629 \end{programtabbing}
 630 \end{figure}
 631
 632 \begin{figure}[H]
 633 \begin{programtabbing}
 634 \textbf{get}(node \V{root}, key $k$): \\
 635 descend:~
 636  $\langle n, v \rangle \gets \text{descend}(\V{root}, k)$ \\
 637 retry:~
 638 \> if $v.\V{deleted}$: \\
 639 \>\> goto descend \\
 640 \> \(\langle i, \pi \rangle \gets \text{find\_key}(n, k,
 641 n.\V{permutation})\) \\
 642 \> if \(\pi \neq \NONE\): \\
 643 \>\> \(t \gets n.\V{type}[\pi]\);~
 644     \(\V{suffix} \gets n.\V{ksuffix}[\pi]\);~
 645     \(\V{value} \gets n.\V{value}[\pi]\) \\
 646 \> \fence \\
 647 \> if \(v \oplus \readnow{n.\V{version}} > \V{locked}\): \\
 648 \>\> $\langle n, v \rangle \gets \text{advance}(n, k)$ \\
 649 \>\> goto retry \\
 650 \> else if \(\pi \neq \NONE\) and \(t = \LAYER\): \\
 651 \>\> \(root \gets \V{value}\);~ \(k \gets \N{shift\_key}(k)\) \\
 652 \>\> goto descend \\
 653 \> else if \(\pi = \NONE\) or \(\V{suffix} \neq k.\V{suffix}\): \\
 654 \>\> return \NONE \\
 655 \> else: \\
 656 \>\> return \V{value}
 657 \end{programtabbing}
 658 \caption{Find the value for a key.}
 659 \label{fig:nget}
 660 \end{figure}
 661
 662
 663 \section{Put}
 664
 665 Put and remove, unlike get, must lock the leaf containing a key. The
 666 \N{descend\_and\_lock} function performs a combined traversal and lock.
 667 Unlike \N{descend}, \N{descend\_and\_lock} descends across multiple
 668 layers. It returns the unique node that would contain the value for
 669 \(k\) if \(k\) were in the Masstree.
 670
 671 A natural implementation of \N{descend\_and\_lock} might lock a
 672 leaf node per layer. When encountering a pointer to a lower layer,
 673 \N{descend\_and\_lock} would unlock the leaf from the higher layer and
 674 then recurse into the lower. This, however, would perform far more
 675 locking operations than required. A goal of Masstree is to perform no
 676 memory writes for common-case tree traversal. Therefore,
 677 \N{descend\_and\_lock} includes an optimization that, in the common
 678 case, recurses into a lower layer without obtaining a lock.
 679
 680 \N{Descend\_and\_lock} also performs a tree cleanup task. Consider a
 681 full lower-layer tree (linked from a higher layer, which we display
 682 above the dashed line):
 683
 684 \begin{figure}[H]
 685 \includegraphics[scale=.8]{insert1_1}
 686 \end{figure}
 687
 688 \noindent%
 689 Inserting a new item into this lower layer will cause a split. A new
 690 leaf node is created,
 691
 692 \begin{figure}[H]
 693 \includegraphics[scale=.8]{insert1_2}
 694 \end{figure}
 695
 696 \noindent%
 697 then a new internal node
 698
 699 \begin{figure}[H]
 700 \includegraphics[scale=.8]{insert1_3}
 701 \end{figure}
 702
 703 \noindent%
 704 and finally a new root.
 705
 706 \begin{figure}[H]
 707 \includegraphics[scale=.8]{insert1_4}
 708 \end{figure}
 709
 710 \noindent%
 711 But the higher layer connects to the \emph{old} root, not the new root.
 712 Correcting this would more work: modifications to the higher layer
 713 require locking the higher-layer node. We choose to delay that work.
 714 There's no \emph{correctness} problem with leaving the tree this
 715 way---thanks to the root reachability invariant, the new root will be
 716 found by traversing parent pointers from the old. So we leave the tree
 717 this way until the next put or remove, which will retarget the higher
 718 layer's pointer.
 719
 720 \begin{figure}[H]
 721 \includegraphics[scale=.8]{insert1_5}
 722 \end{figure}
 723
 724
 725 \begin{figure}[H]
 726 \begin{programtabbing}
 727 \textbf{descend\_and\_lock}(node \V{toproot}, key \(k\))
 728   \Returns{\(\langle \N{leaf \V{n}}, \N{int \V{i}}, \N{int
 729       \(\pi\)}, \N{key \(k'\)}\rangle\)}: \\
 730 \CX{Find, lock, and return the node into which \(k\) would be inserted.} \\
 731 \CX{postcondition: \(k' = \N{shift\_key}^m(k)\) for some \(m \geq 0\)}\\
 732 \CX{postcondition: \(n\) is locked and \(n\) is responsible for \(k'\)}\\
 733 \CX{postcondition: \(i\) is the insertion index for \(k'\) in \(n\)}\\
 734 \CX{postcondition: \(\pi = \NONE\) iff \(k'\) is not present in \(n\)}\\
 735 retrytop:~
 736    $\V{root} \gets \V{toproot}$;~ \(k \gets \N{reset\_key}(k)\) \\
 737 descend:~  $\langle n, v \rangle \gets \text{descend}(\V{root}, k)$ \\
 738 \> if $k$ has suffix: \C{optimization: avoid some unnecessary traversal locks} \\
 739 \>\> \(x \gets \text{try\_descend\_layer}(n, v, k)\) \\
 740 \>\> if \(x \neq \NONE\): \\
 741 \>\>\> \(\V{root} \gets x\);~ \(k \gets \N{shift\_key}(k)\) \\
 742 \>\>\> goto descend \\
 743 %
 744 retry:~
 745 \> \(\N{lock}(n)\);~ \(v' \gets n.\V{version}\) \\
 746 \> if \(v'.\V{deleted}\): \\
 747 \>\> unlock(\(n\));~ goto descend \\
 748 \> else if \(n.\V{deletedlayer}\): \C{an entire layer has been
 749   deleted, start over}\\
 750 \>\> unlock(\(n\));~ goto retrytop \\
 751 \> else if \(v'.\V{vsplit} \neq v.\V{vsplit}\) and \(n.\V{next} \neq
 752 \NIL\) and \(k \geq \N{lowkey}(n.\V{next})\): \\
 753 \>\> \(\V{next} \gets n.\V{next}\);~ unlock(\(n\)) \\
 754 \>\> \(\langle n, v \rangle \gets \N{advance}(\V{next}, k)\) \\
 755 \>\> goto retry \\
 756 \> else: \\
 757 \>\> \(\langle i, \pi \rangle \gets \N{find\_key}(n, k, n.\V{permutation})\)
 758 \\
 759 \>\> if \(\pi \neq \NONE\) and \(n.\V{type}[\pi] = \LAYER\): \\
 760 \>\>\> \(\V{root} \gets n.\V{value}[\pi]\);~ \(k \gets \N{shift\_key}(k)\) \\
 761 \>\>\> if \(!\V{root}.\V{isroot}\): \\
 762 \>\>\>\> \(\V{root} \gets n.\V{value}[\pi] \gets \N{true\_root}(\V{root})\) \\
 763 \>\>\> unlock(\(n\));~ goto descend \\
 764 \>\> else: \\
 765 \>\>\> return \(\langle n, i, \pi, k \rangle\) \\
 766 \programtabskip
 767 %
 768 \textbf{try\_descend\_layer}(node \(n\), version \(v\), key \(k\)): \\
 769 \> \(\langle i, \pi \rangle \gets \text{find\_key}(n, k, n.\V{permutation})\) \\
 770 \> if \(\pi \neq \NONE\): \\
 771 \>\> \(t \gets n.\V{type}[\pi]\) \\
 772 \>\> \fence \\
 773 \>\> \(\V{value} \gets n.\V{value}[\pi]\) \\
 774 \>\> \fence \\
 775 \>\> if \(!v.\V{deleted}\) and \(t = \LAYER\) and \(v \oplus \readnow{n.\V{version}} \leq \V{locked}\)
 776 and \(\V{value}.\V{isroot}\): \\
 777 \>\>\> return \V{value} \\
 778 \> return \NONE
 779 \end{programtabbing}
 780 \caption{Lock and return the leaf that would contain a key.}
 781 \end{figure}
 782
 783 \begin{figure}[H]
 784 \begin{programtabbing}
 785 \textbf{put}(node \V{root}, key \(k\), value \(\V{value}\)): \\
 786 \> \(\langle n, i, \pi, k \rangle \gets \N{descend\_and\_lock}(\V{toproot}, k)\) \\
 787 retry:~
 788 \> if \(\pi \neq \NONE\) and \(n.\V{ksuffix}[\pi] = k.\V{suffix}\): \\
 789 \>\> fork \(\N{free\_value}(n.\V{value}[\pi])\) \\
 790 \>\> \(n.\V{value}[\pi] \gets \V{value}\) \\
 791 \>\> unlock(\(n\));~ return \\
 792 \> else if \(\pi \neq \NONE\): \\
 793 \>\> \(n \gets \N{new\_layer}(n, \pi)\);~ \(k \gets \N{shift\_key}(k)\) \\
 794 \>\> \(\langle i, \pi \rangle \gets \N{find\_key}(n, k,
 795 n.\V{permutation})\) \\
 796 \>\> goto retry \\
 797 \> if \(n.\V{modstate} \neq \INSERTING\): \\
 798 \>\> \(n.\V{modstate} \gets \INSERTING\) \\
 799 \>\> \(n.\V{version}.\V{inserting} \gets 1\) \\
 800 \>\> \fence \\
 801 \> if \(n.\V{size} < \LEAFWIDTH\): \\
 802 \>\> \(\pi \gets \text{an unused position from \(n.\V{permutation}\)}\) \\
 803 \>\> \(n.\V{kslice}[\pi] \gets k.\V{slice}\) \\
 804 \>\> \(n.\V{ksuffix}[\pi] \gets k.\V{suffix}\) \\
 805 \>\> \(n.\V{klength}[\pi] \gets \min(k.\V{length}, 9)\) \\
 806 \>\> \(n.\V{type}[\pi] \gets \VALUE\) \\
 807 \>\> \(n.\V{value}[\pi] \gets \V{value}\) \\
 808 \>\> \fence \\
 809 \>\> \(n.\V{permutation} \gets \text{modified permutation with \(\pi\) at
 810   position \(i\)}\) \\
 811 \>\> unlock(\(n\));~ return \\
 812 \> \(\N{split}(n, k, \V{value})\) \\
 813 \programtabskip
 814 %
 815 \textbf{new\_layer}(leaf \(n\), int \(\pi\)): \\
 816 \> \(\V{suffix} = n.\V{ksuffix}[\pi]\);~ \(\V{value} \gets
 817 n.\V{value}[\pi]\) \\
 818 \> \(n' \gets \text{new locked leaf containing only \V{suffix} and
 819   \V{value}}\) \\
 820 \> \(n.\V{version}.\V{inserting} \gets 1\) \\
 821 \> \fence \\
 822 \> \(n.\V{value}[\pi] \gets n'\) \\
 823 \> \(n.\V{ksuffix}[\pi] \gets \text{empty string}\) \\
 824 %\> \fence \\
 825 \> \(n.\V{type}[\pi] \gets \LAYER\) \\
 826 \> unlock(\(n\)) \\
 827 \> return \(n'\)
 828 \end{programtabbing}
 829 \caption{Put implementation (except for split).}
 830 \end{figure}
 831
 832
 833 \begin{figure}[H]
 834 \begin{programtabbing}
 835 \textbf{split}(node $n$, key $k$, value \V{value}): \C{precondition: $n$ locked} \\
 836 \> $n' \gets \text{new empty leaf}$ \\
 837 \> \(n'.\V{version} \gets n.\V{version}\) \C{\(n'\) is initially locked} \\
 838 \> \(n'.\V{version}.\V{splitting} \gets 1\) \\
 839 \> copy the larger keys and values from \(n\) to \(n'\) \\
 840 \> assign \(n'.\V{permutation}\) appropriately \\
 841 \> \fence \\
 842 \> \(\N{link\_split\_leaf}(n, n')\) \\
 843 \> \(n.\V{version}.\V{splitting} \gets 1\) \\
 844 \> \fence \\
 845 \> assign \(n.\V{permutation}\) to contain only those keys not copied to
 846 \(n'\) \\
 847 \> insert \(k\) and \V{value} into \(n\) or \(n'\) as appropriate \\
 848 ascend:~
 849 \> $p \gets \text{locked\_parent}(n)$ \C{hand-over-hand locking} \\
 850 \> if $p = \NIL$: \C{$n$ was old root} \\
 851 \>\> \(p \gets \text{new interior node}\) \\
 852 \>\> insert \(n\), \(n'\) into \(p\) \\
 853 \>\> \(p.\V{isroot} \gets 1\);~ \(p.\V{parent} \gets \NIL\) \\
 854 \>\> \(n'.\V{parent} \gets p\) \\
 855 \>\> \fence \\
 856 \>\> \(n.\V{parent} \gets p\) \\
 857 \>\> unlock($n$);~ unlock($n'$);~ return \\
 858 \> else if $p$ is not full: \\
 859 \>\> $p.\V{version}.\V{inserting} \gets 1$ \\
 860 \>\> \fence \\
 861 \>\> \(\N{unlock}(n)\) \\
 862 \>\> shift $p$ keys and children to include $n'$ \\
 863 \>\> \(n'.\V{parent} \gets p\) \\
 864 \>\> \fence \\
 865 \>\> \(++p.\V{size}\) \\
 866 \>\> unlock($n$);~ unlock($n'$);~ unlock($p$);~ return \\
 867 \> else: \\
 868 \>\> \(p.\V{version}.\V{splitting} \gets 1\) \\
 869 \>\> \fence \\
 870 \>\> \(\N{unlock}(n)\) \\
 871 \>\> \(p' \gets \text{new interior node}\) \\
 872 \>\> \(p'.\V{version} \gets p.\V{version}\) \\
 873 \>\> \fence \\
 874 \>\> move larger keys from \(p\) into \(p'\), inserting \(n'\) where it belongs  (and setting \(n'.\V{parent}\)) \\
 875 \>\> for all \(c \in \text{children of \(p'\)}\): \\
 876 \>\>\> \(c.\V{parent} \gets p'\) \\
 877 \>\> unlock($n'$);~ $n \gets p$;~ $n' \gets p'$;~ goto ascend
 878 \end{programtabbing}
 879 \caption{Split a leaf and insert a key.}
 880 \label{fig:nsplit}
 881 \end{figure}
 882
 883 \begin{figure}[H]
 884 \begin{programtabbing}
 885 \textbf{link\_split\_leaf}(leaf \(n\), leaf \(n'\)): \\
 886 \> \(n'.\V{prev} \gets n\) \\
 887 \> \(\V{next} \gets n.\V{next}\) \\
 888 \> while \(\V{next} \neq \NIL\) and
 889 (\V{next} has a \MARK\ or \(!cmpxchg(n.\V{next}, \V{next},
 890 \V{next} + \MARK)\)): \\
 891 \>\> \(\V{next} \gets \readnow{n.\V{next}}\) \\
 892 \> \(n'.\V{next} \gets \V{next}\) \\
 893 \> if \(\V{next} \neq \NIL\): \\
 894 \>\> \(\V{next}.\V{prev} \gets n'\) \\
 895 \> \fence \\
 896 \> \(n.\V{next} \gets n'\)
 897 \end{programtabbing}
 898 \end{figure}
 899
 900 \section{Remove}
 901
 902 The most complex Masstree operation is remove. It's complex because of
 903 the wide range of tree changes it can cause.
 904
 905 \subsection{Removing an item}
 906
 907 We start with a simple example tree.
 908
 909 \begin{figure}[H]
 910 \includegraphics[scale=.8]{remove1_1}
 911 \end{figure}
 912
 913 Removing a single item---say, \ITEM{e}---is relatively simple: all code
 914 is in two short functions, \N{remove} and \N{remove\_item}. We first use
 915 \N{descend\_and\_lock} to locate and lock the leaf containing the item, then pop
 916 the item out of the relevant leaf's permutation.
 917
 918 \begin{figure}[H]
 919 \includegraphics[scale=.8]{remove1_2}
 920 \end{figure}
 921
 922 \noindent
 923 The heavy circle on the leaf's upper left indicates a lock. The item's
 924 value is reclaimed by \N{free\_value}, but only after concurrent readers
 925 have completed.
 926
 927 Note that \N{remove} does not actually modify the leaf's state for the
 928 item, namely \(n.\V{kslice}[\pi]\), \(n.\V{klength}[\pi]\),
 929 \(n.\V{ksuffix}[\pi]\), \(n.\V{type}[\pi]\), and \(n.\V{value}[\pi]\)!
 930 This allows concurrent readers who saw the \emph{old} permutation to
 931 extract a correct value for the item, namely the old value. This is
 932 correct---an overlapping reader can return the old value. Alternatively,
 933 \N{remove} could dirty the node, but this would force readers to retry,
 934 something we aim to avoid.
 935
 936 However, there is a wrinkle. We must ensure that a change in the node's keys
 937 can be detected by examining the node's version---specifically, by examining
 938 the version and the current node size. Thus, the sequence of operations
 939 ``insert(\textsc{x}); remove(\textsc{x})'' should change the node's version
 940 number. To this end, we keep track of a per-node modification state, called
 941 \(n.\V{modstate}\). This is either \INSERTING, if we are currently in the
 942 midst of inserting keys into the node, or \REMOVING, if we're currently
 943 removing keys from the node. Switching between modification states requires an
 944 explicit change to the node's version number.
 945
 946 \subsection{Removing a leaf}
 947
 948 A series of removes can empty out a leaf entirely. Empty nodes would
 949 waste memory and slow down lookup, so they are removed from the tree.
 950
 951 The set of leaves in a Masstree are collectively responsible for a full
 952 range of keys. When a leaf is removed, responsibility for its key range
 953 shifts to another leaf. Since \N{lowkey} values never change,
 954 responsibility always passes to the left, to the node's predecessor in
 955 the doubly linked list. Remove must ensure that concurrent operations
 956 reach this predecessor. This requires remove to update both the leaf
 957 linked list, which is used during scans and some puts, and the
 958 \Bplus\ tree structure.
 959
 960 As an exception, \N{remove} always preserves the leftmost leaf in a
 961 tree. This simplifies many operations and is logically required by our
 962 invariants: \N{lowkey} values never change, but deleting the leftmost
 963 node would effectively require setting its successor's \N{lowkey} to
 964 $-\infty$. The leftmost leaf can be deleted, but only when the entire
 965 subtree is empty.
 966
 967 We demonstrate by removing all the elements in \ITEM{def}.
 968
 969 Masstree first marks the node's version as \V{deleted}, which we show
 970 with a white X. Any operation that reaches a \V{deleted} node will retry
 971 from the root. This is important because the next steps will unlink the
 972 deleted node from the tree, and unlinked nodes no longer receive updates
 973 about other changes to the tree's structure. This means there is no
 974 reliable way to get back on track from a deleted node. Other operations
 975 will retry in a loop until the deleted leaf is fully unlinked. Also, a
 976 callback is registered to free the leaf's memory once concurrent readers
 977 have completed.
 978
 979 We next remove the node from the leaf linked list. Logical ``locks''
 980 protect against neighbors' concurrent removes and splits, but to prevent
 981 deadlock, these per-node locks are different from the main per-node
 982 locks. The lock is a mark bit stolen from node \V{next} pointers. The mark
 983 bit in \(n.\V{next}\) protects the setting of \(n.\V{next}\), and the
 984 setting of \(n.\V{next}.\V{prev}\).
 985
 986 The \N{unlink\_leaf} function contains the code. (The code for inserting
 987 new leaves during split is in \N{link\_split\_leaf}.) First, the
 988 \(\ITEM{def}.\V{next}\) pointer is locked with a mark, shown in black
 989 below. This lock protects both \(\ITEM{def}.\V{next}\) and
 990 \(\ITEM{ghi}.\V{prev}\), so \ITEM{ghi} now cannot remove itself from the
 991 list until \ITEM{def} is unlinked.
 992
 993 \begin{figure}[H]
 994 \includegraphics[scale=.8]{remove1_3}
 995 \end{figure}
 996
 997 \noindent%
 998 But removing \ITEM{def} requires modifying, and therefore locking,
 999 \(\ITEM{def}.\V{prev}.\V{next} = \ITEM{abc}.\V{next}\) as well. This
1000 modification requires a retry loop, in case \ITEM{abc} splits or is
1001 removed during the operation.
1002
1003 \begin{figure}[H]
1004 \includegraphics[scale=.8]{remove1_31}
1005 \end{figure}
1006
1007 \noindent%
1008 With \(\ITEM{def}.\V{prev}.\V{next}\) and \(\ITEM{def}.\V{next}\) both
1009 locked, the unlink is easy. First, \(\ITEM{ghi}.\V{prev}\) is modified,
1010
1011 \begin{figure}[H]
1012 \includegraphics[scale=.8]{remove1_4}
1013 \end{figure}
1014
1015 \noindent%
1016 and then, after a fence, \(\ITEM{abc}.\V{next}\). A single assignment
1017 both unlocks and changes the pointer.
1018
1019 \begin{figure}[H]
1020 \includegraphics[scale=.8]{remove1_5}
1021 \end{figure}
1022
1023 \noindent%
1024 This algorithm resembles some algorithms for lock-free
1025 doubly-linked-list insertion and removal, but it is a bit simpler. This
1026 is because Masstree leaf insertion is constrained---it only happens
1027 during split. As a result, insertion immediately after \(n\) cannot
1028 proceed concurrently with removal of \(n\).
1029
1030 The leaf linked list is correct, but the \Bplus\ tree index is not.
1031 Masstree traverses to and locks the leaf's parent, then releases the
1032 lock on the leaf. It dirties the interior node, searches for the deleted
1033 leaf's \N{lowkey} (here, \(\N{lowkey}(\ITEM{def}) = \ITEM{d}\)), and removes
1034 the corresponding key and child pointer, creating this state.
1035
1036 \begin{figure}[H]
1037 \includegraphics[scale=.8]{remove1_6}
1038 \end{figure}
1039
1040 And this completes the removal of \ITEM{def}.
1041
1042 \subsection{Collapsing trivial interior nodes}
1043
1044 Now say \ITEM{ghi} is deleted. This process starts out as before.
1045
1046 \begin{figure}[H]
1047 \includegraphics[scale=.8]{remove1_7}
1048 \end{figure}
1049
1050 \noindent%
1051 But when \(\N{lowkey}(\ITEM{ghi}) = \ITEM{g}\) is removed from its
1052 parent interior node, that interior node becomes \emph{trivial}: it has
1053 no keys and only one child. Trivial interior nodes are redundant,
1054 wasting space and traversal time.
1055
1056 \begin{figure}[H]
1057 \includegraphics[scale=.8]{remove1_8}
1058 \end{figure}
1059
1060 Masstree detects trivial nodes and deletes them in the \N{collapse}
1061 function. \N{Collapse} marks trivial nodes' versions as \V{deleted} and
1062 frees them after the usual reader delay. It also walks up the tree, with
1063 hand-over-hand locking, and adjusts parent nodes' pointers to hop over
1064 the deleted trivial node.
1065
1066 \begin{figure}[H]
1067 \includegraphics[scale=.8]{remove1_9}
1068 \end{figure}
1069
1070 \noindent%
1071 This deletion procedure leaves the tree funnily shaped: leaves are at
1072 different heights. It's this property that prevents us from storing
1073 links on interior nodes.
1074
1075 The \N{collapse} function won't ever delete the root node. A trivial
1076 root requires more work to fix, as we'll see.
1077
1078 \subsection{Reshaping the tree}
1079
1080 We return to the original example and, this time, remove leaf \ITEM{jkl}.
1081
1082 \begin{figure}[H]
1083 \includegraphics[scale=.8]{remove2_1}
1084 \end{figure}
1085
1086 Now, \ITEM{jkl} is the \emph{first} child of its parent. This requires
1087 special handling. To see why, note that
1088 the usual removal procedure would create this state:
1089
1090 \begin{figure}[H]
1091 \includegraphics[scale=.8]{remove2_2}
1092 \end{figure}
1093
1094 \noindent%
1095 Now consider a \N{put} operation on key \ITEM{j}. This \N{put} should
1096 reach leaf \ITEM{ghi}, which has inherited responsibility for
1097 the \ITEM{jkl} range. But the \Bplus\ tree structure will direct the
1098 \N{put} to leaf \ITEM{mno}!
1099
1100 Although such mistakes might be detectable at the leaves
1101 using \N{lowkey} comparisons, we prefer to avoid the problem
1102 in a different way. When
1103 a subtree's leftmost leaf is deleted, Masstree detects the issue and
1104 switches to a special \N{reshape} procedure. This procedure first sets
1105 the subtree's leftmost child pointer to \(\NIL\), causing concurrent
1106 gets and puts that would hit that node to retry from the root.
1107
1108 \begin{figure}[H]
1109 \includegraphics[scale=.8]{remove2_3}
1110 \end{figure}
1111
1112 The procedure then works up the tree using hand-over-hand locking. At
1113 each step it replaces the deleted lowkey, here \ITEM{j}, with its
1114 logical successor, here \ITEM{m}. In this tree, that takes one step, but
1115 more complicated trees might require multiple replacements.
1116
1117 \begin{figure}[H]
1118 \includegraphics[scale=.8]{remove2_4}
1119 \end{figure}
1120
1121 This tree is valid and sends no puts down the wrong path. Although an
1122 interior node has a redundant key---the right-hand interior node
1123 contains key \ITEM{m}, although it is responsible for no keys less
1124 than \ITEM{m}---we don't bother to clean it up yet.
1125
1126 \subsection{Root trimming}
1127
1128 When the \ITEM{mno} leaf is deleted,
1129 the entire right-hand subtree becomes redundant.
1130
1131 \begin{figure}[H]
1132 \includegraphics[scale=.8]{remove2_5}
1133 \end{figure}
1134
1135 After removing the \ITEM{mno} reference, \N{remove\_leaf} can tell that
1136 the right-hand interior node is redundant: its single child pointer is
1137 \NIL. The interior node can therefore be deleted
1138
1139 \begin{figure}[H]
1140 \includegraphics[scale=.8]{remove2_6}
1141 \end{figure}
1142
1143 \noindent%
1144 and this continues
1145 up the tree.
1146
1147 \begin{figure}[H]
1148 \includegraphics[scale=.8]{remove2_7}
1149 \end{figure}
1150
1151 \noindent%
1152 But now the \emph{root} node is trivial, having only a single child. How
1153 can we fix this? The root node may be referenced by
1154 a leaf node in a higher \emph{tree layer}, as shown here:
1155
1156 \begin{figure}[H]
1157 \includegraphics[scale=.8]{remove2_8}
1158 \end{figure}
1159
1160 \noindent%
1161 (Everything above the dashed line is in the next-higher layer.) We
1162 cannot delete the root node until this pointer is switched to its child.
1163 And any change to that higher layer's values requires locking the
1164 corresponding leaf.
1165
1166 We solve this problem by registering a callback to trim the layer,
1167 called \N{gclayer}. This function locks the
1168 relevant leaf in the higher layer
1169
1170 \begin{figure}[H]
1171 \includegraphics[scale=.8]{remove2_9}
1172 \end{figure}
1173
1174 \noindent%
1175 locks the lower layer's root
1176
1177 \begin{figure}[H]
1178 \includegraphics[scale=.8]{remove2_11}
1179 \end{figure}
1180
1181 \noindent%
1182 and then marks its single child as the new root, switching the higher
1183 layer's value pointer to it.
1184
1185 \begin{figure}[H]
1186 \includegraphics[scale=.8]{remove2_12}
1187 \end{figure}
1188
1189 The original root is now garbage and can be reclaimed.
1190
1191 But what about concurrent readers? The root reachability invariant
1192 requires that the true root of a layer always be reachable from any
1193 prior root. We therefore set the \emph{old} root's parent pointer to
1194 point to the \emph{new} root. This can cause an apparent anomaly: an
1195 internal node can have a leaf as its parent!
1196
1197 \subsection{Deleting a layer}
1198
1199 After the last item in a layer is removed, Masstree can remove the
1200 entire layer. This starts out like any gclayer call: the relevant leaf
1201 in the higher layer is locked
1202
1203 \begin{figure}[H]
1204 \includegraphics[scale=.8]{remove2_15}
1205 \end{figure}
1206
1207 \noindent
1208 and so is the empty root in the lower layer.
1209
1210 \begin{figure}[H]
1211 \includegraphics[scale=.8]{remove2_16}
1212 \end{figure}
1213
1214 \noindent%
1215 But the empty root is not deleted in the conventional way. We mark it
1216 not as deleted, but as a \emph{deleted layer}, using
1217 a flag that's checked only by concurrent \emph{writers}. Why?
1218 Normally, a lookup encountering a deleted node retries from the root of
1219 the current layer. Marking a layer's only node as deleted would cause a
1220 retry loop! Instead, concurrent readers that reach a deleted layer see a
1221 normal empty leaf and return ``not found.'' (This is correct: the
1222 deleted layer contains no keys.) Concurrent writers, however, must not
1223 write into a deleted layer, lest their updates be lost. On encountering
1224 a deleted layer, writers retry from the layer-0 root.
1225
1226 \begin{figure}[H]
1227 \begin{programtabbing}
1228 \textbf{remove}(node \V{toproot}, key \(k\)): \\
1229 \> \(\langle n, i, \pi, k \rangle \gets \N{descend\_and\_lock}(\V{toproot}, k)\) \\
1230 \> if \(\pi = \NONE\) or \(n.\V{ksuffix}[\pi] \neq k.\V{suffix}\): \\
1231 \>\> return \FALSE \\
1232 \> \(\N{remove\_item}(n, i, \pi, k, \V{toproot})\) \\
1233 \> return \TRUE
1234 \end{programtabbing}
1235 \end{figure}
1236
1237 \begin{figure}[H]
1238 \begin{programtabbing}
1239 \textbf{remove\_item}(leaf \(n\), int \(i\), int \(\pi\), key \(k\), node
1240 \V{toproot}): \\
1241 \> if \(n.\V{modstate} = \INSERTING\): \\
1242 \>\> \(n.\V{modstate} \gets \REMOVING\) \\
1243 \>\> \(n.\V{version}.\V{inserting} \gets 1 \) \\
1244 \>\> \fence \\
1245 \> \(n.\V{permutation} \gets \N{permremove}(n.\V{permutation}, i)\) \\
1246 \> fork \(\N{free\_value}(n.\V{value}[\pi])\) \\
1247 \> if \(n.\V{size} \neq 0\): \\
1248 \>\> \(\N{unlock}(n)\);~ return \\
1249 \> \(\N{remove\_leaf}(n, \V{toproot}, k.\V{layerprefix})\)
1250 \end{programtabbing}
1251 \end{figure}
1252
1253 \begin{figure}[H]
1254 \begin{programtabbing}
1255 \textbf{remove\_leaf}(leaf \V{n}, node \V{toproot}, key \V{layerkey}): \\
1256 \> if \(n.\V{prev} = \NIL\): \\
1257 \>\> if \(n.\V{next} = \NIL\) and \(\V{layerkey}\) is not empty: \\
1258 \>\>\> fork \(\N{gclayer}(\V{toproot}, \V{layerkey})\) \\
1259 \>\> unlock(\(n\)) \\
1260 \>\> return \\
1261 \> \(n.\V{version}.\V{deleted} \gets 1\);~ fork \(\N{free\_node}(n)\) \\
1262 \> \(\N{unlink\_leaf}(n)\) \\
1263 \> \(k \gets \N{lowkey}(n)\) \\
1264 ascend:~
1265 \> \(p \gets \N{locked\_parent}(n)\);~ \(\N{unlock}(n)\) \\
1266 \> \(i \gets \text{position of \V{k} in \V{p}}\) \\
1267 \> if \(i \neq 0\): \\
1268 \>\> \(p.\V{version}.\V{inserting} \gets 1\) \\
1269 \>\> \fence \\
1270 \>\> remove the element at index \(i - 1\), shifting others down \\
1271 \>\> if \(i \neq 1\) or \(p.\V{child}[0] \neq \NIL\): \\
1272 \>\>\> \(\N{collapse}(p, k, \V{toproot}, \V{layerkey})\) \\
1273 \>\>\> return \\
1274 \> if \(p.\V{size} = 0\): \\
1275 \>\> \(p.\V{version}.\V{deleted} \gets 1\);~ fork \(\N{free\_node}(p)\) \\
1276 \> else: \\
1277 \>\> \(\N{reshape}(p, k, \V{toproot}, \V{layerkey})\) \\
1278 \>\> return \\
1279 \> \(n \gets p\);~ goto ascend
1280 \end{programtabbing}
1281 \caption{Remove a node.}
1282 \end{figure}
1283
1284 \begin{figure}[H]
1285 \begin{programtabbing}
1286 \textbf{unlink\_leaf}(leaf \(n\)): \\
1287 \> \(\V{next} \gets n.\V{next}\) \\
1288 \> while \(\V{next} \neq \NIL\) and \(!\N{cmpxchg}(n.\V{next}, \V{next},
1289                 \V{next} + \MARK)\): \\
1290 \>\> \(\V{next} \gets \readnow{n.\V{next}}\) \\
1291 \> \(\V{prev} \gets n.\V{prev}\) \\
1292 \> while \(!\N{cmpxchg}(\V{prev}.\V{next}, n, n + \MARK)\): \\
1293 \>\> \(\V{prev} \gets \readnow{n.\V{prev}}\) \\
1294 \> if \(\V{next} \neq \NIL\): \\
1295 \>\> \(\V{next}.\V{prev} \gets \V{prev}\) \\
1296 \> \fence \\
1297 \> \(\V{prev}.\V{next} \gets \V{next}\)
1298 \end{programtabbing}
1299 \end{figure}
1300
1301 \begin{figure}[H]
1302 \begin{programtabbing}
1303 \textbf{reshape}(interiornode \(n\), key \V{k}, node \V{toproot}, key \V{layerkey}): \\
1304 \> \(n.\V{child}[0] \gets \NIL\) \\
1305 \> \(\V{patchkey} \gets n.\V{kslice}[0]\) \\
1306  ascend:~
1307 \> \(p \gets \N{locked\_parent}(n)\);~ \(\N{unlock}(n)\) \\
1308 \> \(i \gets \text{position of \V{k} in \V{p}}\) \\
1309 \> if \(i \neq 0\): \\
1310 \>\> \(p.\V{version}.\V{inserting} \gets 1\) \\
1311 \>\> \fence \\
1312 \>\> \(p.\V{kslice}[i - 1] \gets \V{patchkey}\) \\
1313 \>\> if \(i \neq 1\) or \(p.\V{child}[0] \neq \NIL\): \\
1314 \>\>\> \(\N{collapse}(p, k, \V{toproot}, \V{layerkey})\) \\
1315 \>\>\> return \\
1316 \> \(n \gets p\);~ goto ascend
1317 \end{programtabbing}
1318 \end{figure}
1319
1320 \begin{figure}[H]
1321 \begin{programtabbing}
1322 \textbf{collapse}(interiornode \(n\), key \(k\), node \V{toproot},
1323 key \V{layerkey}): \\
1324 ascend:~
1325 \> if \(n.\V{size} \neq 0\): \\
1326 \>\> \(\N{unlock}(n)\);~ return \\
1327 \> \(p \gets \N{locked\_parent}(n)\) \\
1328 \> if \(p = \NIL\): \\
1329 \>\> if \V{layerkey} is not empty: \\
1330 \>\>\> fork gclayer(\V{toproot}, \V{layerkey}) \\
1331 \>\> \(\N{unlock}(n)\);~ return \\
1332 \> \(i \gets \text{position of \(k\) in \(p\)}\) \\
1333 \> \(p.\V{child}[i] \gets n.\V{child}[0]\) \\
1334 \> \(n.\V{child}[0].\V{parent} \gets p\) \\
1335 \> \(n.\V{version}.\V{deleted} \gets 1\);~ fork \(\N{free\_node}(n)\) \\
1336 \> \(\N{unlock}(n)\);~ \(n \gets p\);~ goto ascend
1337 \end{programtabbing}
1338 \end{figure}
1339
1340 \begin{figure}[H]
1341 \begin{programtabbing}
1342 \textbf{gclayer}(node \V{toproot}, key \V{layerkey}): \\
1343 \> \rcufence \\
1344 \> \(\langle n, i, \pi, \N{\_} \rangle \gets \N{descend\_and\_lock}(\V{toproot}, \V{layerkey})\) \\
1345 \> if \(\pi = \NONE\) or \(n.\V{type}[\pi] \neq \LAYER\): \\
1346 \>\> \(\N{unlock}(n)\);~ return \\
1347 retry:~
1348 \> \(\V{layer} \gets n.\V{value}[\pi]\) \\
1349 \> if \(!\V{layer}.\V{isroot}\): \\
1350 \>\> \(\V{layer} \gets n.\V{value}[\pi] \gets \N{true\_root}(\V{layer})\) \\
1351 \> if \(\V{layer}.\V{size} > 0\) and \(\V{layer}.\V{isroot}\): \\
1352 \>\> \(\N{unlock}(n)\);~ return \\
1353 \> \(\N{lock}(\V{layer})\) \\
1354 \> if \(!\V{layer}.\V{isroot}\) and \(\V{layer}.\V{parent} = \NIL\): \\
1355 \>\> \(\V{layer}.\V{isroot} \gets 1\) \\
1356 \> if \(\V{layer}.\V{size} > 0\) or \(!\V{layer}.\V{isroot}\): \\
1357 \>\> \(\N{unlock}(\V{layer})\);~ \(\N{unlock}(\V{n})\);~ return \\
1358 \> if \(\V{layer}.\V{isleaf}\): \\
1359 \>\> \(\V{layer}.\V{deletedlayer} \gets 1\) \\
1360 \>\> \(\N{unlock}(\V{layer})\) \\
1361 \>\> \(\N{remove\_item}(n, i, \pi, \V{layerkey}, \V{toproot})\) \\
1362 \>\> return \\
1363 \> else: \\
1364 \>\> \(\V{child} \gets \V{layer}.\V{child}[0]\) \\
1365 \>\> \(\V{child}.\V{parent} \gets \NIL\) \\
1366 \>\> \(n.\V{value}[\pi] \gets \V{child}\) \\
1367 \>\> \(\V{layer}.\V{isroot} \gets 0\) \\
1368 \>\> \(\V{layer}.\V{parent} \gets \V{child}\) \\
1369 \>\> \(\N{unlock}(\V{layer})\) \\
1370 \>\> fork \(\N{free\_node}(\V{layer})\) \\
1371 \>\> goto retry
1372 \end{programtabbing}
1373 \caption{Remove a twig.}
1374 \end{figure}
1375
1376 \section{Other optimizations and left-off properties}
1377
1378 In the implementation, \(\N{lowkey}(n)\) is stored in
1379 \(n.\V{kslice}[0]\).
1380 %
1381 Thus, the code must ensure that any values stored in position 0
1382 have this slice.
1383 %
1384 This isn't a problem unless the value in position 0 is removed.
1385 %
1386 To guard against inappropriate reuse, the \N{insert} procedure
1387 ensures that position 0 is only reused by a key with the right slice.
1388
1389 In the implementation, \(n.\V{type}\) is stored implicitly; its values
1390 are inferred from \(n.\V{klength}\).
1391
1392
1393 node\_ts
1394
1395 \end{document}