%!PS-Adobe-3.0 %%Creator: groff version 1.18.1 %%CreationDate: Tue Jul 8 15:17:02 2003 %%DocumentNeededResources: font Times-Bold %%+ font Times-Roman %%+ font Times-Italic %%+ font Courier %%DocumentSuppliedResources: file metadata.eps %%+ file snapdeadlk.eps %%+ file extattr.eps %%+ procset grops 1.18 1 %%Pages: 12 %%PageOrder: Ascend %%Orientation: Portrait %%EndComments %%BeginProlog %%BeginResource: procset grops 1.18 1 /setpacking where{ pop currentpacking true setpacking }if /grops 120 dict dup begin /SC 32 def /A/show load def /B{0 SC 3 -1 roll widthshow}bind def /C{0 exch ashow}bind def /D{0 exch 0 SC 5 2 roll awidthshow}bind def /E{0 rmoveto show}bind def /F{0 rmoveto 0 SC 3 -1 roll widthshow}bind def /G{0 rmoveto 0 exch ashow}bind def /H{0 rmoveto 0 exch 0 SC 5 2 roll awidthshow}bind def /I{0 exch rmoveto show}bind def /J{0 exch rmoveto 0 SC 3 -1 roll widthshow}bind def /K{0 exch rmoveto 0 exch ashow}bind def /L{0 exch rmoveto 0 exch 0 SC 5 2 roll awidthshow}bind def /M{rmoveto show}bind def /N{rmoveto 0 SC 3 -1 roll widthshow}bind def /O{rmoveto 0 exch ashow}bind def /P{rmoveto 0 exch 0 SC 5 2 roll awidthshow}bind def /Q{moveto show}bind def /R{moveto 0 SC 3 -1 roll widthshow}bind def /S{moveto 0 exch ashow}bind def /T{moveto 0 exch 0 SC 5 2 roll awidthshow}bind def /SF{ findfont exch [exch dup 0 exch 0 exch neg 0 0]makefont dup setfont [exch/setfont cvx]cvx bind def }bind def /MF{ findfont [5 2 roll 0 3 1 roll neg 0 0]makefont dup setfont [exch/setfont cvx]cvx bind def }bind def /level0 0 def /RES 0 def /PL 0 def /LS 0 def /MANUAL{ statusdict begin/manualfeed true store end }bind def /PLG{ gsave newpath clippath pathbbox grestore exch pop add exch pop }bind def /BP{ /level0 save def 1 setlinecap 1 setlinejoin 72 RES div dup scale LS{ 90 rotate }{ 0 PL translate }ifelse 1 -1 scale }bind def /EP{ level0 restore showpage }bind def /DA{ newpath arcn stroke }bind def /SN{ transform .25 sub exch .25 sub exch round .25 add exch round .25 add exch itransform }bind def /DL{ SN moveto SN lineto stroke }bind def /DC{ newpath 0 360 arc closepath }bind def /TM matrix def /DE{ TM currentmatrix pop translate scale newpath 0 0 .5 0 360 arc closepath TM setmatrix }bind def /RC/rcurveto load def /RL/rlineto load def /ST/stroke load def /MT/moveto load def /CL/closepath load def /Fr{ setrgbcolor fill }bind def /Fk{ setcmykcolor fill }bind def /Fg{ setgray fill }bind def /FL/fill load def /LW/setlinewidth load def /Cr/setrgbcolor load def /Ck/setcmykcolor load def /Cg/setgray load def /RE{ findfont dup maxlength 1 index/FontName known not{1 add}if dict begin { 1 index/FID ne{def}{pop pop}ifelse }forall /Encoding exch def dup/FontName exch def currentdict end definefont pop }bind def /DEFS 0 def /EBEGIN{ moveto DEFS begin }bind def /EEND/end load def /CNT 0 def /level1 0 def /PBEGIN{ /level1 save def translate div 3 1 roll div exch scale neg exch neg exch translate 0 setgray 0 setlinecap 1 setlinewidth 0 setlinejoin 10 setmiterlimit []0 setdash /setstrokeadjust where{ pop false setstrokeadjust }if /setoverprint where{ pop false setoverprint }if newpath /CNT countdictstack def userdict begin /showpage{}def }bind def /PEND{ clear countdictstack CNT sub{end}repeat level1 restore }bind def end def /setpacking where{ pop setpacking }if %%EndResource %%IncludeResource: font Times-Bold %%IncludeResource: font Times-Roman %%IncludeResource: font Times-Italic %%IncludeResource: font Courier grops begin/DEFS 1 dict def DEFS begin/u{.001 mul}bind def end/RES 72 def/PL 792 def/LS false def/ENC0[/asciicircum/asciitilde/Scaron/Zcaron /scaron/zcaron/Ydieresis/trademark/quotesingle/Euro/.notdef/.notdef /.notdef/.notdef/.notdef/.notdef/.notdef/.notdef/.notdef/.notdef/.notdef /.notdef/.notdef/.notdef/.notdef/.notdef/.notdef/.notdef/.notdef/.notdef /.notdef/.notdef/space/exclam/quotedbl/numbersign/dollar/percent /ampersand/quoteright/parenleft/parenright/asterisk/plus/comma/hyphen /period/slash/zero/one/two/three/four/five/six/seven/eight/nine/colon /semicolon/less/equal/greater/question/at/A/B/C/D/E/F/G/H/I/J/K/L/M/N/O /P/Q/R/S/T/U/V/W/X/Y/Z/bracketleft/backslash/bracketright/circumflex /underscore/quoteleft/a/b/c/d/e/f/g/h/i/j/k/l/m/n/o/p/q/r/s/t/u/v/w/x/y /z/braceleft/bar/braceright/tilde/.notdef/quotesinglbase/guillemotleft /guillemotright/bullet/florin/fraction/perthousand/dagger/daggerdbl /endash/emdash/ff/fi/fl/ffi/ffl/dotlessi/dotlessj/grave/hungarumlaut /dotaccent/breve/caron/ring/ogonek/quotedblleft/quotedblright/oe/lslash /quotedblbase/OE/Lslash/.notdef/exclamdown/cent/sterling/currency/yen /brokenbar/section/dieresis/copyright/ordfeminine/guilsinglleft /logicalnot/minus/registered/macron/degree/plusminus/twosuperior /threesuperior/acute/mu/paragraph/periodcentered/cedilla/onesuperior /ordmasculine/guilsinglright/onequarter/onehalf/threequarters /questiondown/Agrave/Aacute/Acircumflex/Atilde/Adieresis/Aring/AE /Ccedilla/Egrave/Eacute/Ecircumflex/Edieresis/Igrave/Iacute/Icircumflex /Idieresis/Eth/Ntilde/Ograve/Oacute/Ocircumflex/Otilde/Odieresis /multiply/Oslash/Ugrave/Uacute/Ucircumflex/Udieresis/Yacute/Thorn /germandbls/agrave/aacute/acircumflex/atilde/adieresis/aring/ae/ccedilla /egrave/eacute/ecircumflex/edieresis/igrave/iacute/icircumflex/idieresis /eth/ntilde/ograve/oacute/ocircumflex/otilde/odieresis/divide/oslash /ugrave/uacute/ucircumflex/udieresis/yacute/thorn/ydieresis]def /Courier@0 ENC0/Courier RE/Times-Italic@0 ENC0/Times-Italic RE /Times-Roman@0 ENC0/Times-Roman RE/Times-Bold@0 ENC0/Times-Bold RE %%EndProlog %%Page: 1 1 %%BeginPageSetup BP %%EndPageSetup /F0 14/Times-Bold@0 SF .525(Enhancements to the F)111 123 R .524 (ast Filesystem T)-.35 F 4.024(oS)-1.288 G .524(upport Multi-T)-4.024 F (erabyte)-1.288 E(Storage Systems)257.203 138 Q/F1 12/Times-Roman@0 SF (Marshall Kirk McK)245.766 162 Q(usick)-.18 E/F2 12/Times-Italic@0 SF -.24(Au)251.112 180 S(thor and Consultant).24 E/F3 12/Times-Bold@0 SF (Abstract)283.674 222 Q/F4 10/Times-Roman@0 SF .607 (This paper describes a ne)72 261.6 R 3.108(wv)-.25 G .608 (ersion of the f)-3.258 F .608(ast \214lesystem,)-.1 F/F5 9 /Times-Roman@0 SF(UFS2)3.108 E F4 3.108(,d)C .608 (esigned to run on multi-terabyte storage systems.)-3.108 F 1.036(It gi) 72 273.6 R -.15(ve)-.25 G 3.536(st).15 G 1.036(he moti)-3.536 F -.25(va) -.25 G 1.036(tion behind coming up with a ne).25 F 3.536(wo)-.25 G 1.035 (n-disk format rather than trying to continue enhancing the)-3.536 F -.15(ex)72 285.6 S .426(isting f).15 F .426(ast-\214lesystem format.)-.1 F .426(It describes the ne)5.426 F 2.926(wf)-.25 G .426 (eatures and capabilities in)-2.926 F F5(UFS2)2.926 E F4 .426 (including e)2.926 F .426(xtended attrib)-.15 F(utes,)-.2 E(ne)72 297.6 Q 3.967(wa)-.25 G 1.467(nd higher resolution time stamps, dynamically a\ llocated inodes, and an e)-3.967 F 1.467(xpanded boot block area.)-.15 F 1.466(It also)6.466 F .006 (describes the features and capabilities that were considered b)72 309.6 R .006(ut rejected gi)-.2 F .006(ving the reasons for their rejection.) -.25 F(Ne)5.006 E .006(xt it)-.15 F(co)72 321.6 Q -.15(ve)-.15 G .297 (rs changes that were made to the soft update code to support the ne).15 F 2.797(wc)-.25 G .297(apabilities and to enable it to w)-2.797 F .297 (ork more)-.1 F .352(smoothly with e)72 333.6 R .352 (xisting \214lesystems.)-.15 F .352(The paper co)5.352 F -.15(ve)-.15 G .353(rs enhancements made to support li).15 F .653 -.15(ve d)-.25 H .353 (umps and changes made).15 F .168(to \214lesystem snapshots needed to a) 72 345.6 R -.2(vo)-.2 G .167(id deadlocks and to enable them to w).2 F .167(ork ef)-.1 F .167(\214ciently with multi-terabyte \214lesys-)-.25 F 3.57(tems. Similarly)72 357.6 R 3.57(,i)-.65 G 3.57(td)-3.57 G 1.071(es\ cribes changes that needed to be made to the \214lesystem check program\ to w)-3.57 F 1.071(ork with lar)-.1 F(ge)-.18 E 3.407 (\214lesystems. The)72 369.6 R .907(paper gi)3.407 F -.15(ve)-.25 G 3.407(ss).15 G .907 (ome comments about performance, and decribes areas for future w)-3.407 F .906(ork including an)-.1 F -.15(ex)72 381.6 S 1.02 (tent-based allocation mechanism and inde).15 F -.15(xe)-.15 G 3.521(dd) .15 G 1.021(irectory structures.)-3.521 F 1.021 (The paper concludes with current status and)6.021 F -.2(av)72 393.6 S (ailability of)-.05 E F5(UFS2)2.5 E F4(.)A/F6 10/Times-Bold@0 SF(1.)72 442.2 Q F3(Backgr)5 E(ound and Intr)-.216 E(oduction)-.216 E F4 -.35(Tr) 72 457.8 S(aditionally).35 E 3.105(,t)-.65 G(he)-3.105 E F5(BSD)3.105 E F4 -.1(fa)3.105 G .605(st \214lesystem \(which we shall).1 F .235 (refer to in this paper as)72 469.8 R F5(UFS1)2.735 E F4 5.47(\)[)C(McK) -5.47 E .235(usick et al, 1996;)-.15 F(McK)72 481.8 Q 1.867(usick, Jo) -.15 F 4.367(ye)-.1 G 4.366(ta)-4.367 G 1.866(l, 1984] and its deri) -4.366 F -.25(va)-.25 G(ti).25 E -.15(ve)-.25 G 4.366(sh).15 G -2.25 -.2 (av e)-4.366 H .599 (used 32-bit pointers to reference the blocks used by a)72 493.8 R .371 (\214le on the disk.)72 505.8 R(The)5.371 E F5(UFS1)2.87 E F4 .37 (\214lesystem w)2.87 F .37(as designed in)-.1 F 3.021(the early 1980')72 517.8 R 5.521(sw)-.55 G 3.021(hen the lar)-5.521 F 3.022 (gest disks were 330)-.18 F(me)72 529.8 Q -.05(ga)-.15 G 3.872 (bytes. There).05 F -.1(wa)3.872 G 3.871(sd).1 G 1.371 (ebate at the time whether it)-3.871 F -.1(wa)72 541.8 S 6.623(sw).1 G 4.123(orth squandering 32-bits per block pointer)-6.723 F 3.136 (rather than using the 24-bit block pointers of the)72 553.8 R 1.357 (\214lesystem that it replaced.)72 565.8 R 1.357 (Luckily the futurist vie)6.357 F(w)-.25 E(pre)72 577.8 Q -.25(va)-.25 G 1.751(iled and the design used 32-bit block pointers.).25 F(Ov)72 589.8 Q .618(er the twenty years that it has been deplo)-.15 F .618(yed, stor) -.1 F(-)-.2 E 2.057(age systems ha)72 601.8 R 2.357 -.15(ve g)-.2 H(ro) .15 E 2.057(wn to hold o)-.25 F -.15(ve)-.15 G 4.556(rat).15 G 2.056 (erabyte of)-4.556 F 3.545(data. Depending)72 613.8 R 1.046 (on the block size con\214guration, the)3.545 F .144 (32-bit block pointers of)72 625.8 R F5(UFS1)2.644 E F4 .144 (run out of space in the 1)2.644 F 1.65(to 4 terabyte range.)72 637.8 R 1.65(While some stop-g)6.65 F 1.65(ap measures)-.05 F .738 (can be used to e)72 649.8 R .738(xtend the maximum size storage sys-) -.15 F .392(tems supported by)72 661.8 R F5(UFS1)2.892 E F4 2.893(,b)C 2.893(y2)-2.893 G .393(002 it became clear that)-2.893 F 1.758 (the only long-term solution w)72 673.8 R 1.757(as to use 64-bit block) -.1 F 2.983(pointers. Thus,)72 685.8 R .484(we decided to b)2.983 F .484 (uild a ne)-.2 F 2.984<778c>-.25 G(lesystem,)-2.984 E F5(UFS2)72 697.8 Q F4 2.5(,t)C(hat w)-2.5 E(ould use 64-bit block pointers.)-.1 E 3.474 -.8 (We c)351.4 439.2 T 1.874(onsidered the alternati).8 F -.15(ve)-.25 G 4.374(sb).15 G 1.874(etween trying)-4.374 F 2.802(to mak)326.4 451.2 R 5.302(ei)-.1 G 2.802(ncremental changes to the e)-5.302 F(xisting)-.15 E F5(UFS1)5.302 E F4 2.018(\214lesystem v)326.4 463.2 R 2.018 (ersus importing another e)-.15 F 2.018(xisting \214lesys-)-.15 F 1.187 (tem such as)326.4 475.2 R F5(XFS)3.687 E F4([Sweene)3.687 E 3.687(ye) -.15 G 3.687(ta)-3.687 G 1.187(l, 1996], or ReiserFS)-3.687 F([Reiser) 326.4 487.2 Q 5.263(,2)-.4 G 5.263(001]. W)-5.263 F 5.263(ea)-.8 G 2.763 (lso considered writing a ne)-5.263 F(w)-.25 E 1.099 (\214lesystem from scratch so that we could tak)326.4 499.2 R 3.599(ea) -.1 G(dv)-3.599 E(an-)-.25 E .276 (tage of recent \214lesystem research and e)326.4 511.2 R 2.775 (xperience. W)-.15 F(e)-.8 E 1.015(chose to e)326.4 523.2 R 1.015 (xtend the)-.15 F F5(UFS1)3.515 E F4 1.016 (\214lesystem as this approach)3.516 F(allo)326.4 535.2 Q 1.702 (wed us to reuse most of the e)-.25 F(xisting)-.15 E F5(UFS1)4.201 E F4 (code)4.201 E 4.678(base. The)326.4 547.2 R 2.178 (bene\214ts of this decision were that)4.678 F F5(UFS2)4.678 E F4 -.1 (wa)326.4 559.2 S 2.853(sd).1 G -2.15 -.25(ev e)-2.853 H .353 (loped and deplo).25 F .353(yed quickly)-.1 F 2.853(,i)-.65 G 2.853(tb) -2.853 G .352(ecame stable)-2.853 F .641(and reliable rapidly)326.4 571.2 R 3.141(,a)-.65 G .641(nd the same code base could be)-3.141 F 1.589(used to support both)326.4 583.2 R F5(UFS1)4.089 E F4(and)4.088 E F5(UFS2)4.088 E F4 1.588(\214lesystem for)4.088 F(-)-.2 E 2.61(mats. Ov) 326.4 595.2 R .11(er 90 percent of the code base is shared, thus)-.15 F -.2(bu)326.4 607.2 S 4.525<678c>.2 G -.15(xe)-4.525 G 4.525(sa).15 G 2.025(nd feature or performance enhancements)-4.525 F (usually apply to both \214lesystem formats.)326.4 619.2 Q .299 (Sections 2, 3, and 4 discuss the)351.4 634.8 R F5(UFS2)2.8 E F4 (\214lesystem)2.8 E 4.933(itself. Sections)326.4 646.8 R 4.933(5a)4.933 G 2.433(nd 6 discuss enhancements that)-4.933 F .17 (were made during the de)326.4 658.8 R -.15(ve)-.25 G .171(lopment of) .15 F F5(UFS2)2.671 E F4 -.2(bu)2.671 G 2.671(tw).2 G(hich)-2.671 E 3.359(transfer o)326.4 670.8 R -.15(ve)-.15 G 5.859(rt).15 G(o)-5.859 E F5(UFS1)5.858 E F4 3.358(as well.)5.858 F 3.358(Sections 7 and 8)8.358 F .532(describe ho)326.4 682.8 R 3.033(ww)-.25 G 3.033(eo)-3.033 G -.15 (ve)-3.183 G .533(rcame problems of scale brought).15 F 1.121 (on by the enormous size of \214lesystems supported by)326.4 694.8 R F5 (UFS2)326.4 706.8 Q F4 6.576(.T)C 1.576 (he last three sections conclude with discus-)-6.576 F .587 (sions of performance, future w)326.4 718.8 R .587 (ork, and current status.)-.1 F 0 Cg EP %%Page: 2 2 %%BeginPageSetup BP %%EndPageSetup /F0 10/Times-Bold@0 SF(2.)72 87 Q/F1 12/Times-Bold@0 SF(The)5 E/F2 11 /Times-Bold@0 SF(UFS2)3 E F1(Filesystem)3 E/F3 10/Times-Roman@0 SF 2.246 (The on-disk inodes used by)72 102.6 R/F4 9/Times-Roman@0 SF(UFS1)4.747 E F3 2.247(are 128-bytes in)4.747 F .534(size and ha)72 114.6 R .834 -.15(ve o)-.2 H .533(nly tw).15 F 3.033(ou)-.1 G .533 (nused 32-bit \214elds.)-3.033 F .533(It w)5.533 F(ould)-.1 E 2.508 (not be possible to con)72 126.6 R -.15(ve)-.4 G 2.509 (rt to 64-bit block pointers).15 F 1.006 (without reducing the number of direct block pointers)72 138.6 R 2.648 (from twelv)72 150.6 R 5.148(et)-.15 G 5.148<6f8c>-5.148 G -.15(ve) -5.148 G 7.649(.D).15 G 2.649(oing so w)-7.649 F 2.649 (ould dramatically)-.1 F 2.042(increase the amount of w)72 162.6 R 2.041 (asted space as only direct)-.1 F 1.048 (block pointers can reference fragments.)72 174.6 R 1.049(So, the only) 6.049 F .458(viable alternati)72 186.6 R .758 -.15(ve i)-.25 H 2.958(st) .15 G 2.958(oi)-2.958 G .458(ncrease the size of the on-disk)-2.958 F (inode to 256 bytes.)72 198.6 Q 1.917 (Once one is committed to changing to a ne)97 214.2 R(w)-.25 E .502 (on-disk format for the inodes, it is possible to include)72 226.2 R 2.982(other inode-related changes that were not possible)72 238.2 R 1.52 (within the constraints of the old inodes.)72 250.2 R 1.52(While it is) 6.52 F 2.226(tempting to thro)72 262.2 R 4.726(wi)-.25 G 4.726(ne)-4.726 G -.15(ve)-4.976 G 2.226(rything that has e).15 F -.15(ve)-.25 G 4.727 (rb).15 G(een)-4.727 E .568(suggested o)72 274.2 R -.15(ve)-.15 G 3.068 (rt).15 G .568(he last twenty years, we feel that it is)-3.068 F .992 (best to limit the addition of ne)72 286.2 R 3.492(wc)-.25 G .993 (apabilities to those)-3.492 F .042(that are lik)72 298.2 R .042 (ely to ha)-.1 F .342 -.15(ve a c)-.2 H .042(lear bene\214t.).15 F(Ev) 5.042 E .042(ery ne)-.15 F 2.542(wa)-.25 G(ddi-)-2.542 E 1.105 (tion adds comple)72 310.2 R 1.106(xity which has a cost both in main-) -.15 F 2.425(tainability and performance.)72 322.2 R 2.425 (Obscure or little used)7.425 F 3.167 (features may add conditional checks in frequently)72 334.2 R -.15(exe) 72 346.2 S 1.611(cuted code paths such as read and write slo).15 F(wing) -.25 E(do)72 358.2 Q .063(wn the o)-.25 F -.15(ve)-.15 G .064 (rall performance of the \214lesystem e).15 F -.15(ve)-.25 G 2.564(ni) .15 G(f)-2.564 E(the)72 370.2 Q 2.5(ya)-.15 G(re not used.)-2.5 E 1.849 (Although we decided to come up with a ne)97 385.8 R(w)-.25 E .467 (on-disk inode format, we chose not to change the for)72 397.8 R(-)-.2 E .182(mat of the superblock, the c)72 409.8 R .182 (ylinder group maps, or the)-.15 F 5.798(directories. Additional)72 421.8 R 3.298(information needed for the)5.798 F F4(UFS2)72 433.8 Q F3 3.264(superblock and c)5.764 F 3.263(ylinder groups is stored in)-.15 F 3.662(spare \214elds of the)72 445.8 R F4(UFS1)6.162 E F3 3.662 (superblock and c)6.162 F(ylinder)-.15 E 4.142(groups. Maintaining)72 457.8 R 1.641(the same format for these data)4.142 F 2.107 (structures allo)72 469.8 R 2.108(ws a single code base to be used for) -.25 F(both)72 481.8 Q F4(UFS1)5.082 E F3(and)5.081 E F4(UFS2)5.081 E F3 7.581(.B)C 2.581(ecause the only dif)-7.581 F(ference)-.25 E 1.224 (between the tw)72 493.8 R 3.724<6f8c>-.1 G 1.225 (lesystems is in the format of their)-3.724 F .855 (inodes, code can dereference pointers to superblocks,)72 505.8 R -.15 (cy)72 517.8 S .513 (linder groups, and directory entries without need of).15 F 1.84 (checking what type of \214lesystem is being accessed.)72 529.8 R 2.671 -.8(To m)72 541.8 T 1.071 (inimize conditional checking of code that refer).8 F(-)-.2 E .258 (ences inodes, the on-disk inode is con)72 553.8 R -.15(ve)-.4 G .257 (rted to a com-).15 F 2.202 (mon in-core format when the inode is \214rst read in)72 565.8 R 1.206 (from the disk, and con)72 577.8 R -.15(ve)-.4 G 1.205 (rted back to its on-disk for).15 F(-)-.2 E 1.147 (mat when it is written back.)72 589.8 R 1.147(The ef)6.147 F 1.147 (fect of this deci-)-.25 F .606 (sion is that there are only nine out of se)72 601.8 R -.15(ve)-.25 G .606(ral hundred).15 F 1.584(routines that are speci\214c to)72 613.8 R F4(UFS1)4.085 E F3 -.15(ve)4.085 G(rsus).15 E F4(UFS2)4.085 E F3 6.585 (.T)C(he)-6.585 E 1.114(bene\214t of ha)72 625.8 R 1.113 (ving a single code base for both \214lesys-)-.2 F 1.641 (tems is that it dramatically reduces the maintenance)72 637.8 R 4.378 (cost. Outside)72 649.8 R 1.877 (of the nine \214lesystem format speci\214c)4.377 F 1.848 (functions, \214xing a b)72 661.8 R 1.848(ug in the code \214x)-.2 F 1.848(es it for both)-.15 F 1.958(\214lesystem types.)72 673.8 R 4.457 (Ac)6.957 G 1.957(ommon code base also means)-4.457 F 2.605 (that as the symmetric multiprocessing support gets)72 685.8 R .337 (added, it only needs to be done once for the)72 697.8 R F4(UFS)2.836 E F3 -.1(fa)2.836 G(m-).1 E(ily of \214lesystems.)72 709.8 Q .427 (Although we still use the same data structure to)351.4 84 R 2.404 (describe c)326.4 96 R 2.403 (ylinder groups, the practical de\214nition of)-.15 F .763 (them has changed.)326.4 108 R .764(In the era of)5.764 F F4(UFS1)3.264 E F3 3.264(,t)C .764(he \214lesystem)-3.264 F 3.379 (could get an accurate vie)326.4 120 R 5.879(wo)-.25 G 5.879(ft)-5.879 G 3.378(he disk geometry)-5.879 F .472(including the c)326.4 132 R .473 (ylinder and track boundaries and could)-.15 F 2.493 (accurately compute the rotational location of e)326.4 144 R -.15(ve) -.25 G(ry).15 E(sector)326.4 156 Q 5.87(.M)-.55 G .87 (odern disks hide this information pro)-5.87 F(viding)-.15 E 3.586 (\214ctitious numbers of blocks per track, tracks per)326.4 168 R -.15 (cy)326.4 180 S(linder).15 E 4.321(,a)-.4 G 1.821(nd c)-4.321 F 1.821 (ylinders per disk.)-.15 F 1.822(Indeed, in modern)6.822 F F4(RAID)326.4 192 Q F3 3.595(arrays, the `)6.095 F(`disk')-.74 E 6.094('t)-.74 G 3.594 (hat is presented to the)-6.094 F 1.006 (\214lesystem may really be composed from a collection)326.4 204 R .882 (of disks in the)326.4 216 R F4(RAID)3.382 E F3(array)3.382 E 5.881(.W) -.65 G .881(hile some research has)-5.881 F 1.621 (been done to \214gure out the true geometry of a disk)326.4 228 R ([Grif)326.4 240 Q .45 (\214n et al, 2002; Lumb et al, 2002; Schindler et al,)-.25 F 4.158 (2002], the comple)326.4 252 R 4.158(xity of using such information)-.15 F(ef)326.4 264 Q(fecti)-.25 E -.15(ve)-.25 G 1.547(ly is quite high.).15 F 1.547(Modern disks ha)6.547 F 1.847 -.15(ve g)-.2 H(reater).15 E 1.123 (numbers of sectors per track on the outer part of the)326.4 276 R 1.761 (disk than the inner part which mak)326.4 288 R 1.761(es calculation of) -.1 F .843(the rotational position of an)326.4 300 R 3.344(yg)-.15 G -2.15 -.25(iv e)-3.344 H 3.344(ns).25 G .844(ector quite com-)-3.344 F (ple)326.4 312 Q 3.092(xt)-.15 G 3.092(oc)-3.092 G 3.092(alculate. So,) -3.092 F(for)3.092 E F4(UFS2)3.091 E F3 3.091(,w)C 3.091(ed)-3.091 G .591(ecided to get rid)-3.091 F 1.843 (of all the rotational layout code found in)326.4 324 R F4(UFS1)4.343 E F3(and)4.343 E 1.319 (simply assume that laying out \214les with numerically)326.4 336 R .248 (close block numbers \(sequential being vie)326.4 348 R .249 (wed as opti-)-.25 F 3.182(mal\) w)326.4 360 R 3.182(ould gi)-.1 F 3.482 -.15(ve t)-.25 H 3.181(he best performance.).15 F 3.181(Thus, the)8.181 F -.15(cy)326.4 372 S 1.109(linder group structure is retained in).15 F F4(UFS2)3.609 E F3 3.61(,b)C 1.11(ut it is)-3.81 F 1.969 (used only as a con)326.4 384 R -.15(ve)-.4 G 1.969(nient w).15 F 1.968 (ay to manage logically)-.1 F 2.275(close groups of blocks.)326.4 396 R 2.276(The rotational layout code)7.276 F .715(had been disabled in)326.4 408 R F4(UFS1)3.215 E F3 .714(since the late 1980s, so as)3.214 F .439 (part of the code base cleanup it w)326.4 420 R .439(as remo)-.1 F -.15 (ve)-.15 G 2.939(de).15 G(ntirely)-2.939 E(.)-.65 E(The)351.4 435.6 Q F4 (UFS1)2.669 E F3 .169(\214lesystem uses 32-bit inode numbers.)2.669 F .181(While it is v)326.4 447.6 R .181 (ery tempting to increase these inode num-)-.15 F 1.334 (bers to 64 bits in)326.4 459.6 R F4(UFS2)3.833 E F3 3.833(,d)C 1.333 (oing so w)-3.833 F 1.333(ould require that)-.1 F 2.204 (the directory format be changed.)326.4 471.6 R 2.205(There is a lot of) 7.204 F .705(code that w)326.4 483.6 R .704 (orks directly on directory entries.)-.1 F(Chang-)5.704 E 3.761 (ing directory formats w)326.4 495.6 R 3.762(ould entail creating man) -.1 F(y)-.15 E 5.536(more \214lesystem speci\214c functions which w) 326.4 507.6 R(ould)-.1 E 3.436(increase the comple)326.4 519.6 R 3.436 (xity and maintainability issues)-.15 F .877(with the code.)326.4 531.6 R .876(Furthermore, the current)5.877 F F4(API)3.376 E F3 3.376(sf)C .876(or ref-)-3.376 F 1.638 (erencing directory entries use 32-bit inode numbers.)326.4 543.6 R .545 (So, e)326.4 555.6 R -.15(ve)-.25 G 3.045(ni).15 G 3.044(ft)-3.045 G .544(he underlying \214lesystem supported 64-bit)-3.044 F .273 (inode numbers, the)326.4 567.6 R 2.774(yc)-.15 G .274 (ould not currently be made visi-)-2.774 F 1.853 (ble to user applications.)326.4 579.6 R 1.852 (In the short term, applica-)6.853 F 2.049 (tions are not running into the four billion \214les-per)326.4 591.6 R (-)-.2 E .763(\214lesystem limit that 32-bit inode numbers impose.)326.4 603.6 R(If)5.762 E .646(we assume that the gro)326.4 615.6 R .646 (wth rate in the number of \214les)-.25 F .397(per \214lesystem o)326.4 627.6 R -.15(ve)-.15 G 2.896(rt).15 G .396 (he last twenty years will continue)-2.896 F 2.233 (at the same rate, we estimate that the 32-bit inode)326.4 639.6 R .738 (number should be suf)326.4 651.6 R .737 (\214cient for another ten to twenty)-.25 F 3.653(years. Ho)326.4 663.6 R(we)-.25 E -.15(ve)-.25 G 1.953 -.4(r, t).15 H 1.154 (he limit will be reached before the).4 F 1.39(64-bit block limit of) 326.4 675.6 R F4(UFS2)3.89 E F3 1.39(is reached.)3.89 F 1.389(So, the) 6.39 F F4(UFS2)3.889 E F3 2.461(\214lesystem has reserv)326.4 687.6 R 2.461(ed a \215ag in the superblock to)-.15 F .779 (indicate that it is a \214lesystem with 64-bit inode num-)326.4 699.6 R 5.237(bers. When)326.4 711.6 R 2.737(the time comes to be)5.237 F 2.738 (gin using 64-bit)-.15 F 0 Cg EP %%Page: 3 3 %%BeginPageSetup BP %%EndPageSetup /F0 10/Times-Roman@0 SF .508 (inode numbers, the \215ag can be turned on and the ne)72 84 R(w)-.25 E .186(directory format can be used.)72 96 R -.25(Ke)5.186 G .187 (rnels that predate the).25 F 1.48 (introduction of 64-bit inode numbers check this \215ag)72 108 R 1.865 (and will kno)72 120 R 4.365(wt)-.25 G 1.865(hat the)-4.365 F 4.365(yc) -.15 G 1.865(annot mount such \214lesys-)-4.365 F(tems.)72 132 Q 4.006 (Another change that w)97 147.6 R 4.006(as contemplated w)-.1 F(as)-.1 E 1.108(changing to a more comple)72 159.6 R 3.609(xd)-.15 G 1.109 (irectory structure such)-3.609 F 1.197 (as one that uses B-trees to speed up access for lar)72 171.6 R(ge)-.18 E 4.161(directories. This)72 183.6 R 1.661(technique is used in man) 4.161 F 4.162(ym)-.15 G(odern)-4.162 E 1.904(\214lesystems such as)72 195.6 R/F1 9/Times-Roman@0 SF(XFS)4.403 E F0([Sweene)4.403 E 4.403(ye) -.15 G 4.403(ta)-4.403 G 1.903(l, 1996],)-4.403 F F1(JFS)4.403 E F0 2.21 ([Best & Kleikamp, 2003], ReiserFS [Reiser)72 207.6 R 4.71(,2)-.4 G (001],)-4.71 E 2.525(and in later v)72 219.6 R 2.525 (ersions of Ext2 [Phillips, 2001].)-.15 F -.8(We)7.525 G .061 (decided not to mak)72 231.6 R 2.561(et)-.1 G .061 (he change at this time for se)-2.561 F -.15(ve)-.25 G(ral).15 E 2.964 (reasons. First,)72 243.6 R .464(we had limited time and resources and) 2.964 F 1.286(we w)72 255.6 R 1.286(anted to get something w)-.1 F 1.287 (orking and stable that)-.1 F .954 (could be used in the time frame of Free)72 267.6 R F1(BSD)A F0 3.453 (5.0. By)3.453 F -.1(ke)72 279.6 S 1.66 (eping the same directory format, we were able to).1 F .029 (reuse all the directory code from)72 291.6 R F1(UFS1)2.528 E F0 2.528 (,d)C .028(id not ha)-2.528 F .328 -.15(ve t)-.2 H(o).15 E 3.182 (change numerous \214lesystem utilities to understand)72 303.6 R .227 (and maintain a ne)72 315.6 R 2.727(wd)-.25 G .226 (irectory format, and were able to)-2.727 F 1.956 (produce a stable and reliable \214lesystem in the time)72 327.6 R 1.453 (frame a)72 339.6 R -.25(va)-.2 G 1.453(ilable to us.).25 F 1.452 (The other reason that we felt)6.452 F .656(that we could retain the e) 72 351.6 R .657(xisting directory structure is)-.15 F 2.371 (because of the dynamic directory hashing that w)72 363.6 R(as)-.1 E 2.629(added to Free)72 375.6 R F1(BSD)A F0([Do)5.129 E 2.629 (wse & Malone, 2002].)-.25 F(The)7.63 E .783 (dynamic directory hashing retro\214ts a directory inde)72 387.6 R(x-) -.15 E .739(ing system to)72 399.6 R F1(UFS)3.239 E F0 5.739(.T)C 3.239 (oa)-6.539 G -.2(vo)-3.439 G .74(id repeated linear searches).2 F 2.78 (of lar)72 411.6 R 2.78(ge directories, the dynamic directory hashing) -.18 F -.2(bu)72 423.6 S .141 (ilds a hash table of directory entries on the \215y when).2 F .203 (the directory is \214rst accessed.)72 435.6 R .202(This table a)5.202 F -.2(vo)-.2 G .202(ids direc-).2 F 5.387 (tory scans on subsequent lookups, creates, and)72 447.6 R 4.368 (deletes. Unlik)72 459.6 R 4.368<658c>-.1 G 1.868 (lesystems originally designed with)-4.368 F(lar)72 471.6 Q 1.156 (ge directories in mind, these indices are not sa)-.18 F -.15(ve)-.2 G (d).15 E 1.516(on disk and so the system is backw)72 483.6 R 1.516 (ards compatible.)-.1 F 2.058(The ef)72 495.6 R 2.059 (fect of the dynamic directory hashing is that)-.25 F(lar)72 507.6 Q 1.836(ge directories in)-.18 F F1(UFS)4.336 E F0 1.835 (cause minimal performance)4.336 F(problems.)72 519.6 Q(Borro)97 535.2 Q 4.13(wing the technique used by the Ext2)-.25 F .075 (\214lesystem a \215ag w)72 547.2 R .075 (as also added to indicate that an on-)-.1 F 2.818(disk inde)72 559.2 R 2.818(xing structure is supported for directories)-.15 F 1.815 ([Phillips, 2001].)72 571.2 R 1.815 (This \215ag is unconditionally turned)6.815 F(of)72 583.2 Q 5.01(fb) -.25 G 5.01(yt)-5.01 G 2.51(he e)-5.01 F 2.51(xisting implementation of) -.15 F F1(UFS)5.011 E F0 7.511(.I)C 5.011(nt)-7.511 G(he)-5.011 E 1.284 (future, if an implementation of an on-disk directory-)72 595.2 R(inde) 72 607.2 Q 1.103(xing structure is added, the implementations that)-.15 F .754(support it will not turn the \215ag of)72 619.2 R 3.253(f. Inde) -.25 F(x-supporting)-.15 E -.1(ke)72 631.2 S .122 (rnels will maintain the indices and lea).1 F .422 -.15(ve t)-.2 H .123 (he \215ag on.).15 F 1.813(If an old non-inde)72 643.2 R 1.813 (x-supporting k)-.15 F 1.812(ernel is run, it will)-.1 F 1.478(turn of) 72 655.2 R 3.978(ft)-.25 G 1.478 (he \215ag so that when the \214lesystem is once)-3.978 F(ag)72 667.2 Q .337(ain run under a ne)-.05 F 2.837(wk)-.25 G .337(ernel, the ne)-2.937 F 2.837(wk)-.25 G .337(ernel will dis-)-2.937 F(co)72 679.2 Q -.15(ve) -.15 G 3.978(rt).15 G 1.479(hat the inde)-3.978 F 1.479 (xing \215ag has been turned of)-.15 F 3.979(fa)-.25 G(nd)-3.979 E .089 (will kno)72 691.2 R 2.589(wt)-.25 G .088 (hat the indices may be out date and ha)-2.589 F .388 -.15(ve t)-.2 H(o) .15 E 1.148(be reb)72 703.2 R 1.149(uilt before being used.)-.2 F 1.149 (The only constraint on)6.149 F 1.083 (an implementation of the indices is that the)72 715.2 R 3.583(yh)-.15 G -2.25 -.2(av e)-3.583 H(to)3.783 E 1.405 (be an auxiliary data structure that references the old)326.4 84 R (linear directory format.)326.4 96 Q/F2 10/Times-Bold@0 SF(3.)326.4 123 Q/F3 12/Times-Bold@0 SF(Extended Attrib)5 E(utes)-.24 E F0 5.251(Am) 326.4 138.6 S 2.751(ajor addition in)-5.251 F F1(UFS2)5.251 E F0 2.75 (is support for e)5.25 F(xtended)-.15 E(attrib)326.4 150.6 Q 2.851 (utes. Extended)-.2 F(attrib)2.851 E .352(utes are a piece of auxiliary) -.2 F .352(data storage associated with an inode that can be used)326.4 162.6 R 1.401(to store auxiliary data that is separate from the con-) 326.4 174.6 R .69(tents of the \214le.)326.4 186.6 R .69 (The idea is similar to the concept of)5.69 F .22 (data forks used in the Apple \214lesystem [Apple, 2003].)326.4 198.6 R 1.552(By inte)326.4 210.6 R 1.552(grating the e)-.15 F 1.551 (xtended attrib)-.15 F 1.551(utes into the inode)-.2 F .311 (itself, it is possible to pro)326.4 222.6 R .311(vide the same inte) -.15 F .312(grity guar)-.15 F(-)-.2 E 1.317 (antees as are made for the contents of the \214le itself.)326.4 234.6 R (Speci\214cally)326.4 246.6 Q 3.29(,t)-.65 G .79 (he successful completion of an `)-3.29 F(`fsync')-.74 E(')-.74 E 2.358 (system call ensures that the \214le data, the e)326.4 258.6 R(xtended) -.15 E(attrib)326.4 270.6 Q 2.987 (utes, and all names and paths leading to the)-.2 F (names of the \214le are in stable store.)326.4 282.6 Q 2.702 (The current implementation has space in the)351.4 298.2 R .43 (inode to store up to tw)326.4 310.2 R 2.93(ob)-.1 G .43(locks of e) -2.93 F .43(xtended attrib)-.15 F(utes.)-.2 E 1.172(The ne)326.4 322.2 R (w)-.25 E F1(UFS2)3.672 E F0 1.171 (inode format had room for up to \214v)3.672 F(e)-.15 E 4.781 (additional 64-bit pointers.)326.4 334.2 R 4.782(Thus, the number of) 9.782 F -.15(ex)326.4 346.2 S .172(tended attrib).15 F .171 (ute blocks could ha)-.2 F .471 -.15(ve b)-.2 H .171(een in the range) .15 F .115(of one to \214v)326.4 358.2 R 2.615(eb)-.15 G 2.615(locks. W) -2.615 F 2.615(ec)-.8 G .115(hose to allocate tw)-2.615 F 2.615(ob)-.1 G (locks)-2.615 E .8(to the e)326.4 370.2 R .8(xtended attrib)-.15 F .8 (utes and to lea)-.2 F 1.1 -.15(ve t)-.2 H .8(he other three).15 F .902 (as spares for future use.)326.4 382.2 R .902(By ha)5.902 F .902 (ving tw)-.2 F .902(o, all the code)-.1 F 1.151 (had to be prepared to deal with an array of pointers,)326.4 394.2 R 1.696(thus if the number got e)326.4 406.2 R 1.697 (xpanded into the remaining)-.15 F 1.422(spares in the future the e) 326.4 418.2 R 1.421(xisting implementation will)-.15 F -.1(wo)326.4 430.2 S .086(rk without change.).1 F .086(By sa)5.086 F .086 (ving three spares, we pro-)-.2 F .954 (vided a reasonable amount of space for future needs.)326.4 442.2 R 1.069(And, if the decision to allo)326.4 454.2 R 3.569(wo)-.25 G 1.069 (nly tw)-3.569 F 3.569(ob)-.1 G 1.069(locks pro)-3.569 F -.15(ve)-.15 G (s).15 E .264(to be too little space, one or more of the spares can be) 326.4 466.2 R 1.332(used to e)326.4 478.2 R 1.332 (xpand the size of the e)-.15 F 1.333(xtended attrib)-.15 F 1.333 (utes in)-.2 F 1.062(the future.)326.4 490.2 R 1.061(If v)6.062 F 1.061 (astly more e)-.25 F 1.061(xtended attrib)-.15 F 1.061(ute space is)-.2 F .429(needed, one of the spares could be used as an indirect)326.4 502.2 R(pointer to e)326.4 514.2 Q(xtended attrib)-.15 E (ute data blocks.)-.2 E 0 0 218.4 227 -23.091 24 326.4 558.891 PBEGIN %%BeginDocument: extattr.eps %%Title: stdin %%Creator: fig2dev Version 3.2 Patchlevel 3d %%CreationDate: Fri May 9 23:05:05 2003 %%For: mckusick@beastie.mckusick.com (Kirk McKusick) %%BoundingBox: 0 0 227 24 %%Magnification: 1.0000 %%EndComments /$F2psDict 200 dict def $F2psDict begin $F2psDict /mtrx matrix put /col-1 {0 setgray} bind def /col0 {0.000 0.000 0.000 srgb} bind def /col1 {0.000 0.000 1.000 srgb} bind def /col2 {0.000 1.000 0.000 srgb} bind def /col3 {0.000 1.000 1.000 srgb} bind def /col4 {1.000 0.000 0.000 srgb} bind def /col5 {1.000 0.000 1.000 srgb} bind def /col6 {1.000 1.000 0.000 srgb} bind def /col7 {1.000 1.000 1.000 srgb} bind def /col8 {0.000 0.000 0.560 srgb} bind def /col9 {0.000 0.000 0.690 srgb} bind def /col10 {0.000 0.000 0.820 srgb} bind def /col11 {0.530 0.810 1.000 srgb} bind def /col12 {0.000 0.560 0.000 srgb} bind def /col13 {0.000 0.690 0.000 srgb} bind def /col14 {0.000 0.820 0.000 srgb} bind def /col15 {0.000 0.560 0.560 srgb} bind def /col16 {0.000 0.690 0.690 srgb} bind def /col17 {0.000 0.820 0.820 srgb} bind def /col18 {0.560 0.000 0.000 srgb} bind def /col19 {0.690 0.000 0.000 srgb} bind def /col20 {0.820 0.000 0.000 srgb} bind def /col21 {0.560 0.000 0.560 srgb} bind def /col22 {0.690 0.000 0.690 srgb} bind def /col23 {0.820 0.000 0.820 srgb} bind def /col24 {0.500 0.190 0.000 srgb} bind def /col25 {0.630 0.250 0.000 srgb} bind def /col26 {0.750 0.380 0.000 srgb} bind def /col27 {1.000 0.500 0.500 srgb} bind def /col28 {1.000 0.630 0.630 srgb} bind def /col29 {1.000 0.750 0.750 srgb} bind def /col30 {1.000 0.880 0.880 srgb} bind def /col31 {1.000 0.840 0.000 srgb} bind def end save newpath 0 24 moveto 0 0 lineto 227 0 lineto 227 24 lineto closepath clip newpath -57.8 59.2 translate 1 -1 scale /cp {closepath} bind def /ef {eofill} bind def /gr {grestore} bind def /gs {gsave} bind def /sa {save} bind def /rs {restore} bind def /l {lineto} bind def /m {moveto} bind def /rm {rmoveto} bind def /n {newpath} bind def /s {stroke} bind def /sh {show} bind def /slc {setlinecap} bind def /slj {setlinejoin} bind def /slw {setlinewidth} bind def /srgb {setrgbcolor} bind def /rot {rotate} bind def /sc {scale} bind def /sd {setdash} bind def /ff {findfont} bind def /sf {setfont} bind def /scf {scalefont} bind def /sw {stringwidth} bind def /tr {translate} bind def /tnt {dup dup currentrgbcolor 4 -2 roll dup 1 exch sub 3 -1 roll mul add 4 -2 roll dup 1 exch sub 3 -1 roll mul add 4 -2 roll dup 1 exch sub 3 -1 roll mul add srgb} bind def /shd {dup dup currentrgbcolor 4 -2 roll mul 4 -2 roll mul 4 -2 roll mul srgb} bind def /$F2psBegin {$F2psDict begin /$F2psEnteredState save def} def /$F2psEnd {$F2psEnteredState restore end} def $F2psBegin 10 setmiterlimit 0.06000 0.06000 sc % % Fig objects follow % /Times-Roman ff 150.00 scf sf 2175 900 m gs 1 -1 sc (pad len) dup sw pop 2 div neg 0 rm col0 sh gr % Polyline 7.500 slw n 2475 600 m 2475 975 l gs col0 s gr % Polyline n 3075 600 m 3075 975 l gs col0 s gr % Polyline n 3525 600 m 3525 975 l gs col0 s gr % Polyline n 3675 600 m 3675 975 l gs col0 s gr % Polyline n 4575 600 m 4575 975 l gs col0 s gr % Polyline n 1425 600 m 1425 975 l gs col0 s gr % Polyline n 975 600 m 4725 600 l 4725 975 l 975 975 l cp gs col0 s gr % Polyline n 3600 600 m 3525 675 l gs col0 s gr % Polyline n 3675 600 m 3525 750 l gs col0 s gr % Polyline n 3675 675 m 3525 825 l gs col0 s gr % Polyline n 3675 750 m 3525 900 l gs col0 s gr % Polyline n 3675 825 m 3525 975 l gs col0 s gr % Polyline n 3675 900 m 3600 975 l gs col0 s gr % Polyline n 4650 600 m 4575 675 l gs col0 s gr % Polyline n 4725 600 m 4575 750 l gs col0 s gr % Polyline n 4725 675 m 4575 825 l gs col0 s gr % Polyline n 4725 750 m 4575 900 l gs col0 s gr % Polyline n 4725 825 m 4575 975 l gs col0 s gr % Polyline n 4725 900 m 4650 975 l gs col0 s gr /Times-Roman ff 150.00 scf sf 1650 750 m gs 1 -1 sc (name) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman ff 150.00 scf sf 1650 900 m gs 1 -1 sc (space) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman ff 150.00 scf sf 2775 750 m gs 1 -1 sc (name) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman ff 150.00 scf sf 2775 900 m gs 1 -1 sc (length) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman ff 150.00 scf sf 3300 825 m gs 1 -1 sc (name) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman ff 150.00 scf sf 4125 825 m gs 1 -1 sc () dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman ff 150.00 scf sf 1200 825 m gs 1 -1 sc (length) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman ff 150.00 scf sf 2175 750 m gs 1 -1 sc (content) dup sw pop 2 div neg 0 rm col0 sh gr % Polyline n 1875 600 m 1875 975 l gs col0 s gr $F2psEnd rs %%EndDocument end PEND F2(Figur)354.79 576.891 Q 2.5(e1)-.18 G F0(:)-2.5 E/F4 10 /Times-Italic@0 SF -1.05(Fo)2.5 G(rmat of Extended Attrib)1.05 E(utes) -.2 E F0 .063(Figure 1 sho)351.4 600.891 R .062 (ws the format used for the e)-.25 F(xtended)-.15 E(attrib)326.4 612.891 Q 3.885(utes. The)-.2 F 1.385(header of each attrib)3.885 F 1.385 (ute has a 4-byte)-.2 F 1.801 (length, 1-byte name space class, 1-byte content pad)326.4 624.891 R 1.401(length, 1-byte name length, and name.)326.4 636.891 R 1.402 (The name is)6.401 F .782 (padded so that the contents start on an 8-byte bound-)326.4 648.891 R (ary)326.4 660.891 Q 5.268(.T)-.65 G .268 (he contents are padded to the size sho)-5.268 F .268(wn by the)-.25 F -.74(``)326.4 672.891 S 1.216(content pad length').74 F 3.716<278c>-.74 G 3.716(eld. Applications)-3.716 F 1.215(that do not)3.716 F 1.332 (understand the name space or name can simply skip)326.4 684.891 R -.15 (ove)326.4 696.891 S 4.549(rt).15 G 2.049(he unkno)-4.549 F 2.049 (wn attrib)-.25 F 2.049(ute by adding the length to)-.2 F 3.352 (their current position to get to the ne)326.4 708.891 R 3.353 (xt attrib)-.15 F(ute.)-.2 E .55(Thus, man)326.4 720.891 R 3.05(yd)-.15 G(if)-3.05 E .55(ferent applications can share the usage)-.25 F 0 Cg EP %%Page: 4 4 %%BeginPageSetup BP %%EndPageSetup /F0 10/Times-Roman@0 SF 1.902(of the e)72 84 R 1.902(xtended attrib)-.15 F 1.902(ute space, e)-.2 F -.15(ve)-.25 G 4.402(ni).15 G 4.402(ft)-4.402 G(he)-4.402 E 4.403(yd)-.15 G 4.403(on)-4.403 G(ot)-4.403 E (understand each other')72 96 Q 2.5(sd)-.55 G(ata types.)-2.5 E 4.77 (The \214rst of tw)97 111.6 R 7.27(oi)-.1 G 4.77(nitial uses for e)-7.27 F(xtended)-.15 E(attrib)72 123.6 Q 1.39 (utes is to support access control lists, generally)-.2 F .839 (referred to as)72 135.6 R/F1 9/Times-Roman@0 SF -.36(AC)3.339 G(L).36 E F0 3.339(s. An)B F1 -.36(AC)3.339 G(L).36 E F0 .839 (replaces the group per)3.339 F(-)-.2 E .033 (missions for a \214le with a more speci\214c list of the users)72 147.6 R .898(that are permitted to access the \214les along with a list)72 159.6 R 1.377(of the permissions that the)72 171.6 R 3.878(ya)-.15 G 1.378(re granted.)-3.878 F 1.378(These per)6.378 F(-)-.2 E 4.737 (missions include the traditional read, write, and)72 183.6 R -.15(exe) 72 195.6 S 1.085(cute permissions along with other properties such).15 F 2.912(as the right to rename or delete the \214le [Rhodes,)72 207.6 R (2003].)72 219.6 Q 3.658(Earlier implementations of)97 235.2 R F1 -.36 (AC)6.158 G(L).36 E F0 6.158(sw)C 3.658(ere done)-6.158 F 2.523 (with a single auxiliary \214le per \214lesystem that w)72 247.2 R(as) -.1 E(inde)72 259.2 Q -.15(xe)-.15 G 4.735(db).15 G 4.735(yt)-4.735 G 2.236(he inode number and had a small and)-4.735 F<8c78>72 271.2 Q 1.993 (ed sized area to store the)-.15 F F1 -.36(AC)4.492 G(L).36 E F0 4.492 (permissions. The)4.492 F 1.752(size w)72 283.2 R 1.752(as small to k) -.1 F 1.752(eep the size of the auxiliary \214le)-.1 F .746 (reasonable since it had to ha)72 295.2 R 1.045 -.15(ve s)-.2 H .745 (pace for e).15 F -.15(ve)-.25 G .745(ry possi-).15 F .441 (ble inode in the \214lesystem.)72 307.2 R .441(There were tw)5.441 F 2.942(op)-.1 G(roblems)-2.942 E .585(with this implementation.)72 319.2 R .585(The \214x)5.585 F .585(ed size of the space)-.15 F 1.29 (per inode to store the)72 331.2 R F1 -.36(AC)3.79 G(L).36 E F0 1.29 (information meant that it)3.79 F -.1(wa)72 343.2 S 3.419(sn).1 G .919 (ot possible to gi)-3.419 F 1.219 -.15(ve a)-.25 H .919 (ccess to long lists of users.).15 F .275(The second problem w)72 355.2 R .276(as that it w)-.1 F .276(as dif)-.1 F .276(\214cult to atomi-)-.25 F 1.189(cally commit changes to the)72 367.2 R F1 -.36(AC)3.689 G(L).36 E F0 1.189(list for a \214le since)3.689 F .079 (an update requires that both the \214le inode and the)72 379.2 R F1 -.36(AC)2.579 G(L).36 E F0 .613(\214le be written to ha)72 391.2 R .912 -.15(ve t)-.2 H .612(he update tak).15 F 3.112(ee)-.1 G -.25(ff)-3.112 G .612(ect [W).25 F(atson,)-.8 E(2000].)72 403.2 Q .012 (Both problems with the auxiliary \214le implemen-)97 418.8 R .953 (tation of)72 430.8 R F1 -.36(AC)3.453 G(L).36 E F0 3.452(sa)C .952 (re \214x)-3.452 F .952(ed by storing the)-.15 F F1 -.36(AC)3.452 G(L) .36 E F0(informa-)3.452 E .752(tion directly in the e)72 442.8 R (xtended-attrib)-.15 E .753(ute data area of the)-.2 F 5.682 (inode. Because)72 454.8 R 3.181(of the lar)5.682 F 3.181 (ge size of the e)-.18 F(xtended)-.15 E(attrib)72 466.8 Q .249 (ute data area \(a minimum of 8 kilobytes and typ-)-.2 F .339 (ically 32 kilobytes\), long lists of)72 478.8 R F1 -.36(AC)2.839 G(L) .36 E F0 .338(information can)2.838 F 4.758(be easily stored.)72 490.8 R 4.759(Space used to store e)9.758 F(xtended)-.15 E(attrib)72 502.8 Q .794(ute information is proportional to the number of)-.2 F 2.347 (inodes with e)72 514.8 R 2.347(xtended attrib)-.15 F 2.348 (utes and the size of the)-.2 F F1 -.36(AC)72 526.8 S(L).36 E F0 1.329 (lists that the)3.829 F 3.829(yu)-.15 G 3.829(se. Atomic)-3.829 F 1.329 (update of the infor)3.829 F(-)-.2 E 2.371 (mation is much easier since writing the inode will)72 538.8 R 1.196 (update the inode attrib)72 550.8 R 1.196 (utes and the set of data that it)-.2 F 2.855 (references including the e)72 562.8 R 2.855(xtended attrib)-.15 F 2.855 (utes in one)-.2 F .925(disk operation.)72 574.8 R .925(While it w)5.925 F .925(ould be possible to update)-.1 F 2.192 (the old auxiliary \214le on e)72 586.8 R -.15(ve)-.25 G 2.193(ry `).15 F(`fsync')-.74 E 4.693('s)-.74 G 2.193(ystem call)-4.693 F .601 (done on the \214lesystem, the cost of doing so w)72 598.8 R .601 (ould be)-.1 F(prohibiti)72 610.8 Q -.15(ve)-.25 G 6.89(.H).15 G 1.89 (ere, the k)-6.89 F 1.89(ernel kno)-.1 F 1.89(ws if the e)-.25 F (xtended)-.15 E(attrib)72 622.8 Q .364 (ute data block for an inode is dirty and can write)-.2 F 2.429 (just that data block during an `)72 634.8 R(`fsync')-.74 E 4.929('c) -.74 G 2.429(all on the)-4.929 F(inode.)72 646.8 Q 2.327 (The second use for e)97 662.4 R 2.327(xtended attrib)-.15 F 2.327 (utes is for)-.2 F .38(data labeling.)72 674.4 R .38 (Data labels are used to pro)5.38 F .38(vide permis-)-.15 F .158 (sions for mandatory access controls \()72 686.4 R F1(MA)A(C)-.36 E F0 2.658(s\). The)B -.1(ke)2.658 G -.2(r-).1 G 1.136(nel pro)72 698.4 R 1.137(vides a)-.15 F F1(MA)3.637 E(C)-.36 E F0(frame)3.637 E -.1(wo)-.25 G 1.137(rk that permits dynami-).1 F 2.072 (cally introduced system-security modules to modify)72 710.4 R .646 (system security functionality)326.4 84 R 5.647(.T)-.65 G .647 (his frame)-5.647 F -.1(wo)-.25 G .647(rk can be).1 F 2.403 (used to support a v)326.4 96 R 2.402(ariety of ne)-.25 F 4.902(ws)-.25 G 2.402(ecurity services,)-4.902 F .298 (including traditional labeled mandatory access control)326.4 108 R 5.009(models. The)326.4 120 R(frame)5.009 E -.1(wo)-.25 G 2.509(rk pro) .1 F 2.508(vides a series of entry)-.15 F .289 (points which is called by code supporting v)326.4 132 R .289(arious k) -.25 F(er)-.1 E(-)-.2 E .334 (nel services, especially with respects to access control)326.4 144 R .82(points and object creation.)326.4 156 R .82(The frame)5.82 F -.1(wo) -.25 G .82(rk then calls).1 F .805(out to security modules to of)326.4 168 R .805(fer them the opportunity)-.25 F 4.108 (to modify security beha)326.4 180 R 4.108(vior at those)-.2 F F1(MA) 6.609 E(C)-.36 E F0(entry)6.609 E 3.478(points. Thus,)326.4 192 R .977 (the \214lesystem does not codify ho)3.478 F 3.477(wt)-.25 G(he)-3.477 E .166(labels are used or enforced.)326.4 204 R .167 (It simply stores the labels)5.167 F 1.055 (associated with the inode and produces them when a)326.4 216 R .817 (security modules needs to query them to mak)326.4 228 R 3.317(eap)-.1 G (er)-3.317 E(-)-.2 E(mission check [W)326.4 240 Q(atson, 2001; W)-.8 E (atson et al, 2003].)-.8 E 4.617 -.8(We c)351.4 255.6 T 3.017 (onsidered storing symbolic links in the).8 F -.15(ex)326.4 267.6 S 1.405(tended attrib).15 F 1.405(ute area.)-.2 F 3.005 -.8(We c)6.405 H 1.406(hose not to do this for).8 F 1.734(three reasons.)326.4 279.6 R 1.734(First, the time to access an e)6.734 F(xtended)-.15 E .302 (storage block is the same as the time to access a re)326.4 291.6 R(gu-) -.15 E 2.078(lar data block.)326.4 303.6 R 2.077 (Second, since symbolic links rarely)7.077 F(ha)326.4 315.6 Q 1.465 -.15 (ve a)-.2 H 1.465 -.15(ny ex).15 H 1.165(tended attrib).15 F 1.165 (utes, there w)-.2 F 1.165(ould be no sa)-.1 F(v-)-.2 E 1.135 (ings in storage since a \214lesystem fragment w)326.4 327.6 R 1.135 (ould be)-.1 F 1.183(needed whether it w)326.4 339.6 R 1.183 (as stored in a re)-.1 F 1.184(gular data block)-.15 F 2.498(or in an e) 326.4 351.6 R 2.498(xtended storage block.)-.15 F 2.497 (Third, if it were)7.498 F 2.87(stored in the e)326.4 363.6 R 2.87 (xtended storage area, it w)-.15 F 2.87(ould tak)-.1 F(e)-.1 E (more time to tra)326.4 375.6 Q -.15(ve)-.2 G(rse do).15 E (wn the attrib)-.25 E(ute list to \214nd it.)-.2 E/F2 10/Times-Bold@0 SF (4.)326.4 402.6 Q/F3 12/Times-Bold@0 SF(New Filesystem Capabilities)5 E F0(Se)326.4 418.2 Q -.15(ve)-.25 G 3.562(ral other impro).15 F -.15(ve) -.15 G 3.562(ments were made when the).15 F(enlar)326.4 430.2 Q .221 (ged inode format w)-.18 F .221(as created.)-.1 F 1.821 -.8(We d)5.221 H .222(ecided to get).8 F .615 (an early jump on the year 2038 problem \(speci\214cally)326.4 442.2 R (,)-.65 E -.45(Tu)326.4 454.2 S 4.627(eJ).45 G 2.128 (an 19 03:14:08 2038 GMT which could be a)-4.627 F 1.808(really ugly w) 326.4 466.2 R 1.808(ay to usher in my 84th birthday\).)-.1 F -.8(We) 6.807 G -.15(ex)326.4 478.2 S 5.503 (panded the time \214elds \(which hold seconds-).15 F .41 (since-1970\) for access, modi\214cation, and inode-modi-)326.4 490.2 R 2.994(\214cation times from 32-bits to 64-bits.)326.4 502.2 R 2.994 (At plus or)7.994 F .163 (minus 136 billion years that should carry us from well)326.4 514.2 R 1.883(before the uni)326.4 526.2 R -.15(ve)-.25 G 1.884(rse w).15 F 1.884(as created until long after our)-.1 F 1.804(Sun has b)326.4 538.2 R 1.804(urned itself out.)-.2 F 3.404 -.8(We l)6.804 H 1.803 (eft the nanoseconds).8 F .437 (\214elds for these times at 32-bits as we did not feel that)326.4 550.2 R 1.412(added resolution w)326.4 562.2 R 1.411 (as going to be useful in the fore-)-.1 F .985(seeable future.)326.4 574.2 R 2.586 -.8(We c)5.985 H .986(onsidered e).8 F .986 (xpanding the time to)-.15 F .358(only 48-bits.)326.4 586.2 R 1.958 -.8 (We c)5.358 H .357(hose to go to 64-bits as 64-bits is a).8 F(nati)326.4 598.2 Q 1.283 -.15(ve s)-.25 H .984 (ize that can be easily manipulated with e).15 F(xist-)-.15 E 3.644 (ing and lik)326.4 610.2 R 3.643(ely future architectures.)-.1 F 3.643 (Using 48-bits)8.643 F -.1(wo)326.4 622.2 S 1.888(uld ha).1 F 2.188 -.15 (ve r)-.2 H 1.888(equired an e).15 F 1.889(xtra unpacking or packing) -.15 F 1.998(step each time the \214eld w)326.4 634.2 R 1.998 (as read or written.)-.1 F(Also,)6.997 E 2.73 (going to 64-bits ensures enough bits for all lik)326.4 646.2 R(ely)-.1 E(measured time so will not ha)326.4 658.2 Q .3 -.15(ve t)-.2 H 2.5(ob) .15 G 2.5(ee)-2.5 G(nlar)-2.5 E(ged.)-.18 E 2.23 (At the same time we also added a ne)351.4 673.8 R 4.73(wt)-.25 G(ime) -4.73 E 1.891 (\214eld \(also 64-bit\) to hold the birth time \(also com-)326.4 685.8 R .906(monly called the creation time\) of the \214le.)326.4 697.8 R .905(The birth)5.905 F .638 (time is set when the inode is \214rst allocated and is not)326.4 709.8 R .642(changed thereafter)326.4 721.8 R 5.642(.I)-.55 G 3.141(th)-5.642 G .641(as been added to the structure)-3.141 F 0 Cg EP %%Page: 5 5 %%BeginPageSetup BP %%EndPageSetup /F0 10/Times-Roman@0 SF .336(returned by the `)72 84 R(`stat')-.74 E 2.836('s)-.74 G .336(ystem call so that applications)-2.836 F .007 (can determine its v)72 96 R .006(alue and so that archi)-.25 F .006 (ving programs)-.25 F .458(such as)72 108 R/F1 10/Times-Bold@0 SF(dump) 2.958 E F0(,)A F1(tar)2.958 E F0 2.958(,a)C(nd)-2.958 E F1(pax)2.958 E F0 .458(can sa)2.958 F .758 -.15(ve t)-.2 H .458(his v).15 F .458 (alue along)-.25 F .407(with the other \214le times.)72 120 R .407 (The birth time w)5.407 F .407(as added to)-.1 F 5.645(ap)72 132 S(re) -5.645 E 3.145(viously spare \214eld in the `)-.25 F(`stat')-.74 E 5.645 ('s)-.74 G 3.145(ystem call)-5.645 F 3.128 (structure so that the size of the structure did not)72 144 R 3.746 (change. Thus,)72 156 R 1.246(old v)3.746 F 1.246 (ersions of programs that use the)-.15 F -.74(``)72 168 S(stat').74 E 2.5('c)-.74 G(all continue to w)-2.5 E(ork.)-.1 E 5.219 -.8(To d)97 183.6 T 3.619(ate, only the).8 F F1(dump)6.119 E F0 3.618 (program has been)6.118 F 1.348(changed to sa)72 195.6 R 1.649 -.15 (ve t)-.2 H 1.349(he birth time v).15 F 3.849(alue. This)-.25 F(ne)3.849 E 3.849(wv)-.25 G(er)-3.999 E(-)-.2 E 1.174(sion of)72 207.6 R F1(dump) 3.673 E F0 1.173(which can dump both)3.673 F/F2 9/Times-Roman@0 SF(UFS1) 3.673 E F0(and)3.673 E F2(UFS2)3.673 E F0 1.188 (\214lesystems, creates a ne)72 219.6 R 3.689(wd)-.25 G 1.189 (ump format which is not)-3.689 F 1.877(readable by older v)72 231.6 R 1.877(ersions of)-.15 F F1 -.18(re)4.377 G(stor).18 E(e)-.18 E F0 6.877 (.T)C 1.877(he updated)-6.877 F -.15(ve)72 243.6 S .912(rsion of).15 F F1 -.18(re)3.412 G(stor).18 E(e)-.18 E F0 .913 (can identify and restore from both)3.412 F .872(old and ne)72 255.6 R 3.371(wd)-.25 G .871(ump formats.)-3.371 F .871 (The birth times are only)5.871 F -.2(av)72 267.6 S (ailable and setable from the ne)-.05 E 2.5(wd)-.25 G(ump format.)-2.5 E 1.774(The `)97 283.2 R(`utimes')-.74 E 4.274('s)-.74 G 1.774 (ystem call sets the access and)-4.274 F .061 (modi\214cation times of a \214le to a speci\214ed set of v)72 295.2 R (alues.)-.25 E 1.584(It is used primarily by archi)72 307.2 R 1.884 -.15 (ve r)-.25 H(etrie).15 E -.25(va)-.25 G 4.084(lp).25 G 1.584(rograms to) -4.084 F 1.59(set ne)72 319.2 R 1.59(wly e)-.25 F 1.59 (xtracted \214les times back to those associ-)-.15 F .7 (ated with the \214le in the archi)72 331.2 R -.15(ve)-.25 G 5.7(.W).15 G .7(ith the addition of)-6.1 F .254(birth time, we added a ne)72 343.2 R 2.754(ws)-.25 G .254(ystem call that allo)-2.754 F .254(ws the)-.25 F .326(setting of access, modi\214cation, and birth times.)72 355.2 R(Ho) 5.326 E(w-)-.25 E -2.15 -.25(ev e)72 367.2 T 1.686 -.4(r, w).25 H 3.386 (er).4 G .886(ealized that man)-3.386 F 3.386(ye)-.15 G .885 (xisting applications will)-3.536 F .821(not be changed to use the ne)72 379.2 R 3.321(w`)-.25 G(`utimes')-4.061 E 3.321('s)-.74 G .822 (ystem call.)-3.321 F .009(The result will be that the \214les that the) 72 391.2 R 2.509(yr)-.15 G(etrie)-2.509 E -.15(ve)-.25 G 2.509(df).15 G (rom)-2.509 E(archi)72 403.2 Q -.15(ve)-.25 G 3.89(sw).15 G 1.39(ill ha) -3.89 F 1.69 -.15(ve a n)-.2 H -.25(ew).15 G 1.39 (er birth time than access or).25 F(modi\214cation times.)72 415.2 Q 1.868 -.8(To p)97 430.8 T(ro).8 E .267 (vide a sensible birth time for applications)-.15 F 4.132(that are una) 72 442.8 R -.1(wa)-.15 G 4.133(re of the birth time attrib).1 F 4.133 (ute, we)-.2 F .406(changed the semantics of the `)72 454.8 R(`utimes') -.74 E 2.906('s)-.74 G .406(ystem call so)-2.906 F .897 (that if the birth time w)72 466.8 R .897(as ne)-.1 F .897 (wer than the v)-.25 F .898(alue of the)-.25 F 1.19 (modi\214cation time that it w)72 478.8 R 1.19 (as setting, it sets the birth)-.1 F 1.52 (time to the same time as the modi\214cation time.)72 490.8 R(An)6.52 E .216(application that is a)72 502.8 R -.1(wa)-.15 G .215 (re of the birth time attrib).1 F .215(ute can)-.2 F 1.458 (set both the birth time and the modi\214cation time by)72 514.8 R 1.295 (doing tw)72 526.8 R 3.795(oc)-.1 G 1.295(alls to `)-3.795 F(`utimes') -.74 E 3.795('. First)-.74 F 1.295(it calls `)3.795 F(`utimes')-.74 E(') -.74 E .232(with a modi\214cation time equal to the sa)72 538.8 R -.15 (ve)-.2 G 2.733(db).15 G .233(irth time,)-2.733 F .638(then it calls `) 72 550.8 R(`utimes')-.74 E 3.138('as)-.74 G .638 (econd time with a modi\214ca-)-3.138 F .296 (tion time equal to the \(presumably ne)72 562.8 R .297(wer\) sa)-.25 F -.15(ve)-.2 G 2.797(dm).15 G(od-)-2.797 E 1.068(i\214cation time.)72 574.8 R -.15(Fo)6.068 G 3.568<728c>.15 G 1.067 (lesystems that do not store birth)-3.568 F 3 (times, the second call will o)72 586.8 R -.15(ve)-.15 G 3 (rwrite the \214rst call).15 F .948(resulting in the same v)72 598.8 R .947(alues for access and modi\214ca-)-.25 F .828(tion times as the)72 610.8 R 3.329(yw)-.15 G .829(ould ha)-3.429 F 1.129 -.15(ve p)-.2 H(re) .15 E .829(viously gotten.)-.25 F -.15(Fo)5.829 G(r).15 E .668 (\214lesystems that support birth time, it will be properly)72 622.8 R 5.29(set. And)72 634.8 R 2.79(most happily for the application writers,) 5.29 F(the)72 646.8 Q 3.475(yw)-.15 G .975(ill not ha)-3.475 F 1.275 -.15(ve t)-.2 H 3.475(oc).15 G .975(onditionally compile the name)-3.475 F .075(of `)72 658.8 R(`utimes')-.74 E 2.575('f)-.74 G(or)-2.575 E F2 (BSD)2.575 E F0 .075(and non-)2.575 F F2(BSD)A F0 2.575(systems. The) 2.575 F 2.575(yj)-.15 G(ust)-2.575 E 1.499 (write their applications to call the standard interf)72 670.8 R(ace)-.1 E .931(twice kno)72 682.8 R .931 (wing that the right thing will happen on all)-.25 F 1.61 (systems and \214lesystems.)72 694.8 R -.15(Fo)6.61 G 4.11(rt).15 G 1.61 (hose applications that)-4.11 F -.25(va)72 706.8 S 1.092(lue speed of e) .25 F -.15(xe)-.15 G 1.092(cution o).15 F -.15(ve)-.15 G 3.593(rp).15 G 1.093(ortability can use the)-3.593 F(ne)72 718.8 Q 3.82(wv)-.25 G 1.32 (ersion of the `)-3.97 F(`utimes')-.74 E 3.82('s)-.74 G 1.32 (ystem call that allo)-3.82 F(ws)-.25 E(all time v)326.4 84 Q (alues to be set with one call.)-.25 E .203 (Another incremental change to the inode format)351.4 99.6 R -.1(wa) 326.4 111.6 S 4.66(st).1 G 4.66(os)-4.66 G 2.16 (plit the \215ags \214eld into tw)-4.66 F 4.66(os)-.1 G 2.16 (eparate 32-bit)-4.66 F .482 (\214elds, one for \215ags that can be set by applications \(as)326.4 123.6 R(in)326.4 135.6 Q F2(UFS1)3.383 E F0 3.383(\)a)C .883(nd a ne) -3.383 F 3.383<778c>-.25 G .882(eld for \215ags maintained strictly) -3.383 F 2.379(by the k)326.4 147.6 R 4.879(ernel. An)-.1 F -.15(ex) 4.879 G 2.379(ample of a k).15 F 2.379(ernel \215ag is the)-.1 F F2(SN) 326.4 159.6 Q(APSHO)-.315 E(T)-.36 E F0 1.315 (\215ag used to label a \214le as being a snap-)3.816 F 4.299 (shot. Another)326.4 171.6 R -.1(ke)4.299 G 1.8(rnel-only \215ag is).1 F F2(OP)4.3 E -1.35 -.495(AQ U)-.828 H(E).495 E F0 1.8(which is)4.3 F 3.205(used by the union \214lesystem to mark a directory)326.4 183.6 R .14(which should not mak)326.4 195.6 R 2.64(et)-.1 G .14(he layers belo) -2.64 F 2.64(wi)-.25 G 2.64(tv)-2.64 G 2.64(isible. By)-2.64 F(mo)326.4 207.6 Q 1.694(ving these k)-.15 F 1.694 (ernel \215ags into a separate \214eld, the)-.1 F(y)-.15 E 1.496 (will not be accidentally set or cleared by a nai)326.4 219.6 R 1.796 -.15(ve o)-.25 H(r).15 E(malicious application.)326.4 231.6 Q F1(4.1.) 326.4 258.6 Q/F3 12/Times-Bold@0 SF(Dynamic Inodes)5 E F0 4.194 (One of the common complaints about the)326.4 274.2 R F2(UFS1)6.693 E F0 1.706(\214lesystem is that it preallocates all its inodes at the)326.4 286.2 R 2.373(time that the \214lesystem is created.)326.4 298.2 R -.15 (Fo)7.373 G 4.872<728c>.15 G(lesystems)-4.872 E 1.014 (with millions of \214les, the initialization of the \214lesys-)326.4 310.2 R .972(tem can tak)326.4 322.2 R 3.472(es)-.1 G -2.15 -.25(ev e) -3.472 H .971(ral hours.).25 F(Additionally)5.971 E 3.471(,t)-.65 G .971 (he \214lesys-)-3.471 F 3.387(tem creation program,)326.4 334.2 R F1 (newfs)5.887 E F0 5.887(,h)C 3.387(ad to assume that)-5.887 F -2.15 -.25 (ev e)326.4 346.2 T .368(ry \214lesystem w).25 F .368 (ould be \214lled with man)-.1 F 2.867(ys)-.15 G .367(mall \214les) -2.867 F .228(and allocate a lot more inodes than were lik)326.4 358.2 R .228(ely to e)-.1 F -.15(ve)-.25 G(r).15 E 1.269(be used.)326.4 370.2 R 1.269(If a)6.269 F F2(UFS1)3.769 E F0 1.268 (\214lesystem uses up all its inodes,)3.768 F 2.149(the only w)326.4 382.2 R 2.149(ay to get more is to dump, reb)-.1 F 2.149(uild, and)-.2 F 1.446(restore the \214lesystem.)326.4 394.2 R(The)6.446 E F2(UFS2)3.946 E F0 1.445(\214lesystem resolv)3.945 F(es)-.15 E 1.448 (these problems by dynamically allocating its inodes.)326.4 406.2 R 2.518(The usual implementation of dynamically allocated)326.4 418.2 R 2.612(inodes requires a separate \214lesystem data structure)326.4 430.2 R .908(\(typically referred to as the inode \214le\) that tracks the) 326.4 442.2 R 1.484(current set of inodes.)326.4 454.2 R 1.485 (The management and mainte-)6.484 F 1.462(nance of this e)326.4 466.2 R 1.461(xtra data structure adds o)-.15 F -.15(ve)-.15 G 1.461(rhead and) .15 F(comple)326.4 478.2 Q(xity and often de)-.15 E(grades performance.) -.15 E 2.357 -.8(To a)351.4 493.8 T -.2(vo).6 G .757(id these costs,).2 F F2(UFS2)3.258 E F0 .758(preallocates a range)3.258 F .138 (of inode numbers and a set of blocks for each c)326.4 505.8 R(ylinder) -.15 E 2.531(group. Initially)326.4 517.8 R .031(each c)2.531 F .031 (ylinder group has a single block)-.15 F 2.017 (of inodes allocated \(a typical block holds 32 or 64)326.4 529.8 R 4.254(inodes\). When)326.4 541.8 R 1.754(the block \214lls up, the ne) 4.254 F 1.755(xt block of)-.15 F 1.09 (inodes in the set is allocated and initialized.)326.4 553.8 R 1.09 (The set)6.09 F 1.509 (of blocks that may be allocated to inodes is held as)326.4 565.8 R 1.236(part of the free-space reserv)326.4 577.8 R 3.736(eu)-.15 G 1.235 (ntil all other space in)-3.736 F 1.213(the \214lesystem is allocated.) 326.4 589.8 R 1.214(Only then can it be used)6.213 F(for \214le data.) 326.4 601.8 Q .532 (In theory a \214lesystem could \214ll using up all the)351.4 617.4 R .557(blocks set aside for inodes.)326.4 629.4 R .558(Later after lar) 5.558 F .558(ge \214les had)-.18 F .558(been remo)326.4 641.4 R -.15(ve) -.15 G 3.058(da).15 G .558(nd man)-3.058 F 3.058(ys)-.15 G .557 (mall \214les created to replace)-3.058 F 1.431 (them, the \214lesystem might \214nd itself unable to allo-)326.4 653.4 R .046(cated the needed inodes because all the space set aside)326.4 665.4 R 1.169(for inodes w)326.4 677.4 R 1.169(as still in use.)-.1 F 1.169(Here, it w)6.169 F 1.169(ould be neces-)-.1 F 1.168 (sary to reallocate e)326.4 689.4 R 1.168(xisting \214les to mo)-.15 F 1.468 -.15(ve t)-.15 H 1.167(hem to ne).15 F(w)-.25 E .165 (locations outside of the inode area.)326.4 701.4 R .166 (Such code has not)5.166 F 5.011 (been written as we do not anticipate that this)326.4 713.4 R 0 Cg EP %%Page: 6 6 %%BeginPageSetup BP %%EndPageSetup /F0 10/Times-Roman@0 SF 3.556 (condition will arise in practice as the free space)72 84 R(reserv)72 96 Q 4.82(eu)-.15 G 2.32(sed on most \214lesystems \(8%\) e)-4.82 F 2.32 (xceeds the)-.15 F 1.207 (amount of space needed for inodes \(typically 2-6%\).)72 108 R 2.094 (On these systems only a process running with root)72 120 R(pri)72 132 Q (vile)-.25 E 2.278(ges w)-.15 F 2.278(ould e)-.1 F -.15(ve)-.25 G 4.779 (rb).15 G 4.779(ea)-4.779 G 2.279(ble to allocate the inode)-4.779 F 4.979(blocks. Should)72 144 R 2.479(the code pro)4.979 F 2.778 -.15 (ve n)-.15 H 2.478(ecessary in actual).15 F .84 (use, it can be written at that time.)72 156 R .84(Until it is written,) 5.84 F 1.503(\214lesystems hitting this condition will return an `)72 168 R(`out)-.74 E(of inodes')72 180 Q 2.5('e)-.74 G (rror on attempts to create ne)-2.5 E 2.5<778c>-.25 G(les.)-2.5 E .53 (One of the side bene\214ts of dynamically allocat-)97 195.6 R .978 (ing inodes is that the time to create a ne)72 207.6 R 3.478<778c>-.25 G (lesystem)-3.478 E(in)72 219.6 Q/F1 9/Times-Roman@0 SF(UFS2)3.264 E F0 .765(is about 1 percent of the time that it tak)3.264 F .765(es in)-.1 F F1(UFS1)72 231.6 Q F0 5.528(.A)C .527(\214lesystem that w)-2.501 F .527 (ould tak)-.1 F 3.027(eo)-.1 G .527(ne hour to b)-3.027 F(uild)-.2 E .357(in a)72 243.6 R F1(UFS1)2.857 E F0 .357(format can be b)2.857 F .357(uilt in under a minute in the)-.2 F F1(UFS2)72 255.6 Q F0 5.029 (format. While)5.03 F 2.529(\214lesystem creations are not a)5.029 F 2.07(common operation, ha)72 267.6 R 2.07(ving them b)-.2 F 2.07 (uild quickly does)-.2 F 2.068 (matter to the system administrators that ha)72 279.6 R 2.367 -.15(ve t) -.2 H 4.567(od).15 G(o)-4.567 E(such tasks with some re)72 291.6 Q (gularity)-.15 E(.)-.65 E 2.792 (The cost of dynamically allocating inodes is)97 307.2 R .87(one e)72 319.2 R .87(xtra disk write for e)-.15 F -.15(ve)-.25 G .87(ry 64 ne).15 F 3.37(wi)-.25 G .87(nodes that are)-3.37 F 3.445(created. Although)72 331.2 R .945(this cost is quite lo)3.445 F 3.445(wc)-.25 G .945 (ompared to)-3.445 F .47(the other costs of creating 64 ne)72 343.2 R 2.97<778c>-.25 G .47(les, some systems)-2.97 F 2.346 (administrators might w)72 355.2 R 2.347(ant to preallocate more than) -.1 F 2.72(the minimal number of inodes.)72 367.2 R 2.72 (If such a demand)7.72 F 1.453(arises, it w)72 379.2 R 1.453 (ould be tri)-.1 F 1.454(vial to add a \215ag to the)-.25 F/F2 10 /Times-Bold@0 SF(newfs)3.954 E F0 1.72 (program to preallocate additional inodes at the time)72 391.2 R (that the \214lesystem is created.)72 403.2 Q F2(4.2.)72 430.2 Q/F3 12 /Times-Bold@0 SF(Boot Blocks)5 E F0(The)72 445.8 Q F1(UFS1)3.864 E F0 1.364(\214lesystem reserv)3.864 F 1.364(ed an 8 kilobyte space at)-.15 F .522(the be)72 457.8 R .522 (ginning of the \214lesystem in which to put a boot)-.15 F 2.652 (block. While)72 469.8 R .153(this space seemed huge compared to the) 2.652 F 2.733(1k)72 481.8 S .233(ilobyte book block that it replaced, o) -2.733 F -.15(ve)-.15 G 2.733(rt).15 G .233(ime it has)-2.733 F 1.358 (gotten increasingly dif)72 493.8 R 1.359 (\214cult to cram the needed boot)-.25 F 2.752(code into this space.)72 505.8 R 2.751(Consequently we decided to)7.751 F(re)72 517.8 Q (visit the boot block size in)-.25 E F1(UFS2)2.5 E F0(.)A 1.461 (The boot code has a list of locations to check)97 533.4 R .787 (for boot blocks.)72 545.4 R 3.287(Ab)5.787 G .787 (oot block can be de\214ned to start)-3.287 F 1.279(at an)72 557.4 R 3.779(y8k)-.15 G 1.279(ilobyte boundary)-3.779 F 6.279(.W)-.65 G 3.779 (es)-7.079 G 1.279(et up an initial list)-3.779 F .356 (with four possible boot block sizes: none, 8 kilobytes,)72 569.4 R 1.157(64 kilobytes, and 256 kilobytes.)72 581.4 R 1.158 (Each of these loca-)6.158 F 1.578(tions w)72 593.4 R 1.577 (as selected for a particular purpose.)-.1 F(Filesys-)6.577 E .897 (tems other than the root \214lesystem do not need to be)72 605.4 R .844 (bootable, so can use a boot block size of zero.)72 617.4 R(Also,)5.844 E 1.398(\214lesystems on tin)72 629.4 R 3.899(ym)-.15 G 1.399 (edia that need e)-3.899 F -.15(ve)-.25 G 1.399(ry block that).15 F(the) 72 641.4 Q 3.468(yc)-.15 G .968(an get such as \215opp)-3.468 F 3.467 (yd)-.1 G .967(isks can use a zero size)-3.467 F .032(boot block.)72 653.4 R -.15(Fo)5.033 G 2.533(ra).15 G .033 (rchitectures with simple boot blocks,)-2.533 F 2.871(the traditional)72 665.4 R F1(UFS1)5.371 E F0 5.371(8k)5.371 G 2.87 (ilobyte boot block can be)-5.371 F 4.592(used. More)72 677.4 R 2.093 (typically the 64 kilobyte boot block is)4.592 F .257(used \(for e)72 689.4 R .257(xample on the)-.15 F F1(PC)2.757 E F0 .257 (architecture with its need)2.757 F 1.111 (to support booting from a myriad of b)72 701.4 R 1.111(usses and disk) -.2 F(dri)72 713.4 Q -.15(ve)-.25 G(rs\).).15 E 3.032 -.8(We a)351.4 84 T 1.431(dded the 256 kilobyte boot block in case).8 F 1.334 (some architecture or application needs to set aside a)326.4 96 R 4.063 (particularly lar)326.4 108 R 4.063(ge boot area.)-.18 F 4.063 (While this w)9.063 F 4.062(as not)-.1 F .314(strictly necessary as ne) 326.4 120 R 2.814(ws)-.25 G .314(izes can be added to the list)-2.814 F .336(at an)326.4 132 R 2.836(yt)-.15 G .336(ime, it can tak)-2.836 F 2.835(eal)-.1 G .335(ong time before the updated)-2.835 F .293 (list gets propag)326.4 144 R .293 (ated to all the boot programs and load-)-.05 F .487(ers out on the e) 326.4 156 R .487(xisting systems.)-.15 F .486(By adding the option)5.487 F 1.81(for a huge boot area no)326.4 168 R 3.11 -.65(w, w)-.25 H 4.31 (ec).65 G 1.81(an ensure it will be)-4.31 F .03(readily a)326.4 180 R -.25(va)-.2 G .03(ilable should it be needed on short notice in).25 F (the future.)326.4 192 Q .15(One of the une)351.4 207.6 R .15 (xpected side ef)-.15 F .15(fects of using a 64)-.25 F 1.841 (kilobyte boot block for)326.4 219.6 R F1(UFS2)4.341 E F0 1.84 (is that if the partition)4.34 F 3.908(had pre)326.4 231.6 R 3.908 (viously had a)-.25 F F1(UFS1)6.408 E F0 3.908(\214lesystem on it, the) 6.408 F .303(superblock for the former)326.4 243.6 R F1(UFS1)2.803 E F0 .303(\214lesystem may not be)2.803 F -.15(ove)326.4 255.6 S 4.196 (rwritten. If).15 F 1.696(an old v)4.196 F 1.697(ersion of)-.15 F F2 (fsck)4.197 E F0 1.697(that does not)4.197 F 1.629(\214rst look for a) 326.4 267.6 R F1(UFS2)4.129 E F0 1.628 (\214lesystem is run and \214nds the)4.128 F F1(UFS1)326.4 279.6 Q F0 1.127(superblock, it can incorrectly try to reb)3.626 F 1.127(uild the) -.2 F F1(UFS1)326.4 291.6 Q F0 .49(\214lesystem destro)2.99 F .49 (ying the)-.1 F F1(UFS2)2.99 E F0 .489(\214lesystem in the)2.99 F 5.895 (process. So,)326.4 303.6 R 3.396(when b)5.895 F(uilding)-.2 E F1(UFS2) 5.896 E F0 3.396(\214lesystems, the)5.896 F F2(newfs)326.4 315.6 Q F0 3.319(utility looks for old)5.82 F F1(UFS1)5.819 E F0 3.319 (superblocks and)5.819 F(zeros them out.)326.4 327.6 Q F2(5.)326.4 354.6 Q F3 4.911(Changes and Enhancements to Soft)9.911 F(Updates)326.4 369.6 Q F0 -.35(Tr)326.4 385.2 S(aditionally).35 E 4.414<2c8c>-.65 G 1.914 (lesystem consistenc)-4.414 F 4.414(yh)-.15 G 1.914(as been main-)-4.414 F 3.163(tained across system f)326.4 397.2 R 3.163 (ailures either by using syn-)-.1 F 4.634 (chronous writes to sequence dependent metadata)326.4 409.2 R .37 (updates or by using write-ahead logging to atomically)326.4 421.2 R 2.443(group them [Seltzer et al, 2000].)326.4 433.2 R 2.442 (Soft updates, an)7.442 F(alternati)326.4 445.2 Q 1.448 -.15(ve t)-.25 H 3.648(ot).15 G 1.148(hese approaches, is an implementation)-3.648 F 1.317(mechanism that tracks and enforces metadata update)326.4 457.2 R 1.01(dependencies to ensure that the disk image is al)326.4 469.2 R -.1 (wa)-.1 G(ys).1 E -.1(ke)326.4 481.2 S 1.015(pt consistent.).1 F 1.015 (The use of soft updates ob)6.015 F 1.015(viates the)-.15 F .106 (need for a separate log or for most synchronous writes)326.4 493.2 R ([McK)326.4 505.2 Q(usick & Ganger)-.15 E 2.5(,1)-.4 G(999].)-2.5 E 1.929(The addition of e)351.4 520.8 R 1.929(xtended attrib)-.15 F 1.928 (ute data to the)-.2 F .727 (inode required that the soft updates code be e)326.4 532.8 R(xtended) -.15 E .772(so that it could ensure the inte)326.4 544.8 R .772 (grity of these ne)-.15 F 3.272(wd)-.25 G(ata)-3.272 E 2.536(blocks. As) 326.4 556.8 R .036(with the \214le data blocks, it ensures that the) 2.536 F -.15(ex)326.4 568.8 S 1.311 (tended data blocks and the bitmaps that sho).15 F 3.811(wt)-.25 G(hat) -3.811 E(the)326.4 580.8 Q 4.82(ya)-.15 G 2.32 (re in use are written to disk before the)-4.82 F 4.82(ya)-.15 G(re) -4.82 E 1.609(claimed by the inode.)326.4 592.8 R 1.609 (Soft updates also ensure that)6.609 F(an)326.4 604.8 Q 4.211(yu)-.15 G 1.711(pdated e)-4.211 F 1.711(xtended attrib)-.15 F 1.712 (ute data is committed to)-.2 F(disk as part of an `)326.4 616.8 Q (`fsync')-.74 E 2.5('o)-.74 G 2.5(ft)-2.5 G(he \214le.)-2.5 E -1 -.8 (Tw o)351.4 632.4 T .723(important enhancements were made to the)4.024 F -.15(ex)326.4 644.4 S .216(isting soft updates implementation.).15 F .216(These enhance-)5.216 F .262(ments were initially made for)326.4 656.4 R F1(UFS2)2.762 E F0 -.2(bu)2.761 G 2.761(tb).2 G .261 (ecause of the)-2.761 F 1.239(shared code base with)326.4 668.4 R F1 (UFS1)3.739 E F0 1.239(were tri)3.739 F 1.24(vially inte)-.25 F(grated) -.15 E(to w)326.4 680.4 Q(ork with)-.1 E F1(UFS1)2.5 E F0 (\214lesystems as well.)2.5 E .604(When a \214le is remo)351.4 696 R -.15(ve)-.15 G 3.104(do).15 G 3.104(na)-3.104 G .603 (\214lesystem running)-.001 F .395(with soft updates, the remo)326.4 708 R -.25(va)-.15 G 2.895(la).25 G .395(ppears to happen v)-2.895 F(ery) -.15 E(quickly)326.4 720 Q 5.727(,b)-.65 G 3.226(ut the process of remo) -5.927 F 3.226(ving the \214le and)-.15 F 0 Cg EP %%Page: 7 7 %%BeginPageSetup BP %%EndPageSetup /F0 10/Times-Roman@0 SF .267 (returning its blocks to the free list may tak)72 84 R 2.768(eu)-.1 G 2.768(pt)-2.768 G 2.768(os)-2.768 G -.25(ev)-2.768 G(-).25 E .381 (eral minutes.)72 96 R .381(Prior to)5.381 F/F1 9/Times-Roman@0 SF(UFS2) 2.881 E F0 2.88(,t)C .38(he space held by the \214le)-2.88 F 1.545 (did not sho)72 108 R 4.045(wu)-.25 G 4.045(pi)-4.045 G 4.046(nt)-4.045 G 1.546(he \214lesystem statistics until the)-4.046 F(remo)72 120 Q -.25 (va)-.15 G 3.362(lo).25 G 3.362(ft)-3.362 G .861 (he \214le had been completed.)-3.362 F .861(Thus, appli-)5.861 F .051 (cations that clean up disk space such as the ne)72 132 R .051(ws e)-.25 F(xpi-)-.15 E 3.272(ration program w)72 144 R 3.272(ould often v)-.1 F 3.271(astly o)-.25 F -.15(ve)-.15 G 3.271(rshoot their).15 F 2.84 (goal. The)72 156 R 2.84(yw)-.15 G .34(ork by remo)-2.94 F .34 (ving \214les and then checking)-.15 F 1.122 (to see if enough free space has sho)72 168 R 1.122(wed up.)-.25 F (Because)6.122 E .353(of the time lag in ha)72 180 R .353 (ving the free space recorded, the)-.2 F(y)-.15 E -.1(wo)72 192 S .004 (uld remo).1 F .304 -.15(ve f)-.15 H .004(ar too man).05 F 2.504<798c> -.15 G 2.504(les. T)-2.504 F 2.503(or)-.8 G(esolv)-2.503 E 2.503(ep)-.15 G(roblems)-2.503 E 2.074(of this sort, the soft updates code no)72 204 R 4.575(wm)-.25 G 2.075(aintains a)-4.575 F .299(counter that k)72 216 R .299(eeps track of the amount of space that is)-.1 F .778 (held by the \214les that it is in the process of remo)72 228 R(ving.) -.15 E 1.216(This counter of pending space is added to the actual)72 240 R 1.477(amount of free space as reported by the k)72 252 R 1.477 (ernel \(and)-.1 F .906(thus by utilities lik)72 264 R(e)-.1 E/F2 10 /Times-Bold@0 SF(df)3.406 E F0 3.406(\). The)B .906 (result of this change is)3.406 F 6.595 (that free space appears immediately after the)72 276 R -.74(``)72 288 S (unlink').74 E 2.592('s)-.74 G .091(ystem call returns or the)-2.592 F F2(rm)2.591 E F0 .091(utility \214nishes.)2.591 F 1.326 (The second and related change to soft updates)97 303.6 R 3.262 (has to do with a)72 315.6 R -.2(vo)-.2 G 3.261(iding f).2 F 3.261 (alse out-of-space errors.)-.1 F 3.345 (When running with soft updates on a nearly full)72 327.6 R .833 (\214lesystem with high turno)72 339.6 R -.15(ve)-.15 G 3.333(rr).15 G .833(ate \(for e)-3.333 F .832(xample when)-.15 F .757 (installing a whole ne)72 351.6 R 3.257(ws)-.25 G .757 (et of binaries on a root parti-)-3.257 F 1.019 (tion\), the \214lesystem can return a \214lesystem full error)72 363.6 R -2.15 -.25(ev e)72 375.6 T 3.134(nt).25 G .634 (hough it reports that it has plenty of free space.)-3.134 F 3.35 (The \214lesystem full message happens because soft)72 387.6 R 1.526 (updates has not managed to free the space from the)72 399.6 R .957 (old binaries in time for it to be a)72 411.6 R -.25(va)-.2 G .956 (ilable for the ne).25 F(w)-.25 E(binaries.)72 423.6 Q 1.177 (The initial attempt to correct this problem w)97 439.2 R(as)-.1 E 2.555 (to simply ha)72 451.2 R 2.855 -.15(ve t)-.2 H 2.555 (he process that wished to allocate).15 F 1.159(space w)72 463.2 R 1.159 (ait for the free space to sho)-.1 F 3.659(wu)-.25 G 3.659(p. The)-3.659 F(prob-)3.659 E .749(lem with this approach is that it often had to w)72 475.2 R .749(ait for)-.1 F .594(up to a minute.)72 487.2 R .595 (In addition to making the application)5.594 F .865 (seem intolerably slo)72 499.2 R 2.165 -.65(w, i)-.25 H 3.365(tu).65 G .865(sually held a lock)-3.365 F .865(ed vnode)-.1 F 1.656 (which could cause other applications to get block)72 511.2 R(ed)-.1 E -.1(wa)72 523.2 S .291(iting for it to become a).1 F -.25(va)-.2 G .291 (ilable \(often referred to as).25 F 4.084(al)72 535.2 S 1.584 (ock race to the root of the \214lesystem\).)-4.084 F(Although)6.585 E 1.699(the condition w)72 547.2 R 1.699(ould clear in a minute or tw)-.1 F 1.698(o, users)-.1 F 1.065 (often assumed that their system had hung and w)72 559.2 R(ould)-.1 E (reboot.)72 571.2 Q 3.779 -.8(To r)97 586.8 T 2.178 (emedy this problem, the solution de).8 F(vised)-.25 E(for)72 598.8 Q F1 (UFS2)2.749 E F0 .249(is to co-opt the process that w)2.749 F .25 (ould otherwise)-.1 F 2.038(be block)72 610.8 R 2.038 (ed and put it to w)-.1 F 2.037(ork helping soft updates)-.1 F .6 (process the \214les to be freed.)72 622.8 R .6(The more processes try-) 5.6 F .194(ing to allocate space, the more help that is a)72 634.8 R -.25(va)-.2 G .194(ilable to).25 F .164(soft updates and the f)72 646.8 R .165(aster free blocks be)-.1 F .165(gin to appear)-.15 F(.)-.55 E 1.016(Usually in under one second enough space sho)72 658.8 R 1.016 (ws up)-.25 F .537 (that the processes can return to their original task and)72 670.8 R 1.643(proceed to completion.)72 682.8 R 1.643(The ef)6.643 F 1.642 (fect of this change is)-.25 F .368(that soft updates can no)72 694.8 R 2.868(wb)-.25 G 2.868(eu)-2.868 G .368(sed on small nearly full)-2.868 F (\214lesystems with high turno)72 706.8 Q -.15(ve)-.15 G -.55(r.).15 G F2(6.)326.4 87 Q/F3 12/Times-Bold@0 SF(Enhancements f)5 E(or Li)-.3 E .24 -.12(ve D)-.12 H(umps).12 E F0 3.653<418c>326.4 102.6 S 1.152 (lesystem snapshot is a frozen image of a \214lesys-)-3.653 F .357 (tem at a gi)326.4 114.6 R -.15(ve)-.25 G 2.858(ni).15 G .358 (nstant in time.)-2.858 F .358(Snapshots support se)5.358 F(v-)-.25 E .076(eral important features: the ability to pro)326.4 126.6 R .075 (vide back-ups)-.15 F 1.237(of the \214lesystem at se)326.4 138.6 R -.15 (ve)-.25 G 1.238(ral times during the day).15 F 3.738(,t)-.65 G(he) -3.738 E 1.76(ability to do reliable dumps of li)326.4 150.6 R 2.06 -.15 (ve \214)-.25 H 1.76(lesystems, and).15 F 2.263 (the ability to run a \214lesystem check program on a)326.4 162.6 R (acti)326.4 174.6 Q 4.695 -.15(ve s)-.25 H 4.394 (ystem to reclaim lost blocks and inodes).15 F([McK)326.4 186.6 Q (usick, 2002].)-.15 E -.4(Wi)351.4 202.2 S 2.928(th the adv).4 F 2.928 (ent of \214lesystem snapshots, the)-.15 F F2(dump)326.4 214.2 Q F0 .062 (program has been enhanced to safely dump li)2.563 F -.15(ve)-.25 G 4.39 (\214lesystems. When)326.4 226.2 R(gi)4.39 E -.15(ve)-.25 G 4.39(nt).15 G(he)-4.39 E F1(-L)4.391 E F0(\215ag,)4.391 E F2(dump)4.391 E F0 -.15 (ve)4.391 G(ri\214es).15 E 1.191(that it is being ask)326.4 238.2 R 1.191(ed to dump a mounted \214lesystem,)-.1 F .483(then tak)326.4 250.2 R .483(es a snapshot of the \214lesystem and dumps the)-.1 F 3.283 (snapshot instead of on the li)326.4 262.2 R 3.583 -.15(ve \214)-.25 H 5.782(lesystem. When).15 F F2(dump)326.4 274.2 Q F0 (completes, it releases the snapshot.)2.5 E 2.115 (The initial implementation of li)351.4 289.8 R 2.415 -.15(ve d)-.25 H 2.115(umps had).15 F(the)326.4 301.8 Q F2(dump)3.182 E F0 .681 (program do the `)3.182 F(`mount')-.74 E 3.181('s)-.74 G .681 (ystem call itself)-3.181 F .971(to tak)326.4 313.8 R 3.471(et)-.1 G .971(he snapshot.)-3.471 F(Ho)5.971 E(we)-.25 E -.15(ve)-.25 G 1.772 -.4 (r, m).15 H .972(ost systems require).4 F .872(root pri)326.4 325.8 R (vile)-.25 E .871(ge to use the `)-.15 F(`mount')-.74 E 3.371('s)-.74 G .871(ystem call.)-3.371 F(Since)5.871 E .306 (dumps are often done by the)326.4 337.8 R/F4 10/Times-Italic@0 SF(oper) 2.807 E(ator)-.15 E F0 .307(user rather than)2.807 F F4 -.45(ro)326.4 349.8 S(ot).45 E F0 2.5(,a)C 2.5(na)-2.5 G(ttempt to tak)-2.5 E 2.5(eas) -.1 G(napshot will f)-2.5 E(ail.)-.1 E 4.492 -.8(To g)351.4 365.4 T 2.891(et around this problem, a ne).8 F 5.391(ws)-.25 G(et-user)-5.391 E (-)-.2 E(identi\214er)326.4 377.4 Q F4 -.45(ro)12.906 G(ot).45 E F0 10.406(program w)12.906 F 10.406(as written called)-.1 F F2(mksnap_ffs) 326.4 389.4 Q F0 7.962(.T)C(he)-7.962 E F2(mksnap_ffs)5.462 E F0 2.962 (command creates a)5.462 F .581(snapshot with a gi)326.4 401.4 R -.15 (ve)-.25 G 3.081(nn).15 G .581(ame on a speci\214ed \214lesystem.)-3.081 F .079(The snapshot \214le must be contained within the \214lesys-)326.4 413.4 R 1.517(tem being snapshotted.)326.4 425.4 R 1.517(The group o) 6.517 F 1.517(wnership of the)-.25 F 1.548(\214le is set to)326.4 437.4 R F4(oper)4.048 E(ator)-.15 E F0 4.048(;t)C 1.548(he o)-4.048 F 1.548 (wner of the \214le remains)-.25 F F4 -.45(ro)326.4 449.4 S(ot).45 E F0 5.999(.T)C .999(he mode of the snapshot is set to be readable)-5.999 F (by the o)326.4 461.4 Q(wner or members of the)-.25 E F4(oper)2.5 E (ator)-.15 E F0(group.)2.5 E(The)351.4 477 Q F2(dump)4.426 E F0 1.926 (program no)4.426 F 4.426(wi)-.25 G -1.9 -.4(nv o)-4.426 H -.1(ke).4 G (s).1 E F2(mksnap_ffs)4.426 E F0 1.71 (to create the snapshot rather than trying to create it)326.4 489 R (directly)326.4 501 Q 5.092(.T)-.65 G .091(he result is that an)-5.092 F .091(yone with)-.15 F F4(oper)2.591 E(ator)-.15 E F0(pri)2.591 E(vi-) -.25 E(le)326.4 513 Q 2.677(ges can no)-.15 F 5.177(wr)-.25 G 2.677 (eliably tak)-5.177 F 5.177(el)-.1 G -2.15 -.25(iv e)-5.177 H 5.178 (dumps. Allo)5.427 F(wing)-.25 E F4(oper)326.4 525 Q(ator)-.15 E F0 1.141(group access to the snapshot does not open)3.642 F(an)326.4 537 Q 2.733(yn)-.15 G .733 -.25(ew s)-2.733 H .233(ecurity holes since the ra) .25 F 2.733(wd)-.15 G .233(isk is also read-)-2.733 F .493 (able by members of the)326.4 549 R F4(oper)2.992 E(ator)-.15 E F0 .492 (group \(for the bene-)2.992 F .458(\214t of traditional)326.4 561 R F2 (dump)2.959 E F0 2.959(\). Thus,)B .459(the information that is)2.959 F -.2(av)326.4 573 S .349 (ailable in the snapshot can also be accessed directly)-.05 F (through the disk de)326.4 585 Q(vice.)-.25 E F2(7.)326.4 612 Q F3(Lar)5 E(ge Filesystem Snapshots)-.12 E F0 .373 (Creating and using a snapshot requires random access)326.4 627.6 R 3.762(to the snapshot \214le.)326.4 639.6 R 3.761 (The creation of a snapshot)8.761 F .391 (requires the inspection and cop)326.4 651.6 R .391(ying of all the c) -.1 F(ylinder)-.15 E .312(group maps.)326.4 663.6 R .311 (Once in operation, e)5.312 F -.15(ve)-.25 G .311(ry write operation).15 F .775(to the \214lesystem must check whether the block being)326.4 675.6 R 4.022(written needs to be copied.)326.4 687.6 R 4.021 (The information on)9.021 F 1.462 (whether a blocks needs to be copied is contained in)326.4 699.6 R 5.338 (the snapshot \214le metadata \(its indirect blocks\).)326.4 711.6 R 0 Cg EP %%Page: 8 8 %%BeginPageSetup BP %%EndPageSetup /F0 10/Times-Roman@0 SF(Ideally)72 84 Q 3.417(,t)-.65 G .917 (his metadata w)-3.417 F .918(ould be resident in the k)-.1 F(ernel)-.1 E 1.639(memory throughout the lifetime of the snapshot.)72 96 R(In)6.638 E(Free)72 108 Q/F1 9/Times-Roman@0 SF(BSD)A F0 3.275(,t)C .775 (he entire ph)-3.275 F .775(ysical memory on the machine)-.05 F .743 (can be used to cache \214le data pages if the memory is)72 120 R 2.488 (not needed for other purposes.)72 132 R(Unfortunately)7.489 E 4.989(,d) -.65 G(ata)-4.989 E 2.444 (pages associated with disks can only be cached in)72 144 R .405 (pages mapped into the k)72 156 R .406(ernel ph)-.1 F .406 (ysical memory)-.05 F 5.406(.O)-.65 G(nly)-5.406 E .537(about 10 me)72 168 R -.05(ga)-.15 G .536(bytes of k).05 F .536 (ernel memory is dedicated to)-.1 F .685(such purposes.)72 180 R .685 (Assuming that we allo)5.685 F 3.186(wu)-.25 G 3.186(pt)-3.186 G 3.186 (oh)-3.186 G .686(alf of)-3.186 F 2.681(this space to be used for an)72 192 R 5.181(ys)-.15 G 2.681(ingle snapshot, the)-5.181 F(lar)72 204 Q 1.146(gest snapshot whose metadata that we can hold in)-.18 F 1.319 (memory is 11 me)72 216 R -.05(ga)-.15 G 3.819(bytes. W).05 F 1.319 (ithout help, such a tin)-.4 F(y)-.15 E .668(cache w)72 228 R .669 (ould be hopeless in trying to support a multi-)-.1 F (terabyte snapshot.)72 240 Q .89(In an ef)97 255.6 R .89 (fort to support multi-terabyte snapshots)-.25 F .61(with the tin)72 267.6 R 3.11(ym)-.15 G .61(etadata cache a)-3.11 F -.25(va)-.2 G .61 (ilable, it is necessary).25 F 1.226(to observ)72 279.6 R 3.726(et)-.15 G 1.226(he access patterns on typical \214lesystems.)-3.726 F .534 (The snapshot is only consulted for \214les that are being)72 291.6 R 3.784(written. The)72 303.6 R 1.283(\214lesystem is or)3.784 F -.05(ga) -.18 G 1.283(nized around c).05 F(ylinder)-.15 E 3.069 (groups which maps small contiguous areas of the)72 315.6 R 3.857 (disk. W)72 327.6 R 1.357(ithin a directory)-.4 F 3.856(,t)-.65 G 1.356 (he \214lesystem tries to allo-)-3.856 F 3.021 (cate all the inodes and \214les in the same c)72 339.6 R(ylinder)-.15 E 5.214(group. When)72 351.6 R(mo)5.214 E 2.714 (ving between directories dif)-.15 F(ferent)-.25 E -.15(cy)72 363.6 S 4.161(linder groups are usually inspected.).15 F 4.162(Thus, the)9.162 F 5.176(widely random beha)72 375.6 R 5.176(vior occurs from mo)-.2 F -.15 (ve)-.15 G(ment).15 E 2.12(between c)72 387.6 R 2.12(ylinder groups.) -.15 F 2.12(Once \214le writing acti)7.12 F(vity)-.25 E 3.637 (settles do)72 399.6 R 3.636(wn into a c)-.25 F 3.636 (ylinder group, only a small)-.15 F 1.521 (amount of snapshot metadata needs to be consulted.)72 411.6 R 1.718 (That metadata will easily \214t in e)72 423.6 R -.15(ve)-.25 G 4.218 (nt).15 G 1.717(he tin)-4.218 F 4.217(yk)-.15 G(ernel)-4.317 E .206 (metadata cache.)72 435.6 R .206(So, the need is to \214nd a w)5.206 F .207(ay to a)-.1 F -.2(vo)-.2 G(id).2 E 2.199 (thrashing the cache when mo)72 447.6 R 2.198(ving between c)-.15 F (ylinder)-.15 E(groups.)72 459.6 Q 2.86(The technique used to a)97 475.2 R -.2(vo)-.2 G 2.86(id thrashing when).2 F(mo)72 487.2 Q 2.225 (ving between c)-.15 F 2.225(ylinder groups is to b)-.15 F 2.225 (uild a look)-.2 F 1.339 (aside table of all the blocks that were copied during)72 499.2 R .915 (the time that the snapshot w)72 511.2 R .914(as made.)-.1 F .914 (This table lists)5.914 F 1.481 (the blocks associated with all the snapshot metadata)72 523.2 R .135 (blocks, the c)72 535.2 R .135 (ylinder groups maps, the super block, and)-.15 F 1.634 (blocks that contain acti)72 547.2 R 1.934 -.15(ve i)-.25 H 4.134 (nodes. When).15 F 4.135(ac)4.135 G(op)-4.135 E(y-on-)-.1 E .827 (write f)72 559.2 R .826 (ault occurs for a block, the \214rst step is to con-)-.1 F .663 (sult this table.)72 571.2 R .664 (If the block is found in the table, then)5.663 F 1.582 (no further searching needs to be done in an)72 583.2 R 4.082(yo)-.15 G 4.082(ft)-4.082 G(he)-4.082 E 2.51(snapshots. If)72 595.2 R .01 (the block is not found, then the metadata)2.51 F .148(of each acti)72 607.2 R .448 -.15(ve s)-.25 H .148 (napshot on the \214lesystem must be con-).15 F 1.408 (sulted to see if a cop)72 619.2 R 3.908(yi)-.1 G 3.908(sn)-3.908 G 3.908(eeded. This)-3.908 F 1.408(table lookup)3.908 F(sa)72 631.2 Q -.15 (ve)-.2 G 3.807(st).15 G 1.307(ime as it not only a)-3.807 F -.2(vo)-.2 G 1.306(ids f).2 F 1.306(aulting in metadata)-.1 F .201 (for widely scattered blocks, b)72 643.2 R .201(ut it also a)-.2 F -.2 (vo)-.2 G .201(ids the need).2 F(to consult potentially man)72 655.2 Q 2.5(ys)-.15 G(napshots.)-2.5 E 5.664 (Another problem with snapshots on lar)97 670.8 R(ge)-.18 E 1.528 (\214lesystems is that the)72 682.8 R 4.028(ya)-.15 G(ggra)-4.028 E -.25 (va)-.2 G 1.529(ted e).25 F 1.529(xisting deadlock)-.15 F 3.35 (problems. When)72 694.8 R .85(there are multiple snapshots associ-)3.35 F 1.007(ated with a \214lesystem, the)72 706.8 R 3.507(ya)-.15 G 1.007 (re k)-3.507 F 1.007(ept in a list ordered)-.1 F .797 (from oldest to youngest.)72 718.8 R .797(When a cop)5.797 F .797 (y-on-write f)-.1 F(ault)-.1 E 0 0 198 198 -73 73 336.6 163 PBEGIN %%BeginDocument: snapdeadlk.eps %%Title: stdin %%Creator: fig2dev Version 3.2 Patchlevel 3d %%CreationDate: Fri May 9 23:05:05 2003 %%For: mckusick@beastie.mckusick.com (Kirk McKusick) %%BoundingBox: 0 0 198 73 %%Magnification: 1.0000 %%EndComments /$F2psDict 200 dict def $F2psDict begin $F2psDict /mtrx matrix put /col-1 {0 setgray} bind def /col0 {0.000 0.000 0.000 srgb} bind def /col1 {0.000 0.000 1.000 srgb} bind def /col2 {0.000 1.000 0.000 srgb} bind def /col3 {0.000 1.000 1.000 srgb} bind def /col4 {1.000 0.000 0.000 srgb} bind def /col5 {1.000 0.000 1.000 srgb} bind def /col6 {1.000 1.000 0.000 srgb} bind def /col7 {1.000 1.000 1.000 srgb} bind def /col8 {0.000 0.000 0.560 srgb} bind def /col9 {0.000 0.000 0.690 srgb} bind def /col10 {0.000 0.000 0.820 srgb} bind def /col11 {0.530 0.810 1.000 srgb} bind def /col12 {0.000 0.560 0.000 srgb} bind def /col13 {0.000 0.690 0.000 srgb} bind def /col14 {0.000 0.820 0.000 srgb} bind def /col15 {0.000 0.560 0.560 srgb} bind def /col16 {0.000 0.690 0.690 srgb} bind def /col17 {0.000 0.820 0.820 srgb} bind def /col18 {0.560 0.000 0.000 srgb} bind def /col19 {0.690 0.000 0.000 srgb} bind def /col20 {0.820 0.000 0.000 srgb} bind def /col21 {0.560 0.000 0.560 srgb} bind def /col22 {0.690 0.000 0.690 srgb} bind def /col23 {0.820 0.000 0.820 srgb} bind def /col24 {0.500 0.190 0.000 srgb} bind def /col25 {0.630 0.250 0.000 srgb} bind def /col26 {0.750 0.380 0.000 srgb} bind def /col27 {1.000 0.500 0.500 srgb} bind def /col28 {1.000 0.630 0.630 srgb} bind def /col29 {1.000 0.750 0.750 srgb} bind def /col30 {1.000 0.880 0.880 srgb} bind def /col31 {1.000 0.840 0.000 srgb} bind def end save newpath 0 73 moveto 0 0 lineto 198 0 lineto 198 73 lineto closepath clip newpath -20.5 90.0 translate 1 -1 scale /cp {closepath} bind def /ef {eofill} bind def /gr {grestore} bind def /gs {gsave} bind def /sa {save} bind def /rs {restore} bind def /l {lineto} bind def /m {moveto} bind def /rm {rmoveto} bind def /n {newpath} bind def /s {stroke} bind def /sh {show} bind def /slc {setlinecap} bind def /slj {setlinejoin} bind def /slw {setlinewidth} bind def /srgb {setrgbcolor} bind def /rot {rotate} bind def /sc {scale} bind def /sd {setdash} bind def /ff {findfont} bind def /sf {setfont} bind def /scf {scalefont} bind def /sw {stringwidth} bind def /tr {translate} bind def /tnt {dup dup currentrgbcolor 4 -2 roll dup 1 exch sub 3 -1 roll mul add 4 -2 roll dup 1 exch sub 3 -1 roll mul add 4 -2 roll dup 1 exch sub 3 -1 roll mul add srgb} bind def /shd {dup dup currentrgbcolor 4 -2 roll mul 4 -2 roll mul 4 -2 roll mul srgb} bind def /$F2psBegin {$F2psDict begin /$F2psEnteredState save def} def /$F2psEnd {$F2psEnteredState restore end} def $F2psBegin 10 setmiterlimit 0.06000 0.06000 sc % % Fig objects follow % /Times-Roman ff 150.00 scf sf 2925 1500 m gs 1 -1 sc (to check write) dup sw pop 2 div neg 0 rm col0 sh gr % Polyline 7.500 slw n 2325 300 m 3525 300 l 3525 975 l 2325 975 l cp gs col0 s gr /Times-Roman ff 150.00 scf sf 1050 675 m gs 1 -1 sc (snap1) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman ff 150.00 scf sf 2925 675 m gs 1 -1 sc (snap2) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman ff 150.00 scf sf 1050 1200 m gs 1 -1 sc (locked by process A) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman ff 150.00 scf sf 1050 1350 m gs 1 -1 sc (waiting for snap2 lock) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman ff 150.00 scf sf 1050 1500 m gs 1 -1 sc (to check write) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman ff 150.00 scf sf 2925 1200 m gs 1 -1 sc (locked by process B) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman ff 150.00 scf sf 2925 1350 m gs 1 -1 sc (waiting for snap1 lock) dup sw pop 2 div neg 0 rm col0 sh gr % Polyline n 450 300 m 1650 300 l 1650 975 l 450 975 l cp gs col0 s gr $F2psEnd rs %%EndDocument end PEND/F2 10/Times-Bold@0 SF(Figur)359.13 181 Q 2.5(e2)-.18 G F0(:) -2.5 E/F3 10/Times-Italic@0 SF(Snapshot deadloc)2.5 E 2.5(ks)-.2 G (cenario)-2.5 E F0 3.714(occurs, the list is tra)326.4 205 R -.15(ve)-.2 G 3.715(rsed letting each snapshot).15 F .859(decide if it needs to mak) 326.4 217 R 3.358(eac)-.1 G(op)-3.358 E 3.358(yo)-.1 G 3.358(ft)-3.358 G .858(he block that is)-3.358 F 1.375(about to be written.)326.4 229 R (Originally)6.376 E 3.876(,e)-.65 G 1.376(ach snapshot inode)-3.876 F 1.782(had its o)326.4 241 R 1.781(wn lock.)-.25 F 4.281(Ad)6.781 G 1.781 (eadlock could occur between)-4.281 F(tw)326.4 253 Q 2.985(op)-.1 G .485 (rocesses each trying to do a write.)-2.985 F .486(Consider the)5.486 F -.15(ex)326.4 265 S 2.224(ample in Fig. 2.).15 F 2.223(It sho)7.223 F 2.223(ws a \214lesystem with tw)-.25 F(o)-.1 E 1.758 (snapshots, snap1 and snap2.)326.4 277 R 1.759(Process A holds snap-) 6.759 F .649(shot 1 lock)326.4 289 R .649 (ed and process B holds snapshot 2 lock)-.1 F(ed.)-.1 E .776 (Both snap1 and snap2 ha)326.4 301 R 1.077 -.15(ve d)-.2 H .777 (ecided that the).15 F 3.277(yn)-.15 G .777(eed to)-3.277 F 1.33 (allocate a ne)326.4 313 R 3.829(wb)-.25 G 1.329 (lock in which to hold a cop)-3.829 F 3.829(yo)-.1 G 3.829(ft)-3.829 G (he)-3.829 E 2.094(block being written by the process that holds them) 326.4 325 R(lock)326.4 337 Q 3.938(ed. The)-.1 F 1.438 (writing of the ne)3.938 F 3.938(wb)-.25 G 1.437(lock in snapshot 1) -3.938 F .255(will cause the k)326.4 349 R .256 (ernel running in the conte)-.1 F .256(xt of process)-.15 F 3.018(At) 326.4 361 S 3.018(os)-3.018 G .518 (can the list of snapshots which will get block)-3.018 F(ed)-.1 E .936 (at snapshot 2 because it is held lock)326.4 373 R .936 (ed by process B.)-.1 F .038(Meanwhile, the writing of the ne)326.4 385 R 2.538(wb)-.25 G .037(lock in snapshot 2)-2.538 F .255 (will cause the k)326.4 397 R .256(ernel running in the conte)-.1 F .256 (xt of process)-.15 F 3.073(Bt)326.4 409 S 3.073(os)-3.073 G .573 (can the list of snapshots which will get block)-3.073 F(ed)-.1 E (at snapshot 1 because it is held lock)326.4 421 Q(ed by process A.)-.1 E 2.353(The resolution to the deadlock problem is to)351.4 436.6 R .398 (allocate a single lock that is used for all the snapshots)326.4 448.6 R .656(on a \214lesystem.)326.4 460.6 R .657(When a ne)5.656 F 3.157(ws) -.25 G .657(napshot is created, the)-3.157 F -.1(ke)326.4 472.6 S 1.523 (rnel checks whether there are an).1 F 4.023(yo)-.15 G 1.522 (ther snapshots)-4.023 F .096(on the \214lesystem.)326.4 484.6 R .097 (If there are, the per)5.096 F .097(-\214le lock associ-)-.2 F 3.03 (ated with the ne)326.4 496.6 R 5.53(ws)-.25 G 3.03 (napshot inode is released and)-5.53 F 1.645 (replaced with the lock used for the other snapshots.)326.4 508.6 R -.4 (Wi)326.4 520.6 S .299 (th only a single lock, the access to the snapshots as).4 F 2.815(aw) 326.4 532.6 S .315(hole are serialized.)-2.815 F .315 (Thus, in Fig. 2, process B will)5.315 F .639 (hold the lock for all the snapshots and will be able to)326.4 544.6 R (mak)326.4 556.6 Q 3.048(et)-.1 G .549 (he necessary checks and updates while process)-3.048 F 2.923(Aw)326.4 568.6 S .422(ill be held w)-2.923 F 2.922(aiting. Once)-.1 F .422 (process B completes its)2.922 F 1.475 (scan, process A will be able to get access to all the)326.4 580.6 R .288(snapshots and will be able to run successfully to com-)326.4 592.6 R 5.491(pletion. Because)326.4 604.6 R 2.992 (of the added serialization of the)5.491 F 1.208 (snapshot lookups, the look-aside table described ear)326.4 616.6 R(-) -.2 E .93(lier is important to ensure reasonable performance of)326.4 628.6 R 3.352(snapshots. In)326.4 640.6 R -.05(ga)3.352 G .851 (thering statistics on our running sys-).05 F 3.369 (tems, we found that the look-aside table resolv)326.4 652.6 R(es)-.15 E 2.765(nearly half of the snapshot cop)326.4 664.6 R 2.765 (y-on-write lookups.)-.1 F 2.212 (Thus, we found that the look-aside table k)326.4 676.6 R 2.213 (eeps the)-.1 F .445 (contention for the snapshot lock to a reasonable le)326.4 688.6 R -.15 (ve)-.25 G(l.).15 E 0 Cg EP %%Page: 9 9 %%BeginPageSetup BP %%EndPageSetup /F0 10/Times-Bold@0 SF(8.)72 87 Q/F1 12/Times-Bold@0 SF (Running Fsck on Lar)5 E(ge Filesystems)-.12 E/F2 10/Times-Roman@0 SF -.35(Tr)72 102.6 S(aditionally).35 E 4.113(,a)-.65 G 1.613 (fter an unclean system shutdo)-4.113 F 1.614(wn, the)-.25 F .015 (\214lesystem check program,)72 114.6 R F0(fsck)2.515 E F2 2.515(,h)C .014(as had to be run o)-2.515 F -.15(ve)-.15 G(r).15 E 1.955 (all inodes in a \214lesystem to ascertain which inodes)72 126.6 R .511 (and blocks are in use and to correct the bitmaps.)72 138.6 R(The)5.511 E .573(current implementation of soft updates guarantees the)72 150.6 R (consistenc)72 162.6 Q 3.79(yo)-.15 G 3.79(fa)-3.79 G 1.29 (ll \214lesystem resources, including the)-3.79 F .65 (inode and block bitmaps.)72 174.6 R -.4(Wi)5.65 G .65 (th soft updates, the only).4 F(inconsistenc)72 186.6 Q 3.369(yt)-.15 G .869(hat can arise in the \214lesystem \(barring)-3.369 F(softw)72 198.6 Q .758(are b)-.1 F .759(ugs and media f)-.2 F .759 (ailures\) is that some unref-)-.1 F 2.305 (erenced blocks may not appear in the bitmaps and)72 210.6 R .213 (some inodes may ha)72 222.6 R .513 -.15(ve t)-.2 H 2.713(oh).15 G -2.25 -.2(av e)-2.713 H -.15(ove)2.913 G .214(rly high link counts).15 F 2.527 (reduced. Thus,)72 234.6 R .027(it is completely safe to be)2.527 F .026 (gin using the)-.15 F 2.79 (\214lesystem after a crash without \214rst running)72 246.6 R F0(fsck) 5.29 E F2(.)A(Ho)72 258.6 Q(we)-.25 E -.15(ve)-.25 G 3.585 -.4(r, s).15 H 2.784(ome \214lesystem space may be lost after).4 F 1.166(each crash.) 72 270.6 R 1.166(Thus, there is a v)6.166 F 1.166(ersion of)-.15 F F0 (fsck)3.666 E F2 1.166(that can)3.666 F .949 (run in the background on an acti)72 282.6 R 1.249 -.15(ve \214)-.25 H .949(lesystem to \214nd).15 F 2.151(and reco)72 294.6 R -.15(ve)-.15 G 4.651(ra).15 G 2.451 -.15(ny l)-4.651 H 2.151 (ost blocks and adjust inodes with).15 F -.15(ove)72 306.6 S .911 (rly high link counts.).15 F 3.411(As)5.911 G .911(pecial case of the o) -3.411 F -.15(ve)-.15 G(rly).15 E 1.397 (high link count is one that should be zero.)72 318.6 R 1.397(Such an) 6.397 F .085(inode will be freed as part of reducing its link count to) 72 330.6 R 2.618(zero. This)72 342.6 R -.05(ga)2.618 G .119 (rbage collection task is less dif).05 F .119(\214cult than)-.25 F .535 (it might at \214rst appear)72 354.6 R 3.035(,s)-.4 G .535(ince this v) -3.035 F .535(ersion of)-.15 F F0(fsck)3.035 E F2(only)3.035 E .123 (needs to identify resources that are not in use and can-)72 366.6 R 1.958(not be allocated or accessed by the running system)72 378.6 R ([McK)72 390.6 Q(usick & Ganger)-.15 E 2.5(,1)-.4 G(999].)-2.5 E -.4(Wi) 97 406.2 S 5.958(th the addition of snapshots, the task).4 F 1.792 (becomes simple, requiring only minor modi\214cations)72 418.2 R 4.657 (to the standard)72 430.2 R F0(fsck)7.157 E F2 9.657(.W)C 4.657 (hen run in background)-9.657 F .752(cleanup mode,)72 442.2 R F0(fsck) 3.251 E F2 .751(starts by taking a snapshot of the)3.251 F 2.353 (\214lesystem to be check)72 454.2 R(ed.)-.1 E F0(Fsck)7.354 E F2 2.354 (then runs o)4.854 F -.15(ve)-.15 G 4.854(rt).15 G(he)-4.854 E .382 (snapshot \214lesystem image doing its usual calculations)72 466.2 R .288(just as in its normal operation.)72 478.2 R .288 (The only other change)5.288 F .317 (comes at the end of its run, when it w)72 490.2 R .316 (ants to write out)-.1 F .76(the updated v)72 502.2 R .76 (ersions of the bitmaps.)-.15 F .76(Here, the modi-)5.76 F(\214ed)72 514.2 Q F0(fsck)3.704 E F2(tak)3.704 E 1.204 (es the set of blocks that it \214nds were in)-.1 F 1.372 (use at the time of the snapshot and remo)72 526.2 R -.15(ve)-.15 G 3.872(st).15 G 1.372(his set)-3.872 F .573(from the set mark)72 538.2 R .572(ed as in use at the time of the snap-)-.1 F 1.18(shot\212the dif)72 550.2 R 1.18(ference is the set of lost blocks.)-.25 F 1.18(It also)6.18 F .967(constructs the list of inodes whose counts need to be)72 562.2 R (adjusted.)72 574.2 Q F0(Fsck)5.932 E F2 .932(then uses a ne)3.432 F 3.432(ws)-.25 G .933(ystem call to notify)-3.432 F 1.642 (the \214lesystem of the identi\214ed lost blocks so that it)72 586.2 R 1.049(can replace them in its bitmaps.)72 598.2 R 1.049(It also gi)6.049 F -.15(ve)-.25 G 3.549(st).15 G 1.049(he set)-3.549 F .102 (of inodes whose link counts need to be adjusted; those)72 610.2 R 1.278 (inodes whose link count is reduced to zero are trun-)72 622.2 R .398 (cated to zero length and freed.)72 634.2 R(When)5.397 E F0(fsck)2.897 E F2(completes,)2.897 E(it releases its snapshot [McK)72 646.2 Q (usick, 2002].)-.15 E 1.925(As \214lesystems ha)97 661.8 R 2.226 -.15 (ve g)-.2 H 1.926(otten bigger the time to).15 F 2.882 (run either a fore)72 673.8 R 2.881(ground or a background)-.15 F F0 (fsck)5.381 E F2(has)5.381 E .349(increased to multiple hours.)72 685.8 R .349(Being able to run)5.349 F F0(fsck)2.849 E F2(in)2.849 E 2.739 (background has lar)72 697.8 R 2.738(gely mitig)-.18 F 2.738 (ated the running time)-.05 F 2.31(issue because it allo)72 709.8 R 2.31 (ws normal system operation to)-.25 F(proceed in parallel.)72 721.8 Q 2.73(Another problem with running)351.4 84 R F0(fsck)5.23 E F2 2.73 (on lar)5.23 F(ge)-.18 E .317 (\214lesystems is that the memory that it consumes gro)326.4 96 R(ws) -.25 E 2.788(in proportional to the size of the \214lesystem being)326.4 108 R(check)326.4 120 Q 4.075(ed. The)-.1 F 1.576 (main consumption of memory is four)4.075 F 1.264(bytes per re)326.4 132 R 1.263(gular inode, 40 to 50 bytes per directory)-.15 F 1.985 (inode, and one bit per \214lesystem data block.)326.4 144 R 1.986(On a) 6.986 F(typical)326.4 156 Q/F3 9/Times-Roman@0 SF(UFS2)2.938 E F2 .438 (\214lesystem with 16 kilobyte blocks and 2)2.938 F 2.378 (kilobyte fragments, the data-block map requires 64)326.4 168 R(me)326.4 180 Q -.05(ga)-.15 G 3.8(bytes of memory per terabyte of \214lesystem.) .05 F(Because)326.4 192 Q F3(UFS2)3.512 E F2 1.012 (does not preallocate inodes, b)3.512 F 1.012(ut rather)-.2 F 1.388 (allocates the inodes as the)326.4 204 R 3.887(ya)-.15 G 1.387 (re needed, the memory)-3.887 F 1.092 (required is dependent on the number of \214les that are)326.4 216 R (created in the \214lesystem.)326.4 228 Q .4 LW 566.39 242.1 326.4 242.1 DL F0(fsck)480.555 251.6 Q F2(maximum)23.055 E 20.545(Files Dirs)393.205 263.6 R 12.79(memory checkable)19.58 F 12.5(Filesystem per)331.4 275.6 R 12.5(Tb per)2.5 F 15.975(Tb per)2.5 F 15.975(Tb \214lesystem)2.5 F 47.085(/usr 93M)331.4 293.6 R 19.3(15M 1200K)19.99 F(3Tb)32.78 E(/juk) 331.4 305.6 Q 22.185(ebox 243K)-.1 F 30.97(18K 66K)21.66 F(60Tb)27.78 E 566.39 310.1 326.4 310.1 DL 566.39 242.1 566.39 310.1 DL 326.4 242.1 326.4 310.1 DL F0 -.92(Ta)326.4 323.6 S .792(ble 1).92 F F2(:)A/F4 10 /Times-Italic@0 SF .791(Maximum \214lesystem sizes c)3.292 F(hec)-.15 E .791(kable by)-.2 F F0(fsck)3.291 E F4(on a 32-bit ar)388.41 335.6 Q -.15(ch)-.37 G(itectur).15 E(e)-.37 E F2 .567 (The number of \214les and directories in a \214lesys-)351.4 359.6 R .392(tem mak)326.4 371.6 R 2.891(eah)-.1 G .391(uge dif)-2.891 F .391 (ference in the amount of memory)-.25 F 1.014(required by)326.4 383.6 R F0(fsck)3.514 E F2 6.014(.T)C 1.014(able 1 sho)-6.814 F 1.014(ws the tw) -.25 F 3.514(oe)-.1 G 1.014(nds of the)-3.514 F 5.949(spectrum. At)326.4 395.6 R 3.448(one end is a typical Free)5.948 F F3(BSD)A F0(/usr)5.948 E F2 .561(\214lesystem assuming that it gre)326.4 407.6 R 3.061(wa)-.25 G 3.061(ti)-3.061 G .561(ts current \214le and)-3.061 F 1.46 (directory mix to \214ll a 1 terabyte disk.)326.4 419.6 R 1.46 (The memory)6.46 F .286(footprint of)326.4 431.6 R F0(fsck)2.786 E F2 .287(is dominated by the memory to man-)2.786 F 1.694 (age the inodes and w)326.4 443.6 R 1.694 (ould require the entire address)-.1 F .4 (space of a 32-bit processor for a \214lesystem of about 3)326.4 455.6 R 3.308(terabytes. At)326.4 467.6 R .808(the other e)3.308 F .807 (xtreme is the author')-.15 F(s)-.55 E F0(/juk)3.307 E(e-)-.1 E(box) 326.4 479.6 Q F2 .436(\214lesystem assuming that it gre)2.936 F 2.937 (wa)-.25 G 2.937(ti)-2.937 G .437(ts current \214le)-2.937 F .62 (and directory mix to \214ll a 1 terabyte disk.)326.4 491.6 R .62 (The mem-)5.62 F 1.088(ory footprint of)326.4 503.6 R F0(fsck)3.589 E F2 1.089(is dominated by the memory to)3.589 F 1.208 (manage the data blocks and w)326.4 515.6 R 1.207 (ould require the entire)-.1 F .414 (address space of a 32-bit processor for a \214lesystem of)326.4 527.6 R 1.835(about 60 terabytes.)326.4 539.6 R 1.835(My e)6.835 F 1.835 (xpectation is that as disks)-.15 F 1.072(get lar)326.4 551.6 R(ger)-.18 E 3.572(,t)-.4 G(he)-3.572 E 3.572(yw)-.15 G 1.072 (ill tend to be \214lled with lar)-3.572 F 1.072(ger \214les)-.18 F .037 (of audio and video.)326.4 563.6 R .037(Thus, in practice)5.037 F F0 (fsck)2.537 E F2 .037(will run out)2.537 F 1.404 (of space on 32-bit architectures at about 30 terabyte)326.4 575.6 R 3.74(\214lesystems. Hopefully)326.4 587.6 R 1.24 (by the time that such \214lesys-)3.74 F 2.841(tems are common, the) 326.4 599.6 R 5.341(yw)-.15 G 2.841(ill be running on 64-bit)-5.341 F (architectures.)326.4 611.6 Q 1.072(In the e)351.4 627.2 R -.15(ve)-.25 G 1.071(nt that).15 F F0(fsck)3.571 E F2 1.071(hits the memory limit of) 3.571 F .248(32-bit architectures, Julian Elischer has suggested that) 326.4 639.2 R 3.406(one solution is to implement an `)326.4 651.2 R(`of) -.74 E 3.405(\215ine, non-in-)-.25 F(place')326.4 663.2 Q 3.666('v)-.74 G 1.166(ersion of)-3.816 F F0(fsck)3.666 E F2 1.166 (using all those techniques we)3.666 F 2.103 (learned in CS101 relating to mag-tape mer)326.4 675.2 R 2.102 (ge sorts.)-.18 F F0(Fsck)326.4 687.2 Q F2 -.1(wo)3.87 G 1.37(uld ha).1 F 1.67 -.15(ve t)-.2 H 3.87(oh).15 G -2.25 -.2(av e)-3.87 H 3.87(as)4.07 G 1.37(mall \(20 gig)-3.87 F 1.37(abyte\) disk)-.05 F 1.624 (partition set aside to hold w)326.4 699.2 R 1.623 (orking \214les, to which it)-.1 F -.1(wo)326.4 711.2 S .98 (uld write \214les of records detailing block numbers,).1 F 0 Cg EP %%Page: 10 10 %%BeginPageSetup BP %%EndPageSetup /F0 10/Times-Roman@0 SF 3.537(etc. Then)72 84 R 1.037(it w)3.537 F 1.037 (ould do mer)-.1 F 1.037(ge or block sorts on those)-.18 F .068 (\214les to get them in v)72 96 R .068 (arious orders \(depending on \214elds)-.25 F 1.067(in the records\).)72 108 R/F1 10/Times-Bold@0 SF(Fsck)6.066 E F0 -.1(wo)3.566 G 1.066 (uld then recombine them to).1 F 2.593 (\214nd such things as multiple referenced blocks and)72 120 R 2.348 (other \214le inconsistencies.)72 132 R 2.348(It w)7.348 F 2.347 (ould be slo)-.1 F 3.647 -.65(w, b)-.25 H 2.347(ut at).45 F 1.438 (least it could be used to check a 100 terabyte array)72 144 R(,)-.65 E 1.759(where the in-memory v)72 156 R 1.759(ersion w)-.15 F 1.758 (ould need a process)-.1 F 1.06(VM space of 13 Gig)72 168 R 1.06 (abytes which is clearly impossi-)-.05 F(ble on the 32-bit)72 180 Q/F2 9 /Times-Roman@0 SF(PC)2.5 E F0(.)A 2.456(Journalling \214lesystems pro)97 195.6 R 2.456(vide a much f)-.15 F(aster)-.1 E 2.812(state reco)72 207.6 R -.15(ve)-.15 G 2.812(ry than).15 F F1(fsck)5.312 E F0 7.812(.F)C 2.813 (or this reason, there is)-7.962 F 3.42(ongoing w)72 219.6 R 3.42 (ork to pro)-.1 F 3.42(vide a journalling option for)-.15 F F2(UFS2)72 231.6 Q F0 5.969(.H)C -.25(ow)-5.969 G -2.15 -.25(ev e).25 H 1.769 -.4 (r, e).25 H -.15(ve).15 G 3.469(nj).15 G .969 (ournalling \214lesystems need to)-3.469 F(ha)72 243.6 Q 1.438 -.15 (ve a \214)-.2 H 1.138(lesystem reco).15 F -.15(ve)-.15 G 1.137 (ry program such as).15 F F1(fsck)3.637 E F0 6.137(.I)C(n)-6.137 E 1.026 (the e)72 255.6 R -.15(ve)-.25 G 1.026(nt of media or softw).15 F 1.026 (are f)-.1 F 1.027(ailure, the \214lesystem)-.1 F 1.443 (can be damaged in w)72 267.6 R 1.442 (ays that the journal cannot \214x.)-.1 F .625 (Thus, the size of the reco)72 279.6 R -.15(ve)-.15 G .625 (ry program is an issue for).15 F .472(all \214lesystems.)72 291.6 R .472(Indeed, the f)5.472 F .472(act that)-.1 F F2(UFS)2.971 E F0 .471 (needs to use)2.971 F F1(fsck)72 303.6 Q F0 .917 (in its general operation ensures that)3.416 F F1(fsck)3.417 E F0 .917 (is k)3.417 F(ept)-.1 E .65(in good w)72 315.6 R .65 (orking order and is kno)-.1 F .65(wn to w)-.25 F .65(ork e)-.1 F -.15 (ve)-.25 G 3.15(no).15 G(n)-3.15 E -.15(ve)72 327.6 S(ry lar).15 E (ge \214lesystems.)-.18 E F1(9.)72 354.6 Q/F3 12/Times-Bold@0 SF -.24 (Pe)5 G(rf).24 E(ormance)-.3 E F0 .363(The performance of)72 370.2 R F2 (UFS2)2.864 E F0 .364(is nearly identical to that of)2.864 F F2(UFS1)72 382.2 Q F0 6.842(.T)C 1.842(his similarity in performance is hardly sur) -6.842 F(-)-.2 E 2.661(prising since the tw)72 394.2 R 5.161<6f8c>-.1 G 2.661(lesystem share most of the)-5.161 F 2.584 (same code base and use the same allocation algo-)72 406.2 R 5.526 (rithms. The)72 418.2 R 3.026(purpose of)5.526 F F2(UFS2)5.526 E F0 -.1 (wa)5.526 G 5.526(sn).1 G 3.026(ot to try and)-5.526 F(impro)72 430.2 Q .643 -.15(ve o)-.15 H 2.843(nt).15 G .343(he performance of)-2.843 F F2 (UFS1)2.843 E F0 .343(which is already)2.843 F .163 (within 80-95% of the bandwidth of the disk.)72 442.2 R .164(Rather it) 5.164 F -.1(wa)72 454.2 S 3.598(st).1 G 3.597(os)-3.598 G 1.097 (upport multi-terabyte \214lesystems and to pro-)-3.597 F .14(vide ne)72 466.2 R 2.64(wc)-.25 G .14(apabilities such as e)-2.64 F .14 (xtended attrib)-.15 F .14(utes with-)-.2 F .43(out losing performance.) 72 478.2 R .43(It has been successful in that)5.43 F(goal.)72 490.2 Q F1 (10.)72 517.2 Q F3(Futur)5 E 3(eW)-.216 G(ork)-3.9 E 0 0 218.4 251 -105.284 121 72 644.085 PBEGIN %%BeginDocument: metadata.eps %%Title: stdin %%Creator: fig2dev Version 3.2 Patchlevel 3d %%CreationDate: Fri May 9 23:05:05 2003 %%For: mckusick@beastie.mckusick.com (Kirk McKusick) %%BoundingBox: 0 0 251 121 %%Magnification: 1.0000 %%EndComments /$F2psDict 200 dict def $F2psDict begin $F2psDict /mtrx matrix put /col-1 {0 setgray} bind def /col0 {0.000 0.000 0.000 srgb} bind def /col1 {0.000 0.000 1.000 srgb} bind def /col2 {0.000 1.000 0.000 srgb} bind def /col3 {0.000 1.000 1.000 srgb} bind def /col4 {1.000 0.000 0.000 srgb} bind def /col5 {1.000 0.000 1.000 srgb} bind def /col6 {1.000 1.000 0.000 srgb} bind def /col7 {1.000 1.000 1.000 srgb} bind def /col8 {0.000 0.000 0.560 srgb} bind def /col9 {0.000 0.000 0.690 srgb} bind def /col10 {0.000 0.000 0.820 srgb} bind def /col11 {0.530 0.810 1.000 srgb} bind def /col12 {0.000 0.560 0.000 srgb} bind def /col13 {0.000 0.690 0.000 srgb} bind def /col14 {0.000 0.820 0.000 srgb} bind def /col15 {0.000 0.560 0.560 srgb} bind def /col16 {0.000 0.690 0.690 srgb} bind def /col17 {0.000 0.820 0.820 srgb} bind def /col18 {0.560 0.000 0.000 srgb} bind def /col19 {0.690 0.000 0.000 srgb} bind def /col20 {0.820 0.000 0.000 srgb} bind def /col21 {0.560 0.000 0.560 srgb} bind def /col22 {0.690 0.000 0.690 srgb} bind def /col23 {0.820 0.000 0.820 srgb} bind def /col24 {0.500 0.190 0.000 srgb} bind def /col25 {0.630 0.250 0.000 srgb} bind def /col26 {0.750 0.380 0.000 srgb} bind def /col27 {1.000 0.500 0.500 srgb} bind def /col28 {1.000 0.630 0.630 srgb} bind def /col29 {1.000 0.750 0.750 srgb} bind def /col30 {1.000 0.880 0.880 srgb} bind def /col31 {1.000 0.840 0.000 srgb} bind def end save newpath 0 121 moveto 0 0 lineto 251 0 lineto 251 121 lineto closepath clip newpath -47.0 150.9 translate 1 -1 scale /cp {closepath} bind def /ef {eofill} bind def /gr {grestore} bind def /gs {gsave} bind def /sa {save} bind def /rs {restore} bind def /l {lineto} bind def /m {moveto} bind def /rm {rmoveto} bind def /n {newpath} bind def /s {stroke} bind def /sh {show} bind def /slc {setlinecap} bind def /slj {setlinejoin} bind def /slw {setlinewidth} bind def /srgb {setrgbcolor} bind def /rot {rotate} bind def /sc {scale} bind def /sd {setdash} bind def /ff {findfont} bind def /sf {setfont} bind def /scf {scalefont} bind def /sw {stringwidth} bind def /tr {translate} bind def /tnt {dup dup currentrgbcolor 4 -2 roll dup 1 exch sub 3 -1 roll mul add 4 -2 roll dup 1 exch sub 3 -1 roll mul add 4 -2 roll dup 1 exch sub 3 -1 roll mul add srgb} bind def /shd {dup dup currentrgbcolor 4 -2 roll mul 4 -2 roll mul 4 -2 roll mul srgb} bind def /reencdict 12 dict def /ReEncode { reencdict begin /newcodesandnames exch def /newfontname exch def /basefontname exch def /basefontdict basefontname findfont def /newfont basefontdict maxlength dict def basefontdict { exch dup /FID ne { dup /Encoding eq { exch dup length array copy newfont 3 1 roll put } { exch newfont 3 1 roll put } ifelse } { pop pop } ifelse } forall newfont /FontName newfontname put newcodesandnames aload pop 128 1 255 { newfont /Encoding get exch /.notdef put } for newcodesandnames length 2 idiv { newfont /Encoding get 3 1 roll put } repeat newfontname newfont definefont pop end } def /isovec [ 8#055 /minus 8#200 /grave 8#201 /acute 8#202 /circumflex 8#203 /tilde 8#204 /macron 8#205 /breve 8#206 /dotaccent 8#207 /dieresis 8#210 /ring 8#211 /cedilla 8#212 /hungarumlaut 8#213 /ogonek 8#214 /caron 8#220 /dotlessi 8#230 /oe 8#231 /OE 8#240 /space 8#241 /exclamdown 8#242 /cent 8#243 /sterling 8#244 /currency 8#245 /yen 8#246 /brokenbar 8#247 /section 8#250 /dieresis 8#251 /copyright 8#252 /ordfeminine 8#253 /guillemotleft 8#254 /logicalnot 8#255 /hyphen 8#256 /registered 8#257 /macron 8#260 /degree 8#261 /plusminus 8#262 /twosuperior 8#263 /threesuperior 8#264 /acute 8#265 /mu 8#266 /paragraph 8#267 /periodcentered 8#270 /cedilla 8#271 /onesuperior 8#272 /ordmasculine 8#273 /guillemotright 8#274 /onequarter 8#275 /onehalf 8#276 /threequarters 8#277 /questiondown 8#300 /Agrave 8#301 /Aacute 8#302 /Acircumflex 8#303 /Atilde 8#304 /Adieresis 8#305 /Aring 8#306 /AE 8#307 /Ccedilla 8#310 /Egrave 8#311 /Eacute 8#312 /Ecircumflex 8#313 /Edieresis 8#314 /Igrave 8#315 /Iacute 8#316 /Icircumflex 8#317 /Idieresis 8#320 /Eth 8#321 /Ntilde 8#322 /Ograve 8#323 /Oacute 8#324 /Ocircumflex 8#325 /Otilde 8#326 /Odieresis 8#327 /multiply 8#330 /Oslash 8#331 /Ugrave 8#332 /Uacute 8#333 /Ucircumflex 8#334 /Udieresis 8#335 /Yacute 8#336 /Thorn 8#337 /germandbls 8#340 /agrave 8#341 /aacute 8#342 /acircumflex 8#343 /atilde 8#344 /adieresis 8#345 /aring 8#346 /ae 8#347 /ccedilla 8#350 /egrave 8#351 /eacute 8#352 /ecircumflex 8#353 /edieresis 8#354 /igrave 8#355 /iacute 8#356 /icircumflex 8#357 /idieresis 8#360 /eth 8#361 /ntilde 8#362 /ograve 8#363 /oacute 8#364 /ocircumflex 8#365 /otilde 8#366 /odieresis 8#367 /divide 8#370 /oslash 8#371 /ugrave 8#372 /uacute 8#373 /ucircumflex 8#374 /udieresis 8#375 /yacute 8#376 /thorn 8#377 /ydieresis] def /Times-Roman /Times-Roman-iso isovec ReEncode /Times-Bold /Times-Bold-iso isovec ReEncode /$F2psBegin {$F2psDict begin /$F2psEnteredState save def} def /$F2psEnd {$F2psEnteredState restore end} def $F2psBegin 10 setmiterlimit 0.06000 0.06000 sc % % Fig objects follow % /Times-Roman-iso ff 150.00 scf sf 2850 1725 m gs 1 -1 sc (\(b\) - traditional extent encoding) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Bold-iso ff 270.00 scf sf 2190 2175 m gs 1 -1 sc (...) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Bold-iso ff 270.00 scf sf 3705 1410 m gs 1 -1 sc (...) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Bold-iso ff 270.00 scf sf 1995 1410 m gs 1 -1 sc (...) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Bold-iso ff 270.00 scf sf 3465 2160 m gs 1 -1 sc (...) dup sw pop 2 div neg 0 rm col0 sh gr % Polyline 7.500 slw n 1050 525 m 1050 750 l gs col0 s gr % Polyline n 1275 525 m 1275 750 l gs col0 s gr % Polyline n 1500 525 m 1500 750 l gs col0 s gr % Polyline n 1725 525 m 1725 750 l gs col0 s gr % Polyline n 2175 525 m 2175 750 l gs col0 s gr % Polyline n 2400 525 m 2400 750 l gs col0 s gr % Polyline n 2625 525 m 2625 750 l gs col0 s gr % Polyline n 2850 525 m 2850 750 l gs col0 s gr % Polyline n 3075 525 m 3075 750 l gs col0 s gr % Polyline n 3300 525 m 3300 750 l gs col0 s gr % Polyline n 3750 525 m 3750 750 l gs col0 s gr % Polyline n 3975 525 m 3975 750 l gs col0 s gr % Polyline n 4200 525 m 4200 750 l gs col0 s gr % Polyline n 4425 525 m 4425 750 l gs col0 s gr % Polyline n 4650 525 m 4650 750 l gs col0 s gr % Polyline n 3525 525 m 3525 750 l gs col0 s gr % Polyline n 1950 525 m 1950 750 l gs col0 s gr % Polyline n 2175 1275 m 2175 1500 l gs col0 s gr % Polyline n 2625 1275 m 2625 1500 l gs col0 s gr % Polyline n 3075 1275 m 3075 1500 l gs col0 s gr % Polyline n 3525 1275 m 3525 1500 l gs col0 s gr % Polyline n 900 750 m 4800 750 l gs col0 s gr % Polyline n 900 525 m 4800 525 l gs col0 s gr % Polyline n 2025 1275 m 3675 1275 l gs col0 s gr % Polyline n 2625 2025 m 2625 2250 l gs col0 s gr % Polyline n 2850 2025 m 2850 2250 l gs col0 s gr % Polyline n 3075 2025 m 3075 2250 l gs col0 s gr % Polyline n 2400 2025 m 2400 2250 l gs col0 s gr % Polyline n 3300 2025 m 3300 2250 l gs col0 s gr % Polyline n 2250 2250 m 3450 2250 l gs col0 s gr % Polyline n 2250 2025 m 3450 2025 l gs col0 s gr % Polyline n 2025 1500 m 3675 1500 l gs col0 s gr % Polyline [60] 0 sd n 2400 1275 m 2400 1500 l gs col0 s gr [] 0 sd % Polyline [60] 0 sd n 2850 1275 m 2850 1500 l gs col0 s gr [] 0 sd % Polyline [60] 0 sd n 3300 1275 m 3300 1500 l gs col0 s gr [] 0 sd /Times-Roman-iso ff 150.00 scf sf 1185 690 m gs 1 -1 sc (12) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman-iso ff 150.00 scf sf 1395 690 m gs 1 -1 sc (13) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman-iso ff 150.00 scf sf 1620 690 m gs 1 -1 sc (14) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman-iso ff 150.00 scf sf 1860 690 m gs 1 -1 sc (15) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman-iso ff 150.00 scf sf 2070 690 m gs 1 -1 sc (24) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman-iso ff 150.00 scf sf 2310 690 m gs 1 -1 sc (25) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman-iso ff 150.00 scf sf 2535 690 m gs 1 -1 sc (26) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman-iso ff 150.00 scf sf 2760 690 m gs 1 -1 sc (27) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman-iso ff 150.00 scf sf 2985 690 m gs 1 -1 sc (32) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman-iso ff 150.00 scf sf 3195 690 m gs 1 -1 sc (33) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman-iso ff 150.00 scf sf 3435 690 m gs 1 -1 sc (34) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman-iso ff 150.00 scf sf 3645 690 m gs 1 -1 sc (35) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman-iso ff 150.00 scf sf 3885 690 m gs 1 -1 sc (36) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman-iso ff 150.00 scf sf 4110 690 m gs 1 -1 sc (37) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman-iso ff 150.00 scf sf 4560 690 m gs 1 -1 sc (39) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman-iso ff 150.00 scf sf 4335 690 m gs 1 -1 sc (38) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Bold-iso ff 270.00 scf sf 900 660 m gs 1 -1 sc (...) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman-iso ff 150.00 scf sf 2970 1440 m gs 1 -1 sc (24) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman-iso ff 150.00 scf sf 3435 1440 m gs 1 -1 sc (32) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman-iso ff 150.00 scf sf 2520 1440 m gs 1 -1 sc (12) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman-iso ff 150.00 scf sf 2295 1440 m gs 1 -1 sc (4) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman-iso ff 150.00 scf sf 2745 1440 m gs 1 -1 sc (4) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman-iso ff 150.00 scf sf 3195 1440 m gs 1 -1 sc (8) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman-iso ff 150.00 scf sf 2850 975 m gs 1 -1 sc (\(a\) - traditional encoding) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman-iso ff 150.00 scf sf 2520 2190 m gs 1 -1 sc (12) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman-iso ff 150.00 scf sf 2985 2190 m gs 1 -1 sc (32) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman-iso ff 150.00 scf sf 3210 2190 m gs 1 -1 sc (36) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman-iso ff 150.00 scf sf 2760 2190 m gs 1 -1 sc (24) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Roman-iso ff 150.00 scf sf 2850 2475 m gs 1 -1 sc (\(c\) - hybrid extent encoding) dup sw pop 2 div neg 0 rm col0 sh gr /Times-Bold-iso ff 270.00 scf sf 4845 660 m gs 1 -1 sc (...) dup sw pop 2 div neg 0 rm col0 sh gr $F2psEnd rs %%EndDocument end PEND F1(Figur)79.305 662.085 Q 2.5(e3)-.18 G F0(:)-2.5 E/F4 10 /Times-Italic@0 SF(Alternative \214le metadata r)2.5 E(epr)-.37 E (esentations)-.37 E F0 -.4(Wi)72 686.085 S .2 (th the addition of dynamic block reallocation in the).4 F .162 (early 1990s [Seltzer & Smith, 1996], the)72 698.085 R F2(UFS1)2.661 E F0(\214lesys-)2.661 E 5.356 (tem has had the ability to allocate most \214les)72 710.085 R 1.085 (contiguously on the disk.)326.4 84 R 1.084(The metadata describing a) 6.084 F(lar)326.4 96 Q .616 (ge \214le consists of indirect blocks with long runs of)-.18 F 1.742 (sequential block numbers, see Fig. 3-\(a\).)326.4 108 R -.15(Fo)6.741 G 4.241(rq).15 G(uick)-4.241 E .216(access while a \214le is acti)326.4 120 R -.15(ve)-.25 G 2.716(,t).15 G .216(he k)-2.716 F .217 (ernel tries to k)-.1 F .217(eep all)-.1 F .788(of a \214le')326.4 132 R 3.287(sm)-.55 G .787(etadata in memory)-3.287 F 5.787(.W)-.65 G(ith) -6.187 E F2(UFS2)3.287 E F0 .787(the space)3.287 F 1.01 (required to hold the metadata for a \214le is doubled as)326.4 144 R -2.15 -.25(ev e)326.4 156 T .47(ry block pointer gro).25 F .47 (ws from 32-bits to 64-bits.)-.25 F -.8(To)5.47 G(pro)326.4 168 Q .67 (vide a more compact representation, man)-.15 F 3.17<798c>-.15 G(lesys-) -3.17 E 2.544(tems use an e)326.4 180 R 2.543 (xtent-based representation.)-.15 F 5.043(At)7.543 G(ypical)-5.043 E -.15(ex)326.4 192 S 1.34 (tent-based representation uses pairs of block num-).15 F .362 (bers and lengths.)326.4 204 R .361 (Figure 3-\(b\) represents the same set)5.361 F .978 (of block number as Fig. 3-\(a\) in an e)326.4 216 R .978 (xtent-based for)-.15 F(-)-.2 E 2.858(mat. Pro)326.4 228 R .358 (vided that the \214le can be laid out nearly con-)-.15 F(tiguously) 326.4 240 Q 2.998(,t)-.65 G .498(his representation pro)-2.998 F .498 (vides a v)-.15 F .499(ery compact)-.15 F 5.89(description. Ho)326.4 252 R(we)-.25 E -.15(ve)-.25 G 4.19 -.4(r, r).15 H 3.39(andomly or slo).4 F 3.39(wly written)-.25 F .099(\214les can end up with man)326.4 264 R 2.599(yn)-.15 G .099(on-contiguous block allo-)-2.599 F 4.462 (cations which will produce a representation that)326.4 276 R .548 (requires more space than the one used by)326.4 288 R F2(UFS1)3.049 E F0 5.549(.T)C(his)-5.549 E 4.205(representation also has the dra)326.4 300 R 4.204(wback that it can)-.15 F 1.4 (require a lot of computation to do random access to)326.4 312 R .659 (the \214le since the block number needs to be computed)326.4 324 R .798 (by adding up the sizes starting from the be)326.4 336 R .798 (ginning of)-.15 F(the \214le until the desired seek of)326.4 348 Q (fset is reached.)-.25 E 1.699 -.8(To g)351.4 363.6 T .099 (ain most of the ef).75 F .099(\214ciencies of e)-.25 F .098 (xtends with-)-.15 F .86(out the random access inef)326.4 375.6 R (\214ciencies,)-.25 E F2(UFS2)3.361 E F0 .861(has added)3.361 F 3.222 <618c>326.4 387.6 S .722(eld to the inode that will allo)-3.222 F 3.222 (wt)-.25 G .721(hat inode to use a)-3.222 F(lar)326.4 399.6 Q 1.975 (ger block size.)-.18 F 1.976(Small, slo)6.976 F 1.976(wly gro)-.25 F 1.976(wing, or sparse)-.25 F .64(\214les set this v)326.4 411.6 R .64 (alue to the re)-.25 F .64(gular \214lesystem block size)-.15 F .39 (and represent their data in the traditional w)326.4 423.6 R .39(ay sho) -.1 F 2.89(wi)-.25 G(n)-2.89 E 2.269(Fig. 3-\(a\).)326.4 435.6 R(Ho) 7.269 E(we)-.25 E -.15(ve)-.25 G 3.069 -.4(r, w).15 H 2.268 (hen the \214lesystem detects a).4 F(lar)326.4 447.6 Q .092 (ge dense \214le, it can set this inode-block-size \214eld to)-.18 F 2.98(av)326.4 459.6 S .48(alue tw)-3.23 F 2.98(ot)-.1 G 2.98(os)-2.98 G .48(ixteen times the \214lesystem block size.)-2.98 F .917 (Figure 3-\(c\) represents the same set of block number)326.4 471.6 R .258(as Fig. 3-\(a\) with the inode-block-size \214eld set to four)326.4 483.6 R 1.877(times the \214lesystem block size.)326.4 495.6 R 1.877 (Each block pointer)6.877 F 1.711 (references a piece of disk storage that is four times)326.4 507.6 R (lar)326.4 519.6 Q 2.738 (ger which reduces the metadata storage require-)-.18 F 1.105 (ment by 75 percent.)326.4 531.6 R 1.105(Since e)6.105 F -.15(ve)-.25 G 1.105(ry block pointer other).15 F 1.82 (than possibly the last one references an equal sized)326.4 543.6 R .425 (block, computation of random access of)326.4 555.6 R .425 (fsets is just as)-.25 F -.1(fa)326.4 567.6 S 1.894 (st as in the traditional metadata representation.).1 F(It)6.895 E .335 (also cannot de)326.4 579.6 R .335(grade to a lar)-.15 F .335 (ger representation than the)-.18 F (traditional metadata representation.)326.4 591.6 Q 1.907(The dra)351.4 607.2 R 1.908(wback to this approach is that once a)-.15 F .777 (\214le has committed to using a lar)326.4 619.2 R .777 (ger block size, it can)-.18 F .719(only utilize blocks of that size.) 326.4 631.2 R .719(If the \214lesystem runs)5.719 F .41 (out of big blocks then the \214le can no longer gro)326.4 643.2 R 2.91 (wa)-.25 G(nd)-2.91 E .128(either the application will get an `)326.4 655.2 R(`out-of-space')-.74 E 2.629('e)-.74 G(rror)-2.629 E(,)-.4 E .413 (or the \214lesystem has to recreate the metadata with the)326.4 667.2 R .913(standard \214lesystem block size.)326.4 679.2 R .914 (My current plan is to)5.914 F .783 (write the code to recreate the metadata.)326.4 691.2 R .782 (While recre-)5.782 F .039 (ating the metadata usually will cause a long pause, W)326.4 703.2 R(e) -.8 E -.15(ex)326.4 715.2 S 3.639 (pect that condition to be quite rare and not a).15 F 0 Cg EP %%Page: 11 11 %%BeginPageSetup BP %%EndPageSetup /F0 10/Times-Roman@0 SF(noticeable problem in actual use.)72 84 Q/F1 10 /Times-Bold@0 SF(11.)72 111 Q/F2 12/Times-Bold@0 SF(Curr)5 E(ent Status) -.216 E F0(The)72 126.6 Q/F3 9/Times-Roman@0 SF(UFS2)3.433 E F0 .933 (\214lesystem w)3.433 F .933(as de)-.1 F -.15(ve)-.25 G .934 (loped for the FreeBSD).15 F .165 (Project by the author under contract to Netw)72 138.6 R .165(ork Asso-) -.1 F .477(ciates Laboratories, the Security Research Di)72 150.6 R .477 (vision of)-.25 F(Netw)72 162.6 Q 4.05(ork Associates, Inc. under D)-.1 F(ARP)-.4 E(A/SP)-.92 E -.9(AW)-.92 G(AR)-.3 E 2.72 (contract N66001-01-C-8035 \("CBOSS"\), as part of)72 174.6 R 2.924 (the D)72 186.6 R(ARP)-.4 E 5.424(AC)-.92 G(HA)-5.424 E 2.923 (TS research program.)-1.11 F 2.923(Under the)7.923 F 1.242 (terms of that contract, the softw)72 198.6 R 1.243 (are must be released)-.1 F 1.574(under a Berk)72 210.6 R(ele)-.1 E 1.574(y-style cop)-.15 F 4.074(yright. The)-.1 F F3(UFS2)4.074 E F0 (\214lesys-)4.074 E .391(tem w)72 222.6 R .392 (as written in 2002 and \214rst released in Free)-.1 F F3(BSD)A F0 3.295 (5.0. Extensi)72 234.6 R 1.095 -.15(ve u)-.25 H .795 (ser feedback in that release has been).15 F 2.143 (helpful in shaking out latent short-comings particu-)72 246.6 R 2.087 (larly in the ability of)72 258.6 R F3(UFS2)4.587 E F0 2.087 (to smoothly handle the)4.587 F .649 (really big \214lesystems for which it w)72 270.6 R .649(as designed.) -.1 F(The)5.649 E 1.259 (biggest current limitation is that the disk labels used)72 282.6 R 2.199(in Free)72 294.6 R F3(BSD)A F0 2.199 (5.0 can only describe 2 terabyte disks.)4.699 F 2.456 -.8(We a)72 306.6 T .856(re hoping that the ne).8 F 3.356(wl)-.25 G(ar)-3.356 E .856 (ger disk labels will be)-.18 F -.2(av)72 318.6 S (ailable by the time Free)-.05 E F3(BSD)A F0(5.1 is released.)2.5 E F1 (12.)72 361.2 Q F2(Refer)5 E(ences)-.216 E F0(Apple, 2003.)72 370.8 Q 5.015(Apple, \231Mac OS X Essentials, Chapter 9)97 382.8 R 7.87 (Filesystem, Section 12 Resource F)97 394.8 R(orks,)-.15 E<9a>-.7 E/F4 10/Times-Italic@0 SF(https://de)97 406.8 Q(veloper)-.15 E(.apple)-1.11 E (.com/tec)-.15 E(hpubs/macosx/)-.15 E(Essentials/SystemOvervie)97 418.8 Q(w/F)-.15 E(ileSystem/c)-.45 E(hapter)-.15 E(_9_section_12.html)97 430.8 Q F0(\(2003\).)2.5 E(Best & Kleikamp, 2003.)72 446.4 Q 2.27 (S. Best & D. Kleikamp, \231Ho)97 458.4 R 4.77(wt)-.25 G 2.27 (he Journaled)-4.77 F 6.376(File System handles the on-disk layout,)97 470.4 R<9a>-.7 E F4(https://www-106.ibm.com/de)97 482.4 Q (veloperworks/linux/)-.15 E(libr)97 494.4 Q(ary/l-jfslayout/)-.15 E F0 (\(2003\).)2.5 E(Do)72 510 Q(wse & Malone, 2002.)-.25 E 3.159(I. Do)97 522 R 3.158(wse & D. Malone, \231Recent Filesystem)-.25 F .246 (Optimizations on FreeBSD,)97 534 R<9a>-.7 E F4(Pr)2.746 E .246 (oceedings of the)-.45 F -1.77 -.55(Fr e)97 546 T .705(enix T).55 F -.15 (ra)-.55 G 1.105 -.2(ck a).15 H 3.204(tt).2 G .704 (he 2002 Usenix Annual T)-3.204 F(ec)-.92 E(h-)-.15 E(nical Confer)97 558 Q(ence)-.37 E(,)-.1 E F0(p. 245\255258 \(June 2002\).)2.5 E(Grif)72 573.6 Q(\214n et al, 2002.)-.25 E .627(J. L. Grif)97 585.6 R .628 (\214n, J. Schindler)-.25 F 3.128(,S)-.4 G 3.128(.W)-3.128 G 3.128(.S) -4.048 G(chlosser)-3.128 E 3.128(,J)-.4 G 3.128(.S)-3.128 G(.)-3.128 E (Buc)97 597.6 Q 2.535 -.65(y, & G)-.15 H 3.735(.R).65 G 3.735(.G)-3.735 G(anger)-3.735 E 3.735<2c99>-.4 G -.35(Ti)-3.735 G 1.235 (ming-accurate Stor).35 F(-)-.2 E 4.36(age Emulation,)97 609.6 R<9a>-.7 E F4(Pr)6.86 E 4.36(oceedings of the Usenix)-.45 F(Confer)97 621.6 Q 1.818(ence on F)-.37 F 1.818(ile and Stor)-.45 F 2.018 -.1(age T)-.15 H (ec)-.82 E(hnolo)-.15 E(gies,)-.1 E F0(p. 75\25588 \(January 2002\).)97 633.6 Q(Lumb et al, 2002.)72 649.2 Q 2.956(C. R. Lumb, J. Schindler)97 661.2 R 5.456(,&G)-.4 G 5.457(.R)-5.456 G 5.457(.G)-5.457 G(anger)-5.457 E(,)-.4 E 8.91(\231Freeblock Scheduling Outside of Disk)97 673.2 R (Firmw)97 685.2 Q(are,)-.1 E<9a>-.7 E F4(Pr)4.032 E 1.532 (oceedings of the Usenix Confer)-.45 F(-)-.2 E 4.417(ence on F)97 697.2 R 4.417(ile and Stor)-.45 F 4.616 -.1(age T)-.15 H(ec)-.82 E(hnolo)-.15 E(gies,)-.1 E F0(p.)6.916 E(275\255288 \(January 2002\).)97 709.2 Q(McK) 326.4 84 Q(usick, 2002.)-.15 E 3.556(M. McK)351.4 96 R 3.557 (usick, \231Running Fsck in the Back-)-.15 F(ground,)351.4 108 Q<9a>-.7 E F4(Pr)6.858 E 4.358(oceedings of the BSDCon 2002)-.45 F(Confer)351.4 120 Q(ence)-.37 E(,)-.1 E F0(p. 55\25564 \(February 2002\).)2.5 E(McK) 326.4 135.6 Q(usick et al, 1996.)-.15 E .367(M. McK)351.4 147.6 R .368 (usick, K. Bostic, M. Karels, & J. Quar)-.15 F(-)-.2 E(terman,,)351.4 159.6 Q F4 .755(The Design and Implementation of the)3.255 F 1.974 (4.4BSD Oper)351.4 171.6 R 1.974(ating System,)-.15 F F0 1.974 (p. 269\255271, Addi-)4.474 F .53(son W)351.4 183.6 R(esle)-.8 E 3.03 (yP)-.15 G .53(ublishing Compan)-3.03 F 1.83 -.65(y, R)-.15 H .53 (eading, MA).65 F(\(1996\).)351.4 195.6 Q(McK)326.4 211.2 Q (usick & Ganger)-.15 E 2.5(,1)-.4 G(999.)-2.5 E 1.441(M. McK)351.4 223.2 R 1.441(usick & G. Ganger)-.15 F 3.942<2c99>-.4 G 1.442(Soft Updates: A) -3.942 F -.7(Te)351.4 235.2 S 2.555 (chnique for Eliminating Most Synchronous).7 F 1.386(Writes in the F) 351.4 247.2 R 1.387(ast Filesystem,)-.15 F<9a>-.7 E F4(Pr)3.887 E 1.387 (oceedings of)-.45 F 2.059(the F)351.4 259.2 R -.37(re)-.55 G 2.059 (enix T).37 F -.15(ra)-.55 G 2.459 -.2(ck a).15 H 4.559(tt).2 G 2.058 (he 1999 Usenix Annual)-4.559 F -.92(Te)351.4 271.2 S -.15(ch).92 G (nical Confer).15 E(ence)-.37 E(,)-.1 E F0(p. 1\25517 \(June 1999\).)2.5 E(McK)326.4 286.8 Q(usick et al, 1984.)-.15 E 1.53(M. McK)351.4 298.8 R 1.53(usick, W)-.15 F 4.03(.J)-.92 G -.1(oy)-4.03 G 4.03(,S)-.55 G 4.03 (.L)-4.03 G(ef)-4.03 E(\215er)-.25 E 4.03(,&R)-.4 G 4.03(.F)-4.03 G (abry)-4.18 E(,)-.65 E 2.229 -.8(\231A F)351.4 310.8 T .629 (ast File System for UNIX,).65 F<9a>-.7 E F4 -.3(AC)3.128 G 3.128(MT).3 G -.15(ra)-3.678 G(nsac-).15 E 2.021(tions on Computer Systems,)351.4 322.8 R F0 2.022(2, 3, p. 181\255197)4.521 F(\(August 1984\).)351.4 334.8 Q(Phillips, 2001.)326.4 350.4 Q .949(D. Phillips, \231)351.4 362.4 R 3.449(AD)-.8 G .949(irectory Inde)-3.449 F 3.448(xf)-.15 G .948 (or Ext2,)-3.448 F<9a>-.7 E F4(Pr)3.448 E(o-)-.45 E 4.161 (ceedings of the Usenix F)351.4 374.4 R 4.162(ifth Annual Linux)-.45 F (Showcase and Confer)351.4 386.4 Q(ence)-.37 E F0(\(No)2.5 E -.15(ve) -.15 G(mber 2001\).).15 E(Reiser)326.4 402 Q 2.5(,2)-.4 G(001.)-2.5 E 9.348(H. Reiser)351.4 414 R 11.848<2c99>-.4 G 9.348 (The Reiser File System,)-11.848 F<9a>-.7 E F4(https://www)351.4 426 Q (.namesys.com/r)-.74 E(es_whol.shtml)-.37 E F0(\(Jan-)10.9 E (uary 2001\).)351.4 438 Q(Rhodes, 2003.)326.4 453.6 Q 4.59 -.74(T. R) 351.4 465.6 T 3.11(hodes, \231FreeBSD Handbook, Chapter 3,).74 F 1.053 (Section 3.3 File System Access Control Lists,)351.4 477.6 R<9a>-.7 E F4 (https://www)351.4 489.6 Q(.F)-.74 E -.37(re)-.55 G(eBSD.or).37 E (g/doc/en_US.ISO8859-1/)-.37 E(books/handbook/fs-acl.html)351.4 501.6 Q F0(\(2003\).)2.5 E(Schindler et al, 2002.)326.4 517.2 Q .571 (J. Schindler)351.4 529.2 R 3.071(,J)-.4 G 3.071(.L)-3.071 G 3.071(.G) -3.071 G(rif)-3.071 E .571(\214n, C. R. Lumb, & G. R.)-.25 F(Ganger) 351.4 541.2 Q 10.194<2c99>-.4 G -.35(Tr)-10.194 G 7.693 (ack-aligned Extents: Matching).35 F 1.16(Access P)351.4 553.2 R 1.16 (atterns to Disk Dri)-.15 F 1.46 -.15(ve C)-.25 H(haracteristics,).15 E <9a>-.7 E F4(Pr)351.4 565.2 Q 1.617(oceedings of the Usenix Confer)-.45 F 1.616(ence on F)-.37 F(ile)-.45 E .582(and Stor)351.4 577.2 R .782 -.1 (age T)-.15 H(ec)-.82 E(hnolo)-.15 E(gies,)-.1 E F0 .582 (p. 259\255274 \(January)3.082 F(2002\).)351.4 589.2 Q (Seltzer et al, 2000.)326.4 604.8 Q .268(M. Seltzer)351.4 616.8 R 2.767 (,G)-.4 G 2.767(.G)-2.767 G(anger)-2.767 E 2.767(,M)-.4 G 2.767(.M) -2.767 G(cK)-2.767 E .267(usick, K. Smith,)-.15 F 1.26 (C. Soules, & C. Stein, \231Journaling v)351.4 628.8 R 1.26(ersus Soft) -.15 F .163(Updates: Asynchronous Meta-data Protection in)351.4 640.8 R 2.446(File Systems,)351.4 652.8 R<9a>-.7 E F4(Pr)4.946 E 2.447 (oceedings of the San Die)-.45 F(go)-.4 E(Usenix Confer)351.4 664.8 Q (ence)-.37 E(,)-.1 E F0(p. 71\25584 \(June 2000\).)2.5 E (Seltzer & Smith, 1996.)326.4 680.4 Q .698(M. Seltzer & K. Smith, \231) 351.4 692.4 R 3.197(AC)-.8 G .697(omparison of FFS)-3.197 F 2.345 (Disk Allocation Algorithms,)351.4 704.4 R<9a>-.7 E F4 -.55(Wi)4.845 G 2.345(nter USENIX).55 F(Confer)351.4 716.4 Q(ence)-.37 E(,)-.1 E F0 (p. 15\25525 \(January 1996\).)2.5 E 0 Cg EP %%Page: 12 12 %%BeginPageSetup BP %%EndPageSetup /F0 10/Times-Roman@0 SF(Sweene)72 84 Q 2.5(ye)-.15 G 2.5(ta)-2.5 G (l, 1996.)-2.5 E .539(A. Sweene)97 96 R 1.839 -.65(y, D)-.15 H 3.039(.D) .65 G .538(oucette, C. Anderson, W)-3.039 F 3.038(.H)-.92 G(u,)-3.038 E 1.874(M. Nishimoto, & G. Peck, \231Scalability in the)97 108 R 3.212 (XFS File System,)97 120 R<9a>-.7 E/F1 10/Times-Italic@0 SF(Pr)5.712 E 3.211(oceedings of the 1996)-.45 F 1.92(Usenix Annual T)97 132 R(ec)-.92 E 1.92(hnical Confer)-.15 F(ence)-.37 E(,)-.1 E F0 1.92(p. 1\25514)4.42 F(\(January 1996\).)97 144 Q -.8(Wa)72 159.6 S(tson, 2000.).8 E 1.058 (R. W)97 171.6 R 1.057(atson, \231Introducing Supporting Infrastruc-)-.8 F 2.016(ture for T)97 183.6 R 2.017(rusted Operating System Support in) -.35 F(FreeBSD,)97 195.6 Q<9a>-.7 E F1(Pr)5.08 E 2.58 (oceedings of the BSDCon 2000)-.45 F(Confer)97 207.6 Q(ence)-.37 E F0 (\(September 2000\).)2.5 E -.8(Wa)72 223.2 S(tson, 2001.).8 E 6.227 (R. W)97 235.2 R 6.227(atson, \231T)-.8 F 6.228(rustedBSD: Adding T)-.35 F(rusted)-.35 E 1.692(Operating System Features to FreeBSD,)97 247.2 R <9a>-.7 E F1(Pr)4.192 E(o-)-.45 E 4.043(ceedings of the F)97 259.2 R -.37(re)-.55 G 4.043(enix T).37 F -.15(ra)-.55 G 4.443 -.2(ck a).15 H 6.543(tt).2 G 4.043(he 2001)-6.543 F 5.245(Usenix Annual T)97 271.2 R (ec)-.92 E 5.245(hnical Confer)-.15 F(ence)-.37 E F0(\(June)7.745 E (2001\).)97 283.2 Q -.8(Wa)72 298.8 S(tson et al, 2003.).8 E .885(R. W) 97 310.8 R .885(atson, W)-.8 F 3.385(.M)-.92 G .885(orrison, C. V)-3.385 F .885(ance, & B. Feld-)-1.11 F 5.783(man, \231The T)97 322.8 R 5.782 (rustedBSD MA)-.35 F 8.282(CF)-.4 G(rame)-8.282 E -.1(wo)-.25 G(rk:).1 E .792(Extensible K)97 334.8 R .792(ernel Access Control for FreeBSD)-.25 F(5.0,)97 346.8 Q<9a>-.7 E F1(Pr)4.645 E 2.144(oceedings of the F)-.45 F -.37(re)-.55 G 2.144(enix T).37 F -.15(ra)-.55 G 2.544 -.2(ck a).15 H 4.644(tt).2 G(he)-4.644 E 5.66(2003 Usenix Annual T)97 358.8 R(ec)-.92 E 5.66(hnical Confer)-.15 F(ence)-.37 E F0(\(June 2003\).)97 370.8 Q/F2 10 /Times-Bold@0 SF(13.)72 397.8 Q/F3 12/Times-Bold@0 SF(Biograph)5 E(y) -.18 E F0(Dr)72 413.4 Q 2.338(.M)-.55 G -.163(arshall Kirk McK)-2.338 F -.163(usick writes books and articles,)-.15 F 2.634 (consults, and teaches classes on UNIX- and BSD-)72 425.4 R .3 (related subjects.)72 437.4 R .3(While at the Uni)5.091 F -.15(ve)-.25 G .3(rsity of California).15 F .988(at Berk)72 449.4 R(ele)-.1 E 2.288 -.65(y, h)-.15 H 3.488(ei).65 G .989(mplemented the 4.2BSD f)-3.488 F .989(ast \214lesys-)-.1 F 1.032(tem, and w)72 461.4 R 1.031 (as the Research Computer Scientist at the)-.1 F(Berk)72 473.4 Q(ele)-.1 E 2.846(yC)-.15 G .346(omputer Systems Research Group \(CSRG\))-2.846 F -.15(ove)72 485.4 S 2.404(rseeing the de).15 F -.15(ve)-.25 G 2.403 (lopment and release of 4.3BSD).15 F 1.533(and 4.4BSD.)72 497.4 R 1.534 (His particular areas of interest are the)6.324 F 1.315 (virtual-memory system and the \214lesystem.)72 509.4 R 1.314(One day) 6.106 F(,)-.65 E 1.186(he hopes to see them mer)72 521.4 R 1.186 (ged seamlessly)-.18 F 5.977(.H)-.65 G 3.687(ee)-5.977 G(arned)-3.687 E 3.146(his under)72 533.4 R 3.146(graduate de)-.18 F 3.146 (gree in Electrical Engineering)-.15 F .537(from Cornell Uni)72 545.4 R -.15(ve)-.25 G(rsity).15 E 3.037(,a)-.65 G .538(nd did his graduate w) -3.037 F .538(ork at)-.1 F 2.742(the Uni)72 557.4 R -.15(ve)-.25 G 2.742 (rsity of California at Berk).15 F(ele)-.1 E 4.041 -.65(y, w)-.15 H 2.741(here he).65 F(recei)72 569.4 Q -.15(ve)-.25 G 4.926(dM).15 G 2.426 (asters de)-4.926 F 2.427(grees in Computer Science and)-.15 F 3.142 (Business Administration, and a doctoral de)72 581.4 R 3.141(gree in) -.15 F 3.019(Computer Science.)72 593.4 R 3.02 (He is president of the Usenix)7.811 F -.209 (Association, and is a member of A)72 605.4 R -.209(CM and IEEE.)-.4 F 1.653(In his spare time, he enjo)97 621 R 1.653(ys swimming, scuba)-.1 F (di)72 633 Q 1.295(ving, and wine collecting.)-.25 F 1.296 (The wine is stored in a)6.086 F .762 (specially constructed wine cellar \(accessible from the)72 645 R 2.127 (web at https://www)72 657 R 2.128(.mckusick.com/~mckusick/\) in the)-.65 F 1.586(basement of the house that he shares with Eric All-)72 669 R .852(man, his domestic partner of 24-and-some-odd years.)72 681 R -1.1 (Yo)72 693 S 9.472(uc)1.1 G 6.971(an contact him via email at)-9.472 F /F4 10/Courier@0 SF()72 705 Q F0(.)A 0 Cg EP %%Trailer end %%EOF