composefs/erofs/
writer.rs

1//! EROFS image generation and writing functionality.
2//!
3//! This module provides functionality to generate EROFS filesystem images
4//! from composefs tree structures, handling inode layout, directory blocks,
5//! and metadata serialization.
6
7use std::{
8    cell::RefCell,
9    collections::{BTreeMap, HashMap},
10    mem::size_of,
11    os::unix::ffi::OsStrExt,
12    rc::Rc,
13};
14
15use log::trace;
16use xxhash_rust::xxh32::xxh32;
17use zerocopy::{Immutable, IntoBytes};
18
19use crate::{
20    erofs::{composefs::OverlayMetacopy, format, reader::round_up},
21    fsverity::FsVerityHashValue,
22    tree,
23};
24
25#[derive(Clone, Copy, Debug)]
26enum Offset {
27    Header,
28    Superblock,
29    Inode,
30    XAttr,
31    Block,
32    End,
33}
34
35trait Output {
36    fn note_offset(&mut self, offset_type: Offset);
37    fn get(&self, offset_type: Offset, idx: usize) -> usize;
38    fn write(&mut self, data: &[u8]);
39    fn pad(&mut self, alignment: usize);
40    fn len(&self) -> usize;
41
42    fn get_div(&self, offset_type: Offset, idx: usize, div: usize) -> usize {
43        let offset = self.get(offset_type, idx);
44        assert_eq!(offset % div, 0);
45        offset / div
46    }
47
48    fn get_nid(&self, idx: usize) -> u64 {
49        self.get_div(Offset::Inode, idx, 32) as u64
50    }
51
52    fn get_xattr(&self, idx: usize) -> u32 {
53        self.get_div(Offset::XAttr, idx, 4).try_into().unwrap()
54    }
55
56    fn write_struct(&mut self, st: impl IntoBytes + Immutable) {
57        self.write(st.as_bytes());
58    }
59}
60
61#[derive(PartialOrd, PartialEq, Eq, Ord, Clone)]
62struct XAttr {
63    prefix: u8,
64    suffix: Box<[u8]>,
65    value: Box<[u8]>,
66}
67
68#[derive(Clone, Default)]
69struct InodeXAttrs {
70    shared: Vec<usize>,
71    local: Vec<XAttr>,
72    filter: u32,
73}
74
75#[derive(Debug)]
76struct DirEnt<'a> {
77    name: &'a [u8],
78    inode: usize,
79    file_type: format::FileType,
80}
81
82#[derive(Debug, Default)]
83struct Directory<'a> {
84    blocks: Box<[Box<[DirEnt<'a>]>]>,
85    inline: Box<[DirEnt<'a>]>,
86    size: usize,
87    nlink: usize,
88}
89
90#[derive(Debug)]
91struct Leaf<'a, ObjectID: FsVerityHashValue> {
92    content: &'a tree::LeafContent<ObjectID>,
93    nlink: usize,
94}
95
96#[derive(Debug)]
97enum InodeContent<'a, ObjectID: FsVerityHashValue> {
98    Directory(Directory<'a>),
99    Leaf(Leaf<'a, ObjectID>),
100}
101
102struct Inode<'a, ObjectID: FsVerityHashValue> {
103    stat: &'a tree::Stat,
104    xattrs: InodeXAttrs,
105    content: InodeContent<'a, ObjectID>,
106}
107
108impl XAttr {
109    pub fn write(&self, output: &mut impl Output) {
110        output.write_struct(format::XAttrHeader {
111            name_len: self.suffix.len() as u8,
112            name_index: self.prefix,
113            value_size: (self.value.len() as u16).into(),
114        });
115        output.write(&self.suffix);
116        output.write(&self.value);
117        output.pad(4);
118    }
119}
120
121impl InodeXAttrs {
122    fn add(&mut self, name: &[u8], value: &[u8]) {
123        for (idx, prefix) in format::XATTR_PREFIXES.iter().enumerate().rev() {
124            if let Some(suffix) = name.strip_prefix(*prefix) {
125                self.filter |= 1 << (xxh32(suffix, format::XATTR_FILTER_SEED + idx as u32) % 32);
126                self.local.push(XAttr {
127                    prefix: idx as u8,
128                    suffix: Box::from(suffix),
129                    value: Box::from(value),
130                });
131                return;
132            }
133        }
134        unreachable!("{:?}", std::str::from_utf8(name)); // worst case: we matched the empty prefix (0)
135    }
136
137    fn write(&self, output: &mut impl Output) {
138        if self.filter != 0 {
139            trace!("  write xattrs block");
140            output.write_struct(format::InodeXAttrHeader {
141                name_filter: (!self.filter).into(),
142                shared_count: self.shared.len() as u8,
143                ..Default::default()
144            });
145            for idx in &self.shared {
146                trace!("    shared {} @{}", idx, output.len());
147                output.write(&output.get_xattr(*idx).to_le_bytes());
148            }
149            for attr in &self.local {
150                trace!("    local @{}", output.len());
151                attr.write(output);
152            }
153        }
154        // our alignment is equal to xattr alignment: no need to pad
155    }
156}
157
158impl<'a> Directory<'a> {
159    pub fn from_entries(entries: Vec<DirEnt<'a>>) -> Self {
160        let mut blocks = vec![];
161        let mut rest = vec![];
162
163        let mut n_bytes = 0;
164        let mut nlink = 0;
165
166        trace!("Directory with {} items", entries.len());
167
168        // The content of the directory is fixed at this point so we may as well split it into
169        // blocks.  This lets us avoid measuring and re-measuring.
170        for entry in entries.into_iter() {
171            let entry_size = size_of::<format::DirectoryEntryHeader>() + entry.name.len();
172            assert!(entry_size <= 4096);
173
174            trace!("    {:?}", entry.file_type);
175
176            if matches!(entry.file_type, format::FileType::Directory) {
177                nlink += 1;
178            }
179
180            n_bytes += entry_size;
181            if n_bytes <= 4096 {
182                rest.push(entry);
183            } else {
184                // It won't fit, so we need to store the existing entries in a block.
185                trace!("    block {}", rest.len());
186                blocks.push(rest.into_boxed_slice());
187
188                // Start over
189                rest = vec![entry];
190                n_bytes = entry_size;
191            }
192        }
193
194        // Don't try to store more than 2048 bytes of tail data
195        if n_bytes > 2048 {
196            blocks.push(rest.into_boxed_slice());
197            rest = vec![];
198            n_bytes = 0;
199        }
200
201        trace!(
202            "  blocks {} inline {} inline_size {n_bytes}",
203            blocks.len(),
204            rest.len()
205        );
206
207        let size = format::BLOCK_SIZE * blocks.len() + n_bytes;
208        Self {
209            blocks: blocks.into_boxed_slice(),
210            inline: rest.into_boxed_slice(),
211            size,
212            nlink,
213        }
214    }
215
216    fn write_block(&self, output: &mut impl Output, block: &[DirEnt]) {
217        trace!("    write dir block {} @{}", block.len(), output.len());
218        let mut nameofs = size_of::<format::DirectoryEntryHeader>() * block.len();
219
220        for entry in block {
221            trace!(
222                "      entry {:?} name {} @{}",
223                entry.file_type,
224                nameofs,
225                output.len()
226            );
227            output.write_struct(format::DirectoryEntryHeader {
228                name_offset: (nameofs as u16).into(),
229                inode_offset: output.get_nid(entry.inode).into(),
230                file_type: entry.file_type.into(),
231                ..Default::default()
232            });
233            nameofs += entry.name.len();
234        }
235
236        for entry in block {
237            trace!("      name @{}", output.len());
238            output.write(entry.name.as_bytes());
239        }
240    }
241
242    fn write_inline(&self, output: &mut impl Output) {
243        trace!(
244            "  write inline len {} expected size {} of {}",
245            self.inline.len(),
246            self.size % 4096,
247            self.size
248        );
249        self.write_block(output, &self.inline);
250    }
251
252    fn write_blocks(&self, output: &mut impl Output) {
253        for block in &self.blocks {
254            assert_eq!(output.len() % format::BLOCK_SIZE, 0);
255            self.write_block(output, block);
256            output.pad(format::BLOCK_SIZE);
257        }
258    }
259
260    fn inode_meta(&self, block_offset: usize) -> (format::DataLayout, u32, u64, usize) {
261        let (layout, u) = if self.inline.is_empty() {
262            (format::DataLayout::FlatPlain, block_offset as u32 / 4096)
263        } else if !self.blocks.is_empty() {
264            (format::DataLayout::FlatInline, block_offset as u32 / 4096)
265        } else {
266            (format::DataLayout::FlatInline, 0)
267        };
268        (layout, u, self.size as u64, self.nlink)
269    }
270}
271
272impl<ObjectID: FsVerityHashValue> Leaf<'_, ObjectID> {
273    fn inode_meta(&self) -> (format::DataLayout, u32, u64, usize) {
274        let (layout, u, size) = match &self.content {
275            tree::LeafContent::Regular(tree::RegularFile::Inline(data)) => {
276                if data.is_empty() {
277                    (format::DataLayout::FlatPlain, 0, data.len() as u64)
278                } else {
279                    (format::DataLayout::FlatInline, 0, data.len() as u64)
280                }
281            }
282            tree::LeafContent::Regular(tree::RegularFile::External(.., size)) => {
283                (format::DataLayout::ChunkBased, 31, *size)
284            }
285            tree::LeafContent::CharacterDevice(rdev) | tree::LeafContent::BlockDevice(rdev) => {
286                (format::DataLayout::FlatPlain, *rdev as u32, 0)
287            }
288            tree::LeafContent::Fifo | tree::LeafContent::Socket => {
289                (format::DataLayout::FlatPlain, 0, 0)
290            }
291            tree::LeafContent::Symlink(target) => {
292                (format::DataLayout::FlatInline, 0, target.len() as u64)
293            }
294        };
295        (layout, u, size, self.nlink)
296    }
297
298    fn write_inline(&self, output: &mut impl Output) {
299        output.write(match self.content {
300            tree::LeafContent::Regular(tree::RegularFile::Inline(data)) => data,
301            tree::LeafContent::Regular(tree::RegularFile::External(..)) => b"\xff\xff\xff\xff", // null chunk
302            tree::LeafContent::Symlink(target) => target.as_bytes(),
303            _ => &[],
304        });
305    }
306}
307
308impl<ObjectID: FsVerityHashValue> Inode<'_, ObjectID> {
309    fn file_type(&self) -> format::FileType {
310        match &self.content {
311            InodeContent::Directory(..) => format::FileType::Directory,
312            InodeContent::Leaf(leaf) => match &leaf.content {
313                tree::LeafContent::Regular(..) => format::FileType::RegularFile,
314                tree::LeafContent::CharacterDevice(..) => format::FileType::CharacterDevice,
315                tree::LeafContent::BlockDevice(..) => format::FileType::BlockDevice,
316                tree::LeafContent::Fifo => format::FileType::Fifo,
317                tree::LeafContent::Socket => format::FileType::Socket,
318                tree::LeafContent::Symlink(..) => format::FileType::Symlink,
319            },
320        }
321    }
322
323    fn write_inode(&self, output: &mut impl Output, idx: usize) {
324        let (layout, u, size, nlink) = match &self.content {
325            InodeContent::Directory(dir) => dir.inode_meta(output.get(Offset::Block, idx)),
326            InodeContent::Leaf(leaf) => leaf.inode_meta(),
327        };
328
329        let xattr_size = {
330            let mut xattr = FirstPass::default();
331            self.xattrs.write(&mut xattr);
332            xattr.offset
333        };
334
335        // We need to make sure the inline part doesn't overlap a block boundary
336        output.pad(32);
337        if matches!(layout, format::DataLayout::FlatInline) {
338            let inode_and_xattr_size = size_of::<format::ExtendedInodeHeader>() + xattr_size;
339            let inline_start = output.len() + inode_and_xattr_size;
340            let end_of_metadata = inline_start - 1;
341            let inline_end = inline_start + (size as usize % format::BLOCK_SIZE);
342            if end_of_metadata / format::BLOCK_SIZE != inline_end / format::BLOCK_SIZE {
343                // If we proceed, then we'll violate the rule about crossing block boundaries.
344                // The easiest thing to do is to add padding so that the inline data starts close
345                // to the start of a fresh block boundary, while ensuring inode alignment.
346                let pad = vec![0; 4096 - end_of_metadata % 4096];
347                trace!("added pad {}", pad.len());
348                output.write(&pad);
349                output.pad(32);
350            }
351        }
352
353        let format = format::InodeLayout::Extended | layout;
354
355        trace!(
356            "write inode {idx} nid {} {:?} {:?} xattrsize{xattr_size} icount{} inline{} @{}",
357            output.len() / 32,
358            format,
359            self.file_type(),
360            match xattr_size {
361                0 => 0,
362                n => (1 + (n - 12) / 4) as u16,
363            },
364            size % 4096,
365            output.len()
366        );
367
368        output.note_offset(Offset::Inode);
369        output.write_struct(format::ExtendedInodeHeader {
370            format,
371            xattr_icount: match xattr_size {
372                0 => 0,
373                n => (1 + (n - 12) / 4) as u16,
374            }
375            .into(),
376            mode: self.file_type() | self.stat.st_mode,
377            size: size.into(),
378            u: u.into(),
379            ino: ((output.len() / 32) as u32).into(),
380            uid: self.stat.st_uid.into(),
381            gid: self.stat.st_gid.into(),
382            mtime: (self.stat.st_mtim_sec as u64).into(),
383            nlink: (nlink as u32).into(),
384            ..Default::default()
385        });
386
387        self.xattrs.write(output);
388
389        match &self.content {
390            InodeContent::Directory(dir) => dir.write_inline(output),
391            InodeContent::Leaf(leaf) => leaf.write_inline(output),
392        };
393
394        output.pad(32);
395    }
396
397    fn write_blocks(&self, output: &mut impl Output) {
398        if let InodeContent::Directory(dir) = &self.content {
399            dir.write_blocks(output);
400        }
401    }
402}
403
404struct InodeCollector<'a, ObjectID: FsVerityHashValue> {
405    inodes: Vec<Inode<'a, ObjectID>>,
406    hardlinks: HashMap<*const tree::Leaf<ObjectID>, usize>,
407}
408
409impl<'a, ObjectID: FsVerityHashValue> InodeCollector<'a, ObjectID> {
410    fn push_inode(&mut self, stat: &'a tree::Stat, content: InodeContent<'a, ObjectID>) -> usize {
411        let mut xattrs = InodeXAttrs::default();
412
413        // We need to record extra xattrs for some files.  These come first.
414        if let InodeContent::Leaf(Leaf {
415            content: tree::LeafContent::Regular(tree::RegularFile::External(id, ..)),
416            ..
417        }) = content
418        {
419            xattrs.add(
420                b"trusted.overlay.metacopy",
421                OverlayMetacopy::new(id).as_bytes(),
422            );
423
424            let redirect = format!("/{}", id.to_object_pathname());
425            xattrs.add(b"trusted.overlay.redirect", redirect.as_bytes());
426        }
427
428        // Add the normal xattrs.  They're already listed in sorted order.
429        for (name, value) in RefCell::borrow(&stat.xattrs).iter() {
430            let name = name.as_bytes();
431
432            if let Some(escapee) = name.strip_prefix(b"trusted.overlay.") {
433                let escaped = [b"trusted.overlay.overlay.", escapee].concat();
434                xattrs.add(&escaped, value);
435            } else {
436                xattrs.add(name, value);
437            }
438        }
439
440        // Allocate an inode for ourselves.  At first we write all xattrs as local.  Later (after
441        // we've determined which xattrs ought to be shared) we'll come and move some of them over.
442        let inode = self.inodes.len();
443        self.inodes.push(Inode {
444            stat,
445            xattrs,
446            content,
447        });
448        inode
449    }
450
451    fn collect_leaf(&mut self, leaf: &'a Rc<tree::Leaf<ObjectID>>) -> usize {
452        let nlink = Rc::strong_count(leaf);
453
454        if nlink > 1 {
455            if let Some(inode) = self.hardlinks.get(&Rc::as_ptr(leaf)) {
456                return *inode;
457            }
458        }
459
460        let inode = self.push_inode(
461            &leaf.stat,
462            InodeContent::Leaf(Leaf {
463                content: &leaf.content,
464                nlink,
465            }),
466        );
467
468        if nlink > 1 {
469            self.hardlinks.insert(Rc::as_ptr(leaf), inode);
470        }
471
472        inode
473    }
474
475    fn insert_sorted(
476        entries: &mut Vec<DirEnt<'a>>,
477        name: &'a [u8],
478        inode: usize,
479        file_type: format::FileType,
480    ) {
481        let entry = DirEnt {
482            name,
483            inode,
484            file_type,
485        };
486        let point = entries.partition_point(|e| e.name < entry.name);
487        entries.insert(point, entry);
488    }
489
490    fn collect_dir(&mut self, dir: &'a tree::Directory<ObjectID>, parent: usize) -> usize {
491        // The root inode number needs to fit in a u16.  That more or less compels us to write the
492        // directory inode before the inode of the children of the directory.  Reserve a slot.
493        let me = self.push_inode(&dir.stat, InodeContent::Directory(Directory::default()));
494
495        let mut entries = vec![];
496
497        for (name, inode) in dir.sorted_entries() {
498            let child = match inode {
499                tree::Inode::Directory(dir) => self.collect_dir(dir, me),
500                tree::Inode::Leaf(leaf) => self.collect_leaf(leaf),
501            };
502            entries.push(DirEnt {
503                name: name.as_bytes(),
504                inode: child,
505                file_type: self.inodes[child].file_type(),
506            });
507        }
508
509        // We're expected to add those, too
510        Self::insert_sorted(&mut entries, b".", me, format::FileType::Directory);
511        Self::insert_sorted(&mut entries, b"..", parent, format::FileType::Directory);
512
513        // Now that we know the actual content, we can write it to our reserved slot
514        self.inodes[me].content = InodeContent::Directory(Directory::from_entries(entries));
515        me
516    }
517
518    pub fn collect(fs: &'a tree::FileSystem<ObjectID>) -> Vec<Inode<'a, ObjectID>> {
519        let mut this = Self {
520            inodes: vec![],
521            hardlinks: HashMap::new(),
522        };
523
524        // '..' of the root directory is the root directory again
525        let root_inode = this.collect_dir(&fs.root, 0);
526        assert_eq!(root_inode, 0);
527
528        this.inodes
529    }
530}
531
532/// Takes a list of inodes where each inode contains only local xattr values, determines which
533/// xattrs (key, value) pairs appear more than once, and shares them.
534fn share_xattrs(inodes: &mut [Inode<impl FsVerityHashValue>]) -> Vec<XAttr> {
535    let mut xattrs: BTreeMap<XAttr, usize> = BTreeMap::new();
536
537    // Collect all xattrs from the inodes
538    for inode in inodes.iter() {
539        for attr in &inode.xattrs.local {
540            if let Some(count) = xattrs.get_mut(attr) {
541                *count += 1;
542            } else {
543                xattrs.insert(attr.clone(), 1);
544            }
545        }
546    }
547
548    // Share only xattrs with more than one user
549    xattrs.retain(|_k, v| *v > 1);
550
551    // Repurpose the refcount field as an index lookup
552    for (idx, value) in xattrs.values_mut().enumerate() {
553        *value = idx;
554    }
555
556    // Visit each inode and change local xattrs into shared xattrs
557    for inode in inodes.iter_mut() {
558        inode.xattrs.local.retain(|attr| {
559            if let Some(idx) = xattrs.get(attr) {
560                inode.xattrs.shared.push(*idx);
561                false // drop the local xattr: we converted it
562            } else {
563                true // retain the local xattr: we didn't convert it
564            }
565        });
566    }
567
568    // Return the shared xattrs as a vec
569    xattrs.into_keys().collect()
570}
571
572fn write_erofs(
573    output: &mut impl Output,
574    inodes: &[Inode<impl FsVerityHashValue>],
575    xattrs: &[XAttr],
576) {
577    // Write composefs header
578    output.note_offset(Offset::Header);
579    output.write_struct(format::ComposefsHeader {
580        magic: format::COMPOSEFS_MAGIC,
581        version: format::VERSION,
582        flags: 0.into(),
583        composefs_version: format::COMPOSEFS_VERSION,
584        ..Default::default()
585    });
586    output.pad(1024);
587
588    // Write superblock
589    output.note_offset(Offset::Superblock);
590    output.write_struct(format::Superblock {
591        magic: format::MAGIC_V1,
592        blkszbits: format::BLOCK_BITS,
593        feature_compat: format::FEATURE_COMPAT_MTIME | format::FEATURE_COMPAT_XATTR_FILTER,
594        root_nid: (output.get_nid(0) as u16).into(),
595        inos: (inodes.len() as u64).into(),
596        blocks: ((output.get(Offset::End, 0) / format::BLOCK_SIZE) as u32).into(),
597        ..Default::default()
598    });
599
600    // Write inode table
601    for (idx, inode) in inodes.iter().enumerate() {
602        // The inode may add padding to itself, so it notes its own offset
603        inode.write_inode(output, idx);
604    }
605
606    // Write shared xattr table
607    for xattr in xattrs {
608        output.note_offset(Offset::XAttr);
609        xattr.write(output);
610    }
611
612    // Write blocks from inodes that have them
613    output.pad(4096);
614    for inode in inodes.iter() {
615        output.note_offset(Offset::Block);
616        inode.write_blocks(output);
617    }
618
619    // That's it
620    output.note_offset(Offset::End);
621}
622
623#[derive(Default)]
624struct Layout {
625    offset_types: Vec<usize>,
626    offsets: Vec<usize>,
627}
628
629#[derive(Default)]
630struct FirstPass {
631    offset: usize,
632    layout: Layout,
633}
634
635struct SecondPass {
636    output: Vec<u8>,
637    layout: Layout,
638}
639
640impl Output for SecondPass {
641    fn note_offset(&mut self, _offset_type: Offset) {
642        /* no-op */
643    }
644
645    fn get(&self, offset_type: Offset, idx: usize) -> usize {
646        let start = self.layout.offset_types[offset_type as usize];
647        self.layout.offsets[start + idx]
648    }
649
650    fn write(&mut self, data: &[u8]) {
651        self.output.extend_from_slice(data);
652    }
653
654    fn pad(&mut self, alignment: usize) {
655        self.output
656            .resize(round_up(self.output.len(), alignment), 0);
657    }
658
659    fn len(&self) -> usize {
660        self.output.len()
661    }
662}
663
664impl Output for FirstPass {
665    fn note_offset(&mut self, offset_type: Offset) {
666        while self.layout.offset_types.len() <= offset_type as usize {
667            self.layout.offset_types.push(self.layout.offsets.len());
668        }
669        assert_eq!(self.layout.offset_types.len(), offset_type as usize + 1);
670
671        trace!(
672            "{:?} #{} @{}",
673            offset_type,
674            self.layout.offsets.len() - self.layout.offset_types[offset_type as usize],
675            self.offset
676        );
677        self.layout.offsets.push(self.offset);
678    }
679
680    fn get(&self, _: Offset, _: usize) -> usize {
681        0 // We don't know offsets in the first pass, so fake it
682    }
683
684    fn write(&mut self, data: &[u8]) {
685        self.offset += data.len();
686    }
687
688    fn pad(&mut self, alignment: usize) {
689        self.offset = round_up(self.offset, alignment);
690    }
691
692    fn len(&self) -> usize {
693        self.offset
694    }
695}
696
697/// Creates an EROFS filesystem image from a composefs tree
698///
699/// This function performs a two-pass generation:
700/// 1. First pass determines the layout and sizes of all structures
701/// 2. Second pass writes the actual image data
702///
703/// Returns the complete EROFS image as a byte array.
704pub fn mkfs_erofs<ObjectID: FsVerityHashValue>(fs: &tree::FileSystem<ObjectID>) -> Box<[u8]> {
705    // Create the intermediate representation: flattened inodes and shared xattrs
706    let mut inodes = InodeCollector::collect(fs);
707    let xattrs = share_xattrs(&mut inodes);
708
709    // Do a first pass with the writer to determine the layout
710    let mut first_pass = FirstPass::default();
711    write_erofs(&mut first_pass, &inodes, &xattrs);
712
713    // Do a second pass with the writer to get the actual bytes
714    let mut second_pass = SecondPass {
715        output: vec![],
716        layout: first_pass.layout,
717    };
718    write_erofs(&mut second_pass, &inodes, &xattrs);
719
720    // That's it
721    second_pass.output.into_boxed_slice()
722}