composefs/
fs.rs

1//! Reading and writing filesystem trees to/from disk.
2//!
3//! This module provides functionality to read filesystem structures from
4//! disk into composefs tree representations and write them back, including
5//! handling of hardlinks, extended attributes, and repository integration.
6
7use std::{
8    cell::RefCell,
9    collections::{BTreeMap, HashMap},
10    ffi::{CStr, OsStr},
11    fs::File,
12    io::{Read, Write},
13    mem::MaybeUninit,
14    os::unix::ffi::OsStrExt,
15    path::Path,
16    rc::Rc,
17};
18
19use anyhow::{ensure, Result};
20use rustix::{
21    buffer::spare_capacity,
22    fd::{AsFd, OwnedFd},
23    fs::{
24        fstat, getxattr, linkat, listxattr, mkdirat, mknodat, openat, readlinkat, symlinkat,
25        AtFlags, Dir, FileType, Mode, OFlags, CWD,
26    },
27    io::{read, Errno},
28};
29use zerocopy::IntoBytes;
30
31use crate::{
32    fsverity::{compute_verity, FsVerityHashValue},
33    repository::Repository,
34    tree::{Directory, FileSystem, Inode, Leaf, LeafContent, RegularFile, Stat},
35    util::proc_self_fd,
36    INLINE_CONTENT_MAX,
37};
38
39/// Attempt to use O_TMPFILE + rename to atomically set file contents.
40/// Will fall back to a non-atomic write if the target doesn't support O_TMPFILE.
41fn set_file_contents(dirfd: &OwnedFd, name: &OsStr, stat: &Stat, data: &[u8]) -> Result<()> {
42    match openat(
43        dirfd,
44        ".",
45        OFlags::WRONLY | OFlags::TMPFILE | OFlags::CLOEXEC,
46        stat.st_mode.into(),
47    ) {
48        Ok(tmp) => {
49            let mut tmp = File::from(tmp);
50            tmp.write_all(data)?;
51            tmp.sync_data()?;
52            linkat(
53                CWD,
54                proc_self_fd(&tmp),
55                dirfd,
56                name,
57                AtFlags::SYMLINK_FOLLOW,
58            )?;
59        }
60        Err(Errno::OPNOTSUPP) => {
61            // vfat? yolo...
62            let fd = openat(
63                dirfd,
64                name,
65                OFlags::CREATE | OFlags::WRONLY | OFlags::CLOEXEC,
66                stat.st_mode.into(),
67            )?;
68            let mut f = File::from(fd);
69            f.write_all(data)?;
70            f.sync_data()?;
71        }
72        Err(e) => Err(e)?,
73    }
74    Ok(())
75}
76
77fn write_directory<ObjectID: FsVerityHashValue>(
78    dir: &Directory<ObjectID>,
79    dirfd: &OwnedFd,
80    name: &OsStr,
81    repo: &Repository<ObjectID>,
82) -> Result<()> {
83    match mkdirat(dirfd, name, dir.stat.st_mode.into()) {
84        Ok(()) | Err(Errno::EXIST) => {}
85        Err(e) => Err(e)?,
86    }
87
88    let fd = openat(dirfd, name, OFlags::PATH | OFlags::DIRECTORY, 0.into())?;
89    write_directory_contents(dir, &fd, repo)
90}
91
92fn write_leaf<ObjectID: FsVerityHashValue>(
93    leaf: &Leaf<ObjectID>,
94    dirfd: &OwnedFd,
95    name: &OsStr,
96    repo: &Repository<ObjectID>,
97) -> Result<()> {
98    let mode = leaf.stat.st_mode.into();
99
100    match &leaf.content {
101        LeafContent::Regular(RegularFile::Inline(ref data)) => {
102            set_file_contents(dirfd, name, &leaf.stat, data)?
103        }
104        LeafContent::Regular(RegularFile::External(ref id, size)) => {
105            let object = repo.open_object(id)?;
106            // TODO: make this better.  At least needs to be EINTR-safe.  Could even do reflink in some cases...
107            let mut buffer = vec![MaybeUninit::uninit(); *size as usize];
108            let (data, _) = read(object, &mut buffer)?;
109            set_file_contents(dirfd, name, &leaf.stat, data)?;
110        }
111        LeafContent::BlockDevice(rdev) => mknodat(dirfd, name, FileType::BlockDevice, mode, *rdev)?,
112        LeafContent::CharacterDevice(rdev) => {
113            mknodat(dirfd, name, FileType::CharacterDevice, mode, *rdev)?
114        }
115        LeafContent::Socket => mknodat(dirfd, name, FileType::Socket, mode, 0)?,
116        LeafContent::Fifo => mknodat(dirfd, name, FileType::Fifo, mode, 0)?,
117        LeafContent::Symlink(target) => symlinkat(target.as_ref(), dirfd, name)?,
118    }
119
120    Ok(())
121}
122
123fn write_directory_contents<ObjectID: FsVerityHashValue>(
124    dir: &Directory<ObjectID>,
125    fd: &OwnedFd,
126    repo: &Repository<ObjectID>,
127) -> Result<()> {
128    for (name, inode) in dir.entries() {
129        match inode {
130            Inode::Directory(ref dir) => write_directory(dir, fd, name, repo),
131            Inode::Leaf(ref leaf) => write_leaf(leaf, fd, name, repo),
132        }?;
133    }
134
135    Ok(())
136}
137
138/// Writes a directory tree from composefs representation to a filesystem path.
139///
140/// Reconstructs the filesystem structure at the specified output directory,
141/// creating directories, files, symlinks, and device nodes as needed. External
142/// file content is read from the repository. Note that hardlinks are not supported.
143pub fn write_to_path<ObjectID: FsVerityHashValue>(
144    repo: &Repository<ObjectID>,
145    dir: &Directory<ObjectID>,
146    output_dir: &Path,
147) -> Result<()> {
148    let fd = openat(CWD, output_dir, OFlags::PATH | OFlags::DIRECTORY, 0.into())?;
149    write_directory_contents(dir, &fd, repo)
150}
151
152/// Helper for reading filesystem trees from disk into composefs representation.
153///
154/// Tracks hardlinks via inode numbers and handles integration with repositories
155/// for storing large file content.
156#[derive(Debug)]
157pub struct FilesystemReader<'repo, ObjectID: FsVerityHashValue> {
158    repo: Option<&'repo Repository<ObjectID>>,
159    inodes: HashMap<(u64, u64), Rc<Leaf<ObjectID>>>,
160}
161
162impl<ObjectID: FsVerityHashValue> FilesystemReader<'_, ObjectID> {
163    fn read_xattrs(fd: &OwnedFd) -> Result<BTreeMap<Box<OsStr>, Box<[u8]>>> {
164        // flistxattr() and fgetxattr() don't work with with O_PATH fds, so go via /proc/self/fd.
165        // Note: we want the symlink-following version of this call, which produces the correct
166        // behaviour even when trying to read xattrs from symlinks themselves.  See
167        // https://gist.github.com/allisonkarlitskaya/7a80f2ebb3314d80f45c653a1ba0e398
168        let filename = proc_self_fd(fd);
169
170        let mut xattrs = BTreeMap::new();
171
172        let mut names = [MaybeUninit::new(0); 65536];
173        let (names, _) = listxattr(&filename, &mut names)?;
174
175        for name in names.split_inclusive(|c| *c == 0) {
176            let mut buffer = [MaybeUninit::new(0); 65536];
177            let name: &[u8] = name.as_bytes();
178            let name = CStr::from_bytes_with_nul(name)?;
179            let (value, _) = getxattr(&filename, name, &mut buffer)?;
180            let key = Box::from(OsStr::from_bytes(name.to_bytes()));
181            xattrs.insert(key, Box::from(value));
182        }
183
184        Ok(xattrs)
185    }
186
187    fn stat(fd: &OwnedFd, ifmt: FileType) -> Result<(rustix::fs::Stat, Stat)> {
188        let buf = fstat(fd)?;
189
190        ensure!(
191            FileType::from_raw_mode(buf.st_mode) == ifmt,
192            "File type changed
193            between readdir() and fstat()"
194        );
195
196        Ok((
197            buf,
198            Stat {
199                st_mode: buf.st_mode & 0o7777,
200                st_uid: buf.st_uid,
201                st_gid: buf.st_gid,
202                st_mtim_sec: buf.st_mtime as i64,
203                xattrs: RefCell::new(Self::read_xattrs(fd)?),
204            },
205        ))
206    }
207
208    fn read_leaf_content(
209        &mut self,
210        fd: OwnedFd,
211        buf: rustix::fs::Stat,
212    ) -> Result<LeafContent<ObjectID>> {
213        let content = match FileType::from_raw_mode(buf.st_mode) {
214            FileType::Directory | FileType::Unknown => unreachable!(),
215            FileType::RegularFile => {
216                let mut buffer = Vec::with_capacity(buf.st_size as usize);
217                if buf.st_size > 0 {
218                    read(fd, spare_capacity(&mut buffer))?;
219                }
220                let buffer = Box::from(buffer);
221
222                if buf.st_size > INLINE_CONTENT_MAX as i64 {
223                    let id = if let Some(repo) = self.repo {
224                        repo.ensure_object(&buffer)?
225                    } else {
226                        compute_verity(&buffer)
227                    };
228                    LeafContent::Regular(RegularFile::External(id, buf.st_size as u64))
229                } else {
230                    LeafContent::Regular(RegularFile::Inline(buffer))
231                }
232            }
233            FileType::Symlink => {
234                let target = readlinkat(fd, "", [])?;
235                LeafContent::Symlink(OsStr::from_bytes(target.as_bytes()).into())
236            }
237            FileType::CharacterDevice => LeafContent::CharacterDevice(buf.st_rdev),
238            FileType::BlockDevice => LeafContent::BlockDevice(buf.st_rdev),
239            FileType::Fifo => LeafContent::Fifo,
240            FileType::Socket => LeafContent::Socket,
241        };
242        Ok(content)
243    }
244
245    fn read_leaf(
246        &mut self,
247        dirfd: &OwnedFd,
248        name: &OsStr,
249        ifmt: FileType,
250    ) -> Result<Rc<Leaf<ObjectID>>> {
251        let oflags = match ifmt {
252            FileType::RegularFile => OFlags::RDONLY,
253            _ => OFlags::PATH,
254        };
255
256        let fd = openat(
257            dirfd,
258            name,
259            oflags | OFlags::NOFOLLOW | OFlags::CLOEXEC,
260            Mode::empty(),
261        )?;
262
263        let (buf, stat) = Self::stat(&fd, ifmt)?;
264
265        // NB: We could check `st_nlink > 1` to find out if we should track a file as a potential
266        // hardlink or not, but some filesystems (like fuse-overlayfs) can report this incorrectly.
267        // Track all files.  https://github.com/containers/fuse-overlayfs/issues/435
268        let key = (buf.st_dev, buf.st_ino);
269        if let Some(leafref) = self.inodes.get(&key) {
270            Ok(Rc::clone(leafref))
271        } else {
272            let content = self.read_leaf_content(fd, buf)?;
273            let leaf = Rc::new(Leaf { stat, content });
274            self.inodes.insert(key, Rc::clone(&leaf));
275            Ok(leaf)
276        }
277    }
278
279    /// Reads a directory from disk into composefs representation.
280    ///
281    /// Recursively reads directory contents, tracking hardlinks and optionally
282    /// reading the directory's own metadata. Large files are stored in the repository
283    /// if one was provided.
284    pub fn read_directory(
285        &mut self,
286        dirfd: impl AsFd,
287        name: &OsStr,
288        stat_self: bool,
289    ) -> Result<Directory<ObjectID>> {
290        let fd = openat(
291            dirfd,
292            name,
293            OFlags::RDONLY | OFlags::DIRECTORY | OFlags::NOFOLLOW | OFlags::CLOEXEC,
294            Mode::empty(),
295        )?;
296
297        let mut directory = if stat_self {
298            let (_, stat) = Self::stat(&fd, FileType::Directory)?;
299            Directory::new(stat)
300        } else {
301            Directory::default()
302        };
303
304        for item in Dir::read_from(&fd)? {
305            let entry = item?;
306            let name = OsStr::from_bytes(entry.file_name().to_bytes());
307
308            if name == "." || name == ".." {
309                continue;
310            }
311
312            let inode = self.read_inode(&fd, name, entry.file_type())?;
313            directory.insert(name, inode);
314        }
315
316        Ok(directory)
317    }
318
319    fn read_inode(
320        &mut self,
321        dirfd: &OwnedFd,
322        name: &OsStr,
323        ifmt: FileType,
324    ) -> Result<Inode<ObjectID>> {
325        if ifmt == FileType::Directory {
326            let dir = self.read_directory(dirfd, name, true)?;
327            Ok(Inode::Directory(Box::new(dir)))
328        } else {
329            let leaf = self.read_leaf(dirfd, name, ifmt)?;
330            Ok(Inode::Leaf(leaf))
331        }
332    }
333}
334
335/// Load a filesystem tree from the given path. A repository may
336/// be provided; if it is, then all files found in the filesystem
337/// are copied in.
338pub fn read_filesystem<ObjectID: FsVerityHashValue>(
339    dirfd: impl AsFd,
340    path: &Path,
341    repo: Option<&Repository<ObjectID>>,
342    stat_root: bool,
343) -> Result<FileSystem<ObjectID>> {
344    let mut reader = FilesystemReader {
345        repo,
346        inodes: HashMap::new(),
347    };
348
349    let root = reader.read_directory(dirfd, path.as_os_str(), stat_root)?;
350
351    Ok(FileSystem {
352        root,
353        have_root_stat: stat_root,
354    })
355}
356
357/// Read the contents of a file.
358pub fn read_file<ObjectID: FsVerityHashValue>(
359    file: &RegularFile<ObjectID>,
360    repo: &Repository<ObjectID>,
361) -> Result<Box<[u8]>> {
362    match file {
363        RegularFile::Inline(data) => Ok(data.clone()),
364        RegularFile::External(id, size) => {
365            let mut data = Vec::with_capacity(*size as usize);
366            std::fs::File::from(repo.open_object(id)?).read_to_end(&mut data)?;
367            ensure!(
368                *size == data.len() as u64,
369                "File content doesn't have the expected length"
370            );
371            Ok(data.into_boxed_slice())
372        }
373    }
374}
375
376#[cfg(test)]
377mod tests {
378    use super::*;
379    use rustix::fs::{openat, CWD};
380
381    #[test]
382    fn test_write_contents() -> Result<()> {
383        let td = tempfile::tempdir()?;
384        let testpath = &td.path().join("testfile");
385        let td = openat(
386            CWD,
387            td.path(),
388            OFlags::RDONLY | OFlags::DIRECTORY | OFlags::CLOEXEC,
389            Mode::from_raw_mode(0),
390        )?;
391        let st = Stat {
392            st_mode: 0o755,
393            st_uid: 0,
394            st_gid: 0,
395            st_mtim_sec: Default::default(),
396            xattrs: Default::default(),
397        };
398        set_file_contents(&td, OsStr::new("testfile"), &st, b"new contents").unwrap();
399        drop(td);
400        assert_eq!(std::fs::read(testpath)?, b"new contents");
401        Ok(())
402    }
403}