1use std::{
13 cell::RefCell,
14 collections::BTreeMap,
15 ffi::{OsStr, OsString},
16 fmt,
17 io::Read,
18 os::unix::prelude::{OsStrExt, OsStringExt},
19 path::PathBuf,
20};
21
22use anyhow::{bail, ensure, Result};
23use rustix::fs::makedev;
24use tar::{EntryType, Header, PaxExtensions};
25use tokio::io::{AsyncRead, AsyncReadExt};
26
27use composefs::{
28 dumpfile,
29 fsverity::FsVerityHashValue,
30 splitstream::{SplitStreamData, SplitStreamReader, SplitStreamWriter},
31 tree::{LeafContent, RegularFile, Stat},
32 util::{read_exactish, read_exactish_async},
33 INLINE_CONTENT_MAX,
34};
35
36fn read_header<R: Read>(reader: &mut R) -> Result<Option<Header>> {
37 let mut header = Header::new_gnu();
38 if read_exactish(reader, header.as_mut_bytes())? {
39 Ok(Some(header))
40 } else {
41 Ok(None)
42 }
43}
44
45async fn read_header_async(reader: &mut (impl AsyncRead + Unpin)) -> Result<Option<Header>> {
46 let mut header = Header::new_gnu();
47 if read_exactish_async(reader, header.as_mut_bytes()).await? {
48 Ok(Some(header))
49 } else {
50 Ok(None)
51 }
52}
53
54pub fn split(
58 tar_stream: &mut impl Read,
59 writer: &mut SplitStreamWriter<impl FsVerityHashValue>,
60) -> Result<()> {
61 while let Some(header) = read_header(tar_stream)? {
62 writer.write_inline(header.as_bytes());
64
65 if header.as_bytes() == &[0u8; 512] {
66 continue;
67 }
68
69 let actual_size = header.entry_size()? as usize;
71 let storage_size = (actual_size + 511) & !511;
72 let mut buffer = vec![0u8; storage_size];
73 tar_stream.read_exact(&mut buffer)?;
74
75 if header.entry_type() == EntryType::Regular && actual_size > INLINE_CONTENT_MAX {
76 let padding = buffer.split_off(actual_size);
78 writer.write_external(&buffer, padding)?;
79 } else {
80 writer.write_inline(&buffer);
82 }
83 }
84 Ok(())
85}
86
87pub async fn split_async(
95 mut tar_stream: impl AsyncRead + Unpin,
96 writer: &mut SplitStreamWriter<impl FsVerityHashValue>,
97) -> Result<()> {
98 while let Some(header) = read_header_async(&mut tar_stream).await? {
99 writer.write_inline(header.as_bytes());
101
102 if header.as_bytes() == &[0u8; 512] {
103 continue;
104 }
105
106 let actual_size = header.entry_size()? as usize;
108 let storage_size = (actual_size + 511) & !511;
109 let mut buffer = vec![0u8; storage_size];
110 tar_stream.read_exact(&mut buffer).await?;
111
112 if header.entry_type() == EntryType::Regular && actual_size > INLINE_CONTENT_MAX {
113 let padding = buffer.split_off(actual_size);
115 writer.write_external_async(buffer, padding).await?;
116 } else {
117 writer.write_inline(&buffer);
119 }
120 }
121 Ok(())
122}
123
124#[derive(Debug)]
129pub enum TarItem<ObjectID: FsVerityHashValue> {
130 Directory,
132 Leaf(LeafContent<ObjectID>),
134 Hardlink(OsString),
136}
137
138#[derive(Debug)]
143pub struct TarEntry<ObjectID: FsVerityHashValue> {
144 pub path: PathBuf,
146 pub stat: Stat,
148 pub item: TarItem<ObjectID>,
150}
151
152impl<ObjectID: FsVerityHashValue> fmt::Display for TarEntry<ObjectID> {
153 fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
154 match self.item {
155 TarItem::Hardlink(ref target) => dumpfile::write_hardlink(fmt, &self.path, target),
156 TarItem::Directory => dumpfile::write_directory(fmt, &self.path, &self.stat, 1),
157 TarItem::Leaf(ref content) => {
158 dumpfile::write_leaf(fmt, &self.path, &self.stat, content, 1)
159 }
160 }
161 }
162}
163
164fn path_from_tar(pax: Option<Box<[u8]>>, gnu: Vec<u8>, short: &[u8]) -> PathBuf {
165 let mut path = vec![b'/'];
167 if let Some(name) = pax {
168 path.extend(name);
169 } else if !gnu.is_empty() {
170 path.extend(gnu);
171 } else {
172 path.extend(short);
173 }
174
175 if path.last() == Some(&b'/') {
179 path.pop(); }
181
182 PathBuf::from(OsString::from_vec(path))
183}
184
185fn symlink_target_from_tar(pax: Option<Box<[u8]>>, gnu: Vec<u8>, short: &[u8]) -> Box<OsStr> {
186 if let Some(name) = pax {
187 OsStr::from_bytes(name.as_ref()).into()
188 } else if !gnu.is_empty() {
189 OsStr::from_bytes(&gnu).into()
190 } else {
191 OsStr::from_bytes(short).into()
192 }
193}
194
195pub fn get_entry<R: Read, ObjectID: FsVerityHashValue>(
203 reader: &mut SplitStreamReader<R, ObjectID>,
204) -> Result<Option<TarEntry<ObjectID>>> {
205 let mut gnu_longlink: Vec<u8> = vec![];
206 let mut gnu_longname: Vec<u8> = vec![];
207 let mut pax_longlink: Option<Box<[u8]>> = None;
208 let mut pax_longname: Option<Box<[u8]>> = None;
209 let mut xattrs = BTreeMap::new();
210
211 loop {
212 let mut buf = [0u8; 512];
213 if !reader.read_inline_exact(&mut buf)? || buf == [0u8; 512] {
214 return Ok(None);
215 }
216
217 let header = tar::Header::from_byte_slice(&buf);
218
219 let size = header.entry_size()?;
220
221 let item = match reader.read_exact(size as usize, ((size + 511) & !511) as usize)? {
222 SplitStreamData::External(id) => match header.entry_type() {
223 EntryType::Regular | EntryType::Continuous => {
224 ensure!(
225 size as usize > INLINE_CONTENT_MAX,
226 "Splitstream incorrectly stored a small ({size} byte) file external"
227 );
228 TarItem::Leaf(LeafContent::Regular(RegularFile::External(id, size)))
229 }
230 _ => bail!("Unsupported external-chunked entry {header:?} {id:?}"),
231 },
232 SplitStreamData::Inline(content) => match header.entry_type() {
233 EntryType::GNULongLink => {
234 gnu_longlink.extend(content);
235
236 gnu_longlink.pop_if(|x| *x == b'\0');
241
242 continue;
243 }
244 EntryType::GNULongName => {
245 gnu_longname.extend(content);
246 gnu_longname.pop_if(|x| *x == b'\0');
247 continue;
248 }
249 EntryType::XGlobalHeader => {
250 todo!();
251 }
252 EntryType::XHeader => {
253 for item in PaxExtensions::new(&content) {
254 let extension = item?;
255 let key = extension.key()?;
256 let value = Box::from(extension.value_bytes());
257
258 if key == "path" {
259 pax_longname = Some(value);
260 } else if key == "linkpath" {
261 pax_longlink = Some(value);
262 } else if let Some(xattr) = key.strip_prefix("SCHILY.xattr.") {
263 xattrs.insert(Box::from(OsStr::new(xattr)), value);
264 }
265 }
266 continue;
267 }
268 EntryType::Directory => TarItem::Directory,
269 EntryType::Regular | EntryType::Continuous => {
270 ensure!(
271 content.len() <= INLINE_CONTENT_MAX,
272 "Splitstream incorrectly stored a large ({} byte) file inline",
273 content.len()
274 );
275 TarItem::Leaf(LeafContent::Regular(RegularFile::Inline(content)))
276 }
277 EntryType::Link => TarItem::Hardlink({
278 let Some(link_name) = header.link_name_bytes() else {
279 bail!("link without a name?")
280 };
281 OsString::from(path_from_tar(pax_longlink, gnu_longlink, &link_name))
282 }),
283 EntryType::Symlink => TarItem::Leaf(LeafContent::Symlink({
284 let Some(link_name) = header.link_name_bytes() else {
285 bail!("symlink without a name?")
286 };
287 symlink_target_from_tar(pax_longlink, gnu_longlink, &link_name)
288 })),
289 EntryType::Block => TarItem::Leaf(LeafContent::BlockDevice(
290 match (header.device_major()?, header.device_minor()?) {
291 (Some(major), Some(minor)) => makedev(major, minor),
292 _ => bail!("Device entry without device numbers?"),
293 },
294 )),
295 EntryType::Char => TarItem::Leaf(LeafContent::CharacterDevice(
296 match (header.device_major()?, header.device_minor()?) {
297 (Some(major), Some(minor)) => makedev(major, minor),
298 _ => bail!("Device entry without device numbers?"),
299 },
300 )),
301 EntryType::Fifo => TarItem::Leaf(LeafContent::Fifo),
302 _ => {
303 todo!("Unsupported entry {:?}", header);
304 }
305 },
306 };
307
308 return Ok(Some(TarEntry {
309 path: path_from_tar(pax_longname, gnu_longname, &header.path_bytes()),
310 stat: Stat {
311 st_uid: header.uid()? as u32,
312 st_gid: header.gid()? as u32,
313 st_mode: header.mode()?,
314 st_mtim_sec: header.mtime()? as i64,
315 xattrs: RefCell::new(xattrs),
316 },
317 item,
318 }));
319 }
320}
321
322#[cfg(test)]
323mod tests {
324 use super::*;
325 use composefs::{
326 fsverity::Sha256HashValue, generic_tree::LeafContent, repository::Repository,
327 splitstream::SplitStreamReader,
328 };
329 use std::{io::Cursor, path::Path, sync::Arc};
330 use tar::Builder;
331
332 use once_cell::sync::Lazy;
333 use std::sync::Mutex;
334
335 static TEST_TEMPDIRS: Lazy<Mutex<Vec<tempfile::TempDir>>> =
336 Lazy::new(|| Mutex::new(Vec::new()));
337
338 pub(crate) fn create_test_repository() -> Result<Arc<Repository<Sha256HashValue>>> {
339 let tempdir = tempfile::TempDir::new().unwrap();
341 let fd = rustix::fs::open(
342 tempdir.path(),
343 rustix::fs::OFlags::CLOEXEC | rustix::fs::OFlags::PATH,
344 0.into(),
345 )?;
346
347 {
349 let mut guard = TEST_TEMPDIRS.lock().unwrap();
350 guard.push(tempdir);
351 }
352
353 let mut repo = Repository::open_path(&fd, ".").unwrap();
354 repo.set_insecure(true);
355
356 Ok(Arc::new(repo))
357 }
358
359 fn append_file(
361 builder: &mut Builder<&mut Vec<u8>>,
362 path: &str,
363 content: &[u8],
364 ) -> Result<tar::Header> {
365 let mut header = tar::Header::new_gnu();
366 header.set_mode(0o644);
367 header.set_uid(1000);
368 header.set_gid(1000);
369 header.set_mtime(1234567890);
370 header.set_size(content.len() as u64);
371 header.set_entry_type(tar::EntryType::Regular);
372 builder.append_data(&mut header, path, content)?;
373 Ok(header)
374 }
375
376 fn read_all_via_splitstream(tar_data: Vec<u8>) -> Result<Vec<TarEntry<Sha256HashValue>>> {
378 let mut tar_cursor = Cursor::new(tar_data);
379 let repo = create_test_repository()?;
380 let mut writer = repo.create_stream(None, None);
381
382 split(&mut tar_cursor, &mut writer)?;
383 let object_id = writer.done()?;
384
385 let mut reader: SplitStreamReader<std::fs::File, Sha256HashValue> =
386 SplitStreamReader::new(repo.open_object(&object_id)?.into())?;
387
388 let mut entries = Vec::new();
389 while let Some(entry) = get_entry(&mut reader)? {
390 entries.push(entry);
391 }
392 Ok(entries)
393 }
394
395 #[test]
396 fn test_empty_tar() {
397 let mut tar_data = Vec::new();
398 {
399 let mut builder = Builder::new(&mut tar_data);
400 builder.finish().unwrap();
401 }
402
403 let mut tar_cursor = Cursor::new(tar_data);
404 let repo = create_test_repository().unwrap();
405 let mut writer = repo.create_stream(None, None);
406
407 split(&mut tar_cursor, &mut writer).unwrap();
408 let object_id = writer.done().unwrap();
409
410 let mut reader: SplitStreamReader<std::fs::File, Sha256HashValue> =
411 SplitStreamReader::new(repo.open_object(&object_id).unwrap().into()).unwrap();
412 assert!(get_entry(&mut reader).unwrap().is_none());
413 }
414
415 #[test]
416 fn test_single_small_file() {
417 let mut tar_data = Vec::new();
418 let original_header = {
419 let mut builder = Builder::new(&mut tar_data);
420
421 let content = b"Hello, World!";
423 let header = append_file(&mut builder, "hello.txt", content).unwrap();
424
425 builder.finish().unwrap();
426 header
427 };
428
429 let mut tar_cursor = Cursor::new(tar_data);
430 let repo = create_test_repository().unwrap();
431 let mut writer = repo.create_stream(None, None);
432
433 split(&mut tar_cursor, &mut writer).unwrap();
434 let object_id = writer.done().unwrap();
435
436 let mut reader: SplitStreamReader<std::fs::File, Sha256HashValue> =
437 SplitStreamReader::new(repo.open_object(&object_id).unwrap().into()).unwrap();
438
439 let entry = get_entry(&mut reader)
441 .unwrap()
442 .expect("Should have one entry");
443 assert_eq!(entry.path, PathBuf::from("/hello.txt"));
444 assert!(matches!(
445 entry.item,
446 TarItem::Leaf(LeafContent::Regular(RegularFile::Inline(_)))
447 ));
448
449 assert_header_stat_equal(&original_header, &entry.stat, "hello.txt");
451
452 if let TarItem::Leaf(LeafContent::Regular(RegularFile::Inline(ref content))) = entry.item {
453 assert_eq!(content.as_ref(), b"Hello, World!");
454 }
455
456 assert!(get_entry(&mut reader).unwrap().is_none());
458 }
459
460 #[test]
461 fn test_inline_threshold() {
462 let mut tar_data = Vec::new();
463 let (threshold_header, over_threshold_header) = {
464 let mut builder = Builder::new(&mut tar_data);
465
466 let threshold_content = vec![b'X'; INLINE_CONTENT_MAX];
468 let header1 =
469 append_file(&mut builder, "threshold_file.txt", &threshold_content).unwrap();
470
471 let over_threshold_content = vec![b'Y'; INLINE_CONTENT_MAX + 1];
473 let header2 = append_file(
474 &mut builder,
475 "over_threshold_file.txt",
476 &over_threshold_content,
477 )
478 .unwrap();
479
480 builder.finish().unwrap();
481 (header1, header2)
482 };
483
484 let mut tar_cursor = Cursor::new(tar_data);
485 let repo = create_test_repository().unwrap();
486 let mut writer = repo.create_stream(None, None);
487
488 split(&mut tar_cursor, &mut writer).unwrap();
489 let object_id = writer.done().unwrap();
490
491 let mut reader: SplitStreamReader<std::fs::File, Sha256HashValue> =
492 SplitStreamReader::new(repo.open_object(&object_id).unwrap().into()).unwrap();
493 let mut entries = Vec::new();
494
495 while let Some(entry) = get_entry(&mut reader).unwrap() {
496 entries.push(entry);
497 }
498
499 assert_eq!(entries.len(), 2);
500
501 assert_eq!(entries[0].path, PathBuf::from("/threshold_file.txt"));
503 assert_header_stat_equal(&threshold_header, &entries[0].stat, "threshold_file.txt");
504 if let TarItem::Leaf(LeafContent::Regular(RegularFile::Inline(ref content))) =
505 entries[0].item
506 {
507 assert_eq!(content.len(), INLINE_CONTENT_MAX);
508 assert_eq!(content[0], b'X');
509 } else {
510 panic!("Expected inline regular file for threshold file");
511 }
512
513 assert_eq!(entries[1].path, PathBuf::from("/over_threshold_file.txt"));
515 assert_header_stat_equal(
516 &over_threshold_header,
517 &entries[1].stat,
518 "over_threshold_file.txt",
519 );
520 if let TarItem::Leaf(LeafContent::Regular(RegularFile::External(_, size))) = entries[1].item
521 {
522 assert_eq!(size, (INLINE_CONTENT_MAX + 1) as u64);
523 } else {
524 panic!("Expected external regular file for over-threshold file");
525 }
526 }
527
528 #[test]
529 fn test_round_trip_simple() {
530 let mut original_tar = Vec::new();
532 let (small_header, large_header) = {
533 let mut builder = Builder::new(&mut original_tar);
534
535 let small_content = b"Small file content";
537 let header1 = append_file(&mut builder, "small.txt", small_content).unwrap();
538
539 let large_content = vec![b'L'; INLINE_CONTENT_MAX + 100];
541 let header2 = append_file(&mut builder, "large.txt", &large_content).unwrap();
542
543 builder.finish().unwrap();
544 (header1, header2)
545 };
546
547 let mut tar_cursor = Cursor::new(original_tar.clone());
549 let repo = create_test_repository().unwrap();
550 let mut writer = repo.create_stream(None, None);
551 split(&mut tar_cursor, &mut writer).unwrap();
552 let object_id = writer.done().unwrap();
553
554 let mut reader: SplitStreamReader<std::fs::File, Sha256HashValue> =
556 SplitStreamReader::new(repo.open_object(&object_id).unwrap().into()).unwrap();
557 let mut entries = Vec::new();
558
559 while let Some(entry) = get_entry(&mut reader).unwrap() {
560 entries.push(entry);
561 }
562
563 assert_eq!(entries.len(), 2, "Should have exactly 2 entries");
564
565 assert_eq!(entries[0].path, PathBuf::from("/small.txt"));
567 assert_header_stat_equal(&small_header, &entries[0].stat, "small.txt");
568
569 if let TarItem::Leaf(LeafContent::Regular(RegularFile::Inline(ref content))) =
570 entries[0].item
571 {
572 assert_eq!(content.as_ref(), b"Small file content");
573 } else {
574 panic!("Expected inline regular file for small.txt");
575 }
576
577 assert_eq!(entries[1].path, PathBuf::from("/large.txt"));
579 assert_header_stat_equal(&large_header, &entries[1].stat, "large.txt");
580
581 if let TarItem::Leaf(LeafContent::Regular(RegularFile::External(ref id, size))) =
582 entries[1].item
583 {
584 assert_eq!(size, (INLINE_CONTENT_MAX + 100) as u64);
585 use std::io::Read;
587 let mut external_data = Vec::new();
588 std::fs::File::from(repo.open_object(id).unwrap())
589 .read_to_end(&mut external_data)
590 .unwrap();
591 let expected_content = vec![b'L'; INLINE_CONTENT_MAX + 100];
592 assert_eq!(
593 external_data, expected_content,
594 "External file content should match"
595 );
596 } else {
597 panic!("Expected external regular file for large.txt");
598 }
599 }
600
601 #[test]
602 fn test_special_filename_cases() {
603 let mut tar_data = Vec::new();
604 {
605 let mut builder = Builder::new(&mut tar_data);
606
607 let content1 = b"Special chars content";
609 append_file(&mut builder, "file-with_special.chars@123", content1).unwrap();
610
611 let long_name = "a".repeat(100);
613 let content2 = b"Long filename content";
614 append_file(&mut builder, &long_name, content2).unwrap();
615
616 builder.finish().unwrap();
617 };
618
619 let entries = read_all_via_splitstream(tar_data).unwrap();
620 assert_eq!(entries.len(), 2);
621
622 assert_eq!(
624 entries[0].path,
625 PathBuf::from("/file-with_special.chars@123")
626 );
627 assert_eq!(
628 entries[0].path.file_name().unwrap(),
629 "file-with_special.chars@123"
630 );
631
632 let expected_long_path = format!("/{}", "a".repeat(100));
634 assert_eq!(entries[1].path, PathBuf::from(expected_long_path));
635 assert_eq!(entries[1].path.file_name().unwrap(), &*"a".repeat(100));
636 }
637
638 #[test]
639 fn test_gnu_long_filename_reproduction() {
640 let very_long_path = format!(
642 "very/long/path/that/exceeds/the/normal/tar/header/limit/{}",
643 "x".repeat(120)
644 );
645 let content = b"Content for very long path";
646
647 let mut tar_data = Vec::new();
649 {
650 let mut builder = Builder::new(&mut tar_data);
651 append_file(&mut builder, &very_long_path, content).unwrap();
652 builder.finish().unwrap();
653 };
654
655 let entries = read_all_via_splitstream(tar_data).unwrap();
656 assert_eq!(entries.len(), 1);
657 let abspath = format!("/{very_long_path}");
658 assert_eq!(entries[0].path, Path::new(&abspath));
659 }
660
661 #[test]
662 fn test_gnu_longlink() {
663 let very_long_path = format!(
664 "very/long/path/that/exceeds/the/normal/tar/header/limit/{}",
665 "x".repeat(120)
666 );
667
668 let mut tar_data = Vec::new();
670 {
671 let mut builder = Builder::new(&mut tar_data);
672 let mut header = tar::Header::new_gnu();
673 header.set_mode(0o777);
674 header.set_entry_type(EntryType::Symlink);
675 header.set_size(0);
676 header.set_uid(0);
677 header.set_gid(0);
678 builder
679 .append_link(&mut header, "long-symlink", &very_long_path)
680 .unwrap();
681 builder.finish().unwrap();
682 };
683
684 let entries = read_all_via_splitstream(tar_data).unwrap();
685 assert_eq!(entries.len(), 1);
686 match &entries[0].item {
687 TarItem::Leaf(LeafContent::Symlink(ref target)) => {
688 assert_eq!(&**target, OsStr::new(&very_long_path));
689 }
690 _ => unreachable!(),
691 };
692 }
693
694 fn assert_header_stat_equal(header: &tar::Header, stat: &Stat, msg_prefix: &str) {
696 assert_eq!(
697 header.mode().unwrap(),
698 stat.st_mode,
699 "{}: mode mismatch",
700 msg_prefix
701 );
702 assert_eq!(
703 header.uid().unwrap() as u32,
704 stat.st_uid,
705 "{}: uid mismatch",
706 msg_prefix
707 );
708 assert_eq!(
709 header.gid().unwrap() as u32,
710 stat.st_gid,
711 "{}: gid mismatch",
712 msg_prefix
713 );
714 assert_eq!(
715 header.mtime().unwrap() as i64,
716 stat.st_mtim_sec,
717 "{}: mtime mismatch",
718 msg_prefix
719 );
720 }
721}