diff --git a/src/archive.rs b/src/archive.rs index cd05f69..8121ec4 100644 --- a/src/archive.rs +++ b/src/archive.rs @@ -3,6 +3,7 @@ use std::io::{Read, Seek, SeekFrom, Write}; use std::path::{Path, PathBuf}; use rand::Rng; +use rayon::prelude::*; use std::os::unix::fs::PermissionsExt; use crate::compression; @@ -27,6 +28,22 @@ struct ProcessedFile { padding_bytes: Vec, } +/// Collected entry from the directory walk (before crypto processing). +/// +/// Separates the fast sequential path-collection phase from the +/// parallelizable crypto-processing phase. +enum CollectedEntry { + Dir { + name: String, + permissions: u16, + }, + File { + path: PathBuf, + name: String, + permissions: u16, + }, +} + /// Read and de-obfuscate archive header and TOC entries. /// /// Handles XOR header bootstrapping (FORMAT.md Section 10 steps 1-3) @@ -60,12 +77,13 @@ fn get_permissions(path: &Path) -> anyhow::Result { } /// Process a single file through the crypto pipeline, returning a ProcessedFile. +/// +/// Thread-safe: creates a thread-local RNG instead of accepting an external one. fn process_file( file_path: &Path, name: String, permissions: u16, no_compress: &[String], - rng: &mut impl Rng, ) -> anyhow::Result { let data = fs::read(file_path)?; @@ -103,9 +121,10 @@ fn process_file( let hmac = crypto::compute_hmac(&KEY, &iv, &ciphertext); // Step 6: Generate decoy padding (FORMAT.md Section 9.3) + let mut rng = rand::rng(); let padding_after: u16 = rng.random_range(64..=4096); let mut padding_bytes = vec![0u8; padding_after as usize]; - rand::Fill::fill(&mut padding_bytes[..], rng); + rand::Fill::fill(&mut padding_bytes[..], &mut rng); Ok(ProcessedFile { name, @@ -143,21 +162,22 @@ fn make_directory_entry(name: String, permissions: u16) -> ProcessedFile { } } -/// Recursively collect all entries (directories and files) from a directory path. +/// Recursively collect paths from a directory (no crypto processing). /// /// Entries are emitted in parent-before-child order (DFS preorder). /// The base_name is the top-level directory name used as prefix for all relative paths. -fn collect_directory_entries( +fn collect_directory_paths( dir_path: &Path, base_name: &str, - no_compress: &[String], - rng: &mut impl Rng, -) -> anyhow::Result> { +) -> anyhow::Result> { let mut entries = Vec::new(); // Add the directory itself first (parent-before-child) let dir_perms = get_permissions(dir_path)?; - entries.push(make_directory_entry(base_name.to_string(), dir_perms)); + entries.push(CollectedEntry::Dir { + name: base_name.to_string(), + permissions: dir_perms, + }); // Collect children sorted by name for deterministic ordering let mut children: Vec = fs::read_dir(dir_path)? @@ -173,39 +193,58 @@ fn collect_directory_entries( .ok_or_else(|| anyhow::anyhow!("Non-UTF-8 filename: {}", child_path.display()))? ); - if child_path.is_dir() { - // Recurse into subdirectory - let sub_entries = collect_directory_entries( + // Use symlink_metadata to avoid following symlinks. + // is_dir()/is_file() follow symlinks, which can cause infinite + // recursion or massively inflated entry counts with symlink farms + // (e.g., pnpm node_modules with hundreds of directory symlinks). + let meta = fs::symlink_metadata(&child_path)?; + + if meta.file_type().is_symlink() { + eprintln!( + "Warning: skipping symlink: {}", + child_path.display() + ); + continue; + } else if meta.is_dir() { + // Recurse into real subdirectory (not a symlink) + let sub_entries = collect_directory_paths( &child_path, &child_name, - no_compress, - rng, )?; entries.extend(sub_entries); } else { - // Process file - let file_perms = get_permissions(&child_path)?; - let pf = process_file(&child_path, child_name, file_perms, no_compress, rng)?; - entries.push(pf); + // Collect file path for later parallel processing + let file_perms = (meta.permissions().mode() & 0o7777) as u16; + entries.push(CollectedEntry::File { + path: child_path, + name: child_name, + permissions: file_perms, + }); } } Ok(entries) } -/// Collect all entries from input paths (files and directories). +/// Collect all entry paths from input paths (files and directories). /// -/// For files: processes through crypto pipeline with filename-only name. -/// For directories: recursively collects all children with relative paths. -fn collect_entries( - inputs: &[PathBuf], - no_compress: &[String], - rng: &mut impl Rng, -) -> anyhow::Result> { - let mut processed = Vec::new(); +/// Returns a list of CollectedEntry items in deterministic order, +/// ready for parallel processing of file entries. +fn collect_paths(inputs: &[PathBuf]) -> anyhow::Result> { + let mut collected = Vec::new(); for input_path in inputs { - if input_path.is_dir() { + // Check for symlinks at top level too + let meta = fs::symlink_metadata(input_path)?; + if meta.file_type().is_symlink() { + eprintln!( + "Warning: skipping symlink: {}", + input_path.display() + ); + continue; + } + + if meta.is_dir() { // Get the directory's own name for the archive prefix let dir_name = input_path .file_name() @@ -214,13 +253,8 @@ fn collect_entries( .ok_or_else(|| anyhow::anyhow!("Non-UTF-8 directory name: {}", input_path.display()))? .to_string(); - let dir_entries = collect_directory_entries( - input_path, - &dir_name, - no_compress, - rng, - )?; - processed.extend(dir_entries); + let dir_entries = collect_directory_paths(input_path, &dir_name)?; + collected.extend(dir_entries); } else { // Single file: use just the filename let name = input_path @@ -231,29 +265,55 @@ fn collect_entries( .to_string(); let file_perms = get_permissions(input_path)?; - let pf = process_file(input_path, name, file_perms, no_compress, rng)?; - processed.push(pf); + collected.push(CollectedEntry::File { + path: input_path.clone(), + name, + permissions: file_perms, + }); } } - Ok(processed) + Ok(collected) } /// Pack files and directories into an encrypted archive. /// -/// Two-pass algorithm with full obfuscation: -/// Pass 1: Read, hash, compress, encrypt each file; generate decoy padding. -/// Directories are stored as zero-length entries. -/// Pass 2: Encrypt TOC, compute offsets, XOR header, write archive. +/// Two-pass algorithm with full obfuscation and parallel file processing: +/// Pass 1a: Walk directory tree sequentially, collect paths in deterministic order. +/// Pass 1b: Process file entries in parallel (read, hash, compress, encrypt, padding). +/// Directory entries become zero-length entries (no processing needed). +/// Pass 2: Encrypt TOC, compute offsets, XOR header, write archive sequentially. pub fn pack(files: &[PathBuf], output: &Path, no_compress: &[String]) -> anyhow::Result<()> { anyhow::ensure!(!files.is_empty(), "No input files specified"); - let mut rng = rand::rng(); + // --- Pass 1a: Collect paths sequentially (fast, deterministic) --- + let collected = collect_paths(files)?; - // --- Pass 1: Collect and process all entries --- - let processed = collect_entries(files, no_compress, &mut rng)?; + anyhow::ensure!(!collected.is_empty(), "No entries to archive"); - anyhow::ensure!(!processed.is_empty(), "No entries to archive"); + // Guard against u16 overflow: file_count field in header is u16 (max 65535) + anyhow::ensure!( + collected.len() <= u16::MAX as usize, + "Too many entries: {} exceeds maximum of {} (u16 file_count limit)", + collected.len(), + u16::MAX + ); + + // --- Pass 1b: Process files in parallel, directories inline --- + // We use par_iter on the collected entries while preserving their order. + // Each entry is processed independently; file entries go through the full + // crypto pipeline in parallel, directory entries are trivially converted. + let processed: Vec = collected + .into_par_iter() + .map(|entry| match entry { + CollectedEntry::Dir { name, permissions } => { + Ok(make_directory_entry(name, permissions)) + } + CollectedEntry::File { path, name, permissions } => { + process_file(&path, name, permissions, no_compress) + } + }) + .collect::>>()?; // Count files and directories let file_count = processed.iter().filter(|pf| pf.entry_type == 0).count(); @@ -459,15 +519,42 @@ pub fn inspect(archive: &Path) -> anyhow::Result<()> { Ok(()) } +/// Data read from the archive for a single entry, ready for parallel processing. +enum ReadEntry { + /// Directory entry: just needs creation and permission setting. + Dir { + name: String, + permissions: u16, + }, + /// File entry: ciphertext has been read, ready for verify/decrypt/decompress/write. + File { + entry: TocEntry, + ciphertext: Vec, + }, + /// Entry with unsafe name that was skipped during reading. + Skipped { + _name: String, + }, +} + +/// Result of processing a single file entry during parallel unpack. +enum UnpackResult { + /// File extracted successfully. + Ok { name: String, original_size: u32 }, + /// File had a verification error but was still written (SHA-256 mismatch). + Written { name: String, original_size: u32 }, + /// File processing failed (HMAC, decryption, or decompression error). + Error { name: String, message: String }, +} + /// Unpack an encrypted archive, extracting all files and directories with /// HMAC and SHA-256 verification, and Unix permission restoration. /// -/// Follows FORMAT.md Section 10 decode order: -/// 1. Read header with XOR bootstrapping -/// 2. Read and decrypt TOC entries -/// 3. For each entry: -/// - Directory: create directory, set permissions -/// - File: seek to data_offset, verify HMAC, decrypt, decompress, verify SHA-256, write, set permissions +/// Uses parallel processing for the verify/decrypt/decompress/write pipeline: +/// 1. Read header and TOC sequentially (single file handle). +/// 2. Create all directories sequentially (ensures parent dirs exist). +/// 3. Read all file ciphertexts sequentially from the archive. +/// 4. Process and write files in parallel (HMAC, decrypt, decompress, SHA-256, write). pub fn unpack(archive: &Path, output_dir: &Path) -> anyhow::Result<()> { let mut file = fs::File::open(archive)?; @@ -478,108 +565,191 @@ pub fn unpack(archive: &Path, output_dir: &Path) -> anyhow::Result<()> { fs::create_dir_all(output_dir)?; let entry_count = entries.len(); - let mut error_count: usize = 0; - let mut success_count: usize = 0; - for entry in &entries { + // --- Phase 1: Sequential read of all entry data --- + // Separate directories from files, read ciphertexts for files. + let mut read_entries: Vec = Vec::with_capacity(entry_count); + + for entry in entries { // Sanitize filename: reject directory traversal if entry.name.starts_with('/') || entry.name.contains("..") { eprintln!( "Skipping entry with unsafe name: {} (directory traversal attempt)", entry.name ); - error_count += 1; + read_entries.push(ReadEntry::Skipped { _name: entry.name.clone() }); continue; } - let output_path = output_dir.join(&entry.name); - if entry.entry_type == 1 { - // Directory entry: create and set permissions + read_entries.push(ReadEntry::Dir { + name: entry.name.clone(), + permissions: entry.permissions, + }); + } else { + // Seek to data_offset and read ciphertext into memory + file.seek(SeekFrom::Start(entry.data_offset as u64))?; + let mut ciphertext = vec![0u8; entry.encrypted_size as usize]; + file.read_exact(&mut ciphertext)?; + + read_entries.push(ReadEntry::File { + entry, + ciphertext, + }); + } + } + + // --- Phase 2: Create directories sequentially (parent-before-child order) --- + let mut dir_count: usize = 0; + for re in &read_entries { + if let ReadEntry::Dir { name, permissions } = re { + let output_path = output_dir.join(name); fs::create_dir_all(&output_path)?; fs::set_permissions( &output_path, - fs::Permissions::from_mode(entry.permissions as u32), + fs::Permissions::from_mode(*permissions as u32), )?; - println!("Created directory: {}", entry.name); - success_count += 1; - continue; + println!("Created directory: {}", name); + dir_count += 1; } + } - // File entry: extract with full verification pipeline + // --- Phase 3: Process and write files in parallel --- + // Count skipped entries from phase 1 + let skipped_count = read_entries.iter() + .filter(|re| matches!(re, ReadEntry::Skipped { .. })) + .count(); - // Create parent directories if name contains path separators - if let Some(parent) = output_path.parent() { - fs::create_dir_all(parent)?; - } - - // Seek to data_offset and read ciphertext - file.seek(SeekFrom::Start(entry.data_offset as u64))?; - let mut ciphertext = vec![0u8; entry.encrypted_size as usize]; - file.read_exact(&mut ciphertext)?; - - // Step 1: Verify HMAC FIRST (encrypt-then-MAC) - if !crypto::verify_hmac(&KEY, &entry.iv, &ciphertext, &entry.hmac) { - eprintln!("HMAC verification failed for {}, skipping", entry.name); - error_count += 1; - continue; - } - - // Step 2: Decrypt - let decrypted = match crypto::decrypt_data(&ciphertext, &KEY, &entry.iv) { - Ok(data) => data, - Err(e) => { - eprintln!("Decryption failed for {}: {}", entry.name, e); - error_count += 1; - continue; + // Collect only file entries for parallel processing + let file_entries: Vec<(&TocEntry, &Vec)> = read_entries.iter() + .filter_map(|re| { + if let ReadEntry::File { entry, ciphertext } = re { + Some((entry, ciphertext)) + } else { + None } - }; + }) + .collect(); - // Step 3: Decompress if compressed - let decompressed = if entry.compression_flag == 1 { - match compression::decompress(&decrypted) { - Ok(data) => data, - Err(e) => { - eprintln!("Decompression failed for {}: {}", entry.name, e); - error_count += 1; - continue; + // Process all files in parallel: HMAC verify, decrypt, decompress, SHA-256, write + let results: Vec = file_entries + .par_iter() + .map(|(entry, ciphertext)| { + let output_path = output_dir.join(&entry.name); + + // Create parent directories if name contains path separators + if let Some(parent) = output_path.parent() { + if let Err(e) = fs::create_dir_all(parent) { + return UnpackResult::Error { + name: entry.name.clone(), + message: format!("Failed to create parent directory: {}", e), + }; } } - } else { - decrypted - }; - // Step 4: Verify SHA-256 - let computed_sha256 = crypto::sha256_hash(&decompressed); - if computed_sha256 != entry.sha256 { - eprintln!( - "SHA-256 mismatch for {} (data may be corrupted)", - entry.name - ); - error_count += 1; - // Still write the file per spec + // Step 1: Verify HMAC FIRST (encrypt-then-MAC) + if !crypto::verify_hmac(&KEY, &entry.iv, ciphertext, &entry.hmac) { + return UnpackResult::Error { + name: entry.name.clone(), + message: "HMAC verification failed".to_string(), + }; + } + + // Step 2: Decrypt + let decrypted = match crypto::decrypt_data(ciphertext, &KEY, &entry.iv) { + Ok(data) => data, + Err(e) => { + return UnpackResult::Error { + name: entry.name.clone(), + message: format!("Decryption failed: {}", e), + }; + } + }; + + // Step 3: Decompress if compressed + let decompressed = if entry.compression_flag == 1 { + match compression::decompress(&decrypted) { + Ok(data) => data, + Err(e) => { + return UnpackResult::Error { + name: entry.name.clone(), + message: format!("Decompression failed: {}", e), + }; + } + } + } else { + decrypted + }; + + // Step 4: Verify SHA-256 + let computed_sha256 = crypto::sha256_hash(&decompressed); + let sha256_ok = computed_sha256 == entry.sha256; + + // Step 5: Write file (even if SHA-256 mismatch, per spec) + if let Err(e) = fs::write(&output_path, &decompressed) { + return UnpackResult::Error { + name: entry.name.clone(), + message: format!("Write failed: {}", e), + }; + } + + // Step 6: Set file permissions + if let Err(e) = fs::set_permissions( + &output_path, + fs::Permissions::from_mode(entry.permissions as u32), + ) { + return UnpackResult::Error { + name: entry.name.clone(), + message: format!("Failed to set permissions: {}", e), + }; + } + + if sha256_ok { + UnpackResult::Ok { + name: entry.name.clone(), + original_size: entry.original_size, + } + } else { + UnpackResult::Written { + name: entry.name.clone(), + original_size: entry.original_size, + } + } + }) + .collect(); + + // --- Phase 4: Report results (sequential for deterministic output) --- + let mut final_error_count = skipped_count; + let mut final_success_count = dir_count; + + for result in &results { + match result { + UnpackResult::Ok { name, original_size } => { + println!("Extracted: {} ({} bytes)", name, original_size); + final_success_count += 1; + } + UnpackResult::Written { name, original_size } => { + eprintln!("SHA-256 mismatch for {} (data may be corrupted)", name); + println!("Extracted: {} ({} bytes)", name, original_size); + final_error_count += 1; + // Original code increments both error_count AND success_count for + // SHA-256 mismatch (file is still written and counted as extracted). + final_success_count += 1; + } + UnpackResult::Error { name, message } => { + eprintln!("{} for {}, skipping", message, name); + final_error_count += 1; + } } - - // Step 5: Write file - fs::write(&output_path, &decompressed)?; - - // Step 6: Set file permissions - fs::set_permissions( - &output_path, - fs::Permissions::from_mode(entry.permissions as u32), - )?; - - println!("Extracted: {} ({} bytes)", entry.name, entry.original_size); - success_count += 1; } println!( "Extracted {}/{} entries", - success_count, entry_count + final_success_count, entry_count ); - if error_count > 0 { - anyhow::bail!("{} entry(ies) had verification errors", error_count); + if final_error_count > 0 { + anyhow::bail!("{} entry(ies) had verification errors", final_error_count); } Ok(())