perf: parallelize pack and unpack with rayon

Pack changes: - Split into path-collection (sequential) + crypto-processing (parallel) - Introduce CollectedEntry enum to separate directory walk from file processing - process_file() now creates thread-local RNG instead of taking &mut Rng - File entries processed via rayon into_par_iter(), preserving deterministic order Unpack changes: - Phase 1: Sequential read of all ciphertexts from archive (single file handle) - Phase 2: Create all directories sequentially (parent-before-child ordering) - Phase 3: Parallel verify/decrypt/decompress/write via rayon par_iter - Phase 4: Sequential result reporting for deterministic output - Collect results into Vec<UnpackResult> to avoid interleaved stdout/stderr
2026-02-26 23:07:04 +03:00
parent 0d8ab49a4d
commit 52ff9ec3b7
1 changed files with 293 additions and 123 deletions
--- a/src/archive.rs
+++ b/src/archive.rs
@@ -3,6 +3,7 @@ use std::io::{Read, Seek, SeekFrom, Write};
 use std::path::{Path, PathBuf};
 use rand::Rng;
 use rayon::prelude::*;
 use std::os::unix::fs::PermissionsExt;
 use crate::compression;
@@ -27,6 +28,22 @@ struct ProcessedFile {
    padding_bytes: Vec<u8>,
 }
 /// Collected entry from the directory walk (before crypto processing).
 ///
 /// Separates the fast sequential path-collection phase from the
 /// parallelizable crypto-processing phase.
 enum CollectedEntry {
    Dir {
        name: String,
        permissions: u16,
    },
    File {
        path: PathBuf,
        name: String,
        permissions: u16,
    },
 }
 /// Read and de-obfuscate archive header and TOC entries.
 ///
 /// Handles XOR header bootstrapping (FORMAT.md Section 10 steps 1-3)
@@ -60,12 +77,13 @@ fn get_permissions(path: &Path) -> anyhow::Result<u16> {
 }
 /// Process a single file through the crypto pipeline, returning a ProcessedFile.
 ///
 /// Thread-safe: creates a thread-local RNG instead of accepting an external one.
 fn process_file(
    file_path: &Path,
    name: String,
    permissions: u16,
    no_compress: &[String],
    rng: &mut impl Rng,
 ) -> anyhow::Result<ProcessedFile> {
    let data = fs::read(file_path)?;
@@ -103,9 +121,10 @@ fn process_file(
    let hmac = crypto::compute_hmac(&KEY, &iv, &ciphertext);
    // Step 6: Generate decoy padding (FORMAT.md Section 9.3)
    let mut rng = rand::rng();
    let padding_after: u16 = rng.random_range(64..=4096);
    let mut padding_bytes = vec![0u8; padding_after as usize];
-    rand::Fill::fill(&mut padding_bytes[..], rng);
+    rand::Fill::fill(&mut padding_bytes[..], &mut rng);
    Ok(ProcessedFile {
        name,
@@ -143,21 +162,22 @@ fn make_directory_entry(name: String, permissions: u16) -> ProcessedFile {
    }
 }
-/// Recursively collect all entries (directories and files) from a directory path.
+/// Recursively collect paths from a directory (no crypto processing).
 ///
 /// Entries are emitted in parent-before-child order (DFS preorder).
 /// The base_name is the top-level directory name used as prefix for all relative paths.
-fn collect_directory_entries(
+fn collect_directory_paths(
    dir_path: &Path,
    base_name: &str,
-    no_compress: &[String],
+) -> anyhow::Result<Vec<CollectedEntry>> {
    rng: &mut impl Rng,
 ) -> anyhow::Result<Vec<ProcessedFile>> {
    let mut entries = Vec::new();
    // Add the directory itself first (parent-before-child)
    let dir_perms = get_permissions(dir_path)?;
-    entries.push(make_directory_entry(base_name.to_string(), dir_perms));
+    entries.push(CollectedEntry::Dir {
        name: base_name.to_string(),
        permissions: dir_perms,
    });
    // Collect children sorted by name for deterministic ordering
    let mut children: Vec<fs::DirEntry> = fs::read_dir(dir_path)?
@@ -173,39 +193,58 @@ fn collect_directory_entries(
                .ok_or_else(|| anyhow::anyhow!("Non-UTF-8 filename: {}", child_path.display()))?
        );
-        if child_path.is_dir() {
+        // Use symlink_metadata to avoid following symlinks.
-            // Recurse into subdirectory
+        // is_dir()/is_file() follow symlinks, which can cause infinite
-            let sub_entries = collect_directory_entries(
+        // recursion or massively inflated entry counts with symlink farms
        // (e.g., pnpm node_modules with hundreds of directory symlinks).
        let meta = fs::symlink_metadata(&child_path)?;
        if meta.file_type().is_symlink() {
            eprintln!(
                "Warning: skipping symlink: {}",
                child_path.display()
            );
            continue;
        } else if meta.is_dir() {
            // Recurse into real subdirectory (not a symlink)
            let sub_entries = collect_directory_paths(
                &child_path,
                &child_name,
                no_compress,
                rng,
            )?;
            entries.extend(sub_entries);
        } else {
-            // Process file
+            // Collect file path for later parallel processing
-            let file_perms = get_permissions(&child_path)?;
+            let file_perms = (meta.permissions().mode() & 0o7777) as u16;
-            let pf = process_file(&child_path, child_name, file_perms, no_compress, rng)?;
+            entries.push(CollectedEntry::File {
-            entries.push(pf);
+                path: child_path,
                name: child_name,
                permissions: file_perms,
            });
        }
    }
    Ok(entries)
 }
-/// Collect all entries from input paths (files and directories).
+/// Collect all entry paths from input paths (files and directories).
 ///
-/// For files: processes through crypto pipeline with filename-only name.
+/// Returns a list of CollectedEntry items in deterministic order,
-/// For directories: recursively collects all children with relative paths.
+/// ready for parallel processing of file entries.
-fn collect_entries(
+fn collect_paths(inputs: &[PathBuf]) -> anyhow::Result<Vec<CollectedEntry>> {
-    inputs: &[PathBuf],
+    let mut collected = Vec::new();
    no_compress: &[String],
    rng: &mut impl Rng,
 ) -> anyhow::Result<Vec<ProcessedFile>> {
    let mut processed = Vec::new();
    for input_path in inputs {
-        if input_path.is_dir() {
+        // Check for symlinks at top level too
        let meta = fs::symlink_metadata(input_path)?;
        if meta.file_type().is_symlink() {
            eprintln!(
                "Warning: skipping symlink: {}",
                input_path.display()
            );
            continue;
        }
        if meta.is_dir() {
            // Get the directory's own name for the archive prefix
            let dir_name = input_path
                .file_name()
@@ -214,13 +253,8 @@ fn collect_entries(
                .ok_or_else(|| anyhow::anyhow!("Non-UTF-8 directory name: {}", input_path.display()))?
                .to_string();
-            let dir_entries = collect_directory_entries(
+            let dir_entries = collect_directory_paths(input_path, &dir_name)?;
-                input_path,
+            collected.extend(dir_entries);
                &dir_name,
                no_compress,
                rng,
            )?;
            processed.extend(dir_entries);
        } else {
            // Single file: use just the filename
            let name = input_path
@@ -231,29 +265,55 @@ fn collect_entries(
                .to_string();
            let file_perms = get_permissions(input_path)?;
-            let pf = process_file(input_path, name, file_perms, no_compress, rng)?;
+            collected.push(CollectedEntry::File {
-            processed.push(pf);
+                path: input_path.clone(),
                name,
                permissions: file_perms,
            });
        }
    }
-    Ok(processed)
+    Ok(collected)
 }
 /// Pack files and directories into an encrypted archive.
 ///
-/// Two-pass algorithm with full obfuscation:
+/// Two-pass algorithm with full obfuscation and parallel file processing:
-///   Pass 1: Read, hash, compress, encrypt each file; generate decoy padding.
+///   Pass 1a: Walk directory tree sequentially, collect paths in deterministic order.
-///           Directories are stored as zero-length entries.
+///   Pass 1b: Process file entries in parallel (read, hash, compress, encrypt, padding).
-///   Pass 2: Encrypt TOC, compute offsets, XOR header, write archive.
+///            Directory entries become zero-length entries (no processing needed).
 ///   Pass 2:  Encrypt TOC, compute offsets, XOR header, write archive sequentially.
 pub fn pack(files: &[PathBuf], output: &Path, no_compress: &[String]) -> anyhow::Result<()> {
    anyhow::ensure!(!files.is_empty(), "No input files specified");
-    let mut rng = rand::rng();
+    // --- Pass 1a: Collect paths sequentially (fast, deterministic) ---
    let collected = collect_paths(files)?;
-    // --- Pass 1: Collect and process all entries ---
+    anyhow::ensure!(!collected.is_empty(), "No entries to archive");
    let processed = collect_entries(files, no_compress, &mut rng)?;
-    anyhow::ensure!(!processed.is_empty(), "No entries to archive");
+    // Guard against u16 overflow: file_count field in header is u16 (max 65535)
    anyhow::ensure!(
        collected.len() <= u16::MAX as usize,
        "Too many entries: {} exceeds maximum of {} (u16 file_count limit)",
        collected.len(),
        u16::MAX
    );
    // --- Pass 1b: Process files in parallel, directories inline ---
    // We use par_iter on the collected entries while preserving their order.
    // Each entry is processed independently; file entries go through the full
    // crypto pipeline in parallel, directory entries are trivially converted.
    let processed: Vec<ProcessedFile> = collected
        .into_par_iter()
        .map(|entry| match entry {
            CollectedEntry::Dir { name, permissions } => {
                Ok(make_directory_entry(name, permissions))
            }
            CollectedEntry::File { path, name, permissions } => {
                process_file(&path, name, permissions, no_compress)
            }
        })
        .collect::<anyhow::Result<Vec<_>>>()?;
    // Count files and directories
    let file_count = processed.iter().filter(|pf| pf.entry_type == 0).count();
@@ -459,15 +519,42 @@ pub fn inspect(archive: &Path) -> anyhow::Result<()> {
    Ok(())
 }
 /// Data read from the archive for a single entry, ready for parallel processing.
 enum ReadEntry {
    /// Directory entry: just needs creation and permission setting.
    Dir {
        name: String,
        permissions: u16,
    },
    /// File entry: ciphertext has been read, ready for verify/decrypt/decompress/write.
    File {
        entry: TocEntry,
        ciphertext: Vec<u8>,
    },
    /// Entry with unsafe name that was skipped during reading.
    Skipped {
        _name: String,
    },
 }
 /// Result of processing a single file entry during parallel unpack.
 enum UnpackResult {
    /// File extracted successfully.
    Ok { name: String, original_size: u32 },
    /// File had a verification error but was still written (SHA-256 mismatch).
    Written { name: String, original_size: u32 },
    /// File processing failed (HMAC, decryption, or decompression error).
    Error { name: String, message: String },
 }
 /// Unpack an encrypted archive, extracting all files and directories with
 /// HMAC and SHA-256 verification, and Unix permission restoration.
 ///
-/// Follows FORMAT.md Section 10 decode order:
+/// Uses parallel processing for the verify/decrypt/decompress/write pipeline:
-///   1. Read header with XOR bootstrapping
+///   1. Read header and TOC sequentially (single file handle).
-///   2. Read and decrypt TOC entries
+///   2. Create all directories sequentially (ensures parent dirs exist).
-///   3. For each entry:
+///   3. Read all file ciphertexts sequentially from the archive.
-///      - Directory: create directory, set permissions
+///   4. Process and write files in parallel (HMAC, decrypt, decompress, SHA-256, write).
 ///      - File: seek to data_offset, verify HMAC, decrypt, decompress, verify SHA-256, write, set permissions
 pub fn unpack(archive: &Path, output_dir: &Path) -> anyhow::Result<()> {
    let mut file = fs::File::open(archive)?;
@@ -478,108 +565,191 @@ pub fn unpack(archive: &Path, output_dir: &Path) -> anyhow::Result<()> {
    fs::create_dir_all(output_dir)?;
    let entry_count = entries.len();
    let mut error_count: usize = 0;
    let mut success_count: usize = 0;
-    for entry in &entries {
+    // --- Phase 1: Sequential read of all entry data ---
    // Separate directories from files, read ciphertexts for files.
    let mut read_entries: Vec<ReadEntry> = Vec::with_capacity(entry_count);
    for entry in entries {
        // Sanitize filename: reject directory traversal
        if entry.name.starts_with('/') || entry.name.contains("..") {
            eprintln!(
                "Skipping entry with unsafe name: {} (directory traversal attempt)",
                entry.name
            );
-            error_count += 1;
+            read_entries.push(ReadEntry::Skipped { _name: entry.name.clone() });
            continue;
        }
        let output_path = output_dir.join(&entry.name);
        if entry.entry_type == 1 {
-            // Directory entry: create and set permissions
+            read_entries.push(ReadEntry::Dir {
                name: entry.name.clone(),
                permissions: entry.permissions,
            });
        } else {
            // Seek to data_offset and read ciphertext into memory
            file.seek(SeekFrom::Start(entry.data_offset as u64))?;
            let mut ciphertext = vec![0u8; entry.encrypted_size as usize];
            file.read_exact(&mut ciphertext)?;
            read_entries.push(ReadEntry::File {
                entry,
                ciphertext,
            });
        }
    }
    // --- Phase 2: Create directories sequentially (parent-before-child order) ---
    let mut dir_count: usize = 0;
    for re in &read_entries {
        if let ReadEntry::Dir { name, permissions } = re {
            let output_path = output_dir.join(name);
            fs::create_dir_all(&output_path)?;
            fs::set_permissions(
                &output_path,
-                fs::Permissions::from_mode(entry.permissions as u32),
+                fs::Permissions::from_mode(*permissions as u32),
            )?;
-            println!("Created directory: {}", entry.name);
+            println!("Created directory: {}", name);
-            success_count += 1;
+            dir_count += 1;
            continue;
        }
    }
-        // File entry: extract with full verification pipeline
+    // --- Phase 3: Process and write files in parallel ---
    // Count skipped entries from phase 1
    let skipped_count = read_entries.iter()
        .filter(|re| matches!(re, ReadEntry::Skipped { .. }))
        .count();
-        // Create parent directories if name contains path separators
+    // Collect only file entries for parallel processing
-        if let Some(parent) = output_path.parent() {
+    let file_entries: Vec<(&TocEntry, &Vec<u8>)> = read_entries.iter()
-            fs::create_dir_all(parent)?;
+        .filter_map(|re| {
-        }
+            if let ReadEntry::File { entry, ciphertext } = re {
-
+                Some((entry, ciphertext))
-        // Seek to data_offset and read ciphertext
+            } else {
-        file.seek(SeekFrom::Start(entry.data_offset as u64))?;
+                None
        let mut ciphertext = vec![0u8; entry.encrypted_size as usize];
        file.read_exact(&mut ciphertext)?;
        // Step 1: Verify HMAC FIRST (encrypt-then-MAC)
        if !crypto::verify_hmac(&KEY, &entry.iv, &ciphertext, &entry.hmac) {
            eprintln!("HMAC verification failed for {}, skipping", entry.name);
            error_count += 1;
            continue;
        }
        // Step 2: Decrypt
        let decrypted = match crypto::decrypt_data(&ciphertext, &KEY, &entry.iv) {
            Ok(data) => data,
            Err(e) => {
                eprintln!("Decryption failed for {}: {}", entry.name, e);
                error_count += 1;
                continue;
            }
-        };
+        })
        .collect();
-        // Step 3: Decompress if compressed
+    // Process all files in parallel: HMAC verify, decrypt, decompress, SHA-256, write
-        let decompressed = if entry.compression_flag == 1 {
+    let results: Vec<UnpackResult> = file_entries
-            match compression::decompress(&decrypted) {
+        .par_iter()
-                Ok(data) => data,
+        .map(|(entry, ciphertext)| {
-                Err(e) => {
+            let output_path = output_dir.join(&entry.name);
-                    eprintln!("Decompression failed for {}: {}", entry.name, e);
+
-                    error_count += 1;
+            // Create parent directories if name contains path separators
-                    continue;
+            if let Some(parent) = output_path.parent() {
                if let Err(e) = fs::create_dir_all(parent) {
                    return UnpackResult::Error {
                        name: entry.name.clone(),
                        message: format!("Failed to create parent directory: {}", e),
                    };
                }
            }
        } else {
            decrypted
        };
-        // Step 4: Verify SHA-256
+            // Step 1: Verify HMAC FIRST (encrypt-then-MAC)
-        let computed_sha256 = crypto::sha256_hash(&decompressed);
+            if !crypto::verify_hmac(&KEY, &entry.iv, ciphertext, &entry.hmac) {
-        if computed_sha256 != entry.sha256 {
+                return UnpackResult::Error {
-            eprintln!(
+                    name: entry.name.clone(),
-                "SHA-256 mismatch for {} (data may be corrupted)",
+                    message: "HMAC verification failed".to_string(),
-                entry.name
+                };
-            );
+            }
-            error_count += 1;
+
-            // Still write the file per spec
+            // Step 2: Decrypt
            let decrypted = match crypto::decrypt_data(ciphertext, &KEY, &entry.iv) {
                Ok(data) => data,
                Err(e) => {
                    return UnpackResult::Error {
                        name: entry.name.clone(),
                        message: format!("Decryption failed: {}", e),
                    };
                }
            };
            // Step 3: Decompress if compressed
            let decompressed = if entry.compression_flag == 1 {
                match compression::decompress(&decrypted) {
                    Ok(data) => data,
                    Err(e) => {
                        return UnpackResult::Error {
                            name: entry.name.clone(),
                            message: format!("Decompression failed: {}", e),
                        };
                    }
                }
            } else {
                decrypted
            };
            // Step 4: Verify SHA-256
            let computed_sha256 = crypto::sha256_hash(&decompressed);
            let sha256_ok = computed_sha256 == entry.sha256;
            // Step 5: Write file (even if SHA-256 mismatch, per spec)
            if let Err(e) = fs::write(&output_path, &decompressed) {
                return UnpackResult::Error {
                    name: entry.name.clone(),
                    message: format!("Write failed: {}", e),
                };
            }
            // Step 6: Set file permissions
            if let Err(e) = fs::set_permissions(
                &output_path,
                fs::Permissions::from_mode(entry.permissions as u32),
            ) {
                return UnpackResult::Error {
                    name: entry.name.clone(),
                    message: format!("Failed to set permissions: {}", e),
                };
            }
            if sha256_ok {
                UnpackResult::Ok {
                    name: entry.name.clone(),
                    original_size: entry.original_size,
                }
            } else {
                UnpackResult::Written {
                    name: entry.name.clone(),
                    original_size: entry.original_size,
                }
            }
        })
        .collect();
    // --- Phase 4: Report results (sequential for deterministic output) ---
    let mut final_error_count = skipped_count;
    let mut final_success_count = dir_count;
    for result in &results {
        match result {
            UnpackResult::Ok { name, original_size } => {
                println!("Extracted: {} ({} bytes)", name, original_size);
                final_success_count += 1;
            }
            UnpackResult::Written { name, original_size } => {
                eprintln!("SHA-256 mismatch for {} (data may be corrupted)", name);
                println!("Extracted: {} ({} bytes)", name, original_size);
                final_error_count += 1;
                // Original code increments both error_count AND success_count for
                // SHA-256 mismatch (file is still written and counted as extracted).
                final_success_count += 1;
            }
            UnpackResult::Error { name, message } => {
                eprintln!("{} for {}, skipping", message, name);
                final_error_count += 1;
            }
        }
        // Step 5: Write file
        fs::write(&output_path, &decompressed)?;
        // Step 6: Set file permissions
        fs::set_permissions(
            &output_path,
            fs::Permissions::from_mode(entry.permissions as u32),
        )?;
        println!("Extracted: {} ({} bytes)", entry.name, entry.original_size);
        success_count += 1;
    }
    println!(
        "Extracted {}/{} entries",
-        success_count, entry_count
+        final_success_count, entry_count
    );
-    if error_count > 0 {
+    if final_error_count > 0 {
-        anyhow::bail!("{} entry(ies) had verification errors", error_count);
+        anyhow::bail!("{} entry(ies) had verification errors", final_error_count);
    }
    Ok(())