perf: parallelize pack and unpack with rayon
Some checks failed
CI / test (push) Failing after 40s

Pack changes:
- Split into path-collection (sequential) + crypto-processing (parallel)
- Introduce CollectedEntry enum to separate directory walk from file processing
- process_file() now creates thread-local RNG instead of taking &mut Rng
- File entries processed via rayon into_par_iter(), preserving deterministic order

Unpack changes:
- Phase 1: Sequential read of all ciphertexts from archive (single file handle)
- Phase 2: Create all directories sequentially (parent-before-child ordering)
- Phase 3: Parallel verify/decrypt/decompress/write via rayon par_iter
- Phase 4: Sequential result reporting for deterministic output
- Collect results into Vec<UnpackResult> to avoid interleaved stdout/stderr
This commit is contained in:
NikitolProject
2026-02-26 23:07:04 +03:00
parent 0d8ab49a4d
commit 52ff9ec3b7

View File

@@ -3,6 +3,7 @@ use std::io::{Read, Seek, SeekFrom, Write};
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use rand::Rng; use rand::Rng;
use rayon::prelude::*;
use std::os::unix::fs::PermissionsExt; use std::os::unix::fs::PermissionsExt;
use crate::compression; use crate::compression;
@@ -27,6 +28,22 @@ struct ProcessedFile {
padding_bytes: Vec<u8>, padding_bytes: Vec<u8>,
} }
/// Collected entry from the directory walk (before crypto processing).
///
/// Separates the fast sequential path-collection phase from the
/// parallelizable crypto-processing phase.
enum CollectedEntry {
Dir {
name: String,
permissions: u16,
},
File {
path: PathBuf,
name: String,
permissions: u16,
},
}
/// Read and de-obfuscate archive header and TOC entries. /// Read and de-obfuscate archive header and TOC entries.
/// ///
/// Handles XOR header bootstrapping (FORMAT.md Section 10 steps 1-3) /// Handles XOR header bootstrapping (FORMAT.md Section 10 steps 1-3)
@@ -60,12 +77,13 @@ fn get_permissions(path: &Path) -> anyhow::Result<u16> {
} }
/// Process a single file through the crypto pipeline, returning a ProcessedFile. /// Process a single file through the crypto pipeline, returning a ProcessedFile.
///
/// Thread-safe: creates a thread-local RNG instead of accepting an external one.
fn process_file( fn process_file(
file_path: &Path, file_path: &Path,
name: String, name: String,
permissions: u16, permissions: u16,
no_compress: &[String], no_compress: &[String],
rng: &mut impl Rng,
) -> anyhow::Result<ProcessedFile> { ) -> anyhow::Result<ProcessedFile> {
let data = fs::read(file_path)?; let data = fs::read(file_path)?;
@@ -103,9 +121,10 @@ fn process_file(
let hmac = crypto::compute_hmac(&KEY, &iv, &ciphertext); let hmac = crypto::compute_hmac(&KEY, &iv, &ciphertext);
// Step 6: Generate decoy padding (FORMAT.md Section 9.3) // Step 6: Generate decoy padding (FORMAT.md Section 9.3)
let mut rng = rand::rng();
let padding_after: u16 = rng.random_range(64..=4096); let padding_after: u16 = rng.random_range(64..=4096);
let mut padding_bytes = vec![0u8; padding_after as usize]; let mut padding_bytes = vec![0u8; padding_after as usize];
rand::Fill::fill(&mut padding_bytes[..], rng); rand::Fill::fill(&mut padding_bytes[..], &mut rng);
Ok(ProcessedFile { Ok(ProcessedFile {
name, name,
@@ -143,21 +162,22 @@ fn make_directory_entry(name: String, permissions: u16) -> ProcessedFile {
} }
} }
/// Recursively collect all entries (directories and files) from a directory path. /// Recursively collect paths from a directory (no crypto processing).
/// ///
/// Entries are emitted in parent-before-child order (DFS preorder). /// Entries are emitted in parent-before-child order (DFS preorder).
/// The base_name is the top-level directory name used as prefix for all relative paths. /// The base_name is the top-level directory name used as prefix for all relative paths.
fn collect_directory_entries( fn collect_directory_paths(
dir_path: &Path, dir_path: &Path,
base_name: &str, base_name: &str,
no_compress: &[String], ) -> anyhow::Result<Vec<CollectedEntry>> {
rng: &mut impl Rng,
) -> anyhow::Result<Vec<ProcessedFile>> {
let mut entries = Vec::new(); let mut entries = Vec::new();
// Add the directory itself first (parent-before-child) // Add the directory itself first (parent-before-child)
let dir_perms = get_permissions(dir_path)?; let dir_perms = get_permissions(dir_path)?;
entries.push(make_directory_entry(base_name.to_string(), dir_perms)); entries.push(CollectedEntry::Dir {
name: base_name.to_string(),
permissions: dir_perms,
});
// Collect children sorted by name for deterministic ordering // Collect children sorted by name for deterministic ordering
let mut children: Vec<fs::DirEntry> = fs::read_dir(dir_path)? let mut children: Vec<fs::DirEntry> = fs::read_dir(dir_path)?
@@ -173,39 +193,58 @@ fn collect_directory_entries(
.ok_or_else(|| anyhow::anyhow!("Non-UTF-8 filename: {}", child_path.display()))? .ok_or_else(|| anyhow::anyhow!("Non-UTF-8 filename: {}", child_path.display()))?
); );
if child_path.is_dir() { // Use symlink_metadata to avoid following symlinks.
// Recurse into subdirectory // is_dir()/is_file() follow symlinks, which can cause infinite
let sub_entries = collect_directory_entries( // recursion or massively inflated entry counts with symlink farms
// (e.g., pnpm node_modules with hundreds of directory symlinks).
let meta = fs::symlink_metadata(&child_path)?;
if meta.file_type().is_symlink() {
eprintln!(
"Warning: skipping symlink: {}",
child_path.display()
);
continue;
} else if meta.is_dir() {
// Recurse into real subdirectory (not a symlink)
let sub_entries = collect_directory_paths(
&child_path, &child_path,
&child_name, &child_name,
no_compress,
rng,
)?; )?;
entries.extend(sub_entries); entries.extend(sub_entries);
} else { } else {
// Process file // Collect file path for later parallel processing
let file_perms = get_permissions(&child_path)?; let file_perms = (meta.permissions().mode() & 0o7777) as u16;
let pf = process_file(&child_path, child_name, file_perms, no_compress, rng)?; entries.push(CollectedEntry::File {
entries.push(pf); path: child_path,
name: child_name,
permissions: file_perms,
});
} }
} }
Ok(entries) Ok(entries)
} }
/// Collect all entries from input paths (files and directories). /// Collect all entry paths from input paths (files and directories).
/// ///
/// For files: processes through crypto pipeline with filename-only name. /// Returns a list of CollectedEntry items in deterministic order,
/// For directories: recursively collects all children with relative paths. /// ready for parallel processing of file entries.
fn collect_entries( fn collect_paths(inputs: &[PathBuf]) -> anyhow::Result<Vec<CollectedEntry>> {
inputs: &[PathBuf], let mut collected = Vec::new();
no_compress: &[String],
rng: &mut impl Rng,
) -> anyhow::Result<Vec<ProcessedFile>> {
let mut processed = Vec::new();
for input_path in inputs { for input_path in inputs {
if input_path.is_dir() { // Check for symlinks at top level too
let meta = fs::symlink_metadata(input_path)?;
if meta.file_type().is_symlink() {
eprintln!(
"Warning: skipping symlink: {}",
input_path.display()
);
continue;
}
if meta.is_dir() {
// Get the directory's own name for the archive prefix // Get the directory's own name for the archive prefix
let dir_name = input_path let dir_name = input_path
.file_name() .file_name()
@@ -214,13 +253,8 @@ fn collect_entries(
.ok_or_else(|| anyhow::anyhow!("Non-UTF-8 directory name: {}", input_path.display()))? .ok_or_else(|| anyhow::anyhow!("Non-UTF-8 directory name: {}", input_path.display()))?
.to_string(); .to_string();
let dir_entries = collect_directory_entries( let dir_entries = collect_directory_paths(input_path, &dir_name)?;
input_path, collected.extend(dir_entries);
&dir_name,
no_compress,
rng,
)?;
processed.extend(dir_entries);
} else { } else {
// Single file: use just the filename // Single file: use just the filename
let name = input_path let name = input_path
@@ -231,29 +265,55 @@ fn collect_entries(
.to_string(); .to_string();
let file_perms = get_permissions(input_path)?; let file_perms = get_permissions(input_path)?;
let pf = process_file(input_path, name, file_perms, no_compress, rng)?; collected.push(CollectedEntry::File {
processed.push(pf); path: input_path.clone(),
name,
permissions: file_perms,
});
} }
} }
Ok(processed) Ok(collected)
} }
/// Pack files and directories into an encrypted archive. /// Pack files and directories into an encrypted archive.
/// ///
/// Two-pass algorithm with full obfuscation: /// Two-pass algorithm with full obfuscation and parallel file processing:
/// Pass 1: Read, hash, compress, encrypt each file; generate decoy padding. /// Pass 1a: Walk directory tree sequentially, collect paths in deterministic order.
/// Directories are stored as zero-length entries. /// Pass 1b: Process file entries in parallel (read, hash, compress, encrypt, padding).
/// Pass 2: Encrypt TOC, compute offsets, XOR header, write archive. /// Directory entries become zero-length entries (no processing needed).
/// Pass 2: Encrypt TOC, compute offsets, XOR header, write archive sequentially.
pub fn pack(files: &[PathBuf], output: &Path, no_compress: &[String]) -> anyhow::Result<()> { pub fn pack(files: &[PathBuf], output: &Path, no_compress: &[String]) -> anyhow::Result<()> {
anyhow::ensure!(!files.is_empty(), "No input files specified"); anyhow::ensure!(!files.is_empty(), "No input files specified");
let mut rng = rand::rng(); // --- Pass 1a: Collect paths sequentially (fast, deterministic) ---
let collected = collect_paths(files)?;
// --- Pass 1: Collect and process all entries --- anyhow::ensure!(!collected.is_empty(), "No entries to archive");
let processed = collect_entries(files, no_compress, &mut rng)?;
anyhow::ensure!(!processed.is_empty(), "No entries to archive"); // Guard against u16 overflow: file_count field in header is u16 (max 65535)
anyhow::ensure!(
collected.len() <= u16::MAX as usize,
"Too many entries: {} exceeds maximum of {} (u16 file_count limit)",
collected.len(),
u16::MAX
);
// --- Pass 1b: Process files in parallel, directories inline ---
// We use par_iter on the collected entries while preserving their order.
// Each entry is processed independently; file entries go through the full
// crypto pipeline in parallel, directory entries are trivially converted.
let processed: Vec<ProcessedFile> = collected
.into_par_iter()
.map(|entry| match entry {
CollectedEntry::Dir { name, permissions } => {
Ok(make_directory_entry(name, permissions))
}
CollectedEntry::File { path, name, permissions } => {
process_file(&path, name, permissions, no_compress)
}
})
.collect::<anyhow::Result<Vec<_>>>()?;
// Count files and directories // Count files and directories
let file_count = processed.iter().filter(|pf| pf.entry_type == 0).count(); let file_count = processed.iter().filter(|pf| pf.entry_type == 0).count();
@@ -459,15 +519,42 @@ pub fn inspect(archive: &Path) -> anyhow::Result<()> {
Ok(()) Ok(())
} }
/// Data read from the archive for a single entry, ready for parallel processing.
enum ReadEntry {
/// Directory entry: just needs creation and permission setting.
Dir {
name: String,
permissions: u16,
},
/// File entry: ciphertext has been read, ready for verify/decrypt/decompress/write.
File {
entry: TocEntry,
ciphertext: Vec<u8>,
},
/// Entry with unsafe name that was skipped during reading.
Skipped {
_name: String,
},
}
/// Result of processing a single file entry during parallel unpack.
enum UnpackResult {
/// File extracted successfully.
Ok { name: String, original_size: u32 },
/// File had a verification error but was still written (SHA-256 mismatch).
Written { name: String, original_size: u32 },
/// File processing failed (HMAC, decryption, or decompression error).
Error { name: String, message: String },
}
/// Unpack an encrypted archive, extracting all files and directories with /// Unpack an encrypted archive, extracting all files and directories with
/// HMAC and SHA-256 verification, and Unix permission restoration. /// HMAC and SHA-256 verification, and Unix permission restoration.
/// ///
/// Follows FORMAT.md Section 10 decode order: /// Uses parallel processing for the verify/decrypt/decompress/write pipeline:
/// 1. Read header with XOR bootstrapping /// 1. Read header and TOC sequentially (single file handle).
/// 2. Read and decrypt TOC entries /// 2. Create all directories sequentially (ensures parent dirs exist).
/// 3. For each entry: /// 3. Read all file ciphertexts sequentially from the archive.
/// - Directory: create directory, set permissions /// 4. Process and write files in parallel (HMAC, decrypt, decompress, SHA-256, write).
/// - File: seek to data_offset, verify HMAC, decrypt, decompress, verify SHA-256, write, set permissions
pub fn unpack(archive: &Path, output_dir: &Path) -> anyhow::Result<()> { pub fn unpack(archive: &Path, output_dir: &Path) -> anyhow::Result<()> {
let mut file = fs::File::open(archive)?; let mut file = fs::File::open(archive)?;
@@ -478,108 +565,191 @@ pub fn unpack(archive: &Path, output_dir: &Path) -> anyhow::Result<()> {
fs::create_dir_all(output_dir)?; fs::create_dir_all(output_dir)?;
let entry_count = entries.len(); let entry_count = entries.len();
let mut error_count: usize = 0;
let mut success_count: usize = 0;
for entry in &entries { // --- Phase 1: Sequential read of all entry data ---
// Separate directories from files, read ciphertexts for files.
let mut read_entries: Vec<ReadEntry> = Vec::with_capacity(entry_count);
for entry in entries {
// Sanitize filename: reject directory traversal // Sanitize filename: reject directory traversal
if entry.name.starts_with('/') || entry.name.contains("..") { if entry.name.starts_with('/') || entry.name.contains("..") {
eprintln!( eprintln!(
"Skipping entry with unsafe name: {} (directory traversal attempt)", "Skipping entry with unsafe name: {} (directory traversal attempt)",
entry.name entry.name
); );
error_count += 1; read_entries.push(ReadEntry::Skipped { _name: entry.name.clone() });
continue; continue;
} }
let output_path = output_dir.join(&entry.name);
if entry.entry_type == 1 { if entry.entry_type == 1 {
// Directory entry: create and set permissions read_entries.push(ReadEntry::Dir {
name: entry.name.clone(),
permissions: entry.permissions,
});
} else {
// Seek to data_offset and read ciphertext into memory
file.seek(SeekFrom::Start(entry.data_offset as u64))?;
let mut ciphertext = vec![0u8; entry.encrypted_size as usize];
file.read_exact(&mut ciphertext)?;
read_entries.push(ReadEntry::File {
entry,
ciphertext,
});
}
}
// --- Phase 2: Create directories sequentially (parent-before-child order) ---
let mut dir_count: usize = 0;
for re in &read_entries {
if let ReadEntry::Dir { name, permissions } = re {
let output_path = output_dir.join(name);
fs::create_dir_all(&output_path)?; fs::create_dir_all(&output_path)?;
fs::set_permissions( fs::set_permissions(
&output_path, &output_path,
fs::Permissions::from_mode(entry.permissions as u32), fs::Permissions::from_mode(*permissions as u32),
)?; )?;
println!("Created directory: {}", entry.name); println!("Created directory: {}", name);
success_count += 1; dir_count += 1;
continue;
} }
}
// File entry: extract with full verification pipeline // --- Phase 3: Process and write files in parallel ---
// Count skipped entries from phase 1
let skipped_count = read_entries.iter()
.filter(|re| matches!(re, ReadEntry::Skipped { .. }))
.count();
// Create parent directories if name contains path separators // Collect only file entries for parallel processing
if let Some(parent) = output_path.parent() { let file_entries: Vec<(&TocEntry, &Vec<u8>)> = read_entries.iter()
fs::create_dir_all(parent)?; .filter_map(|re| {
} if let ReadEntry::File { entry, ciphertext } = re {
Some((entry, ciphertext))
// Seek to data_offset and read ciphertext } else {
file.seek(SeekFrom::Start(entry.data_offset as u64))?; None
let mut ciphertext = vec![0u8; entry.encrypted_size as usize];
file.read_exact(&mut ciphertext)?;
// Step 1: Verify HMAC FIRST (encrypt-then-MAC)
if !crypto::verify_hmac(&KEY, &entry.iv, &ciphertext, &entry.hmac) {
eprintln!("HMAC verification failed for {}, skipping", entry.name);
error_count += 1;
continue;
}
// Step 2: Decrypt
let decrypted = match crypto::decrypt_data(&ciphertext, &KEY, &entry.iv) {
Ok(data) => data,
Err(e) => {
eprintln!("Decryption failed for {}: {}", entry.name, e);
error_count += 1;
continue;
} }
}; })
.collect();
// Step 3: Decompress if compressed // Process all files in parallel: HMAC verify, decrypt, decompress, SHA-256, write
let decompressed = if entry.compression_flag == 1 { let results: Vec<UnpackResult> = file_entries
match compression::decompress(&decrypted) { .par_iter()
Ok(data) => data, .map(|(entry, ciphertext)| {
Err(e) => { let output_path = output_dir.join(&entry.name);
eprintln!("Decompression failed for {}: {}", entry.name, e);
error_count += 1; // Create parent directories if name contains path separators
continue; if let Some(parent) = output_path.parent() {
if let Err(e) = fs::create_dir_all(parent) {
return UnpackResult::Error {
name: entry.name.clone(),
message: format!("Failed to create parent directory: {}", e),
};
} }
} }
} else {
decrypted
};
// Step 4: Verify SHA-256 // Step 1: Verify HMAC FIRST (encrypt-then-MAC)
let computed_sha256 = crypto::sha256_hash(&decompressed); if !crypto::verify_hmac(&KEY, &entry.iv, ciphertext, &entry.hmac) {
if computed_sha256 != entry.sha256 { return UnpackResult::Error {
eprintln!( name: entry.name.clone(),
"SHA-256 mismatch for {} (data may be corrupted)", message: "HMAC verification failed".to_string(),
entry.name };
); }
error_count += 1;
// Still write the file per spec // Step 2: Decrypt
let decrypted = match crypto::decrypt_data(ciphertext, &KEY, &entry.iv) {
Ok(data) => data,
Err(e) => {
return UnpackResult::Error {
name: entry.name.clone(),
message: format!("Decryption failed: {}", e),
};
}
};
// Step 3: Decompress if compressed
let decompressed = if entry.compression_flag == 1 {
match compression::decompress(&decrypted) {
Ok(data) => data,
Err(e) => {
return UnpackResult::Error {
name: entry.name.clone(),
message: format!("Decompression failed: {}", e),
};
}
}
} else {
decrypted
};
// Step 4: Verify SHA-256
let computed_sha256 = crypto::sha256_hash(&decompressed);
let sha256_ok = computed_sha256 == entry.sha256;
// Step 5: Write file (even if SHA-256 mismatch, per spec)
if let Err(e) = fs::write(&output_path, &decompressed) {
return UnpackResult::Error {
name: entry.name.clone(),
message: format!("Write failed: {}", e),
};
}
// Step 6: Set file permissions
if let Err(e) = fs::set_permissions(
&output_path,
fs::Permissions::from_mode(entry.permissions as u32),
) {
return UnpackResult::Error {
name: entry.name.clone(),
message: format!("Failed to set permissions: {}", e),
};
}
if sha256_ok {
UnpackResult::Ok {
name: entry.name.clone(),
original_size: entry.original_size,
}
} else {
UnpackResult::Written {
name: entry.name.clone(),
original_size: entry.original_size,
}
}
})
.collect();
// --- Phase 4: Report results (sequential for deterministic output) ---
let mut final_error_count = skipped_count;
let mut final_success_count = dir_count;
for result in &results {
match result {
UnpackResult::Ok { name, original_size } => {
println!("Extracted: {} ({} bytes)", name, original_size);
final_success_count += 1;
}
UnpackResult::Written { name, original_size } => {
eprintln!("SHA-256 mismatch for {} (data may be corrupted)", name);
println!("Extracted: {} ({} bytes)", name, original_size);
final_error_count += 1;
// Original code increments both error_count AND success_count for
// SHA-256 mismatch (file is still written and counted as extracted).
final_success_count += 1;
}
UnpackResult::Error { name, message } => {
eprintln!("{} for {}, skipping", message, name);
final_error_count += 1;
}
} }
// Step 5: Write file
fs::write(&output_path, &decompressed)?;
// Step 6: Set file permissions
fs::set_permissions(
&output_path,
fs::Permissions::from_mode(entry.permissions as u32),
)?;
println!("Extracted: {} ({} bytes)", entry.name, entry.original_size);
success_count += 1;
} }
println!( println!(
"Extracted {}/{} entries", "Extracted {}/{} entries",
success_count, entry_count final_success_count, entry_count
); );
if error_count > 0 { if final_error_count > 0 {
anyhow::bail!("{} entry(ies) had verification errors", error_count); anyhow::bail!("{} entry(ies) had verification errors", final_error_count);
} }
Ok(()) Ok(())