diff --git a/CHANGELOG.md b/CHANGELOG.md index 35e9f89..551e905 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- **Auto-grouping split archives → package** (scope `link`, sprint task 31, PRD-v2 §P1.12 / PRD §6.3): new `application/services/split_archive_grouper.rs` clusters resolved Link-Grabber URLs that match split-archive patterns (`*.partNN.rar`, `*.rNN` plus the legacy terminal `.rar` header, `*.7z.NNN`, `*.zip.NNN`, `*.tar.{gz,bz2,xz}.NNN`) by base name + format and creates one `Package` per cluster with `source_type = SplitArchive` and `external_id = "split-archive:{format_tag}:{base}"` (format-namespaced so a RAR set and a ZIP set sharing a base name produce two distinct packages). New `GroupSplitArchivesCommand` handler + `link_group_split_archives` Tauri IPC mirror the playlist grouper flow, capped at `MAX_LINKS = 500` per call to mirror `MAX_URLS` in `resolve_links` and bound the cluster-state allocation; `MAX_PART_INDEX = 10_000` rejects absurd part suffixes so `compute_missing_parts` cannot be coerced into a multi-billion-step iteration. The minimum-parts gate counts distinct `part_num`s rather than raw link count, so duplicate mirrors of one volume cannot satisfy the singleton guard. Gaps in the part numbering emit `DomainEvent::SplitArchiveIncomplete { package_id, base_name, missing_parts }` (forwarded to the frontend as `split-archive-incomplete`) so the UI can warn the user before extraction blocks; legacy RAR completeness now treats the terminal `.rar` header as part 0 so a missing header is reported instead of silently dropped. Frontend `SplitArchiveLinkInput` / `SplitArchiveGroupResult` types added in `src/types/media.ts`. 31 service unit tests (matcher fixtures + grouping integration + DoS caps + legacy-header coverage + distinct-parts gate) + 3 handler tests cover the contract. + +- **Shared grouper lock** (scope `core`): new `application/services/group_lock` module factors out the OnceLock + poisoned-mutex-recovery pair that `PlaylistGrouper` and `SplitArchiveGrouper` were each rolling locally. Both groupers now scope the lock to the find-then-save window and release it before publishing `PackageCreated` / `SplitArchiveIncomplete` so synchronous event-bus subscribers cannot block other concurrent grouping calls. + - **CodSpeed performance benchmarks** (CI): new `domain_benchmarks` Criterion harness in `src-tauri/benches/domain_benchmarks.rs` exercising the pure `domain::model::config` helpers (`apply_patch`, `normalize_link_check_parallelism`, `normalize_max_concurrent`). Wired through a new `.github/workflows/codspeed.yml` workflow that runs the benches under CodSpeed on every PR, providing automated perf-regression tracking for the domain layer. `criterion` + `codspeed-criterion-compat` added as dev-dependencies; `[[bench]]` target declared with `harness = false` so Criterion drives the run. - **CI hardening** (scope `ci`): new GitHub Actions jobs `secrets-scan` (rejects `.env`/`.pem`/`.key`/etc. tracked files plus `AKIA*`/`sk-ant-*`/`ghp_*`/`AIza*` API key patterns in tracked content), `forbidden-tools` (rejects `pnpm-lock.yaml`/`yarn.lock`, `.eslintrc*`/`biome.json*`/`.prettierrc*` configs, and any `#[allow(dead_code|unused|...)]` / `@ts-ignore` / `@ts-expect-error` / `oxlint-disable` comment), and `changelog-check` (PR-only — fails when `*.rs` / `*.ts` / `*.tsx` change without a matching `CHANGELOG.md` edit). Existing `cargo audit` swapped for `cargo deny check` covering advisories + licenses + bans + sources via the new `deny.toml`. Frontend job now runs `oxfmt --check`, `knip --reporter compact`, and uploads `coverage/` as an artifact. New `mutants.yml` workflow runs `cargo mutants --in-diff` on PRs touching `src-tauri/**` and a 4-shard nightly sweep on `main`. diff --git a/src-tauri/src/adapters/driven/event/tauri_bridge.rs b/src-tauri/src/adapters/driven/event/tauri_bridge.rs index 53ed2cb..9edaded 100644 --- a/src-tauri/src/adapters/driven/event/tauri_bridge.rs +++ b/src-tauri/src/adapters/driven/event/tauri_bridge.rs @@ -56,6 +56,7 @@ fn event_name(event: &DomainEvent) -> &'static str { DomainEvent::PackageCreated { .. } => "package-created", DomainEvent::PackageUpdated { .. } => "package-updated", DomainEvent::PackageDeleted { .. } => "package-deleted", + DomainEvent::SplitArchiveIncomplete { .. } => "split-archive-incomplete", DomainEvent::ClipboardUrlDetected { .. } => "clipboard-url-detected", DomainEvent::SettingsUpdated => "settings-updated", DomainEvent::ChecksumVerified { .. } => "checksum-verified", @@ -235,6 +236,17 @@ fn event_payload(event: &DomainEvent) -> serde_json::Value { DomainEvent::LinkStatusUpdated { url, status } => { json!({ "url": url, "status": link_status_payload(status) }) } + DomainEvent::SplitArchiveIncomplete { + package_id, + base_name, + missing_parts, + } => { + json!({ + "packageId": package_id.to_string(), + "baseName": base_name, + "missingParts": missing_parts, + }) + } } } diff --git a/src-tauri/src/adapters/driven/logging/download_log_bridge.rs b/src-tauri/src/adapters/driven/logging/download_log_bridge.rs index 3de5037..a66fe17 100644 --- a/src-tauri/src/adapters/driven/logging/download_log_bridge.rs +++ b/src-tauri/src/adapters/driven/logging/download_log_bridge.rs @@ -143,7 +143,8 @@ fn record_download_event(store: &DownloadLogStore, event: &DomainEvent) { | DomainEvent::NoAccountAvailable { .. } | DomainEvent::AccountSelected { .. } | DomainEvent::AccountExhausted { .. } - | DomainEvent::LinkStatusUpdated { .. } => {} + | DomainEvent::LinkStatusUpdated { .. } + | DomainEvent::SplitArchiveIncomplete { .. } => {} } } diff --git a/src-tauri/src/adapters/driving/tauri_ipc.rs b/src-tauri/src/adapters/driving/tauri_ipc.rs index 996df37..1b7ff88 100644 --- a/src-tauri/src/adapters/driving/tauri_ipc.rs +++ b/src-tauri/src/adapters/driving/tauri_ipc.rs @@ -880,6 +880,73 @@ pub async fn link_group_playlists( .collect()) } +/// Inbound IPC payload for [`link_group_split_archives`]. Mirrors +/// [`crate::application::services::SplitArchiveLink`] in camelCase. +#[derive(Debug, serde::Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct SplitArchiveLinkInputDto { + pub url: String, + pub filename: String, +} + +/// IPC return shape for [`link_group_split_archives`]. Mirrors +/// [`crate::application::services::SplitArchiveGroupResult`] in +/// camelCase so the Link Grabber preview can render the "Will create +/// package X with N parts (Y missing)" banner before Start. +#[derive(Debug, serde::Serialize)] +#[serde(rename_all = "camelCase")] +pub struct SplitArchiveGroupResultDto { + pub package_id: String, + pub base_name: String, + pub package_name: String, + pub created: bool, + pub urls: Vec, + pub missing_parts: Vec, +} + +#[tauri::command] +pub async fn link_group_split_archives( + state: State<'_, AppState>, + links: Vec, +) -> Result, String> { + use crate::application::commands::GroupSplitArchivesCommand; + use crate::application::services::SplitArchiveLink; + + let cmd = GroupSplitArchivesCommand { + links: links + .into_iter() + .map(|l| SplitArchiveLink { + url: l.url, + filename: l.filename, + }) + .collect(), + }; + + let results = state + .command_bus + .handle_group_split_archives(cmd) + .await + .map_err(|e| match &e { + AppError::Validation(msg) => msg.clone(), + other => { + tracing::error!(error = %other, "split-archive grouping failed"); + "Failed to group split archives".to_string() + } + })?; + + Ok(results + .into_iter() + .map(|r| SplitArchiveGroupResultDto { + package_id: r.package_id.as_str().to_string(), + base_name: r.base_name, + package_name: r.package_name, + created: r.created, + urls: r.urls, + missing_parts: r.missing_parts, + }) + .collect()) +} + // ── Clipboard ──────────────────────────────────────────────────────── #[tauri::command] diff --git a/src-tauri/src/application/commands/group_split_archives.rs b/src-tauri/src/application/commands/group_split_archives.rs new file mode 100644 index 0000000..e82e2b2 --- /dev/null +++ b/src-tauri/src/application/commands/group_split_archives.rs @@ -0,0 +1,128 @@ +//! Handler for [`GroupSplitArchivesCommand`](super::GroupSplitArchivesCommand). +//! +//! Routes the request through [`SplitArchiveGrouper`] so the same +//! idempotent natural-key logic backs both the IPC entry-point and any +//! future internal caller (e.g. the Link Grabber commit flow once it +//! learns to bundle split-archive links). The handler does NOT attach +//! downloads itself — it only ensures one [`Package`](crate::domain::model::package::Package) +//! exists per detected base name. Attaching member downloads is the +//! caller's responsibility once the resolved links have produced +//! [`DownloadId`](crate::domain::model::download::DownloadId)s. + +use std::time::{SystemTime, UNIX_EPOCH}; + +use crate::application::command_bus::CommandBus; +use crate::application::error::AppError; +use crate::application::services::{SplitArchiveGroupResult, SplitArchiveGrouper}; + +impl CommandBus { + pub async fn handle_group_split_archives( + &self, + cmd: super::GroupSplitArchivesCommand, + ) -> Result, AppError> { + let repo = self + .package_repo_arc() + .ok_or_else(|| AppError::Validation("package repository not configured".into()))?; + let grouper = SplitArchiveGrouper::new(repo, self.event_bus_arc()); + + let now_ms = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_millis() as u64) + .unwrap_or(0); + + grouper.group_all(&cmd.links, now_ms) + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use crate::application::commands::GroupSplitArchivesCommand; + use crate::application::commands::tests_support::{ + CapturingEventBus, InMemoryCredentialStore, InMemoryDownloadRepo, InMemoryPackageRepo, + build_package_bus, bus_without_account_ports, + }; + use crate::application::error::AppError; + use crate::application::services::SplitArchiveLink; + use crate::domain::ports::driven::PackageRepository; + + fn link(url: &str, filename: &str) -> SplitArchiveLink { + SplitArchiveLink { + url: url.to_string(), + filename: filename.to_string(), + } + } + + fn ten_part_links(base: &str) -> Vec { + (1..=10) + .map(|n| { + let name = format!("{base}.part{:02}.rar", n); + let url = format!("https://ex.com/{name}"); + link(&url, &name) + }) + .collect() + } + + #[tokio::test] + async fn test_handle_group_split_archives_creates_one_package_per_base() { + let repo = Arc::new(InMemoryPackageRepo::new()); + let creds = Arc::new(InMemoryCredentialStore::new()); + let dl_repo = Arc::new(InMemoryDownloadRepo::new()); + let events = Arc::new(CapturingEventBus::new()); + let bus = build_package_bus(repo.clone(), creds, events, dl_repo); + + let mut links = ten_part_links("alpha"); + links.extend(ten_part_links("bravo")); + + let results = bus + .handle_group_split_archives(GroupSplitArchivesCommand { links }) + .await + .expect("group"); + + assert_eq!(results.len(), 2); + assert!(results.iter().all(|r| r.created)); + assert_eq!(repo.list().unwrap().len(), 2); + } + + #[tokio::test] + async fn test_handle_group_split_archives_reuses_existing_package_on_re_resolve() { + let repo = Arc::new(InMemoryPackageRepo::new()); + let creds = Arc::new(InMemoryCredentialStore::new()); + let dl_repo = Arc::new(InMemoryDownloadRepo::new()); + let events = Arc::new(CapturingEventBus::new()); + let bus = build_package_bus(repo.clone(), creds, events, dl_repo); + + let first = bus + .handle_group_split_archives(GroupSplitArchivesCommand { + links: ten_part_links("movie"), + }) + .await + .unwrap(); + let second = bus + .handle_group_split_archives(GroupSplitArchivesCommand { + links: ten_part_links("movie"), + }) + .await + .unwrap(); + + assert!(first[0].created); + assert!(!second[0].created); + assert_eq!(first[0].package_id, second[0].package_id); + assert_eq!(repo.list().unwrap().len(), 1, "no duplicate package"); + } + + #[tokio::test] + async fn test_handle_group_split_archives_returns_validation_when_repo_missing() { + let events = Arc::new(CapturingEventBus::new()); + let bus = bus_without_account_ports(events); + + let err = bus + .handle_group_split_archives(GroupSplitArchivesCommand { + links: ten_part_links("movie"), + }) + .await + .expect_err("missing repo"); + assert!(matches!(err, AppError::Validation(_))); + } +} diff --git a/src-tauri/src/application/commands/mod.rs b/src-tauri/src/application/commands/mod.rs index 1559f14..9a4891f 100644 --- a/src-tauri/src/application/commands/mod.rs +++ b/src-tauri/src/application/commands/mod.rs @@ -20,6 +20,7 @@ mod export_accounts; mod export_history; mod extract_archive; mod group_playlists; +mod group_split_archives; mod import_accounts; mod install_plugin; mod move_package_to_folder; @@ -211,6 +212,17 @@ pub struct GroupPlaylistsCommand { } impl Command for GroupPlaylistsCommand {} +/// Auto-group resolved split-archive parts into one [`Package`] per +/// detected base name. Re-running with the same set reuses the existing +/// package (PRD-v2 §P1.12). The handler also detects gaps in the part +/// numbering and emits [`crate::domain::event::DomainEvent::SplitArchiveIncomplete`] +/// so the UI can warn the user before the extraction step blocks. +#[derive(Debug)] +pub struct GroupSplitArchivesCommand { + pub links: Vec, +} +impl Command for GroupSplitArchivesCommand {} + // Handler: task 23 (settings) #[derive(Debug)] pub struct UpdateConfigCommand { diff --git a/src-tauri/src/application/services/group_lock.rs b/src-tauri/src/application/services/group_lock.rs new file mode 100644 index 0000000..589d1cf --- /dev/null +++ b/src-tauri/src/application/services/group_lock.rs @@ -0,0 +1,36 @@ +//! Process-wide lock shared by the package groupers +//! ([`crate::application::services::PlaylistGrouper`], +//! [`crate::application::services::SplitArchiveGrouper`]) to serialise +//! find-then-save sequences. +//! +//! Without this lock, two concurrent IPC invocations for the same +//! natural key could both observe "not found" in `find_by_external_id` +//! and each insert a new `Package`, breaking the idempotent-reuse +//! guarantee. The lock window covers only the lookup + save, never the +//! downstream event publish, so the contention window stays tiny (a +//! few SQLite writes). +//! +//! A single shared mutex is intentional. The cost of mild cross-grouper +//! serialisation is negligible (groupers run only at Link-Grabber +//! commit time, far from any hot path), and a shared mutex makes +//! reasoning about the SQLite UNIQUE-index contract trivial: at most +//! one writer per process competes for any given external_id at a +//! time. + +use std::sync::{Mutex, MutexGuard, OnceLock}; + +fn lock() -> &'static Mutex<()> { + static GROUP_LOCK: OnceLock> = OnceLock::new(); + GROUP_LOCK.get_or_init(|| Mutex::new(())) +} + +/// Acquire the shared grouper lock, recovering from a poisoned mutex +/// (a previous panic while holding the guard) instead of panicking +/// again. Domain state lives in SQLite, not in the guard, so the next +/// caller can safely proceed. +pub(crate) fn acquire_grouper_lock() -> MutexGuard<'static, ()> { + match lock().lock() { + Ok(g) => g, + Err(poisoned) => poisoned.into_inner(), + } +} diff --git a/src-tauri/src/application/services/mod.rs b/src-tauri/src/application/services/mod.rs index 70a8fcf..7750f05 100644 --- a/src-tauri/src/application/services/mod.rs +++ b/src-tauri/src/application/services/mod.rs @@ -2,10 +2,12 @@ pub mod account_rotator; pub mod account_selector; pub mod checksum_validator; pub mod engine_config_bridge; +pub(crate) mod group_lock; pub mod history_backfill; pub mod playlist_grouper; pub mod queue_config_bridge; pub mod queue_manager; +pub mod split_archive_grouper; pub mod startup_recovery; pub use account_rotator::AccountRotator; @@ -16,3 +18,4 @@ pub use history_backfill::backfill_history_for_completed_downloads; pub use playlist_grouper::{PlaylistGroup, PlaylistGroupResult, PlaylistGrouper}; pub use queue_config_bridge::subscribe_queue_to_config; pub use queue_manager::QueueManager; +pub use split_archive_grouper::{SplitArchiveGroupResult, SplitArchiveGrouper, SplitArchiveLink}; diff --git a/src-tauri/src/application/services/playlist_grouper.rs b/src-tauri/src/application/services/playlist_grouper.rs index 49191a9..ff97ecb 100644 --- a/src-tauri/src/application/services/playlist_grouper.rs +++ b/src-tauri/src/application/services/playlist_grouper.rs @@ -22,39 +22,17 @@ //! the grouper only cares about the natural key, not about which //! plugin emitted it. -use std::sync::{Arc, Mutex, MutexGuard, OnceLock}; +use std::sync::Arc; use uuid::Uuid; use crate::application::error::AppError; +use crate::application::services::group_lock::acquire_grouper_lock; use crate::domain::event::DomainEvent; use crate::domain::model::package::{Package, PackageId, PackageSourceType}; use crate::domain::ports::driven::EventBus; use crate::domain::ports::driven::PackageRepository; -/// Process-wide lock that serialises the find-then-save sequence in -/// [`PlaylistGrouper::group_one`]. Without it, two concurrent IPC -/// invocations (rapid double-Start, two windows) for the same -/// `playlist_id` could both observe "not found" and each insert a new -/// `Package`, breaking the idempotent-reuse guarantee. The lock is held -/// only across the lookup + save, never during downstream work, so the -/// contention window is tiny (a few SQLite writes). -fn group_lock() -> &'static Mutex<()> { - static GROUP_LOCK: OnceLock> = OnceLock::new(); - GROUP_LOCK.get_or_init(|| Mutex::new(())) -} - -/// Acquire the global grouper lock, recovering from a poisoned mutex -/// (a previous panic while holding the guard) instead of panicking -/// again. Domain state lives in SQLite, not in the lock guard, so the -/// next caller can safely proceed. -fn acquire_group_lock() -> MutexGuard<'static, ()> { - match group_lock().lock() { - Ok(g) => g, - Err(poisoned) => poisoned.into_inner(), - } -} - /// One playlist seen by the resolver. The grouper turns one or more /// `PlaylistGroup` instances into a `Package` per unique `playlist_id`. #[derive(Debug, Clone, PartialEq, Eq)] @@ -122,40 +100,14 @@ impl PlaylistGrouper { return Err(AppError::Validation("playlist_id must not be empty".into())); } - let _guard = acquire_group_lock(); - - if let Some(existing) = self.repo.find_by_external_id(trimmed_id)? { - return Ok(PlaylistGroupResult { - package_id: existing.id().clone(), - package_name: existing.name().to_string(), - created: false, - item_count: group.item_count, - }); - } - - let trimmed_name = group.playlist_name.trim(); - let name = if trimmed_name.is_empty() { - fallback_name() - } else { - trimmed_name.to_string() - }; + // Hold the shared grouper lock only across the find-then-save + // window. Releasing it before publishing keeps a slow subscriber + // from blocking other concurrent grouping calls (and avoids the + // re-entrant-publish deadlock risk if a subscriber ends up + // touching a grouper itself). + let (package_id, name, created) = { + let _guard = acquire_grouper_lock(); - let package_id = PackageId::new(Uuid::new_v4().to_string()); - let mut package = Package::new( - package_id.clone(), - name.clone(), - PackageSourceType::Playlist, - created_at_ms, - ); - package.set_external_id(Some(trimmed_id.to_string())); - - // Save with conflict-recovery: a cross-process writer (the lock - // above only serialises within one process) may have inserted - // the same `external_id` between our `find_by_external_id` and - // here, in which case the SQLite UNIQUE index makes our save - // fail. Re-querying decides whether the failure was a race - // (return the existing package as a reuse) or a real error. - if let Err(save_err) = self.repo.save(&package) { if let Some(existing) = self.repo.find_by_external_id(trimmed_id)? { return Ok(PlaylistGroupResult { package_id: existing.id().clone(), @@ -164,8 +116,43 @@ impl PlaylistGrouper { item_count: group.item_count, }); } - return Err(save_err.into()); - } + + let trimmed_name = group.playlist_name.trim(); + let name = if trimmed_name.is_empty() { + fallback_name() + } else { + trimmed_name.to_string() + }; + + let package_id = PackageId::new(Uuid::new_v4().to_string()); + let mut package = Package::new( + package_id.clone(), + name.clone(), + PackageSourceType::Playlist, + created_at_ms, + ); + package.set_external_id(Some(trimmed_id.to_string())); + + // Save with conflict-recovery: a cross-process writer (the lock + // above only serialises within one process) may have inserted + // the same `external_id` between our `find_by_external_id` and + // here, in which case the SQLite UNIQUE index makes our save + // fail. Re-querying decides whether the failure was a race + // (return the existing package as a reuse) or a real error. + if let Err(save_err) = self.repo.save(&package) { + if let Some(existing) = self.repo.find_by_external_id(trimmed_id)? { + return Ok(PlaylistGroupResult { + package_id: existing.id().clone(), + package_name: existing.name().to_string(), + created: false, + item_count: group.item_count, + }); + } + return Err(save_err.into()); + } + + (package_id, name, true) + }; self.event_bus.publish(DomainEvent::PackageCreated { id: package_id.clone(), @@ -175,7 +162,7 @@ impl PlaylistGrouper { Ok(PlaylistGroupResult { package_id, package_name: name, - created: true, + created, item_count: group.item_count, }) } @@ -201,6 +188,8 @@ impl PlaylistGrouper { #[cfg(test)] mod tests { + use std::sync::Mutex; + use super::*; use crate::application::commands::tests_support::{CapturingEventBus, InMemoryPackageRepo}; diff --git a/src-tauri/src/application/services/split_archive_grouper.rs b/src-tauri/src/application/services/split_archive_grouper.rs new file mode 100644 index 0000000..bc9970e --- /dev/null +++ b/src-tauri/src/application/services/split_archive_grouper.rs @@ -0,0 +1,1017 @@ +//! Auto-group resolved split-archive parts into a [`Package`]. +//! +//! When the Link Grabber resolves a batch that contains multiple parts +//! of the same split archive (e.g. `movie.part01.rar`, `movie.part02.rar`, +//! …), this grouper clusters them by base name and ensures one +//! [`Package`](crate::domain::model::package::Package) holds every part. +//! Re-resolving the same set must reuse the previously-created package +//! instead of producing a duplicate (PRD-v2 §P1.12). +//! +//! The grouper is the single point of truth for that idempotency: it +//! looks up the package by its `external_id` +//! (`split-archive:{format_tag}:{base}`) and either returns the +//! existing one or creates a new one. The format tag is part of the +//! key so a RAR set and a ZIP set sharing a base name produce two +//! distinct packages. The caller (the resolver / Link Grabber +//! pipeline) then attaches the resolved items by id once the +//! downloads have been persisted. +//! +//! Domain-pure: no plugin loader, no IPC, no HTTP. Just `PackageRepository` +//! + `EventBus`. Tests run entirely in-memory. +//! +//! # Detected formats +//! +//! - Modern RAR — `name.part01.rar`, `name.part02.rar`, … +//! - Legacy RAR — `name.r00`, `name.r01`, … (terminal `name.rar` joins the same set) +//! - 7z split — `name.7z.001`, `name.7z.002`, … +//! - Zip split — `name.zip.001`, `name.zip.002`, … +//! - Tarball split — `name.tar.gz.001`, `name.tar.bz2.001`, `name.tar.xz.001` +//! +//! Files that do not match any pattern are returned untouched by +//! [`SplitArchiveGrouper::group_all`], which keeps non-archive links +//! flowing through the resolver as before. + +use std::collections::BTreeMap; +use std::sync::{Arc, OnceLock}; + +use regex::Regex; +use uuid::Uuid; + +use crate::application::error::AppError; +use crate::application::services::group_lock::acquire_grouper_lock; +use crate::domain::event::DomainEvent; +use crate::domain::model::package::{Package, PackageId, PackageSourceType}; +use crate::domain::ports::driven::{EventBus, PackageRepository}; + +/// Stable namespace prefix used for the `external_id` natural key of +/// split-archive packages. Prevents collisions with playlist packages +/// (which use raw `playlist_id`s) and lets the SQLite UNIQUE index +/// reject cross-process duplicates. The full key embeds the format +/// after the prefix (`split-archive:{format_tag}:{base}`) so two +/// archives that share a base name but use different formats (a RAR +/// set and a ZIP set both called `mix`) end up in distinct packages. +const EXTERNAL_ID_PREFIX: &str = "split-archive:"; + +/// Minimum number of detected parts required before the grouper bothers +/// creating a package. A lone `.part01.rar` is more useful as a regular +/// download than as an empty package shell — the user can still add the +/// other parts to it later via the package detail view. +const MIN_PARTS_TO_GROUP: usize = 2; + +/// Upper bound on the number of links accepted by a single grouping +/// call. Mirrors `MAX_URLS` in +/// [`crate::application::commands::resolve_links`]: keeps a malicious +/// or accidental million-link payload from allocating an unbounded +/// `BTreeMap` worth of cluster state. +pub const MAX_LINKS: usize = 500; + +/// Upper bound on a single part's numeric suffix. Real-world split +/// archives top out at a few hundred volumes, so a payload like +/// `name.part1000000000.rar` is either a typo or a hostile input +/// trying to force `compute_missing_parts` into a multi-billion-step +/// iteration. Matchers reject anything past this cap, which makes the +/// filename fall through to the unmatched path. The two unbounded +/// regexes (modern RAR's `\d+` and legacy RAR's `\d{2,}`) are the only +/// ones that need the explicit check; the other formats already pin +/// the suffix at 3 digits via `\d{3}`. +const MAX_PART_INDEX: u32 = 10_000; + +/// One archive format the grouper recognises. Carried alongside the +/// detected base name so the missing-part error message can render the +/// right suffix (`part05.rar` vs `7z.005`) and the `external_id` can +/// distinguish a RAR set from a ZIP set sharing the same base name. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub(crate) enum SplitArchiveFormat { + /// Modern RAR — `name.part01.rar`. + PartRar, + /// Legacy RAR — `name.r00`, `name.r01`, … plus the terminal `.rar` + /// header file. The header is treated as part `0` for continuity. + LegacyRar, + /// 7z multi-volume — `name.7z.001`. + SevenZ, + /// Split ZIP using the `.zip.NNN` convention. + Zip, + /// Gzip tarball split — `name.tar.gz.001`. + TarGz, + /// Bzip2 tarball split — `name.tar.bz2.001`. + TarBz2, + /// XZ tarball split — `name.tar.xz.001`. + TarXz, +} + +impl SplitArchiveFormat { + /// Suffix the user would type (e.g. `"part05.rar"`, `"7z.003"`), + /// surfaced in `missing_parts` and the matching + /// [`DomainEvent::SplitArchiveIncomplete`] event. + /// + /// Legacy RAR uses 0-based suffixes on disk (`r00`, `r01`, …) but + /// we store as 1-based part numbers internally so every format + /// shares the same numbering: detection adds 1 (`r00` → part 1), + /// rendering subtracts 1 (part 1 → `r00`). + fn part_suffix(self, part_num: u32) -> String { + match self { + Self::PartRar => format!("part{:02}.rar", part_num), + Self::LegacyRar => { + if part_num == 0 { + "rar".to_string() + } else { + format!("r{:02}", part_num.saturating_sub(1)) + } + } + Self::SevenZ => format!("7z.{:03}", part_num), + Self::Zip => format!("zip.{:03}", part_num), + Self::TarGz => format!("tar.gz.{:03}", part_num), + Self::TarBz2 => format!("tar.bz2.{:03}", part_num), + Self::TarXz => format!("tar.xz.{:03}", part_num), + } + } + + /// Stable, URL-safe tag used inside the package `external_id`. + /// Distinct values across formats are required so a RAR set and a + /// ZIP set sharing a base name end up in two different packages + /// instead of silently colliding under the same external_id. + fn as_tag(self) -> &'static str { + match self { + Self::PartRar => "part-rar", + Self::LegacyRar => "legacy-rar", + Self::SevenZ => "7z", + Self::Zip => "zip", + Self::TarGz => "tar-gz", + Self::TarBz2 => "tar-bz2", + Self::TarXz => "tar-xz", + } + } +} + +/// One inbound link sent to [`SplitArchiveGrouper::group_all`]. The +/// caller pre-extracts the URL filename (e.g. via the URL path's last +/// segment) so the grouper does not have to parse URLs itself. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct SplitArchiveLink { + pub url: String, + pub filename: String, +} + +/// Outcome of grouping for a single detected base name. The caller uses +/// `package_id` to attach the matched downloads via +/// `PackageRepository::attach_download`. `missing_parts` is non-empty +/// when one or more numbered parts are absent from the inbound batch — +/// the grouper also emits a [`DomainEvent::SplitArchiveIncomplete`] in +/// that case so the UI can notify the user. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct SplitArchiveGroupResult { + pub package_id: PackageId, + pub base_name: String, + pub package_name: String, + pub created: bool, + /// URLs from the input batch that belong to this group, ordered by + /// detected part number so the caller can reproduce the visual order + /// expected by the Link Grabber preview. + pub urls: Vec, + /// Human-readable suffixes of the parts that should exist between + /// part 1 and the highest detected part number but are missing from + /// the input batch. Empty when the batch is contiguous. + pub missing_parts: Vec, +} + +/// Detection output for a single filename — internal carrier between +/// `detect_from_filename` and the cluster builder. +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) struct DetectedPart { + pub base: String, + pub part_num: u32, + pub format: SplitArchiveFormat, +} + +/// Try every supported pattern in order and return the first match. +/// Order matters: the more specific tarball patterns must be tried +/// before the generic `.7z.NNN` / `.zip.NNN` matchers so +/// `archive.tar.gz.001` is not mis-classified as a 7z volume. +pub(crate) fn detect_from_filename(file_name: &str) -> Option { + let detected = match_part_rar(file_name) + .or_else(|| match_tar_split(file_name)) + .or_else(|| match_seven_z(file_name)) + .or_else(|| match_zip_split(file_name)) + .or_else(|| match_legacy_rar(file_name)) + .or_else(|| match_legacy_rar_header(file_name))?; + // Cap absurd part indices so a hostile `name.part1000000000.rar` + // cannot force `compute_missing_parts` into a multi-billion-step + // iteration. Applied after the cascade so a rejected index does + // not silently fall through to `match_legacy_rar_header` and end + // up classified as a header (`name.part1000000000` would otherwise + // become base + part 0, which is meaningless). + if detected.part_num > MAX_PART_INDEX { + return None; + } + Some(detected) +} + +fn match_part_rar(file_name: &str) -> Option { + static RE: OnceLock = OnceLock::new(); + let re = RE.get_or_init(|| Regex::new(r"^(?P.+?)\.part(?P\d+)\.rar$").unwrap()); + let caps = re.captures(file_name)?; + let base = caps.name("base")?.as_str().to_string(); + let part_num = caps.name("num")?.as_str().parse::().ok()?; + Some(DetectedPart { + base, + part_num, + format: SplitArchiveFormat::PartRar, + }) +} + +fn match_tar_split(file_name: &str) -> Option { + static RE: OnceLock = OnceLock::new(); + let re = RE.get_or_init(|| { + Regex::new(r"^(?P.+?)\.tar\.(?Pgz|bz2|xz)\.(?P\d{3})$").unwrap() + }); + let caps = re.captures(file_name)?; + let base = caps.name("base")?.as_str().to_string(); + let part_num = caps.name("num")?.as_str().parse::().ok()?; + let format = match caps.name("comp")?.as_str() { + "gz" => SplitArchiveFormat::TarGz, + "bz2" => SplitArchiveFormat::TarBz2, + "xz" => SplitArchiveFormat::TarXz, + _ => return None, + }; + Some(DetectedPart { + base, + part_num, + format, + }) +} + +fn match_seven_z(file_name: &str) -> Option { + static RE: OnceLock = OnceLock::new(); + let re = RE.get_or_init(|| Regex::new(r"^(?P.+?)\.7z\.(?P\d{3})$").unwrap()); + let caps = re.captures(file_name)?; + let base = caps.name("base")?.as_str().to_string(); + let part_num = caps.name("num")?.as_str().parse::().ok()?; + Some(DetectedPart { + base, + part_num, + format: SplitArchiveFormat::SevenZ, + }) +} + +fn match_zip_split(file_name: &str) -> Option { + static RE: OnceLock = OnceLock::new(); + let re = RE.get_or_init(|| Regex::new(r"^(?P.+?)\.zip\.(?P\d{3})$").unwrap()); + let caps = re.captures(file_name)?; + let base = caps.name("base")?.as_str().to_string(); + let part_num = caps.name("num")?.as_str().parse::().ok()?; + Some(DetectedPart { + base, + part_num, + format: SplitArchiveFormat::Zip, + }) +} + +fn match_legacy_rar(file_name: &str) -> Option { + static RE: OnceLock = OnceLock::new(); + // Match `name.r00`, `name.r01`, …. The trailing digits are 2+ wide so + // we don't accidentally pick up names that just happen to end in + // `.r1` (which would be an unusual archive convention anyway). + let re = RE.get_or_init(|| Regex::new(r"^(?P.+?)\.r(?P\d{2,})$").unwrap()); + let caps = re.captures(file_name)?; + let base = caps.name("base")?.as_str().to_string(); + let raw_num = caps.name("num")?.as_str().parse::().ok()?; + // Translate `.r00` → part 1, `.r01` → part 2, … so the legacy set + // shares the same 1-based numbering as the modern formats. The + // optional terminal `.rar` header file is treated as part 0 by + // [`match_legacy_rar_header`]. `checked_add` guards against `u32` + // overflow on a `raw_num == u32::MAX` payload; the global cap in + // [`detect_from_filename`] then rejects anything above + // [`MAX_PART_INDEX`] so the iteration in `compute_missing_parts` + // stays bounded. + let part_num = raw_num.checked_add(1)?; + Some(DetectedPart { + base, + part_num, + format: SplitArchiveFormat::LegacyRar, + }) +} + +fn match_legacy_rar_header(file_name: &str) -> Option { + static RE: OnceLock = OnceLock::new(); + // The terminal `.rar` header in a legacy multi-volume set + // (`name.rar` + `name.r00` + `name.r01`…). Tried last in + // [`detect_from_filename`] so the more specific patterns + // (`name.partNN.rar`, `name.rNN`) win first. A standalone `.rar` + // (no companion `.rNN`) survives detection but gets dropped by + // [`MIN_PARTS_TO_GROUP`], so it does not produce a spurious + // singleton package. + let re = RE.get_or_init(|| Regex::new(r"^(?P.+?)\.rar$").unwrap()); + let caps = re.captures(file_name)?; + let base = caps.name("base")?.as_str().to_string(); + Some(DetectedPart { + base, + part_num: 0, + format: SplitArchiveFormat::LegacyRar, + }) +} + +pub struct SplitArchiveGrouper { + repo: Arc, + event_bus: Arc, +} + +impl SplitArchiveGrouper { + pub fn new(repo: Arc, event_bus: Arc) -> Self { + Self { repo, event_bus } + } + + /// Cluster `links` by detected base name + format and create / + /// reuse one [`Package`] per cluster. Links that do not match any + /// split-archive pattern are silently dropped from the result — the + /// caller is expected to handle them through the regular resolver + /// path. Clusters with fewer than [`MIN_PARTS_TO_GROUP`] detected + /// parts are also dropped (a singleton is more useful as a + /// stand-alone download than as a half-empty package). + /// + /// Returns `AppError::Validation` when `links.len()` exceeds + /// [`MAX_LINKS`] so a runaway IPC payload cannot allocate + /// unbounded cluster state. + pub fn group_all( + &self, + links: &[SplitArchiveLink], + created_at_ms: u64, + ) -> Result, AppError> { + if links.len() > MAX_LINKS { + return Err(AppError::Validation(format!( + "Too many links: {} (max {MAX_LINKS})", + links.len() + ))); + } + + // `BTreeMap` keeps the output deterministic (alphabetical), + // which matters for snapshot tests and Link-Grabber preview. + let mut clusters: BTreeMap<(String, SplitArchiveFormat), Vec<(u32, String)>> = + BTreeMap::new(); + for link in links { + let trimmed = link.filename.trim(); + if trimmed.is_empty() { + continue; + } + if let Some(detected) = detect_from_filename(trimmed) { + clusters + .entry((detected.base, detected.format)) + .or_default() + .push((detected.part_num, link.url.clone())); + } + } + + let mut out = Vec::new(); + for ((base, format), mut parts) in clusters { + // Threshold counts distinct part numbers, not raw link + // count: two mirrors of `name.part01.rar` describe one + // volume, so they must not satisfy the singleton guard on + // their own. Otherwise duplicate URLs for the same part + // would create a "complete" group with `missing_parts` + // empty even though only one volume is actually present. + let distinct_parts: std::collections::HashSet = + parts.iter().map(|(n, _)| *n).collect(); + if distinct_parts.len() < MIN_PARTS_TO_GROUP { + continue; + } + parts.sort_by_key(|(n, _)| *n); + let result = self.group_one_base(&base, format, &parts, created_at_ms)?; + out.push(result); + } + Ok(out) + } + + fn group_one_base( + &self, + base: &str, + format: SplitArchiveFormat, + sorted_parts: &[(u32, String)], + created_at_ms: u64, + ) -> Result { + let trimmed_base = base.trim(); + if trimmed_base.is_empty() { + return Err(AppError::Validation( + "split-archive base name must not be empty".into(), + )); + } + // Format is part of the natural key: a RAR set and a ZIP set + // sharing the same base name must produce two distinct packages. + let external_id = format!("{EXTERNAL_ID_PREFIX}{}:{trimmed_base}", format.as_tag()); + let urls: Vec = sorted_parts.iter().map(|(_, u)| u.clone()).collect(); + let missing = compute_missing_parts(format, sorted_parts); + + // Hold the lock only across the find-then-save sequence; drop + // it before publishing events so synchronous subscribers cannot + // block other concurrent grouping calls. + let (package_id, package_name, created) = { + let _guard = acquire_grouper_lock(); + + if let Some(existing) = self.repo.find_by_external_id(&external_id)? { + (existing.id().clone(), existing.name().to_string(), false) + } else { + let new_id = PackageId::new(Uuid::new_v4().to_string()); + let mut package = Package::new( + new_id.clone(), + trimmed_base.to_string(), + PackageSourceType::SplitArchive, + created_at_ms, + ); + package.set_external_id(Some(external_id.clone())); + + match self.repo.save(&package) { + Ok(()) => (new_id, trimmed_base.to_string(), true), + Err(save_err) => { + // Cross-process race: another writer inserted the + // same `external_id` between our `find` and + // `save`. Re-query and surface the winner as a + // reuse instead of bubbling the UNIQUE error. + if let Some(existing) = self.repo.find_by_external_id(&external_id)? { + (existing.id().clone(), existing.name().to_string(), false) + } else { + return Err(save_err.into()); + } + } + } + } + }; + + if created { + self.event_bus.publish(DomainEvent::PackageCreated { + id: package_id.clone(), + name: package_name.clone(), + }); + } + if !missing.is_empty() { + self.event_bus.publish(DomainEvent::SplitArchiveIncomplete { + package_id: package_id.clone(), + base_name: trimmed_base.to_string(), + missing_parts: missing.clone(), + }); + } + + Ok(SplitArchiveGroupResult { + package_id, + base_name: trimmed_base.to_string(), + package_name, + created, + urls, + missing_parts: missing, + }) + } +} + +/// Walk `parts` (sorted ascending by part number) from the format's +/// natural baseline up to the highest seen number, emitting a +/// human-readable suffix for every gap. Legacy RAR starts at 0 because +/// the terminal `.rar` header is part 0; all other supported formats +/// are 1-based. +fn compute_missing_parts( + format: SplitArchiveFormat, + sorted_parts: &[(u32, String)], +) -> Vec { + if sorted_parts.is_empty() { + return Vec::new(); + } + let max = sorted_parts.last().map(|(n, _)| *n).unwrap_or(0); + let present: std::collections::HashSet = sorted_parts.iter().map(|(n, _)| *n).collect(); + let start = match format { + SplitArchiveFormat::LegacyRar => 0, + _ => 1, + }; + let mut missing = Vec::new(); + for n in start..=max { + if !present.contains(&n) { + missing.push(format.part_suffix(n)); + } + } + missing +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::application::commands::tests_support::{CapturingEventBus, InMemoryPackageRepo}; + use crate::domain::ports::driven::PackageRepository; + + fn arc_repo_and_bus() -> (Arc, Arc) { + ( + Arc::new(InMemoryPackageRepo::new()), + Arc::new(CapturingEventBus::new()), + ) + } + + fn link(url: &str, filename: &str) -> SplitArchiveLink { + SplitArchiveLink { + url: url.to_string(), + filename: filename.to_string(), + } + } + + // ── Detection unit tests ──────────────────────────────────────── + + #[test] + fn test_detect_modern_rar_part() { + let part = detect_from_filename("movie.part01.rar").expect("matches"); + assert_eq!(part.base, "movie"); + assert_eq!(part.part_num, 1); + assert_eq!(part.format, SplitArchiveFormat::PartRar); + } + + #[test] + fn test_detect_modern_rar_three_digits() { + let part = detect_from_filename("series.s01e01.part010.rar").expect("matches"); + assert_eq!(part.base, "series.s01e01"); + assert_eq!(part.part_num, 10); + } + + #[test] + fn test_detect_legacy_rar_r00_translates_to_part_one() { + let part = detect_from_filename("backup.r00").expect("matches"); + assert_eq!(part.base, "backup"); + assert_eq!(part.part_num, 1); + assert_eq!(part.format, SplitArchiveFormat::LegacyRar); + } + + #[test] + fn test_detect_legacy_rar_r10() { + let part = detect_from_filename("backup.r10").expect("matches"); + assert_eq!(part.part_num, 11); + } + + #[test] + fn test_detect_seven_z() { + let part = detect_from_filename("dump.7z.001").expect("matches"); + assert_eq!(part.base, "dump"); + assert_eq!(part.part_num, 1); + assert_eq!(part.format, SplitArchiveFormat::SevenZ); + } + + #[test] + fn test_detect_zip_split() { + let part = detect_from_filename("data.zip.005").expect("matches"); + assert_eq!(part.base, "data"); + assert_eq!(part.part_num, 5); + assert_eq!(part.format, SplitArchiveFormat::Zip); + } + + #[test] + fn test_detect_tar_gz_split() { + let part = detect_from_filename("logs.tar.gz.003").expect("matches"); + assert_eq!(part.base, "logs"); + assert_eq!(part.part_num, 3); + assert_eq!(part.format, SplitArchiveFormat::TarGz); + } + + #[test] + fn test_detect_tar_bz2_split() { + let part = detect_from_filename("logs.tar.bz2.002").expect("matches"); + assert_eq!(part.format, SplitArchiveFormat::TarBz2); + } + + #[test] + fn test_detect_tar_xz_split() { + let part = detect_from_filename("logs.tar.xz.001").expect("matches"); + assert_eq!(part.format, SplitArchiveFormat::TarXz); + } + + #[test] + fn test_detect_returns_none_for_regular_filename() { + assert!(detect_from_filename("photo.jpg").is_none()); + assert!(detect_from_filename("archive.zip").is_none()); + assert!(detect_from_filename("archive.7z").is_none()); + assert!(detect_from_filename("notes.tar.gz").is_none()); + } + + #[test] + fn test_detect_legacy_rar_header_is_part_zero() { + let part = detect_from_filename("backup.rar").expect("matches"); + assert_eq!(part.base, "backup"); + assert_eq!(part.part_num, 0); + assert_eq!(part.format, SplitArchiveFormat::LegacyRar); + } + + #[test] + fn test_modern_part_rar_wins_over_legacy_header_match() { + // `name.part01.rar` ends in `.rar` so the legacy-header regex + // would also match — order in `detect_from_filename` must keep + // PartRar primary so the part number is preserved. + let part = detect_from_filename("movie.part01.rar").expect("matches"); + assert_eq!(part.format, SplitArchiveFormat::PartRar); + assert_eq!(part.part_num, 1); + } + + #[test] + fn test_detect_does_not_pick_up_random_dot_r1_filename() { + // Single-digit `.r1` is not a recognised RAR convention; skipping + // it avoids false positives for filenames that happen to end in + // `.r1` (e.g. a `.r1` config file). + assert!(detect_from_filename("config.r1").is_none()); + } + + #[test] + fn test_part_suffix_for_each_format() { + assert_eq!(SplitArchiveFormat::PartRar.part_suffix(5), "part05.rar"); + assert_eq!(SplitArchiveFormat::PartRar.part_suffix(12), "part12.rar"); + assert_eq!(SplitArchiveFormat::SevenZ.part_suffix(3), "7z.003"); + assert_eq!(SplitArchiveFormat::Zip.part_suffix(7), "zip.007"); + assert_eq!(SplitArchiveFormat::TarGz.part_suffix(1), "tar.gz.001"); + assert_eq!(SplitArchiveFormat::TarBz2.part_suffix(2), "tar.bz2.002"); + assert_eq!(SplitArchiveFormat::TarXz.part_suffix(99), "tar.xz.099"); + // Legacy RAR: part 1 is `.r00`, part 2 is `.r01`, … + assert_eq!(SplitArchiveFormat::LegacyRar.part_suffix(1), "r00"); + assert_eq!(SplitArchiveFormat::LegacyRar.part_suffix(11), "r10"); + } + + // ── Grouping integration tests ─────────────────────────────────── + + fn ten_part_links(host: &str, base: &str) -> Vec { + (1..=10) + .map(|n| { + let name = format!("{base}.part{:02}.rar", n); + let url = format!("https://{host}/{name}"); + link(&url, &name) + }) + .collect() + } + + #[test] + fn test_group_all_creates_single_package_for_ten_part_archive() { + let (repo, bus) = arc_repo_and_bus(); + let grouper = SplitArchiveGrouper::new(repo.clone(), bus.clone()); + let links = ten_part_links("ex.com", "movie"); + + let results = grouper.group_all(&links, 1_700_000_000_000).expect("group"); + + assert_eq!( + results.len(), + 1, + "ten matching parts must collapse to one package" + ); + let r = &results[0]; + assert!(r.created); + assert_eq!(r.base_name, "movie"); + assert_eq!(r.package_name, "movie"); + assert_eq!(r.urls.len(), 10); + assert!(r.missing_parts.is_empty()); + + // Persistence side: exactly one package row, with the expected + // external_id namespace and `auto_extract` enabled so the + // downstream extraction pipeline (PRD §7.2) auto-runs once every + // part has finished downloading. + let stored = repo.list().unwrap(); + assert_eq!(stored.len(), 1); + assert_eq!(stored[0].source_type(), PackageSourceType::SplitArchive); + assert_eq!( + stored[0].external_id(), + Some("split-archive:part-rar:movie") + ); + assert!( + stored[0].auto_extract(), + "split-archive packages must default to auto_extract=true so the \ + completed package is extracted without an extra user click" + ); + + // Bus side: PackageCreated fired exactly once, no incomplete event. + let snap = bus.snapshot(); + assert_eq!( + snap.iter() + .filter(|e| matches!(e, DomainEvent::PackageCreated { .. })) + .count(), + 1 + ); + assert!( + !snap + .iter() + .any(|e| matches!(e, DomainEvent::SplitArchiveIncomplete { .. })), + "no incomplete event when batch is contiguous" + ); + } + + #[test] + fn test_group_all_emits_incomplete_when_part_is_missing() { + let (repo, bus) = arc_repo_and_bus(); + let grouper = SplitArchiveGrouper::new(repo.clone(), bus.clone()); + // Drop part 5 from the 10-part set. + let links: Vec = ten_part_links("ex.com", "movie") + .into_iter() + .filter(|l| !l.filename.contains("part05")) + .collect(); + + let results = grouper.group_all(&links, 0).expect("group"); + assert_eq!(results.len(), 1); + let r = &results[0]; + assert_eq!(r.urls.len(), 9); + assert_eq!(r.missing_parts, vec!["part05.rar".to_string()]); + + let snap = bus.snapshot(); + let incomplete: Vec<&DomainEvent> = snap + .iter() + .filter(|e| matches!(e, DomainEvent::SplitArchiveIncomplete { .. })) + .collect(); + assert_eq!(incomplete.len(), 1); + if let DomainEvent::SplitArchiveIncomplete { + base_name, + missing_parts, + .. + } = incomplete[0] + { + assert_eq!(base_name, "movie"); + assert_eq!(missing_parts, &vec!["part05.rar".to_string()]); + } else { + panic!("wrong event variant"); + } + } + + #[test] + fn test_group_all_handles_multiple_bases_in_one_batch() { + let (repo, bus) = arc_repo_and_bus(); + let grouper = SplitArchiveGrouper::new(repo.clone(), bus.clone()); + + let mut links = ten_part_links("ex.com", "alpha"); + links.extend(ten_part_links("ex.com", "bravo")); + + let results = grouper.group_all(&links, 0).expect("group"); + assert_eq!(results.len(), 2); + let names: Vec<&str> = results.iter().map(|r| r.base_name.as_str()).collect(); + assert!(names.contains(&"alpha")); + assert!(names.contains(&"bravo")); + assert_eq!(repo.list().unwrap().len(), 2); + } + + #[test] + fn test_group_all_skips_singleton_part() { + let (repo, bus) = arc_repo_and_bus(); + let grouper = SplitArchiveGrouper::new(repo.clone(), bus.clone()); + let links = vec![link("https://ex.com/lone.part01.rar", "lone.part01.rar")]; + + let results = grouper.group_all(&links, 0).expect("group"); + assert!( + results.is_empty(), + "single part should not create a package" + ); + assert!(repo.list().unwrap().is_empty()); + } + + #[test] + fn test_group_all_ignores_non_archive_links() { + let (repo, bus) = arc_repo_and_bus(); + let grouper = SplitArchiveGrouper::new(repo.clone(), bus.clone()); + let links = vec![ + link("https://ex.com/photo.jpg", "photo.jpg"), + link("https://ex.com/dump.zip", "dump.zip"), + ]; + + let results = grouper.group_all(&links, 0).expect("group"); + assert!(results.is_empty()); + assert!(repo.list().unwrap().is_empty()); + } + + #[test] + fn test_group_all_is_idempotent_on_re_resolve() { + let (repo, bus) = arc_repo_and_bus(); + let grouper = SplitArchiveGrouper::new(repo.clone(), bus.clone()); + let links = ten_part_links("ex.com", "movie"); + + let first = grouper.group_all(&links, 0).expect("first"); + let second = grouper.group_all(&links, 0).expect("second"); + + assert_eq!(first.len(), 1); + assert_eq!(second.len(), 1); + assert!(first[0].created); + assert!( + !second[0].created, + "re-resolve must reuse the existing package" + ); + assert_eq!(first[0].package_id, second[0].package_id); + assert_eq!(repo.list().unwrap().len(), 1, "no duplicate package"); + + let created_events = bus + .snapshot() + .iter() + .filter(|e| matches!(e, DomainEvent::PackageCreated { .. })) + .count(); + assert_eq!( + created_events, 1, + "PackageCreated must fire only on first creation" + ); + } + + #[test] + fn test_group_all_reuse_still_emits_incomplete_when_parts_missing() { + let (repo, bus) = arc_repo_and_bus(); + let grouper = SplitArchiveGrouper::new(repo.clone(), bus.clone()); + // First resolve has all parts. + let _ = grouper + .group_all(&ten_part_links("ex.com", "movie"), 0) + .unwrap(); + // Drain bus events from the first run so the second run's + // assertions are unambiguous. + let _ = bus.snapshot(); + + // Re-resolve with a missing part 7. + let partial: Vec = ten_part_links("ex.com", "movie") + .into_iter() + .filter(|l| !l.filename.contains("part07")) + .collect(); + let results = grouper.group_all(&partial, 0).expect("reuse"); + + assert_eq!(results.len(), 1); + assert!(!results[0].created); + assert_eq!(results[0].missing_parts, vec!["part07.rar".to_string()]); + + let incomplete = bus + .snapshot() + .iter() + .filter(|e| matches!(e, DomainEvent::SplitArchiveIncomplete { .. })) + .count(); + // At least one incomplete event for the re-resolve. (The first run had none.) + assert!(incomplete >= 1); + } + + #[test] + fn test_group_all_handles_seven_z_format_with_gap() { + let (repo, bus) = arc_repo_and_bus(); + let grouper = SplitArchiveGrouper::new(repo.clone(), bus.clone()); + let links = vec![ + link("https://ex.com/dump.7z.001", "dump.7z.001"), + link("https://ex.com/dump.7z.002", "dump.7z.002"), + link("https://ex.com/dump.7z.004", "dump.7z.004"), + ]; + + let results = grouper.group_all(&links, 0).expect("group"); + assert_eq!(results.len(), 1); + assert_eq!(results[0].missing_parts, vec!["7z.003".to_string()]); + } + + #[test] + fn test_group_all_creates_distinct_packages_for_same_base_across_formats() { + // A RAR set and a ZIP set sharing the same base name describe + // two different archives — they must produce two packages, not + // collapse under a single external_id. + let (repo, bus) = arc_repo_and_bus(); + let grouper = SplitArchiveGrouper::new(repo.clone(), bus.clone()); + let mut links = ten_part_links("ex.com", "mix"); + links.push(link("https://ex.com/mix.zip.001", "mix.zip.001")); + links.push(link("https://ex.com/mix.zip.002", "mix.zip.002")); + + let results = grouper.group_all(&links, 0).expect("group"); + assert_eq!(results.len(), 2); + let stored = repo.list().unwrap(); + assert_eq!(stored.len(), 2, "RAR and ZIP must not share a package"); + + let mut external_ids: Vec = stored + .iter() + .filter_map(|p| p.external_id().map(str::to_string)) + .collect(); + external_ids.sort(); + assert_eq!( + external_ids, + vec![ + "split-archive:part-rar:mix".to_string(), + "split-archive:zip:mix".to_string(), + ] + ); + } + + #[test] + fn test_group_all_legacy_rar_includes_terminal_header() { + // `backup.rar` + `backup.r00` + `backup.r01` is a valid legacy + // 3-volume set. The header file (`backup.rar`) used to be + // dropped because detection only matched `.rNN`, leaving the + // cluster a singleton that fell below MIN_PARTS_TO_GROUP. + let (repo, bus) = arc_repo_and_bus(); + let grouper = SplitArchiveGrouper::new(repo.clone(), bus.clone()); + let links = vec![ + link("https://ex.com/backup.rar", "backup.rar"), + link("https://ex.com/backup.r00", "backup.r00"), + link("https://ex.com/backup.r01", "backup.r01"), + ]; + + let results = grouper.group_all(&links, 0).expect("group"); + assert_eq!(results.len(), 1, "all three volumes share one package"); + let r = &results[0]; + assert_eq!(r.urls.len(), 3); + assert!(r.missing_parts.is_empty()); + assert_eq!(r.base_name, "backup"); + + let stored = repo.list().unwrap(); + assert_eq!(stored.len(), 1); + assert_eq!( + stored[0].external_id(), + Some("split-archive:legacy-rar:backup") + ); + } + + #[test] + fn test_group_all_legacy_rar_reports_missing_header() { + // Inverse of the previous test: `.r00` + `.r01` only — the + // header (.rar, part 0) is reported as missing so the UI can + // tell the user to fetch it. + let (repo, bus) = arc_repo_and_bus(); + let grouper = SplitArchiveGrouper::new(repo, bus); + let links = vec![ + link("https://ex.com/backup.r00", "backup.r00"), + link("https://ex.com/backup.r01", "backup.r01"), + ]; + + let results = grouper.group_all(&links, 0).expect("group"); + assert_eq!(results.len(), 1); + assert_eq!(results[0].missing_parts, vec!["rar".to_string()]); + } + + #[test] + fn test_group_all_drops_lone_legacy_rar_header() { + // A standalone `.rar` (no `.rNN` companion) is just a regular + // RAR archive, not a split set — MIN_PARTS_TO_GROUP must keep + // it out of the package list so the resolver sends it through + // the regular single-file path. + let (repo, bus) = arc_repo_and_bus(); + let grouper = SplitArchiveGrouper::new(repo.clone(), bus); + let links = vec![link("https://ex.com/lonely.rar", "lonely.rar")]; + + let results = grouper.group_all(&links, 0).expect("group"); + assert!(results.is_empty()); + assert!(repo.list().unwrap().is_empty()); + } + + #[test] + fn test_detect_rejects_part_num_above_max_part_index() { + // Modern RAR: an absurd part number forces an unbounded + // `compute_missing_parts` iteration if accepted. The matcher + // must drop it instead. + let huge = format!("movie.part{}.rar", MAX_PART_INDEX + 1); + assert!(detect_from_filename(&huge).is_none()); + // Still accept the boundary value itself. + let at_cap = format!("movie.part{}.rar", MAX_PART_INDEX); + assert!(detect_from_filename(&at_cap).is_some()); + } + + #[test] + fn test_detect_legacy_rar_rejects_index_above_cap() { + // Legacy RAR's raw suffix is one less than the stored part + // number (`r00` → 1), so the rejection threshold is `MAX_PART_INDEX - 1`. + let huge = format!("backup.r{}", MAX_PART_INDEX); + assert!(detect_from_filename(&huge).is_none()); + let at_cap = format!("backup.r{}", MAX_PART_INDEX - 1); + assert!(detect_from_filename(&at_cap).is_some()); + } + + #[test] + fn test_group_all_distinct_parts_required_for_min_threshold() { + // Two mirrors of the same part should not satisfy + // MIN_PARTS_TO_GROUP — only one actual volume is present, so + // the resolver should send it through the regular single-file + // path, not create a misleading "complete" package. + let (repo, bus) = arc_repo_and_bus(); + let grouper = SplitArchiveGrouper::new(repo.clone(), bus); + let links = vec![ + link("https://mirror1.com/movie.part01.rar", "movie.part01.rar"), + link("https://mirror2.com/movie.part01.rar", "movie.part01.rar"), + ]; + + let results = grouper.group_all(&links, 0).expect("group"); + assert!( + results.is_empty(), + "duplicate mirrors of one part must not satisfy MIN_PARTS_TO_GROUP" + ); + assert!(repo.list().unwrap().is_empty()); + } + + #[test] + fn test_group_all_distinct_parts_threshold_groups_two_real_volumes() { + // Two real volumes (one mirror per volume) still groups, even + // though parts.len() and distinct count happen to match — sanity + // check that the new gate did not regress the happy path. + let (repo, bus) = arc_repo_and_bus(); + let grouper = SplitArchiveGrouper::new(repo, bus); + let links = vec![ + link("https://ex.com/movie.part01.rar", "movie.part01.rar"), + link("https://ex.com/movie.part02.rar", "movie.part02.rar"), + ]; + + let results = grouper.group_all(&links, 0).expect("group"); + assert_eq!(results.len(), 1); + assert_eq!(results[0].urls.len(), 2); + } + + #[test] + fn test_group_all_caps_link_count_to_avoid_dos() { + // The IPC entry-point can hand us an arbitrarily large batch; + // the grouper must reject it instead of allocating unbounded + // cluster state. Mirrors `MAX_URLS` in `resolve_links`. + let (repo, bus) = arc_repo_and_bus(); + let grouper = SplitArchiveGrouper::new(repo, bus); + + let oversize: Vec = (0..MAX_LINKS + 1) + .map(|n| { + let name = format!("file{n}.bin"); + link(&format!("https://ex.com/{name}"), &name) + }) + .collect(); + + let err = grouper + .group_all(&oversize, 0) + .expect_err("oversize batch must be rejected"); + assert!(matches!(err, AppError::Validation(_))); + } +} diff --git a/src-tauri/src/domain/event.rs b/src-tauri/src/domain/event.rs index e85b033..c3a3a3e 100644 --- a/src-tauri/src/domain/event.rs +++ b/src-tauri/src/domain/event.rs @@ -223,6 +223,16 @@ pub enum DomainEvent { id: PackageId, delete_downloads: bool, }, + /// Emitted by the split-archive grouper when the resolved link set + /// for a base name is missing one or more numbered parts. The UI + /// surfaces a notification so the user can fetch the gap before the + /// extraction step blocks. `missing_parts` lists the human-readable + /// suffixes (e.g. `"part05.rar"`, `"7z.003"`) of the gaps detected. + SplitArchiveIncomplete { + package_id: PackageId, + base_name: String, + missing_parts: Vec, + }, // Clipboard ClipboardUrlDetected { @@ -445,6 +455,23 @@ mod tests { assert!(s.contains("pkg-del")); } + #[test] + fn test_split_archive_incomplete_event_carries_missing_parts() { + let event = DomainEvent::SplitArchiveIncomplete { + package_id: PackageId::new("pkg-split"), + base_name: "movie".to_string(), + missing_parts: vec!["part05.rar".to_string(), "part07.rar".to_string()], + }; + let s = format!("{event:?}"); + assert!(s.contains("SplitArchiveIncomplete")); + assert!(s.contains("pkg-split")); + assert!(s.contains("movie")); + assert!(s.contains("part05.rar")); + + let cloned = event.clone(); + assert_eq!(event, cloned); + } + #[test] fn test_clipboard_url_detected_event() { let event = DomainEvent::ClipboardUrlDetected { diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs index f10561c..78fdac8 100644 --- a/src-tauri/src/lib.rs +++ b/src-tauri/src/lib.rs @@ -80,7 +80,7 @@ pub use adapters::driving::tauri_ipc::{ download_set_priority, download_start, download_verify_checksum, history_clear, history_delete_entry, history_export, history_get_by_id, history_list, history_purge_older_than, history_search, link_check_online, link_group_playlists, - link_resolve, package_add_download, package_create, package_delete, + link_group_split_archives, link_resolve, package_add_download, package_create, package_delete, package_find_by_external_id, package_get, package_list, package_list_downloads, package_move_to_folder, package_remove_download, package_set_password, package_set_priority, package_toggle_auto_extract, package_update, plugin_config_get, plugin_config_update, @@ -556,6 +556,7 @@ pub fn run() { link_resolve, link_check_online, link_group_playlists, + link_group_split_archives, clipboard_toggle, clipboard_state, settings_get, diff --git a/src/types/media.ts b/src/types/media.ts index bbc14f3..58ec01c 100644 --- a/src/types/media.ts +++ b/src/types/media.ts @@ -76,3 +76,30 @@ export interface PlaylistGroupResult { created: boolean; itemCount: number; } + +/** Mirror of [`SplitArchiveLinkInputDto`](src-tauri/src/adapters/driving/tauri_ipc.rs). + * Used as the input payload to the `link_group_split_archives` IPC. */ +export interface SplitArchiveLinkInput { + url: string; + filename: string; +} + +/** Mirror of [`SplitArchiveGroupResultDto`](src-tauri/src/adapters/driving/tauri_ipc.rs). + * Returned by the `link_group_split_archives` IPC. One entry per detected + * base name; `missingParts` is non-empty when the input batch had gaps in + * the part numbering (the backend also fires a `split-archive-incomplete` + * event in that case). */ +export interface SplitArchiveGroupResult { + packageId: string; + baseName: string; + packageName: string; + /** True when the package was just created, false when an existing + * package with the same `baseName` was reused. */ + created: boolean; + /** URLs that belong to this group, sorted by detected part number. */ + urls: string[]; + /** Human-readable suffixes (e.g. `"part05.rar"`, `"7z.003"`) of the + * parts that should exist between part 1 and the highest detected + * part number but are absent from the input batch. */ + missingParts: string[]; +}