Add Spotify duration-mismatch rematch sweep (issue #100, phase 2)

dprodger · claude · dprodger · commit 0317b9868f6a · 2026-04-27T13:52:26.000-04:00
Adds a third Spotify job type, ('spotify', 'rematch_duration_mismatches'),
target_type='song'. The handler is a thin wrapper around the existing
SpotifyMatcher with duration_mismatch_threshold set — same code path
match_spotify_tracks.py --duration-mismatches has used historically. For
each song the matcher walks only the releases whose linked Spotify
track's duration differs from the recording's canonical duration by
more than the threshold, swapping the link to a better track if it
finds one.

No auto-unlinking. Stubborn mismatches stay in place and remain visible
in /admin/duration-mismatches for human review — that's a bigger trust
call we can revisit later.

Threshold defaults to 60 seconds (60_000 ms) — matches the admin review
page and the matcher CLI default. Configurable per-job via the payload
and per-sweep via --threshold-seconds, since the obvious-mismatch passes
are likely to want a tighter threshold once the loud cases are cleaned up.

The job is registered as a separate job_type from match_song so a bulk
cleanup sweep doesn't collide on the (source, job_type, target_type,
target_id) unique index with a user-triggered rematch on the same song.
Different priorities, different metrics, different filter facet on the
admin research dashboard.

Producer: core/spotify_rematch_mismatches.py — sweep enqueuer that
walks get_songs_with_duration_mismatches() and enqueues one job per
song at priority 110 (behind user 50 and routine 100).

CLI: scripts/rematch_spotify_duration_mismatches.py — production path.
The in-process match_spotify_tracks.py --duration-mismatches flag stays
for ad-hoc debugging.

Tests: 13 new in test_spotify_rematch_mismatches.py covering threshold
defaulting, payload override, success/permanent/no-op/retryable error
mapping, and sweep behaviour (threshold pass-through, limit, error
counting).

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/backend/core/spotify_rematch_mismatches.py b/backend/core/spotify_rematch_mismatches.py
@@ -0,0 +1,121 @@
+"""
+Spotify duration-mismatch rematch sweep (issue #100, second phase).
+
+Walks songs where at least one Spotify streaming link's duration_ms
+differs from the linked recording's canonical duration_ms by more than
+a threshold, and enqueues one ('spotify', 'rematch_duration_mismatches')
+job per song onto the durable research queue.
+
+The handler — see research_worker/handlers/spotify.py — wraps the same
+SpotifyMatcher path the `match_spotify_tracks.py --duration-mismatches`
+CLI has used historically. It re-runs matching narrowly on the
+mismatched releases; if it finds a better track, it swaps the link, and
+otherwise leaves the existing match in place.
+
+Threshold defaults to 60s (60_000 ms) — same as the
+/admin/duration-mismatches review page and the matcher CLI. 60s is wide
+enough to ignore Spotify's usual 1-2s duration drift, narrow enough to
+catch wrong-track matches (e.g. a 4-min recording linked to a 2-min
+track).
+
+Per-song was chosen over per-link for the same reasons as the backfill
+sweep: matches the existing match_song job shape, dedups cleanly, and
+the matcher itself is per-song-shaped under the hood.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Optional
+
+from core import research_jobs
+from integrations.spotify.db import get_songs_with_duration_mismatches
+
+
+logger = logging.getLogger(__name__)
+
+
+DEFAULT_THRESHOLD_MS = 60_000
+
+
+def find_candidate_song_ids(
+    threshold_ms: int = DEFAULT_THRESHOLD_MS,
+    limit: Optional[int] = None,
+) -> list[str]:
+    """Return song UUIDs that own one or more Spotify streaming links
+    whose duration_ms differs from the linked recording's duration_ms by
+    more than `threshold_ms`. Pass `limit` to cap the result set."""
+    songs = get_songs_with_duration_mismatches(threshold_ms=threshold_ms)
+    song_ids = [str(row['id']) for row in songs]
+    if limit is not None:
+        song_ids = song_ids[:limit]
+    return song_ids
+
+
+def enqueue_sweep(
+    threshold_ms: int = DEFAULT_THRESHOLD_MS,
+    limit: Optional[int] = None,
+    priority: int = 110,
+) -> dict[str, int]:
+    """Find candidate songs and enqueue one rematch job per song.
+
+    The default priority (110) sits behind user-initiated jobs (50) and
+    plain-vanilla research (100) so a bulk cleanup pass doesn't starve
+    normal traffic on the worker thread.
+
+    The threshold is passed through to the handler via payload so each
+    job re-checks against the same value the sweep used to enqueue it.
+    Otherwise a bulk run at threshold=60_000 could be silently widened by
+    the handler reading a different default later.
+
+    Returns:
+        {'candidates': N, 'enqueued': M, 'errors': E, 'threshold_ms': T}
+        where `enqueued` counts successful enqueue() returns. The unique
+        index in research_jobs collapses a re-sweep down to the existing
+        (queued|running) job for the same song, so calling this twice in
+        a row at the same threshold is safe and idempotent.
+    """
+    song_ids = find_candidate_song_ids(threshold_ms=threshold_ms, limit=limit)
+    base_stats = {
+        'candidates': len(song_ids),
+        'enqueued': 0,
+        'errors': 0,
+        'threshold_ms': threshold_ms,
+    }
+    if not song_ids:
+        return base_stats
+
+    enqueued = 0
+    errors = 0
+    for song_id in song_ids:
+        try:
+            job_id = research_jobs.enqueue(
+                source=research_jobs.SOURCE_SPOTIFY,
+                job_type='rematch_duration_mismatches',
+                target_type=research_jobs.TARGET_SONG,
+                target_id=song_id,
+                payload={'threshold_ms': threshold_ms},
+                priority=priority,
+            )
+        except Exception:
+            logger.exception(
+                "spotify_rematch_mismatches: failed to enqueue song %s",
+                song_id,
+            )
+            errors += 1
+            continue
+
+        if job_id is None:
+            errors += 1
+            continue
+
+        enqueued += 1
+
+    logger.info(
+        "spotify_rematch_mismatches: threshold_ms=%d candidates=%d "
+        "enqueued=%d errors=%d",
+        threshold_ms, len(song_ids), enqueued, errors,
+    )
+    base_stats['enqueued'] = enqueued
+    base_stats['errors'] = errors
+    return base_stats
diff --git a/backend/research_worker/handlers/spotify.py b/backend/research_worker/handlers/spotify.py
@@ -1,7 +1,7 @@
 """
 Spotify handlers on the durable queue.
 
-Two job types are registered here:
+Three job types are registered here:
 
 1. ('spotify', 'match_song'), target_type='song'
    Wraps integrations/spotify/matcher.SpotifyMatcher.match_releases — a
@@ -16,6 +16,16 @@
    backfill historic rows that were inserted before the matcher started
    capturing duration. Issue #100.
 
+3. ('spotify', 'rematch_duration_mismatches'), target_type='song'
+   For one song, re-runs the SpotifyMatcher against only the releases
+   whose linked Spotify track's duration_ms differs from the recording's
+   canonical duration_ms by more than `threshold_ms` (default 60_000 — 60s).
+   The matcher decides whether to swap the link to a better track or
+   leave the existing match alone. No auto-unlinking — leftover bad
+   matches stay visible in the /admin/duration-mismatches review page.
+   Separate job_type from match_song so a bulk-cleanup sweep doesn't
+   collide on the unique index with a user-triggered rematch. Issue #100.
+
 Quota accounting: skipped. Spotify uses HTTP 429 rate limits, not a daily
 budget like YouTube. The SpotifyClient already retries 429s internally
 with exponential backoff. If those retries exhaust, we surface the
@@ -206,3 +216,73 @@ def backfill_durations(payload: dict[str, Any], ctx) -> dict[str, Any]:
         )
 
     return stats
+
+
+# ---------------------------------------------------------------------------
+# rematch_duration_mismatches — issue #100, second phase
+# ---------------------------------------------------------------------------
+
+# Default threshold matches the /admin/duration-mismatches review page and
+# the matcher's existing CLI default. 60s is wide enough to ignore Spotify's
+# usual 1-2s duration drift, narrow enough to catch wrong-track matches
+# (e.g. a 4-min recording linked to a 2-min track).
+_DEFAULT_DURATION_MISMATCH_THRESHOLD_MS = 60_000
+
+
+@handler('spotify', 'rematch_duration_mismatches')
+def rematch_duration_mismatches(payload: dict[str, Any], ctx) -> dict[str, Any]:
+    """Re-run the Spotify matcher on this song's mismatched releases.
+
+    Constructs `SpotifyMatcher(duration_mismatch_threshold=...)` and
+    delegates to `match_releases(song_id)` — same code path the
+    `match_spotify_tracks.py --duration-mismatches` CLI uses. The matcher
+    will only walk releases whose existing Spotify link's duration_ms
+    differs from the recording's canonical duration_ms by more than the
+    threshold, swapping in a better track if one is found.
+
+    payload may include:
+        threshold_ms: int — override the 60_000 ms default. Useful for
+            running tighter sweeps (e.g. 30s) once the obvious cases are
+            cleaned up.
+    """
+    song_id = ctx.target_id
+    threshold_ms = int(
+        payload.get('threshold_ms', _DEFAULT_DURATION_MISMATCH_THRESHOLD_MS)
+    )
+
+    matcher = SpotifyMatcher(
+        duration_mismatch_threshold=threshold_ms,
+        logger=ctx.log,
+    )
+    result = matcher.match_releases(song_id)
+
+    if result.get('success'):
+        stats = result.get('stats') or {}
+        return {
+            'threshold_ms': threshold_ms,
+            'releases_processed': stats.get('releases_processed', 0),
+            'releases_updated': stats.get('releases_updated', 0),
+            'releases_no_match': stats.get('releases_no_match', 0),
+            'tracks_matched': stats.get('tracks_matched', 0),
+            'tracks_had_previous': stats.get('tracks_had_previous', 0),
+            'cache_hits': stats.get('cache_hits', 0),
+            'api_calls': stats.get('api_calls', 0),
+            'rate_limit_hits': stats.get('rate_limit_hits', 0),
+        }
+
+    error = result.get('error') or 'unknown error'
+    error_lower = error.lower()
+
+    if any(marker in error_lower for marker in _PERMANENT_ERROR_MARKERS):
+        raise PermanentError(f"Spotify rematch: {error}")
+
+    if any(marker in error_lower for marker in _NO_OP_ERROR_MARKERS):
+        # Song existed but had no mismatched releases above threshold by the
+        # time the worker got to it — fine, treat as a clean no-op.
+        return {
+            'threshold_ms': threshold_ms,
+            'reason': 'no_mismatched_releases',
+            'releases_processed': 0,
+        }
+
+    raise RetryableError(f"Spotify rematch failed: {error}")
diff --git a/backend/scripts/rematch_spotify_duration_mismatches.py b/backend/scripts/rematch_spotify_duration_mismatches.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+"""
+Rematch Spotify Duration Mismatches — issue #100, second phase.
+
+Enqueues one ('spotify', 'rematch_duration_mismatches') job per song that
+owns at least one Spotify streaming link whose duration differs from the
+linked recording's canonical duration by more than a threshold (default
+60s). The handler — see research_worker/handlers/spotify.py — re-runs
+the SpotifyMatcher narrowly on the mismatched releases.
+
+The matcher swaps the link to a better track if one is found; otherwise
+the existing match stays in place for human review via
+/admin/duration-mismatches. There is no auto-unlinking.
+
+For ad-hoc one-shot work the in-process
+`match_spotify_tracks.py --duration-mismatches <seconds>` flag still
+exists; this CLI is the production path that hands the work to the
+durable worker queue.
+
+Usage:
+    python rematch_spotify_duration_mismatches.py
+    python rematch_spotify_duration_mismatches.py --threshold-seconds 30
+    python rematch_spotify_duration_mismatches.py --limit 100
+    python rematch_spotify_duration_mismatches.py --dry-run
+"""
+
+from script_base import ScriptBase, run_script
+from core.spotify_rematch_mismatches import (
+    DEFAULT_THRESHOLD_MS,
+    enqueue_sweep,
+    find_candidate_song_ids,
+)
+
+
+def main():
+    script = ScriptBase(
+        name="rematch_spotify_duration_mismatches",
+        description=(
+            "Enqueue per-song Spotify rematch jobs for releases whose "
+            "linked Spotify track duration differs from the recording's "
+            "canonical duration by more than a threshold."
+        ),
+        epilog="""
+Examples:
+  python rematch_spotify_duration_mismatches.py
+  python rematch_spotify_duration_mismatches.py --threshold-seconds 30
+  python rematch_spotify_duration_mismatches.py --limit 100
+  python rematch_spotify_duration_mismatches.py --dry-run
+        """
+    )
+
+    script.add_dry_run_arg()
+    script.add_debug_arg()
+    script.add_limit_arg(default=None)
+
+    # Threshold knob — seconds-keyed to match the existing match_spotify_tracks
+    # CLI's --duration-mismatches flag and the admin review page UX.
+    script.parser.add_argument(
+        '--threshold-seconds',
+        type=int,
+        default=DEFAULT_THRESHOLD_MS // 1000,
+        help=(
+            f'Mismatch threshold in seconds (default: '
+            f'{DEFAULT_THRESHOLD_MS // 1000}). Songs whose Spotify track '
+            f'duration differs from the recording\'s duration by more '
+            f'than this are eligible.'
+        ),
+    )
+
+    args = script.parse_args()
+
+    threshold_ms = int(args.threshold_seconds) * 1000
+
+    script.print_header({
+        "DRY RUN": args.dry_run,
+        "THRESHOLD": f"{args.threshold_seconds}s",
+        "LIMIT": args.limit if args.limit is not None else 'all candidates',
+    })
+
+    if args.dry_run:
+        song_ids = find_candidate_song_ids(
+            threshold_ms=threshold_ms, limit=args.limit,
+        )
+        script.logger.info(
+            "Would enqueue %d song(s) for Spotify duration-mismatch rematch "
+            "(threshold %ds)",
+            len(song_ids), args.threshold_seconds,
+        )
+        for sid in song_ids[:25]:
+            script.logger.debug("  candidate: %s", sid)
+        if len(song_ids) > 25:
+            script.logger.debug("  ... %d more", len(song_ids) - 25)
+        script.print_summary({
+            'candidates': len(song_ids),
+            'enqueued': 0,
+            'errors': 0,
+            'threshold_ms': threshold_ms,
+        })
+        return True
+
+    stats = enqueue_sweep(threshold_ms=threshold_ms, limit=args.limit)
+    script.print_summary(stats)
+    return stats['errors'] == 0
+
+
+if __name__ == "__main__":
+    run_script(main)
diff --git a/backend/tests/test_spotify_rematch_mismatches.py b/backend/tests/test_spotify_rematch_mismatches.py