From 5e889c403ff71d5f2cf3ad6d154c6f8be4894672 Mon Sep 17 00:00:00 2001 From: InstaZDLL Date: Sat, 20 Jun 2026 21:12:56 +0200 Subject: [PATCH 1/6] fix(lyrics): strip Genius header artifact + treat stub pages as miss MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Genius recently moved the contributor badge + ` Lyrics` heading inside the same `data-lyrics-container` div that holds the actual verses. Once our HTML decoder flattens the tags we end up with a glued-together prefix like `"17 ContributorsFall on Me Lyrics"` sitting at the head of every result, plus — for songs that have been catalogued on Genius without any transcribed lyrics yet — pages whose ONLY visible content is that bare header. Image 1 in issue #284 shows the bare-header case ("Solamente una vez": panel renders `"2 ContributorsSolamente Una Vez Lyrics"` and nothing else); image 2 shows the prefix-glue case ("Fall on Me": header runs into the chorus). Add `strip_genius_header` to pattern-match the `^\d+ Contributors.*? Lyrics` shape and drop it before returning the candidate. Pure prefix surgery: anything that doesn't match the shape (no leading digits, no " Contributors" token, no trailing " Lyrics" closer) is returned untouched, so a future Genius layout change won't silently lose the first line. After stripping, an empty body is treated as a miss — the host then falls through to the next provider in the chain (NetEase / Megalobiz) instead of caching the bare header as the user-visible lyrics. Seven new unit tests cover the happy path, the stub-page case, the multi-byte title path (Déjà Vu), and four no-op shapes (leading whitespace, no digit prefix, missing " Contributors", missing " Lyrics", title starting with a number à la "99 Red Balloons"). --- .../syncedlyrics/src/providers/genius.rs | 114 +++++++++++++++++- 1 file changed, 113 insertions(+), 1 deletion(-) diff --git a/src-tauri/crates/syncedlyrics/src/providers/genius.rs b/src-tauri/crates/syncedlyrics/src/providers/genius.rs index 245c6a72..325783b3 100644 --- a/src-tauri/crates/syncedlyrics/src/providers/genius.rs +++ b/src-tauri/crates/syncedlyrics/src/providers/genius.rs @@ -51,7 +51,14 @@ pub async fn search( // attacker-controlled host and we'd happily GET it. let url = validate_outbound_url(url_str, GENIUS_ALLOWED_HOSTS)?; let page = safe_text(http.get(url).send().await?.error_for_status()?).await?; - let text = extract_lyrics_containers(&page); + let raw = extract_lyrics_containers(&page); + let text = strip_genius_header(&raw); + // Treat a stub page (header only, no lyrics body — Genius renders + // these for songs that have been added but never transcribed) as a + // miss so the host can fall through to the next provider instead of + // surfacing the bare "N ContributorsTitle Lyrics" artifact to the + // user (issue #284). The full unsynced lyrics are otherwise + // returned unchanged. Ok((!text.trim().is_empty()).then_some(Candidate { synced: None, unsynced: Some(text), @@ -77,3 +84,108 @@ fn extract_lyrics_containers(html: &str) -> String { } out } + +/// Strip the standard Genius lyrics-page header artifact from the +/// start of an extracted lyrics body. +/// +/// Recent Genius layouts nest the contributor badge + the +/// ` Lyrics` heading inside the same `data-lyrics-container` +/// div that holds the actual verses, so once `html_text_decode` flattens +/// the tags we end up with a prefix like +/// `"17 ContributorsFall on Me Lyrics"` glued directly to the lyrics body +/// (issue #284). Strip the prefix when we recognise the +/// `^\d+ Contributors.*? Lyrics` shape; leave the text untouched +/// otherwise so a different layout doesn't silently lose its first line. +fn strip_genius_header(text: &str) -> String { + let trimmed = text.trim_start(); + let digits_end = trimmed.chars().take_while(|c| c.is_ascii_digit()).count(); + if digits_end == 0 { + return text.to_string(); + } + let after_digits = &trimmed[digits_end..]; + let Some(after_contrib) = after_digits.strip_prefix(" Contributors") else { + return text.to_string(); + }; + // Find the first " Lyrics" token that closes the header — Genius + // glues the title onto the trailing " Lyrics" word with a single + // space separator (e.g. "...Vez Lyrics"). Anything before that is + // the title we want to drop along with the badge text. + let Some(lyrics_idx) = after_contrib.find(" Lyrics") else { + return text.to_string(); + }; + after_contrib[lyrics_idx + " Lyrics".len()..] + .trim_start() + .to_string() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn strip_header_drops_badge_title_and_lyrics_token() { + let input = "17 ContributorsFall on Me Lyrics\n[Chorus]\nFall on me"; + assert_eq!( + strip_genius_header(input), + "[Chorus]\nFall on me".to_string() + ); + } + + #[test] + fn strip_header_returns_empty_for_stub_pages() { + // Genius renders this layout for songs added to the catalogue + // without transcribed lyrics. The bare header is the only + // content the parser sees, and the caller treats empty output + // as a miss so the next provider gets a turn. + let input = "2 ContributorsSolamente Una Vez Lyrics"; + assert_eq!(strip_genius_header(input), "".to_string()); + } + + #[test] + fn strip_header_preserves_leading_whitespace_outside_pattern() { + // Genius occasionally pads the container with leading + // whitespace from inline scripts that don't decode to text. + // The trim_start on the digit probe must NOT eat newlines that + // belong to the lyrics body when no header is present. + let input = "\n[Verse 1]\nA real lyric"; + assert_eq!(strip_genius_header(input), input.to_string()); + } + + #[test] + fn strip_header_no_op_when_no_digit_prefix() { + let input = "[Verse 1]\nA real lyric without a Genius header"; + assert_eq!(strip_genius_header(input), input.to_string()); + } + + #[test] + fn strip_header_no_op_when_contributors_token_missing() { + // A track titled with a leading number (e.g. "99 Red Balloons") + // must not trigger the strip — only " Contributors" right + // after the digits qualifies as a Genius header. + let input = "99 Red Balloons Lyrics\n[Verse 1]"; + assert_eq!(strip_genius_header(input), input.to_string()); + } + + #[test] + fn strip_header_no_op_when_lyrics_token_missing() { + // Defensive: a Genius layout change that drops the trailing + // " Lyrics" word would otherwise let us swallow the entire + // body up to the first match. Leave the input alone instead + // so the user sees raw content and we notice the regression + // in bug reports rather than silent data loss. + let input = "5 ContributorsSong Title Without Trailing Word\n[Verse]"; + assert_eq!(strip_genius_header(input), input.to_string()); + } + + #[test] + fn strip_header_handles_multibyte_title() { + // Solamente Una Vez carries no multibyte chars but other Latin + // titles do (Naïve, Déjà Vu). The strip walks bytes via + // `find(" Lyrics")` + `len()`, both byte-safe. + let input = "3 ContributorsDéjà Vu Lyrics\n[Verse]\nMidnight again"; + assert_eq!( + strip_genius_header(input), + "[Verse]\nMidnight again".to_string() + ); + } +} From 5e8f7c4945a69678b86a476b3605136b0db9e7fe Mon Sep 17 00:00:00 2001 From: InstaZDLL Date: Sat, 20 Jun 2026 22:43:46 +0200 Subject: [PATCH 2/6] fix(lyrics): track real provider per row + add user-driven refetch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backend half of the fix for issue #284. Two problems being addressed: 1. The lyrics panel hard-codes "LRCLIB" as the source badge no matter which provider actually responded. After the LRCLIB miss, the waterfall falls through to NetEase / Megalobiz / Genius — and when Genius hands back junk (see the sibling Genius header-strip commit in this branch), the user has no way to know what they're looking at or how to retry from a cleaner source. 2. The auto-waterfall has no manual override. If Genius caches a bad result, the only escape today is "Clear" + "Refetch" which just re-runs the same waterfall and lands on the same Genius hit. ## Migration `20260620120000_lyrics_provider.sql` adds a nullable `provider` column to `app.lyrics`. NULL means "unknown" — covers pre-1.5.1 cached rows plus every non-API tier (embedded / sidecar / manual) where the broad `source` is already meaningful attribution. No CHECK constraint on the allowed set: provider strings come from `Provider::as_str()` at the single write site, and a CHECK that hard-codes the list would force another migration every time the SDK gains a provider. ## Provider plumbing - `LyricsPayload` gains `provider: Option`, serialised via `skip_serializing_if = "Option::is_none"` so the JSON wire shape stays compact on tiers where the field is meaningless. - `Provider::as_str()` + `Provider::from_str()` added to `waveflow_syncedlyrics`, matching the existing `serde(rename_all = "snake_case")` ids (`"lrclib"` / `"genius"` / `"net_ease"` / `"megalobiz"` / `"musixmatch"`). Single canonical id format across DB rows, payload field, and the frontend ↔ backend boundary. - `upsert_lyrics` takes a 6th `provider: Option<&str>` parameter. Every call site updated: embedded / sidecar / manual write `None`; LRCLIB direct + LRCLIB instrumental cache + LRCLIB pre-cache in `fetch_lyrics` and the prefetch path all write `Some("lrclib")`; `cache_external_lyrics` derives the provider id from `LyricsResult::provider.as_str()` so NetEase / Megalobiz / Genius / Musixmatch all attribute correctly. - `read_cached` SELECTs the new column and surfaces it on the payload. - Empty miss rows (after a full waterfall failure) write `provider = None` — attribution to a specific provider would be misleading when every provider returned nothing. ## refetch_lyrics command New `#[tauri::command] pub async fn refetch_lyrics(track_id, provider: Option)`. Drops the cached row first so the waterfall (or single-provider query below) is forced to re-query, then two modes: - `provider = None` → re-runs the full waterfall by calling `fetch_lyrics` directly. Identical to the legacy "Clear + Refetch" flow. - `provider = Some(id)` → bypasses local tiers (embedded / sidecar) and queries ONLY the named provider via `external_lyrics_search`. Honours the Musixmatch translation lang and the offline-mode short-circuit on the same chokepoints as the waterfall. A miss caches an empty row attributed to the requested provider so the UI badge reflects what the user just tried — picking a different provider and trying again is then the natural next step. Registered alongside `fetch_lyrics` in `lib.rs::generate_handler![]`. 84 desktop unit tests + workspace `cargo check --all-targets` still pass with the migration applied to a fresh `app.db`. Frontend wiring (provider badge + dropdown picker) lands in the next commit. --- src-tauri/crates/app/src/commands/lyrics.rs | 214 ++++++++++++++++-- src-tauri/crates/app/src/lib.rs | 1 + .../crates/syncedlyrics/src/providers/mod.rs | 30 +++ .../app/20260620120000_lyrics_provider.sql | 23 ++ 4 files changed, 251 insertions(+), 17 deletions(-) create mode 100644 src-tauri/migrations/app/20260620120000_lyrics_provider.sql diff --git a/src-tauri/crates/app/src/commands/lyrics.rs b/src-tauri/crates/app/src/commands/lyrics.rs index 32da7d40..5e07c127 100644 --- a/src-tauri/crates/app/src/commands/lyrics.rs +++ b/src-tauri/crates/app/src/commands/lyrics.rs @@ -118,6 +118,19 @@ pub struct LyricsPayload { pub content: String, pub format: LyricsFormat, pub source: LyricsSource, + /// Sub-provider that produced this row when `source` is + /// `LyricsSource::Api`. Matches `Provider::as_str()` from + /// `waveflow_syncedlyrics` (snake_case identifier: + /// `"lrclib"` / `"genius"` / `"net_ease"` / `"megalobiz"` / + /// `"musixmatch"`). `None` for embedded / sidecar / manual rows + /// and for pre-1.5.1 cached entries that pre-date the + /// `lyrics.provider` column. The UI surfaces this in the source + /// badge so the user knows whether they're looking at LRCLIB- + /// curated lyrics or a Genius scrape — important when the latter + /// occasionally returns junk (issue #284) and the user wants to + /// re-fetch from a different provider. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub provider: Option, /// Set by `save_lyrics` when destination = `tag` was requested but /// the audio file's tag system can't carry the chosen format (e.g. /// TTML in an MP3's ID3v2 where lofty has no mapping for the @@ -413,12 +426,22 @@ async fn cache_external_lyrics( ) -> AppResult { let format = external_format_to_app(result.format); let source = LyricsSource::Api; - upsert_lyrics(pool, file_hash, &result.content, &format, &source).await?; + let provider = result.provider.as_str(); + upsert_lyrics( + pool, + file_hash, + &result.content, + &format, + &source, + Some(provider), + ) + .await?; Ok(LyricsPayload { track_id, content: result.content, format, source, + provider: Some(provider.to_string()), tag_write_skipped: None, sidecar_write_skipped: None, }) @@ -462,7 +485,9 @@ async fn try_external_fallback_or_miss( // No provider had lyrics. Cache as an empty row so we don't re-hit // the network on every panel open. The user can force a re-search // by clicking "Refetch" in the lyrics panel (clears the row, - // re-runs the waterfall). + // re-runs the waterfall). `provider = None` for the miss row — + // attribution to a specific provider would be misleading when + // every provider returned nothing. let empty = String::new(); upsert_lyrics( pool, @@ -470,6 +495,7 @@ async fn try_external_fallback_or_miss( &empty, &LyricsFormat::Plain, &LyricsSource::Api, + None, ) .await?; Ok(LyricsPayload { @@ -477,6 +503,7 @@ async fn try_external_fallback_or_miss( content: empty, format: LyricsFormat::Plain, source: LyricsSource::Api, + provider: None, tag_write_skipped: None, sidecar_write_skipped: None, }) @@ -679,26 +706,35 @@ fn read_non_empty_file(path: &Path) -> Option { /// Insert (or replace) the lyrics row, keyed by file content hash so the /// cache is shared across profiles that contain the same audio file. +/// +/// `provider` carries the sub-source identifier when `source` is +/// `LyricsSource::Api` (e.g. `"lrclib"`, `"genius"`) and `None` for +/// embedded / sidecar / manual writes where the broad `source` is +/// the only meaningful attribution. The DB column allows NULL so +/// pre-1.5.1 rows + non-API tiers store cleanly. async fn upsert_lyrics( pool: &sqlx::SqlitePool, file_hash: &str, content: &str, format: &LyricsFormat, source: &LyricsSource, + provider: Option<&str>, ) -> AppResult<()> { sqlx::query( - "INSERT INTO app.lyrics (file_hash, content, format, source, language, fetched_at) - VALUES (?, ?, ?, ?, NULL, ?) + "INSERT INTO app.lyrics (file_hash, content, format, source, provider, language, fetched_at) + VALUES (?, ?, ?, ?, ?, NULL, ?) ON CONFLICT(file_hash) DO UPDATE SET content = excluded.content, format = excluded.format, source = excluded.source, + provider = excluded.provider, fetched_at = excluded.fetched_at", ) .bind(file_hash) .bind(content) .bind(format_to_db(format)) .bind(source_to_db(source)) + .bind(provider) .bind(now_ms()) .execute(pool) .await?; @@ -709,8 +745,8 @@ async fn upsert_lyrics( /// numeric `track_id` so we look up the file hash first, then key into the /// shared `app.lyrics` cache. async fn read_cached(pool: &sqlx::SqlitePool, track_id: i64) -> AppResult> { - let row: Option<(String, String, String)> = sqlx::query_as( - "SELECT l.content, l.format, l.source + let row: Option<(String, String, String, Option)> = sqlx::query_as( + "SELECT l.content, l.format, l.source, l.provider FROM track t JOIN app.lyrics l ON l.file_hash = t.file_hash WHERE t.id = ?", @@ -718,11 +754,12 @@ async fn read_cached(pool: &sqlx::SqlitePool, track_id: i64) -> AppResult