Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
214 changes: 197 additions & 17 deletions src-tauri/crates/app/src/commands/lyrics.rs

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src-tauri/crates/app/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -727,6 +727,7 @@ pub fn run() {
commands::tray::set_tray_labels,
commands::lyrics::get_lyrics,
commands::lyrics::fetch_lyrics,
commands::lyrics::refetch_lyrics,
commands::lyrics::import_lrc_file,
commands::lyrics::save_lyrics,
commands::lyrics::export_lyrics_to_path,
Expand Down
183 changes: 182 additions & 1 deletion src-tauri/crates/syncedlyrics/src/providers/genius.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,14 @@ pub async fn search(
// attacker-controlled host and we'd happily GET it.
let url = validate_outbound_url(url_str, GENIUS_ALLOWED_HOSTS)?;
let page = safe_text(http.get(url).send().await?.error_for_status()?).await?;
let text = extract_lyrics_containers(&page);
let raw = extract_lyrics_containers(&page);
let text = strip_genius_header(&raw);
// Treat a stub page (header only, no lyrics body — Genius renders
// these for songs that have been added but never transcribed) as a
// miss so the host can fall through to the next provider instead of
// surfacing the bare "N ContributorsTitle Lyrics" artifact to the
// user (issue #284). The full unsynced lyrics are otherwise
// returned unchanged.
Ok((!text.trim().is_empty()).then_some(Candidate {
synced: None,
unsynced: Some(text),
Expand All @@ -77,3 +84,177 @@ fn extract_lyrics_containers(html: &str) -> String {
}
out
}

/// Strip the standard Genius lyrics-page header artifact from the
/// start of an extracted lyrics body.
///
/// Recent Genius layouts nest the contributor badge + the
/// `<song title> Lyrics` heading inside the same `data-lyrics-container`
/// div that holds the actual verses, so once `html_text_decode` flattens
/// the tags we end up with a prefix like
/// `"17 ContributorsFall on Me Lyrics"` glued directly to the lyrics body
/// (issue #284). Strip the prefix when we recognise the
/// `^\d+ Contributors.*? Lyrics` shape; leave the text untouched
/// otherwise so a different layout doesn't silently lose its first line.
fn strip_genius_header(text: &str) -> String {
let trimmed = text.trim_start();
let digits_end = trimmed.chars().take_while(|c| c.is_ascii_digit()).count();
if digits_end == 0 {
return text.to_string();
}
let after_digits = &trimmed[digits_end..];
// Genius writes "1 Contributor" for a single transcriber and
// "N Contributors" for two or more — the plural-s is conditional
// on the badge count. Strip the singular core first; consume an
// optional trailing 's' to cover both shapes without two near-
// identical branches.
let Some(after_contrib) = after_digits.strip_prefix(" Contributor") else {
return text.to_string();
};
let after_contrib = after_contrib.strip_prefix('s').unwrap_or(after_contrib);
// Find the " Lyrics" token that closes the header. Genius glues
// the title onto a trailing " Lyrics" word with a single space
// separator (e.g. "...Vez Lyrics") and immediately follows that
// closer with either `\n` (when there's a body) or end-of-string
// (when the catalogue entry has no transcribed lyrics yet).
//
// Neither `find` nor `rfind` is correct here in general: a song
// titled "No Lyrics Needed" would put a " Lyrics" inside the
// title (`find` cuts too early), and a body line like "Some
// Lyrics" would do the same for `rfind` (cuts past the closer
// into the verses). The header closer is the FIRST " Lyrics"
// immediately followed by a newline or end-of-string, so we
// iterate forward and stop at that specific shape.
let lyrics_token = " Lyrics";
let lyrics_idx = after_contrib
.match_indices(lyrics_token)
.find(|(i, _)| {
let tail = &after_contrib[i + lyrics_token.len()..];
tail.is_empty()
|| tail.starts_with('\n')
|| tail.starts_with('\r')
})
.map(|(i, _)| i);
let Some(lyrics_idx) = lyrics_idx else {
return text.to_string();
};
after_contrib[lyrics_idx + lyrics_token.len()..]
.trim_start()
.to_string()
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn strip_header_drops_badge_title_and_lyrics_token() {
let input = "17 ContributorsFall on Me Lyrics\n[Chorus]\nFall on me";
assert_eq!(
strip_genius_header(input),
"[Chorus]\nFall on me".to_string()
);
}

#[test]
fn strip_header_returns_empty_for_stub_pages() {
// Genius renders this layout for songs added to the catalogue
// without transcribed lyrics. The bare header is the only
// content the parser sees, and the caller treats empty output
// as a miss so the next provider gets a turn.
let input = "2 ContributorsSolamente Una Vez Lyrics";
assert_eq!(strip_genius_header(input), "".to_string());
}

#[test]
fn strip_header_preserves_leading_whitespace_outside_pattern() {
// Genius occasionally pads the container with leading
// whitespace from inline scripts that don't decode to text.
// The trim_start on the digit probe must NOT eat newlines that
// belong to the lyrics body when no header is present.
let input = "\n[Verse 1]\nA real lyric";
assert_eq!(strip_genius_header(input), input.to_string());
}

#[test]
fn strip_header_no_op_when_no_digit_prefix() {
let input = "[Verse 1]\nA real lyric without a Genius header";
assert_eq!(strip_genius_header(input), input.to_string());
}

#[test]
fn strip_header_no_op_when_contributors_token_missing() {
// A track titled with a leading number (e.g. "99 Red Balloons")
// must not trigger the strip — only " Contributors" right
// after the digits qualifies as a Genius header.
let input = "99 Red Balloons Lyrics\n[Verse 1]";
assert_eq!(strip_genius_header(input), input.to_string());
}

#[test]
fn strip_header_no_op_when_lyrics_token_missing() {
// Defensive: a Genius layout change that drops the trailing
// " Lyrics" word would otherwise let us swallow the entire
// body up to the first match. Leave the input alone instead
// so the user sees raw content and we notice the regression
// in bug reports rather than silent data loss.
let input = "5 ContributorsSong Title Without Trailing Word\n[Verse]";
assert_eq!(strip_genius_header(input), input.to_string());
}

#[test]
fn strip_header_handles_multibyte_title() {
// Solamente Una Vez carries no multibyte chars but other Latin
// titles do (Naïve, Déjà Vu). The strip walks bytes via
// `match_indices(" Lyrics")` + `len()`, both byte-safe.
let input = "3 ContributorsDéjà Vu Lyrics\n[Verse]\nMidnight again";
assert_eq!(
strip_genius_header(input),
"[Verse]\nMidnight again".to_string()
);
}

#[test]
fn strip_header_picks_closer_when_title_contains_lyrics_word() {
// CodeRabbit-flagged edge case (PR #287): a song titled
// "No Lyrics Needed" plants a " Lyrics" inside the title
// BEFORE the trailing closer. A naive `find(" Lyrics")` would
// cut at the title's occurrence and leave " Needed Lyrics\n…"
// in the output. The closer is the FIRST " Lyrics" followed
// by a newline or end-of-string — that's the shape we anchor
// on.
let input = "4 ContributorsNo Lyrics Needed Lyrics\n[Verse]\nReal body";
assert_eq!(
strip_genius_header(input),
"[Verse]\nReal body".to_string()
);
}

#[test]
fn strip_header_does_not_swallow_body_containing_lyrics_word() {
// Complementary to the above: if we'd swapped `find` for
// `rfind` (the original CodeRabbit suggestion) to handle the
// title case, a body line like "Some Lyrics" would now match
// LAST and the strip would eat half the verse. Newline-
// anchored matching keeps both shapes correct.
let input = "5 ContributorsSong Title Lyrics\n[Verse 1]\nSome Lyrics line";
assert_eq!(
strip_genius_header(input),
"[Verse 1]\nSome Lyrics line".to_string()
);
}

#[test]
fn strip_header_handles_singular_contributor_badge() {
// CodeRabbit-flagged edge case: Genius pluralises the badge
// conditionally — a song with a single transcriber renders
// "1 Contributor" (no trailing 's'). The strip MUST consume
// the singular form too, otherwise a 1-transcriber song
// ships the bare header to the user verbatim.
let input = "1 ContributorSomething Lyrics\n[Intro]\nReal body";
assert_eq!(
strip_genius_header(input),
"[Intro]\nReal body".to_string()
);
}
}
30 changes: 30 additions & 0 deletions src-tauri/crates/syncedlyrics/src/providers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,34 @@ impl Provider {
Provider::Genius,
]
}

/// Canonical snake_case identifier. Matches the `serde(rename_all =
/// "snake_case")` shape so the frontend / DB write the exact same
/// string the JSON serialiser would emit — keeps round-trip stable
/// without forcing every caller to spin a `serde_json::to_value`.
pub const fn as_str(&self) -> &'static str {
match self {
Provider::Musixmatch => "musixmatch",
Provider::Lrclib => "lrclib",
Provider::NetEase => "net_ease",
Provider::Megalobiz => "megalobiz",
Provider::Genius => "genius",
}
}

/// Parse the canonical identifier back into a [`Provider`]. Pairs
/// with [`Self::as_str`] so a string round-tripped through the
/// frontend (e.g. user-picked provider in `refetch_lyrics`) maps
/// back to the original variant. Returns `None` for any unknown
/// id so the host can reject unknown values explicitly.
pub fn from_str(s: &str) -> Option<Self> {
match s {
"musixmatch" => Some(Provider::Musixmatch),
"lrclib" => Some(Provider::Lrclib),
"net_ease" => Some(Provider::NetEase),
"megalobiz" => Some(Provider::Megalobiz),
"genius" => Some(Provider::Genius),
_ => None,
}
}
}
23 changes: 23 additions & 0 deletions src-tauri/migrations/app/20260620120000_lyrics_provider.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
-- =============================================================================
-- Track which external provider (LRCLIB / Genius / NetEase / Megalobiz /
-- Musixmatch) returned the lyrics row, so the UI can show an accurate
-- source badge AND let the user manually re-fetch from a different
-- provider when the auto-waterfall caches garbage from one of them
-- (issue #284).
--
-- NULL = unknown. Two cases:
-- 1. Embedded / sidecar / manual writes (the `source` column already
-- tells the user "Embedded tag" / "Sidecar file" / "Saved by you" —
-- a sub-provider is meaningless for these tiers).
-- 2. Pre-1.5.1 rows. The auto-waterfall didn't track which provider
-- hit historically, and back-filling would require re-fetching
-- every cached row from the network.
--
-- No CHECK constraint: provider strings come from
-- `waveflow_syncedlyrics::Provider::as_str()` which is the only writer,
-- and a CHECK that hard-codes the list here would force a migration
-- every time a new provider lands. Validation lives in Rust at the
-- single write site.
-- =============================================================================

ALTER TABLE lyrics ADD COLUMN provider TEXT NULL;
Loading