Skip to content

Commit

Permalink
Apply dist_thresh to Genius and Google backends
Browse files Browse the repository at this point in the history
This commit introduces a distance threshold mechanism for the Genius and
Google backends.

- Create a new `SearchBackend` base class with a method `check_match`
  that performs checking.
- Start using undocumented `dist_thresh` configuration option for good,
  and mention it in the docs. This controls the maximum allowable
  distance for matching artist and title names.

These changes aim to improve the accuracy of lyrics matching, especially
when there are slight variations in artist or title names, see #4791.
  • Loading branch information
snejus committed Oct 13, 2024
1 parent 65d78cb commit 4bf1061
Show file tree
Hide file tree
Showing 4 changed files with 126 additions and 68 deletions.
126 changes: 68 additions & 58 deletions beetsplug/lyrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

from __future__ import annotations

import difflib
import errno
import itertools
import json
Expand All @@ -25,9 +24,9 @@
import struct
import unicodedata
import warnings
from functools import partial
from functools import cached_property, partial
from typing import ClassVar
from urllib.parse import quote, urlencode
from urllib.parse import quote, urlencode, urlparse

import requests
from unidecode import unidecode
Expand All @@ -50,6 +49,7 @@

import beets
from beets import plugins, ui
from beets.autotag.hooks import string_dist

DIV_RE = re.compile(r"<(/?)div>?", re.I)
COMMENT_RE = re.compile(r"<!--.*-->", re.S)
Expand Down Expand Up @@ -345,15 +345,43 @@ def fetch(self, artist, title, album=None, length=None):
return lyrics


class Genius(Backend):
class SearchBackend(Backend):
REQUIRES_BS = True

@cached_property
def dist_thresh(self) -> float:
return self.config["dist_thresh"].get(float)

def check_match(
self, target_artist: str, target_title: str, artist: str, title: str
) -> bool:
"""Check if the given artist and title are 'good enough' match."""
max_dist = max(
string_dist(target_artist, artist),
string_dist(target_title, title),
)
if round(max_dist, 2) <= self.dist_thresh:
return True

self._log.warning(
"({}, {}) pair does not match ({}, {}), dist {:.2f} > expected {:.2f}",
artist,
title,
target_artist,
target_title,
max_dist,
self.dist_thresh,
)
return False


class Genius(SearchBackend):
"""Fetch lyrics from Genius via genius-api.
Simply adapted from
bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/
"""

REQUIRES_BS = True

base_url = "https://api.genius.com"

def __init__(self, config, log):
Expand All @@ -376,19 +404,15 @@ def fetch(self, artist, title, album=None, length=None):
self._log.debug("Genius API request returned invalid JSON")
return None

# find a matching artist in the json
check = partial(self.check_match, artist, title)
for hit in json["response"]["hits"]:
hit_artist = hit["result"]["primary_artist"]["name"]

if slug(hit_artist) == slug(artist):
html = self.fetch_url(hit["result"]["url"])
result = hit["result"]
if check(result["primary_artist"]["name"], result["title"]):
html = self.fetch_url(result["url"])
if not html:
return None
return self._scrape_lyrics_from_html(html)

self._log.debug(
"Genius failed to find a matching artist for '{0}'", artist
)
return None

def _search(self, artist, title):
Expand Down Expand Up @@ -582,16 +606,9 @@ def is_text_notcode(text):
return None


class Google(Backend):
class Google(SearchBackend):
"""Fetch lyrics from Google search results."""

REQUIRES_BS = True

def __init__(self, config, log):
super().__init__(config, log)
self.api_key = config["google_API_key"].as_str()
self.engine_id = config["google_engine_ID"].as_str()

def is_lyrics(self, text, artist=None):
"""Determine whether the text seems to be valid lyrics."""
if not text:
Expand Down Expand Up @@ -637,21 +654,20 @@ def slugify(self, text):
BY_TRANS = ["by", "par", "de", "von"]
LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]

def is_page_candidate(self, url_link, url_title, title, artist):
def is_page_candidate(
self, artist: str, title: str, url_link: str, url_title: str
) -> bool:
"""Return True if the URL title makes it a good candidate to be a
page that contains lyrics of title by artist.
"""
title = self.slugify(title.lower())
artist = self.slugify(artist.lower())
sitename = re.search(
"//([^/]+)/.*", self.slugify(url_link.lower())
).group(1)
url_title = self.slugify(url_title.lower())

# Check if URL title contains song title (exact match)
if url_title.find(title) != -1:
title_slug = self.slugify(title.lower())
url_title_slug = self.slugify(url_title.lower())
if title_slug in url_title_slug:
return True

artist = self.slugify(artist.lower())
sitename = urlparse(url_link).netloc

# or try extracting song title from URL title and check if
# they are close enough
tokens = (
Expand All @@ -660,18 +676,15 @@ def is_page_candidate(self, url_link, url_title, title, artist):
+ self.LYRICS_TRANS
)
tokens = [re.escape(t) for t in tokens]
song_title = re.sub("(%s)" % "|".join(tokens), "", url_title)
song_title = re.sub("(%s)" % "|".join(tokens), "", url_title_slug)

song_title = song_title.strip("_|")
typo_ratio = 0.9
ratio = difflib.SequenceMatcher(None, song_title, title).ratio()
return ratio >= typo_ratio
return self.check_match(artist, title_slug, artist, song_title)

def fetch(self, artist, title, album=None, length=None):
query = f"{artist} {title}"
url = "https://www.googleapis.com/customsearch/v1?key=%s&cx=%s&q=%s" % (
self.api_key,
self.engine_id,
self.config["google_API_key"].as_str(),
self.config["google_engine_ID"].as_str(),
quote(query.encode("utf-8")),
)

Expand All @@ -688,24 +701,21 @@ def fetch(self, artist, title, album=None, length=None):
self._log.debug("google backend error: {0}", reason)
return None

if "items" in data.keys():
for item in data["items"]:
url_link = item["link"]
url_title = item.get("title", "")
if not self.is_page_candidate(
url_link, url_title, title, artist
):
continue
html = self.fetch_url(url_link)
if not html:
continue
lyrics = scrape_lyrics_from_html(html)
if not lyrics:
continue

if self.is_lyrics(lyrics, artist):
self._log.debug("got lyrics from {0}", item["displayLink"])
return lyrics
check_candidate = partial(self.is_page_candidate, artist, title)
for item in data.get("items", []):
url_link = item["link"]
if not check_candidate(url_link, item.get("title", "")):
continue
html = self.fetch_url(url_link)
if not html:
continue
lyrics = scrape_lyrics_from_html(html)
if not lyrics:
continue

if self.is_lyrics(lyrics, artist):
self._log.debug("got lyrics from {0}", item["displayLink"])
return lyrics

return None

Expand All @@ -729,6 +739,7 @@ def __init__(self):
"bing_client_secret": None,
"bing_lang_from": [],
"bing_lang_to": None,
"dist_thresh": 0.11,
"google_API_key": None,
"google_engine_ID": "009217259823014548361:lndtuqkycfu",
"genius_api_key": "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W"
Expand All @@ -740,7 +751,6 @@ def __init__(self):
# Musixmatch is disabled by default as they are currently blocking
# requests with the beets user agent.
"sources": [s for s in self.SOURCES if s != "musixmatch"],
"dist_thresh": 0.1,
}
)
self.config["bing_client_secret"].redact = True
Expand Down
3 changes: 3 additions & 0 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ Bug fixes:
* :doc:`plugins/lyrics`: Update ``tekstowo`` backend to fetch lyrics directly
since recent updates to their website made it unsearchable.
:bug:`5456`
* :doc:`plugins/lyrics`: Fix the issue with ``genius`` backend not being able
to match lyrics when there was a slight variation in the artist name.
:bug:`4791`

For packagers:

Expand Down
6 changes: 6 additions & 0 deletions docs/plugins/lyrics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@ configuration file. The available options are:
Default: ``[]``
- **bing_lang_to**: Language to translate lyrics into.
Default: None.
- **dist_thresh**: The maximum distance between the artist and title
combination of the music file and lyrics candidate to consider them a match.
Lower values will make the plugin more strict, higher values will make it
more lenient. This does not apply to the ``lrclib`` backend as it matches
durations.
Default: ``0.11``.
- **fallback**: By default, the file will be left unchanged when no lyrics are
found. Use the empty string ``''`` to reset the lyrics in such a case.
Default: None.
Expand Down
59 changes: 49 additions & 10 deletions test/plugins/test_lyrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import os
import re
import unittest
from unittest.mock import MagicMock, patch
from unittest.mock import patch

import confuse
import pytest
Expand All @@ -31,11 +31,12 @@
from beetsplug import lyrics

log = logging.getLogger("beets.test_lyrics")
raw_backend = lyrics.Backend({}, log)
google = lyrics.Google(MagicMock(), log)
genius = lyrics.Genius(MagicMock(), log)
tekstowo = lyrics.Tekstowo(MagicMock(), log)
lrclib = lyrics.LRCLib(MagicMock(), log)
plugin = lyrics.LyricsPlugin()
raw_backend = lyrics.Backend(plugin.config, log)
google = lyrics.Google(plugin.config, log)
genius = lyrics.Genius(plugin.config, log)
tekstowo = lyrics.Tekstowo(plugin.config, log)
lrclib = lyrics.LRCLib(plugin.config, log)


class LyricsPluginTest(unittest.TestCase):
Expand Down Expand Up @@ -244,6 +245,42 @@ def setUp(self):
self.skipTest("Beautiful Soup 4 not available")


class TestSearchBackend:
@pytest.fixture
def backend(self, dist_thresh):
plugin = lyrics.LyricsPlugin()
plugin.config.set({"dist_thresh": dist_thresh})
return lyrics.SearchBackend(plugin.config, plugin._log)

@pytest.mark.parametrize(
"dist_thresh, target_artist, artist, should_match",
[
(0.11, "Target Artist", "Target Artist", True),
(0.11, "Target Artist", "Target Artis", True),
(0.11, "Target Artist", "Target Arti", False),
(0.11, "Psychonaut", "Psychonaut (BEL)", True),
(0.11, "beets song", "beats song", True),
(0.10, "beets song", "beats song", False),
(
0.11,
"Lucid Dreams (Forget Me)",
"Lucid Dreams (Remix) ft. Lil Uzi Vert",
False,
),
(
0.12,
"Lucid Dreams (Forget Me)",
"Lucid Dreams (Remix) ft. Lil Uzi Vert",
True,
),
],
)
def test_check_match(self, backend, target_artist, artist, should_match):
assert (
backend.check_match(target_artist, "", artist, "") == should_match
)


class LyricsPluginSourcesTest(LyricsGoogleBaseTest, LyricsAssertions):
"""Check that beets google custom search engine sources are correctly
scraped.
Expand Down Expand Up @@ -400,7 +437,7 @@ def test_is_page_candidate_exact_match(self):
html, "html.parser", parse_only=SoupStrainer("title")
)
assert google.is_page_candidate(
url, soup.title.string, s["title"], s["artist"]
s["artist"], s["title"], url, soup.title.string
), url

def test_is_page_candidate_fuzzy_match(self):
Expand All @@ -413,12 +450,12 @@ def test_is_page_candidate_fuzzy_match(self):

# very small diffs (typo) are ok eg 'beats' vs 'beets' with same artist
assert google.is_page_candidate(
url, url_title, s["title"], s["artist"]
s["artist"], s["title"], url, url_title
), url
# reject different title
url_title = "example.com | seets bong lyrics by John doe"
assert not google.is_page_candidate(
url, url_title, s["title"], s["artist"]
s["artist"], s["title"], url, url_title
), url

def test_is_page_candidate_special_chars(self):
Expand All @@ -430,7 +467,7 @@ def test_is_page_candidate_special_chars(self):
url = s["url"] + s["path"]
url_title = "foo"

google.is_page_candidate(url, url_title, s["title"], "Sunn O)))")
google.is_page_candidate("Sunn O)))", s["title"], url, url_title)


# test Genius backend
Expand Down Expand Up @@ -506,12 +543,14 @@ def test_json(self, mock_fetch_url, mock_scrape):
"name": "\u200bblackbear",
},
"url": "blackbear_url",
"title": "Idfc",
}
},
{
"result": {
"primary_artist": {"name": "El\u002dp"},
"url": "El-p_url",
"title": "Idfc",
}
},
]
Expand Down

0 comments on commit 4bf1061

Please sign in to comment.