diff --git a/beetsplug/_typing.py b/beetsplug/_typing.py
new file mode 100644
index 0000000000..1aa288cbcb
--- /dev/null
+++ b/beetsplug/_typing.py
@@ -0,0 +1,115 @@
+from __future__ import annotations
+
+from typing import Any
+
+from typing_extensions import NotRequired, TypeAlias, TypedDict
+
+JSONDict: TypeAlias = "dict[str, Any]"
+
+
+class LRCLibAPI:
+ class Item(TypedDict):
+ """Lyrics data item returned by the LRCLib API."""
+
+ id: int
+ name: str
+ trackName: str
+ artistName: str
+ albumName: str
+ duration: float | None
+ instrumental: bool
+ plainLyrics: str
+ syncedLyrics: str | None
+
+
+class GeniusAPI:
+ """Genius API data types.
+
+ This documents *only* the fields that are used in the plugin.
+ :attr:`SearchResult` is an exception, since I thought some of the other
+ fields might be useful in the future.
+ """
+
+ class DateComponents(TypedDict):
+ year: int
+ month: int
+ day: int
+
+ class Artist(TypedDict):
+ api_path: str
+ header_image_url: str
+ id: int
+ image_url: str
+ is_meme_verified: bool
+ is_verified: bool
+ name: str
+ url: str
+
+ class Stats(TypedDict):
+ unreviewed_annotations: int
+ hot: bool
+
+ class SearchResult(TypedDict):
+ annotation_count: int
+ api_path: str
+ artist_names: str
+ full_title: str
+ header_image_thumbnail_url: str
+ header_image_url: str
+ id: int
+ lyrics_owner_id: int
+ lyrics_state: str
+ path: str
+ primary_artist_names: str
+ pyongs_count: int | None
+ relationships_index_url: str
+ release_date_components: GeniusAPI.DateComponents
+ release_date_for_display: str
+ release_date_with_abbreviated_month_for_display: str
+ song_art_image_thumbnail_url: str
+ song_art_image_url: str
+ stats: GeniusAPI.Stats
+ title: str
+ title_with_featured: str
+ url: str
+ featured_artists: list[GeniusAPI.Artist]
+ primary_artist: GeniusAPI.Artist
+ primary_artists: list[GeniusAPI.Artist]
+
+ class SearchHit(TypedDict):
+ result: GeniusAPI.SearchResult
+
+ class SearchResponse(TypedDict):
+ hits: list[GeniusAPI.SearchHit]
+
+ class Search(TypedDict):
+ response: GeniusAPI.SearchResponse
+
+
+class GoogleCustomSearchAPI:
+ class Response(TypedDict):
+ """Search response from the Google Custom Search API.
+
+ If the search returns no results, the :attr:`items` field is not found.
+ """
+
+ items: NotRequired[list[GoogleCustomSearchAPI.Item]]
+
+ class Item(TypedDict):
+ """A Google Custom Search API result item.
+
+ :attr:`title` field is shown to the user in the search interface, thus
+ it gets truncated with an ellipsis for longer queries. For most
+ results, the full title is available as ``og:title`` metatag found
+ under the :attr:`pagemap` field. Note neither this metatag nor the
+ ``pagemap`` field is guaranteed to be present in the data.
+ """
+
+ title: str
+ link: str
+ pagemap: NotRequired[GoogleCustomSearchAPI.Pagemap]
+
+ class Pagemap(TypedDict):
+ """Pagemap data with a single meta tags dict in a list."""
+
+ metatags: list[JSONDict]
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index e6ab217c5b..2ef1ac2f30 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -16,52 +16,35 @@
from __future__ import annotations
-import difflib
+import atexit
import errno
import itertools
-import json
+import math
import os.path
import re
-import struct
-import unicodedata
-import warnings
-from contextlib import suppress
+from contextlib import contextmanager, suppress
from dataclasses import dataclass
from functools import cached_property, partial, total_ordering
+from html import unescape
from http import HTTPStatus
-from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator
-from urllib.parse import quote, urlencode
+from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator, NamedTuple
+from urllib.parse import quote, urlparse
+import langdetect
import requests
-from typing_extensions import TypedDict
+from bs4 import BeautifulSoup
from unidecode import unidecode
import beets
from beets import plugins, ui
+from beets.autotag.hooks import string_dist
if TYPE_CHECKING:
from beets.importer import ImportTask
from beets.library import Item
-try:
- import bs4
- from bs4 import SoupStrainer
+ from ._typing import GeniusAPI, GoogleCustomSearchAPI, LRCLibAPI
- HAS_BEAUTIFUL_SOUP = True
-except ImportError:
- HAS_BEAUTIFUL_SOUP = False
-
-try:
- import langdetect
-
- HAS_LANGDETECT = True
-except ImportError:
- HAS_LANGDETECT = False
-
-DIV_RE = re.compile(r"<(/?)div>?", re.I)
-COMMENT_RE = re.compile(r"", re.S)
-TAG_RE = re.compile(r"<[^>]*>")
-BREAK_RE = re.compile(r"\n?\s* ]*)*>\s*\n?", re.I)
USER_AGENT = f"beets/{beets.__version__}"
INSTRUMENTAL_LYRICS = "[Instrumental]"
@@ -105,37 +88,36 @@ class NotFoundError(requests.exceptions.HTTPError):
pass
-# Utilities.
+class CaptchaError(requests.exceptions.HTTPError):
+ pass
+
+
+class TimeoutSession(requests.Session):
+ def request(self, *args, **kwargs):
+ """Wrap the request method to raise an exception on HTTP errors."""
+ kwargs.setdefault("timeout", 10)
+ r = super().request(*args, **kwargs)
+ if r.status_code == HTTPStatus.NOT_FOUND:
+ raise NotFoundError("HTTP Error: Not Found", response=r)
+ if 300 <= r.status_code < 400:
+ raise CaptchaError("Captcha is required", response=r)
+ r.raise_for_status()
-def unichar(i):
- try:
- return chr(i)
- except ValueError:
- return struct.pack("i", i).decode("utf-32")
+ return r
-def unescape(text):
- """Resolve xx; HTML entities (and some others)."""
- if isinstance(text, bytes):
- text = text.decode("utf-8", "ignore")
- out = text.replace(" ", " ")
+r_session = TimeoutSession()
+r_session.headers.update({"User-Agent": USER_AGENT})
- def replchar(m):
- num = m.group(1)
- return unichar(int(num))
- out = re.sub("(\\d+);", replchar, out)
- return out
+@atexit.register
+def close_session():
+ """Close the requests session on shut down."""
+ r_session.close()
-def extract_text_between(html, start_marker, end_marker):
- try:
- _, html = html.split(start_marker, 1)
- html, _ = html.split(end_marker, 1)
- except ValueError:
- return ""
- return html
+# Utilities.
def search_pairs(item):
@@ -204,7 +186,7 @@ def generate_alternatives(string, patterns):
return itertools.product(artists, multi_titles)
-def slug(text):
+def slug(text: str) -> str:
"""Make a URL-safe, human-readable version of the given text
This will do the following:
@@ -214,81 +196,69 @@ def slug(text):
3. strip whitespace
4. replace other non-word characters with dashes
5. strip extra dashes
-
- This somewhat duplicates the :func:`Google.slugify` function but
- slugify is not as generic as this one, which can be reused
- elsewhere.
"""
return re.sub(r"\W+", "-", unidecode(text).lower().strip()).strip("-")
-if HAS_BEAUTIFUL_SOUP:
+class RequestHandler:
+ _log: beets.logging.Logger
- def try_parse_html(html, **kwargs):
- return bs4.BeautifulSoup(html, "html.parser", **kwargs)
+ def debug(self, message: str, *args) -> None:
+ """Log a debug message with the class name."""
+ self._log.debug(f"{self.__class__.__name__}: {message}", *args)
-else:
+ def info(self, message: str, *args) -> None:
+ """Log an info message with the class name."""
+ self._log.info(f"{self.__class__.__name__}: {message}", *args)
- def try_parse_html(html, **kwargs):
- return None
+ def warn(self, message: str, *args) -> None:
+ """Log warning with the class name."""
+ self._log.warning(f"{self.__class__.__name__}: {message}", *args)
+ def fetch_text(self, url: str, **kwargs) -> str:
+ """Return text / HTML data from the given URL.
-class Backend:
- REQUIRES_BS = False
+ Set the encoding to None to let requests handle it because some sites
+ set it incorrectly.
+ """
+ self.debug("Fetching HTML from {}", url)
+ r = r_session.get(url, **kwargs)
+ r.encoding = None
+ return r.text
+
+ def fetch_json(self, url: str, **kwargs):
+ """Return JSON data from the given URL."""
+ self.debug("Fetching JSON from {}", url)
+ return r_session.get(url, **kwargs).json()
+
+ @contextmanager
+ def handle_request(self) -> Iterator[None]:
+ try:
+ yield
+ except requests.JSONDecodeError:
+ self.warn("Could not decode response JSON data")
+ except requests.RequestException as exc:
+ self.warn("Request error: {}", exc)
+
+
+class BackendClass(type):
+ @property
+ def name(cls) -> str:
+ """Return lowercase name of the backend class."""
+ return cls.__name__.lower()
+
+class Backend(RequestHandler, metaclass=BackendClass):
def __init__(self, config, log):
self._log = log
self.config = config
- def fetch_url(self, url, **kwargs):
- """Retrieve the content at a given URL, or return None if the source
- is unreachable.
- """
- try:
- # Disable the InsecureRequestWarning that comes from using
- # `verify=false`.
- # https://github.com/kennethreitz/requests/issues/2214
- # We're not overly worried about the NSA MITMing our lyrics scraper
- with warnings.catch_warnings():
- warnings.simplefilter("ignore")
- r = requests.get(
- url,
- verify=False,
- headers={
- "User-Agent": USER_AGENT,
- },
- timeout=10,
- **kwargs,
- )
- except requests.RequestException as exc:
- self._log.debug("lyrics request failed: {0}", exc)
- return
- if r.status_code == requests.codes.ok:
- return r.text
- else:
- self._log.debug("failed to fetch: {0} ({1})", url, r.status_code)
- return None
-
def fetch(
self, artist: str, title: str, album: str, length: int
- ) -> str | None:
+ ) -> tuple[str, str] | None:
raise NotImplementedError
-class LRCLibItem(TypedDict):
- """Lyrics data item returned by the LRCLib API."""
-
- id: int
- name: str
- trackName: str
- artistName: str
- albumName: str
- duration: float | None
- instrumental: bool
- plainLyrics: str
- syncedLyrics: str | None
-
-
@dataclass
@total_ordering
class LRCLyrics:
@@ -296,6 +266,7 @@ class LRCLyrics:
DURATION_DIFF_TOLERANCE = 0.05
target_duration: float
+ id: int
duration: float
instrumental: bool
plain: str
@@ -306,9 +277,12 @@ def __le__(self, other: LRCLyrics) -> bool:
return self.dist < other.dist
@classmethod
- def make(cls, candidate: LRCLibItem, target_duration: float) -> LRCLyrics:
+ def make(
+ cls, candidate: LRCLibAPI.Item, target_duration: float
+ ) -> LRCLyrics:
return cls(
target_duration,
+ candidate["id"],
candidate["duration"] or 0.0,
candidate["instrumental"],
candidate["plainLyrics"],
@@ -361,24 +335,9 @@ class LRCLib(Backend):
GET_URL = f"{BASE_URL}/get"
SEARCH_URL = f"{BASE_URL}/search"
- def warn(self, message: str, *args) -> None:
- """Log a warning message with the class name."""
- self._log.warning(f"{self.__class__.__name__}: {message}", *args)
-
- def fetch_json(self, *args, **kwargs):
- """Wrap the request method to raise an exception on HTTP errors."""
- kwargs.setdefault("timeout", 10)
- kwargs.setdefault("headers", {"User-Agent": USER_AGENT})
- r = requests.get(*args, **kwargs)
- if r.status_code == HTTPStatus.NOT_FOUND:
- raise NotFoundError("HTTP Error: Not Found", response=r)
- r.raise_for_status()
-
- return r.json()
-
def fetch_candidates(
self, artist: str, title: str, album: str, length: int
- ) -> Iterator[list[LRCLibItem]]:
+ ) -> Iterator[list[LRCLibAPI.Item]]:
"""Yield lyrics candidates for the given song data.
Firstly, attempt to GET lyrics directly, and then search the API if
@@ -403,23 +362,19 @@ def pick_best_match(cls, lyrics: Iterable[LRCLyrics]) -> LRCLyrics | None:
def fetch(
self, artist: str, title: str, album: str, length: int
- ) -> str | None:
+ ) -> tuple[str, str] | None:
"""Fetch lyrics text for the given song data."""
fetch = partial(self.fetch_candidates, artist, title, album, length)
make = partial(LRCLyrics.make, target_duration=length)
pick = self.pick_best_match
try:
- return next(
+ item = next(
filter(None, map(pick, (map(make, x) for x in fetch())))
- ).get_text(self.config["synced"])
+ )
except StopIteration:
- pass
- except requests.JSONDecodeError:
- self.warn("Could not decode response JSON data")
- except requests.RequestException as exc:
- self.warn("Request error: {}", exc)
+ return None
- return None
+ return item.get_text(self.config["synced"]), f"{self.GET_URL}/{item.id}"
class DirectBackend(Backend):
@@ -456,22 +411,18 @@ def encode(cls, text: str) -> str:
return quote(unidecode(text))
- def fetch(self, artist: str, title: str, *_) -> str | None:
+ def fetch(self, artist: str, title: str, *_) -> tuple[str, str] | None:
url = self.build_url(artist, title)
- html = self.fetch_url(url)
- if not html:
- return None
+ html = self.fetch_text(url)
if "We detected that your IP is blocked" in html:
- self._log.warning(
- "we are blocked at MusixMatch: url %s failed" % url
- )
+ self.warn("Failed: Blocked IP address")
return None
html_parts = html.split('
"))
+ lyrics_parts.append(re.sub(r"^[^>]+>|.*", "", html_part))
lyrics = "\n".join(lyrics_parts)
lyrics = lyrics.strip(',"').replace("\\n", "\n")
# another odd case: sometimes only that string remains, for
@@ -482,151 +433,165 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
# sometimes there are non-existent lyrics with some content
if "Lyrics | Musixmatch" in lyrics:
return None
- return lyrics
+ return lyrics, url
+
+
+class Html:
+ collapse_space = partial(re.compile(r"(^| ) +", re.M).sub, r"\1")
+ expand_br = partial(re.compile(r"\s* ]*>\s*", re.I).sub, "\n")
+ #: two newlines between paragraphs on the same line (musica, letras.mus.br)
+ merge_blocks = partial(re.compile(r"(?)]*>").sub, "\n\n")
+ #: a single new line between paragraphs on separate lines
+ #: (paroles.net, sweetslyrics.com, lacoccinelle.net)
+ merge_lines = partial(re.compile(r"
\s+]*>(?!___)").sub, "\n")
+ #: remove empty divs (lacoccinelle.net)
+ remove_empty_tags = partial(
+ re.compile(r"(<(div|span)[^>]*>\s*\2>)").sub, ""
+ )
+ #: remove Google Ads tags (musica.com)
+ remove_aside = partial(re.compile("").sub, "")
+ #: remove adslot-Content_1 div from the lyrics text (paroles.net)
+ remove_adslot = partial(
+ re.compile(r"\n[^\n]+-- Content_\d+ --.*?\n", re.S).sub,
+ "\n",
+ )
+ #: remove text formatting (azlyrics.com, lacocinelle.net)
+ remove_formatting = partial(
+ re.compile(r" *?(i|em|pre|strong)[^>]*>").sub, ""
+ )
+ @classmethod
+ def normalize_space(cls, text: str) -> str:
+ text = unescape(text).replace("\r", "").replace("\xa0", " ")
+ return cls.collapse_space(cls.expand_br(text))
-class Genius(Backend):
- """Fetch lyrics from Genius via genius-api.
+ @classmethod
+ def remove_ads(cls, text: str) -> str:
+ return cls.remove_adslot(cls.remove_aside(text))
- Simply adapted from
- bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/
- """
+ @classmethod
+ def merge_paragraphs(cls, text: str) -> str:
+ return cls.merge_blocks(cls.merge_lines(cls.remove_empty_tags(text)))
- REQUIRES_BS = True
- base_url = "https://api.genius.com"
+class SoupMixin:
+ @classmethod
+ def pre_process_html(cls, html: str) -> str:
+ """Pre-process the HTML content before scraping."""
+ return Html.normalize_space(html)
- def __init__(self, config, log):
- super().__init__(config, log)
- self.api_key = config["genius_api_key"].as_str()
- self.headers = {
- "Authorization": "Bearer %s" % self.api_key,
- "User-Agent": USER_AGENT,
- }
+ @classmethod
+ def get_soup(cls, html: str) -> BeautifulSoup:
+ return BeautifulSoup(cls.pre_process_html(html), "html.parser")
- def fetch(self, artist: str, title: str, *_) -> str | None:
- """Fetch lyrics from genius.com
- Because genius doesn't allow accessing lyrics via the api,
- we first query the api for a url matching our artist & title,
- then attempt to scrape that url for the lyrics.
- """
- json = self._search(artist, title)
- if not json:
- self._log.debug("Genius API request returned invalid JSON")
- return None
+class SearchResult(NamedTuple):
+ artist: str
+ title: str
+ url: str
- # find a matching artist in the json
- for hit in json["response"]["hits"]:
- hit_artist = hit["result"]["primary_artist"]["name"]
+ @property
+ def source(self) -> str:
+ return urlparse(self.url).netloc
- if slug(hit_artist) == slug(artist):
- html = self.fetch_url(hit["result"]["url"])
- if not html:
- return None
- return self._scrape_lyrics_from_html(html)
- self._log.debug(
- "Genius failed to find a matching artist for '{0}'", artist
+class SearchBackend(SoupMixin, Backend):
+ @cached_property
+ def dist_thresh(self) -> float:
+ return self.config["dist_thresh"].get(float)
+
+ def check_match(
+ self, target_artist: str, target_title: str, result: SearchResult
+ ) -> bool:
+ """Check if the given search result is a 'good enough' match."""
+ max_dist = max(
+ string_dist(target_artist, result.artist),
+ string_dist(target_title, result.title),
)
- return None
-
- def _search(self, artist, title):
- """Searches the genius api for a given artist and title
- https://docs.genius.com/#search-h2
+ if (max_dist := round(max_dist, 2)) <= self.dist_thresh:
+ return True
- :returns: json response
- """
- search_url = self.base_url + "/search"
- data = {"q": title + " " + artist.lower()}
- try:
- response = requests.get(
- search_url,
- params=data,
- headers=self.headers,
- timeout=10,
+ if math.isclose(max_dist, self.dist_thresh, abs_tol=0.4):
+ # log out the candidate that did not make it but was close.
+ # This may show a matching candidate with some noise in the name
+ self.debug(
+ "({}, {}) does not match ({}, {}) but dist was close: {:.2f}",
+ result.artist,
+ result.title,
+ target_artist,
+ target_title,
+ max_dist,
)
- except requests.RequestException as exc:
- self._log.debug("Genius API request failed: {0}", exc)
- return None
- try:
- return response.json()
- except ValueError:
- return None
+ return False
- def replace_br(self, lyrics_div):
- for br in lyrics_div.find_all("br"):
- br.replace_with("\n")
+ def search(self, artist: str, title: str) -> Iterable[SearchResult]:
+ """Search for the given query and yield search results."""
+ raise NotImplementedError
- def _scrape_lyrics_from_html(self, html):
- """Scrape lyrics from a given genius.com html"""
+ def get_results(self, artist: str, title: str) -> Iterable[SearchResult]:
+ check_match = partial(self.check_match, artist, title)
+ for candidate in self.search(artist, title):
+ if check_match(candidate):
+ yield candidate
+
+ def fetch(self, artist: str, title: str, *_) -> tuple[str, str] | None:
+ """Fetch lyrics for the given artist and title."""
+ for result in self.get_results(artist, title):
+ if (html := self.fetch_text(result.url)) and (
+ lyrics := self.scrape(html)
+ ):
+ return lyrics, result.url
- soup = try_parse_html(html)
- if not soup:
- return
+ return None
- # Remove script tags that they put in the middle of the lyrics.
- [h.extract() for h in soup("script")]
+ @classmethod
+ def scrape(cls, html: str) -> str | None:
+ """Scrape the lyrics from the given HTML."""
+ raise NotImplementedError
- # Most of the time, the page contains a div with class="lyrics" where
- # all of the lyrics can be found already correctly formatted
- # Sometimes, though, it packages the lyrics into separate divs, most
- # likely for easier ad placement
- lyrics_divs = soup.find_all("div", {"data-lyrics-container": True})
- if not lyrics_divs:
- self._log.debug("Received unusual song page html")
- return self._try_extracting_lyrics_from_non_data_lyrics_container(
- soup
- )
- lyrics = ""
- for lyrics_div in lyrics_divs:
- self.replace_br(lyrics_div)
- lyrics += lyrics_div.get_text() + "\n\n"
- while lyrics[-1] == "\n":
- lyrics = lyrics[:-1]
- return lyrics
-
- def _try_extracting_lyrics_from_non_data_lyrics_container(self, soup):
- """Extract lyrics from a div without attribute data-lyrics-container
- This is the second most common layout on genius.com
- """
- verse_div = soup.find("div", class_=re.compile("Lyrics__Container"))
- if not verse_div:
- if soup.find(
- "div",
- class_=re.compile("LyricsPlaceholder__Message"),
- string="This song is an instrumental",
- ):
- self._log.debug("Detected instrumental")
- return INSTRUMENTAL_LYRICS
- else:
- self._log.debug("Couldn't scrape page using known layouts")
- return None
+class Genius(SearchBackend):
+ """Fetch lyrics from Genius via genius-api.
- lyrics_div = verse_div.parent
- self.replace_br(lyrics_div)
+ Because genius doesn't allow accessing lyrics via the api, we first query
+ the api for a url matching our artist & title, then scrape the HTML text
+ for the JSON data containing the lyrics.
- ads = lyrics_div.find_all(
- "div", class_=re.compile("InreadAd__Container")
- )
- for ad in ads:
- ad.replace_with("\n")
+ Adapted from
+ bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping
+ """
+
+ SEARCH_URL = "https://api.genius.com/search"
+ LYRICS_IN_JSON_RE = re.compile(r'(?<=.\\"html\\":\\").*?(?=(? dict[str, str]:
+ return {"Authorization": f'Bearer {self.config["genius_api_key"]}'}
+
+ def search(self, artist: str, title: str) -> Iterable[SearchResult]:
+ search_data: GeniusAPI.Search = self.fetch_json(
+ self.SEARCH_URL,
+ params={"q": f"{artist} {title}"},
+ headers=self.headers,
)
- for footer in footers:
- footer.replace_with("")
- return lyrics_div.get_text()
+ for r in (hit["result"] for hit in search_data["response"]["hits"]):
+ yield SearchResult(r["artist_names"], r["title"], r["url"])
+
+ @classmethod
+ def scrape(cls, html: str) -> str | None:
+ if m := cls.LYRICS_IN_JSON_RE.search(html):
+ html_text = cls.remove_backslash(m[0]).replace(r"\n", "\n")
+ return cls.get_soup(html_text).get_text().strip()
+
+ return None
-class Tekstowo(DirectBackend):
+class Tekstowo(SoupMixin, DirectBackend):
"""Fetch lyrics from Tekstowo.pl."""
- REQUIRES_BS = True
URL_TEMPLATE = "https://www.tekstowo.pl/piosenka,{},{}.html"
non_alpha_to_underscore = partial(re.compile(r"\W").sub, "_")
@@ -635,17 +600,21 @@ class Tekstowo(DirectBackend):
def encode(cls, text: str) -> str:
return cls.non_alpha_to_underscore(unidecode(text.lower()))
- def fetch(self, artist: str, title: str, *_) -> str | None:
- if html := self.fetch_url(self.build_url(artist, title)):
- return self.extract_lyrics(html)
+ def fetch(self, artist: str, title: str, *_) -> tuple[str, str] | None:
+ url = self.build_url(artist, title)
+ # We are expecting to receive a 404 since we are guessing the URL.
+ # Thus suppress the error so that it does not end up in the logs.
+ with suppress(NotFoundError):
+ if lyrics := self.scrape(self.fetch_text(url)):
+ return lyrics, url
return None
- def extract_lyrics(self, html: str) -> str | None:
- html = _scrape_strip_cruft(html)
- html = _scrape_merge_paragraphs(html)
+ return None
- soup = try_parse_html(html)
+ @classmethod
+ def scrape(cls, html: str) -> str | None:
+ soup = cls.get_soup(html)
if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
return lyrics_div.get_text()
@@ -653,210 +622,153 @@ def extract_lyrics(self, html: str) -> str | None:
return None
-def remove_credits(text):
- """Remove first/last line of text if it contains the word 'lyrics'
- eg 'Lyrics by songsdatabase.com'
- """
- textlines = text.split("\n")
- credits = None
- for i in (0, -1):
- if textlines and "lyrics" in textlines[i].lower():
- credits = textlines.pop(i)
- if credits:
- text = "\n".join(textlines)
- return text
-
-
-def _scrape_strip_cruft(html, plain_text_out=False):
- """Clean up HTML"""
- html = unescape(html)
-
- html = html.replace("\r", "\n") # Normalize EOL.
- html = re.sub(r" +", " ", html) # Whitespaces collapse.
- html = BREAK_RE.sub("\n", html) #
eats up surrounding '\n'.
- html = re.sub(r"(?s)<(script).*?\1>", "", html) # Strip script tags.
- html = re.sub("\u2005", " ", html) # replace unicode with regular space
- html = re.sub("
", "", html) # remove Google Ads tags
- html = re.sub(r"?(em|strong)[^>]*>", "", html) # remove italics / bold
-
- if plain_text_out: # Strip remaining HTML tags
- html = COMMENT_RE.sub("", html)
- html = TAG_RE.sub("", html)
-
- html = "\n".join([x.strip() for x in html.strip().split("\n")])
- html = re.sub(r"\n{3,}", r"\n\n", html)
- return html
-
-
-def _scrape_merge_paragraphs(html):
- html = re.sub(r"\s*]*)>", "\n", html)
- return re.sub(r"
\s*
", "\n", html)
-
-
-def scrape_lyrics_from_html(html):
- """Scrape lyrics from a URL. If no lyrics can be found, return None
- instead.
- """
-
- def is_text_notcode(text):
- if not text:
- return False
- length = len(text)
- return (
- length > 20
- and text.count(" ") > length / 25
- and (text.find("{") == -1 or text.find(";") == -1)
- )
-
- html = _scrape_strip_cruft(html)
- html = _scrape_merge_paragraphs(html)
-
- # extract all long text blocks that are not code
- soup = try_parse_html(html, parse_only=SoupStrainer(string=is_text_notcode))
- if not soup:
- return None
-
- # Get the longest text element (if any).
- strings = sorted(soup.stripped_strings, key=len, reverse=True)
- if strings:
- return strings[0]
- else:
- return None
-
-
-class Google(Backend):
+class Google(SearchBackend):
"""Fetch lyrics from Google search results."""
- REQUIRES_BS = True
SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
- def is_lyrics(self, text, artist=None):
- """Determine whether the text seems to be valid lyrics."""
- if not text:
- return False
- bad_triggers_occ = []
- nb_lines = text.count("\n")
- if nb_lines <= 1:
- self._log.debug("Ignoring too short lyrics '{0}'", text)
- return False
- elif nb_lines < 5:
- bad_triggers_occ.append("too_short")
- else:
- # Lyrics look legit, remove credits to avoid being penalized
- # further down
- text = remove_credits(text)
+ #: Exclude some letras.mus.br pages which do not contain lyrics.
+ EXCLUDE_PAGES = [
+ "significado.html",
+ "traduccion.html",
+ "traducao.html",
+ "significados.html",
+ ]
- bad_triggers = ["lyrics", "copyright", "property", "links"]
- if artist:
- bad_triggers += [artist]
+ #: Regular expression to match noise in the URL title.
+ URL_TITLE_NOISE_RE = re.compile(
+ r"""
+\b
+(
+ paroles(\ et\ traduction|\ de\ chanson)?
+ | letras?(\ de)?
+ | liedtexte
+ | dainų\ žodžiai
+ | original\ song\ full\ text\.
+ | official
+ | 20[12]\d\ version
+ | (absolute\ |az)?lyrics(\ complete)?
+ | www\S+
+ | \S+\.(com|net|mus\.br)
+)
+([^\w.]|$)
+""",
+ re.IGNORECASE | re.VERBOSE,
+ )
+ #: Split cleaned up URL title into artist and title parts.
+ URL_TITLE_PARTS_RE = re.compile(r" +(?:[ :|-]+|par|by) +")
- for item in bad_triggers:
- bad_triggers_occ += [item] * len(
- re.findall(r"\W%s\W" % item, text, re.I)
- )
+ SOURCE_DIST_FACTOR = {"www.azlyrics.com": 0.5, "www.songlyrics.com": 0.6}
- if bad_triggers_occ:
- self._log.debug("Bad triggers detected: {0}", bad_triggers_occ)
- return len(bad_triggers_occ) < 2
+ ignored_domains: set[str] = set()
- def slugify(self, text):
- """Normalize a string and remove non-alphanumeric characters."""
- text = re.sub(r"[-'_\s]", "_", text)
- text = re.sub(r"_+", "_", text).strip("_")
- pat = r"([^,\(]*)\((.*?)\)" # Remove content within parentheses
- text = re.sub(pat, r"\g<1>", text).strip()
- try:
- text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore")
- text = str(re.sub(r"[-\s]+", " ", text.decode("utf-8")))
- except UnicodeDecodeError:
- self._log.exception("Failing to normalize '{0}'", text)
- return text
+ @classmethod
+ def pre_process_html(cls, html: str) -> str:
+ """Pre-process the HTML content before scraping."""
+ html = Html.remove_ads(super().pre_process_html(html))
+ return Html.remove_formatting(Html.merge_paragraphs(html))
+
+ def fetch_text(self, *args, **kwargs) -> str:
+ """Handle an error so that we can continue with the next URL."""
+ kwargs.setdefault("allow_redirects", False)
+ with self.handle_request():
+ try:
+ return super().fetch_text(*args, **kwargs)
+ except CaptchaError:
+ self.ignored_domains.add(urlparse(args[0]).netloc)
+ raise
- BY_TRANS = ["by", "par", "de", "von"]
- LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
+ @staticmethod
+ def get_part_dist(artist: str, title: str, part: str) -> float:
+ """Return the distance between the given part and the artist and title.
- def is_page_candidate(self, url_link, url_title, title, artist):
- """Return True if the URL title makes it a good candidate to be a
- page that contains lyrics of title by artist.
+ A number between -1 and 1 is returned, where -1 means the part is
+ closer to the artist and 1 means it is closer to the title.
"""
- title = self.slugify(title.lower())
- artist = self.slugify(artist.lower())
- sitename = re.search(
- "//([^/]+)/.*", self.slugify(url_link.lower())
- ).group(1)
- url_title = self.slugify(url_title.lower())
-
- # Check if URL title contains song title (exact match)
- if url_title.find(title) != -1:
- return True
+ return string_dist(artist, part) - string_dist(title, part)
- # or try extracting song title from URL title and check if
- # they are close enough
- tokens = (
- [by + "_" + artist for by in self.BY_TRANS]
- + [artist, sitename, sitename.replace("www.", "")]
- + self.LYRICS_TRANS
+ @classmethod
+ def make_search_result(
+ cls, artist: str, title: str, item: GoogleCustomSearchAPI.Item
+ ) -> SearchResult:
+ """Parse artist and title from the URL title and return a search result."""
+ url_title = (
+ # get full title from metatags if available
+ item.get("pagemap", {}).get("metatags", [{}])[0].get("og:title")
+ # default to the dispolay title
+ or item["title"]
)
- tokens = [re.escape(t) for t in tokens]
- song_title = re.sub("(%s)" % "|".join(tokens), "", url_title)
+ clean_title = cls.URL_TITLE_NOISE_RE.sub("", url_title).strip(" .-|")
+ # split it into parts which may be part of the artist or the title
+ # `dict.fromkeys` removes duplicates keeping the order
+ parts = list(dict.fromkeys(cls.URL_TITLE_PARTS_RE.split(clean_title)))
+
+ if len(parts) == 1:
+ part = parts[0]
+ if m := re.search(rf"(?i)\W*({re.escape(title)})\W*", part):
+ # artist and title may not have a separator
+ result_title = m[1]
+ result_artist = part.replace(m[0], "")
+ else:
+ # assume that this is the title
+ result_artist, result_title = "", parts[0]
+ else:
+ # sort parts by their similarity to the artist
+ parts.sort(key=lambda p: cls.get_part_dist(artist, title, p))
+ result_artist, result_title = parts[0], " ".join(parts[1:])
- song_title = song_title.strip("_|")
- typo_ratio = 0.9
- ratio = difflib.SequenceMatcher(None, song_title, title).ratio()
- return ratio >= typo_ratio
+ return SearchResult(result_artist, result_title, item["link"])
- def fetch(self, artist: str, title: str, *_) -> str | None:
+ def search(self, artist: str, title: str) -> Iterable[SearchResult]:
params = {
"key": self.config["google_API_key"].as_str(),
"cx": self.config["google_engine_ID"].as_str(),
"q": f"{artist} {title}",
+ "siteSearch": "www.musixmatch.com",
+ "siteSearchFilter": "e",
+ "excludeTerms": ", ".join(self.EXCLUDE_PAGES),
}
- data = self.fetch_url(self.SEARCH_URL, params=params)
- if not data:
- self._log.debug("google backend returned no data")
- return None
- try:
- data = json.loads(data)
- except ValueError as exc:
- self._log.debug("google backend returned malformed JSON: {}", exc)
- if "error" in data:
- reason = data["error"]["errors"][0]["reason"]
- self._log.debug("google backend error: {0}", reason)
- return None
+ data: GoogleCustomSearchAPI.Response = self.fetch_json(
+ self.SEARCH_URL, params=params
+ )
+ for item in data.get("items", []):
+ yield self.make_search_result(artist, title, item)
+
+ def get_results(self, *args) -> Iterable[SearchResult]:
+ """Try results from preferred sources first."""
+ for result in sorted(
+ super().get_results(*args),
+ key=lambda r: self.SOURCE_DIST_FACTOR.get(r.source, 1),
+ ):
+ if result.source not in self.ignored_domains:
+ yield result
- if "items" in data.keys():
- for item in data["items"]:
- url_link = item["link"]
- url_title = item.get("title", "")
- if not self.is_page_candidate(
- url_link, url_title, title, artist
- ):
- continue
- html = self.fetch_url(url_link)
- if not html:
- continue
- lyrics = scrape_lyrics_from_html(html)
- if not lyrics:
- continue
-
- if self.is_lyrics(lyrics, artist):
- self._log.debug("got lyrics from {0}", item["displayLink"])
- return lyrics
+ @classmethod
+ def scrape(cls, html: str) -> str | None:
+ # Get the longest text element (if any).
+ if strings := sorted(cls.get_soup(html).stripped_strings, key=len):
+ return strings[-1]
return None
-class LyricsPlugin(plugins.BeetsPlugin):
- SOURCES = ["lrclib", "google", "musixmatch", "genius", "tekstowo"]
- SOURCE_BACKENDS = {
- "google": Google,
- "musixmatch": MusiXmatch,
- "genius": Genius,
- "tekstowo": Tekstowo,
- "lrclib": LRCLib,
+class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
+ BACKEND_BY_NAME = {
+ b.name: b for b in [LRCLib, Google, Genius, Tekstowo, MusiXmatch]
}
+ @cached_property
+ def backends(self) -> list[Backend]:
+ user_sources = self.config["sources"].get()
+
+ chosen = plugins.sanitize_choices(user_sources, self.BACKEND_BY_NAME)
+ if "google" in chosen and not self.config["google_API_key"].get():
+ self.warn("Disabling Google source: no API key configured.")
+ chosen.remove("google")
+
+ return [self.BACKEND_BY_NAME[c](self.config, self._log) for c in chosen]
+
def __init__(self):
super().__init__()
self.import_stages = [self.imported]
@@ -866,18 +778,22 @@ def __init__(self):
"bing_client_secret": None,
"bing_lang_from": [],
"bing_lang_to": None,
+ "dist_thresh": 0.11,
"google_API_key": None,
"google_engine_ID": "009217259823014548361:lndtuqkycfu",
- "genius_api_key": "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W"
- "76V-uFL5jks5dNvcGCdarqFjDhP9c",
+ "genius_api_key": (
+ "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W"
+ "76V-uFL5jks5dNvcGCdarqFjDhP9c"
+ ),
"fallback": None,
"force": False,
"local": False,
"synced": False,
# Musixmatch is disabled by default as they are currently blocking
# requests with the beets user agent.
- "sources": [s for s in self.SOURCES if s != "musixmatch"],
- "dist_thresh": 0.1,
+ "sources": [
+ n for n in self.BACKEND_BY_NAME if n != "musixmatch"
+ ],
}
)
self.config["bing_client_secret"].redact = True
@@ -894,57 +810,12 @@ def __init__(self):
# open yet.
self.rest = None
- available_sources = list(self.SOURCES)
- sources = plugins.sanitize_choices(
- self.config["sources"].as_str_seq(), available_sources
- )
-
- if not HAS_BEAUTIFUL_SOUP:
- sources = self.sanitize_bs_sources(sources)
-
- if "google" in sources:
- if not self.config["google_API_key"].get():
- # We log a *debug* message here because the default
- # configuration includes `google`. This way, the source
- # is silent by default but can be enabled just by
- # setting an API key.
- self._log.debug(
- "Disabling google source: " "no API key configured."
- )
- sources.remove("google")
-
self.config["bing_lang_from"] = [
x.lower() for x in self.config["bing_lang_from"].as_str_seq()
]
- self.bing_auth_token = None
-
- if not HAS_LANGDETECT and self.config["bing_client_secret"].get():
- self._log.warning(
- "To use bing translations, you need to "
- "install the langdetect module. See the "
- "documentation for further details."
- )
-
- self.backends = [
- self.SOURCE_BACKENDS[source](self.config, self._log)
- for source in sources
- ]
-
- def sanitize_bs_sources(self, sources):
- enabled_sources = []
- for source in sources:
- if self.SOURCE_BACKENDS[source].REQUIRES_BS:
- self._log.debug(
- "To use the %s lyrics source, you must "
- "install the beautifulsoup4 module. See "
- "the documentation for further details." % source
- )
- else:
- enabled_sources.append(source)
-
- return enabled_sources
- def get_bing_access_token(self):
+ @cached_property
+ def bing_access_token(self) -> str | None:
params = {
"client_id": "beets",
"client_secret": self.config["bing_client_secret"],
@@ -953,20 +824,9 @@ def get_bing_access_token(self):
}
oauth_url = "https://datamarket.accesscontrol.windows.net/v2/OAuth2-13"
- oauth_token = json.loads(
- requests.post(
- oauth_url,
- data=urlencode(params),
- timeout=10,
- ).content
- )
- if "access_token" in oauth_token:
- return "Bearer " + oauth_token["access_token"]
- else:
- self._log.warning(
- "Could not get Bing Translate API access token."
- ' Check your "bing_client_secret" password'
- )
+ with self.handle_request():
+ r = r_session.post(oauth_url, params=params)
+ return r.json()["access_token"]
def commands(self):
cmd = ui.Subcommand("lyrics", help="fetch song lyrics")
@@ -1112,7 +972,7 @@ def fetch_item_lyrics(self, item: Item, write: bool, force: bool) -> None:
"""
# Skip if the item already has lyrics.
if not force and item.lyrics:
- self._log.info("lyrics already present: {0}", item)
+ self.info("🔵 Lyrics already present: {}", item)
return
lyrics_matches = []
@@ -1128,8 +988,8 @@ def fetch_item_lyrics(self, item: Item, write: bool, force: bool) -> None:
lyrics = "\n\n---\n\n".join(filter(None, lyrics_matches))
if lyrics:
- self._log.info("fetched lyrics: {0}", item)
- if HAS_LANGDETECT and self.config["bing_client_secret"].get():
+ self.info("🟢 Found lyrics: {0}", item)
+ if self.config["bing_client_secret"].get():
lang_from = langdetect.detect(lyrics)
if self.config["bing_lang_to"].get() != lang_from and (
not self.config["bing_lang_from"]
@@ -1139,62 +999,51 @@ def fetch_item_lyrics(self, item: Item, write: bool, force: bool) -> None:
lyrics, self.config["bing_lang_to"]
)
else:
- self._log.info("lyrics not found: {0}", item)
- fallback = self.config["fallback"].get()
- if fallback:
- lyrics = fallback
- else:
- return
- item.lyrics = lyrics
- if write:
- item.try_write()
- item.store()
+ self.info("🔴 Lyrics not found: {}", item)
+ lyrics = self.config["fallback"].get()
+
+ if lyrics not in {None, item.lyrics}:
+ item.lyrics = lyrics
+ if write:
+ item.try_write()
+ item.store()
def get_lyrics(self, artist: str, title: str, *args) -> str | None:
"""Fetch lyrics, trying each source in turn. Return a string or
None if no lyrics were found.
"""
+ self.info("Fetching lyrics for {} - {}", artist, title)
for backend in self.backends:
- lyrics = backend.fetch(artist, title, *args)
- if lyrics:
- self._log.debug(
- "got lyrics from backend: {0}", backend.__class__.__name__
- )
- return _scrape_strip_cruft(lyrics, True)
+ with backend.handle_request():
+ if lyrics_info := backend.fetch(artist, title, *args):
+ lyrics, url = lyrics_info
+ return f"{lyrics}\n\nSource: {url}"
return None
def append_translation(self, text, to_lang):
from xml.etree import ElementTree
- if not self.bing_auth_token:
- self.bing_auth_token = self.get_bing_access_token()
- if self.bing_auth_token:
- # Extract unique lines to limit API request size per song
- text_lines = set(text.split("\n"))
- url = (
- "https://api.microsofttranslator.com/v2/Http.svc/"
- "Translate?text=%s&to=%s" % ("|".join(text_lines), to_lang)
+ if not (token := self.bing_access_token):
+ self.warn(
+ "Could not get Bing Translate API access token. "
+ "Check your 'bing_client_secret' password."
)
- r = requests.get(
+ return text
+
+ # Extract unique lines to limit API request size per song
+ lines = text.split("\n")
+ unique_lines = set(lines)
+ url = "https://api.microsofttranslator.com/v2/Http.svc/Translate"
+ with self.handle_request():
+ text = self.fetch_text(
url,
- headers={"Authorization ": self.bing_auth_token},
- timeout=10,
+ headers={"Authorization": f"Bearer {token}"},
+ params={"text": "|".join(unique_lines), "to": to_lang},
)
- if r.status_code != 200:
- self._log.debug(
- "translation API error {}: {}", r.status_code, r.text
- )
- if "token has expired" in r.text:
- self.bing_auth_token = None
- return self.append_translation(text, to_lang)
- return text
- lines_translated = ElementTree.fromstring(
- r.text.encode("utf-8")
- ).text
- # Use a translation mapping dict to build resulting lyrics
- translations = dict(zip(text_lines, lines_translated.split("|")))
- result = ""
- for line in text.split("\n"):
- result += "{} / {}\n".format(line, translations[line])
- return result
+ if translated := ElementTree.fromstring(text.encode("utf-8")).text:
+ # Use a translation mapping dict to build resulting lyrics
+ translations = dict(zip(unique_lines, translated.split("|")))
+ return "".join(f"{ln} / {translations[ln]}\n" for ln in lines)
+
+ return text
diff --git a/docs/changelog.rst b/docs/changelog.rst
index 48b91c44c8..737631971f 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -11,6 +11,10 @@ New features:
* :doc:`/plugins/substitute`: Allow the replacement string to use capture groups
from the match. It is thus possible to create more general rules, applying to
many different artists at once.
+* :doc:`plugins/lyrics`: Add new configuration option ``dist_thresh`` to
+ control the maximum allowed distance between the lyrics search result and the
+ tagged item's artist and title. This is useful for preventing false positives
+ when fetching lyrics.
Bug fixes:
@@ -28,6 +32,9 @@ Bug fixes:
``lrclib`` over other sources since it returns reliable results quicker than
others.
:bug:`5102`
+* :doc:`plugins/lyrics`: Fix the issue with ``genius`` backend not being able
+ to match lyrics when there is a slight variation in the artist name.
+ :bug:`4791`
For packagers:
diff --git a/docs/plugins/lyrics.rst b/docs/plugins/lyrics.rst
index d1f434d70f..f034cf47a1 100644
--- a/docs/plugins/lyrics.rst
+++ b/docs/plugins/lyrics.rst
@@ -2,25 +2,27 @@ Lyrics Plugin
=============
The ``lyrics`` plugin fetches and stores song lyrics from databases on the Web.
-Namely, the current version of the plugin uses `Genius.com`_, `Tekstowo.pl`_, `LRCLIB`_
-and, optionally, the Google custom search API.
+Namely, the current version of the plugin uses `Genius.com`_, `Tekstowo.pl`_,
+`LRCLIB`_ and, optionally, the Google Custom Search API.
.. _Genius.com: https://genius.com/
.. _Tekstowo.pl: https://www.tekstowo.pl/
.. _LRCLIB: https://lrclib.net/
-Fetch Lyrics During Import
---------------------------
+Install
+-------
-To automatically fetch lyrics for songs you import, first enable it in your
-configuration (see :ref:`using-plugins`). Then, install ``beets`` with
-``lyrics`` extra
+Firstly, enable ``lyrics`` plugin in your configuration (see
+:ref:`using-plugins`). Then, install ``beets`` with ``lyrics`` extra
.. code-block:: bash
pip install "beets[lyrics]"
+Fetch Lyrics During Import
+--------------------------
+
When importing new files, beets will now fetch lyrics for files that don't
already have them. The lyrics will be stored in the beets database. If the
``import.write`` config option is on, then the lyrics will also be written to
@@ -29,46 +31,52 @@ the files' tags.
Configuration
-------------
-To configure the plugin, make a ``lyrics:`` section in your
-configuration file. The available options are:
+To configure the plugin, make a ``lyrics:`` section in your configuration file.
+Default configuration:
+
+.. code-block:: yaml
+
+ lyrics:
+ auto: yes
+ bing_client_secret: null
+ bing_lang_from: []
+ bing_lang_to: null
+ dist_thresh: 0.11
+ fallback: null
+ force: no
+ google_API_key: null
+ google_engine_ID: 009217259823014548361:lndtuqkycfu
+ sources: [lrclib, google, genius, tekstowo]
+ synced: no
+
+The available options are:
- **auto**: Fetch lyrics automatically during import.
- Default: ``yes``.
- **bing_client_secret**: Your Bing Translation application password
- (to :ref:`lyrics-translation`)
+ (see :ref:`lyrics-translation`)
- **bing_lang_from**: By default all lyrics with a language other than
``bing_lang_to`` are translated. Use a list of lang codes to restrict the set
of source languages to translate.
- Default: ``[]``
- **bing_lang_to**: Language to translate lyrics into.
- Default: None.
+- **dist_thresh**: The maximum distance between the artist and title
+ combination of the music file and lyrics candidate to consider them a match.
+ Lower values will make the plugin more strict, higher values will make it
+ more lenient. This does not apply to the ``lrclib`` backend as it matches
+ durations.
- **fallback**: By default, the file will be left unchanged when no lyrics are
found. Use the empty string ``''`` to reset the lyrics in such a case.
- Default: None.
- **force**: By default, beets won't fetch lyrics if the files already have
ones. To instead always fetch lyrics, set the ``force`` option to ``yes``.
- Default: ``no``.
- **google_API_key**: Your Google API key (to enable the Google Custom Search
backend).
- Default: None.
- **google_engine_ID**: The custom search engine to use.
Default: The `beets custom search engine`_, which gathers an updated list of
sources known to be scrapeable.
- **sources**: List of sources to search for lyrics. An asterisk ``*`` expands
- to all available sources.
- Default: ``lrclib google genius tekstowo``, i.e., all the available sources. The
- ``google`` source will be automatically deactivated if no ``google_API_key``
- is setup.
- The ``google``, ``genius``, and ``tekstowo`` sources will only be enabled if
- BeautifulSoup is installed.
-- **synced**: Prefer synced lyrics over plain lyrics if a source offers them. Currently `lrclib` is the only source that provides them. Default: `no`.
-
-Here's an example of ``config.yaml``::
-
- lyrics:
- fallback: ''
- google_API_key: AZERTYUIOPQSDFGHJKLMWXCVBN1234567890_ab
- google_engine_ID: 009217259823014548361:lndtuqkycfu
+ to all available sources. The ``google`` source will be automatically
+ deactivated if no ``google_API_key`` is setup.
+- **synced**: Prefer synced lyrics over plain lyrics if a source offers them.
+ Currently ``lrclib`` is the only source that provides them.
.. _beets custom search engine: https://www.google.com:443/cse/publicurl?cx=009217259823014548361:lndtuqkycfu
@@ -83,74 +91,74 @@ by that band, and ``beet lyrics`` will get lyrics for my entire library. The
lyrics will be added to the beets database and, if ``import.write`` is on,
embedded into files' metadata.
-The ``-p`` option to the ``lyrics`` command makes it print lyrics out to the
-console so you can view the fetched (or previously-stored) lyrics.
+The ``-p, --print`` option to the ``lyrics`` command makes it print lyrics out
+to the console so you can view the fetched (or previously-stored) lyrics.
-The ``-f`` option forces the command to fetch lyrics, even for tracks that
-already have lyrics. Inversely, the ``-l`` option restricts operations
-to lyrics that are locally available, which show lyrics faster without using
-the network at all.
+The ``-f, --force`` option forces the command to fetch lyrics, even for tracks
+that already have lyrics.
+
+Inversely, the ``-l, --local`` option restricts operations to lyrics that are
+locally available, which show lyrics faster without using the network at all.
Rendering Lyrics into Other Formats
-----------------------------------
-The ``-r directory`` option renders all lyrics as `reStructuredText`_ (ReST)
-documents in ``directory`` (by default, the current directory). That
-directory, in turn, can be parsed by tools like `Sphinx`_ to generate HTML,
-ePUB, or PDF documents.
+The ``-r directory, --write-rest directory`` option renders all lyrics as
+`reStructuredText`_ (ReST) documents in ``directory`` (by default, the current
+directory). That directory, in turn, can be parsed by tools like `Sphinx`_ to
+generate HTML, ePUB, or PDF documents.
-A minimal ``conf.py`` and ``index.rst`` files are created the first time the
+Minimal ``conf.py`` and ``index.rst`` files are created the first time the
command is run. They are not overwritten on subsequent runs, so you can safely
modify these files to customize the output.
-.. _Sphinx: https://www.sphinx-doc.org/
-.. _reStructuredText: http://docutils.sourceforge.net/rst.html
+Sphinx supports various `builders`_, see a few suggestions:
-Sphinx supports various `builders
-`_, but here are a
-few suggestions.
- * Build an HTML version::
+.. admonition:: Build an HTML version
- sphinx-build -b html . _build/html
+ ::
- * Build an ePUB3 formatted file, usable on ebook readers::
+ sphinx-build -b html . _build/html
- sphinx-build -b epub3 . _build/epub
+.. admonition:: Build an ePUB3 formatted file, usable on ebook readers
- * Build a PDF file, which incidentally also builds a LaTeX file::
+ ::
- sphinx-build -b latex %s _build/latex && make -C _build/latex all-pdf
+ sphinx-build -b epub3 . _build/epub
-.. _activate-google-custom-search:
+.. admonition:: Build a PDF file, which incidentally also builds a LaTeX file
+
+ ::
+
+ sphinx-build -b latex %s _build/latex && make -C _build/latex all-pdf
+
+
+.. _Sphinx: https://www.sphinx-doc.org/
+.. _reStructuredText: http://docutils.sourceforge.net/rst.html
+.. _builders: https://www.sphinx-doc.org/en/stable/builders.html
Activate Google Custom Search
------------------------------
You need to `register for a Google API key`_. Set the ``google_API_key``
configuration option to your key.
+
Then add ``google`` to the list of sources in your configuration (or use
default list, which includes it as long as you have an API key).
If you use default ``google_engine_ID``, we recommend limiting the sources to
``google`` as the other sources are already included in the Google results.
-.. _register for a Google API key: https://console.developers.google.com/
-
Optionally, you can `define a custom search engine`_. Get your search engine's
token and use it for your ``google_engine_ID`` configuration option. By
default, beets use a list of sources known to be scrapeable.
-.. _define a custom search engine: https://www.google.com/cse/all
-
Note that the Google custom search API is limited to 100 queries per day.
After that, the lyrics plugin will fall back on other declared data sources.
-.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
-
-Activate Genius and Tekstowo.pl Lyrics
---------------------------------------
+.. _register for a Google API key: https://console.developers.google.com/
+.. _define a custom search engine: https://www.google.com/cse/all
-These backends are enabled by default.
.. _lyrics-translation:
@@ -161,6 +169,6 @@ You need to register for a Microsoft Azure Marketplace free account and
to the `Microsoft Translator API`_. Follow the four steps process, specifically
at step 3 enter ``beets`` as *Client ID* and copy/paste the generated
*Client secret* into your ``bing_client_secret`` configuration, alongside
-``bing_lang_to`` target `language code`.
+``bing_lang_to`` target ``language code``.
.. _Microsoft Translator API: https://docs.microsoft.com/en-us/azure/cognitive-services/translator/translator-how-to-signup
diff --git a/setup.cfg b/setup.cfg
index 15ca23f658..8e3d7e3b82 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -21,8 +21,8 @@ omit = beets/test/*
precision = 2
skip_empty = true
show_missing = true
-exclude_lines =
- pragma: no cover
+exclude_also =
+ @atexit.register
if TYPE_CHECKING
if typing.TYPE_CHECKING
raise AssertionError
diff --git a/test/plugins/lyrics_pages.py b/test/plugins/lyrics_pages.py
index 84c2457ba4..84c9e24410 100644
--- a/test/plugins/lyrics_pages.py
+++ b/test/plugins/lyrics_pages.py
@@ -147,6 +147,27 @@ def backend(self) -> str:
""",
url_title="The Beatles Lady Madonna lyrics",
),
+ LyricsPage.make(
+ "https://www.dainuzodziai.lt/m/mergaites-nori-mylet-atlanta/",
+ """
+ Jos nesuspėja skriet paskui vėją
+ Bangos į krantą grąžina jas vėl
+ Jos karštą saulę paliesti norėjo
+ Ant kranto palikę visas negandas
+
+ Bet jos nori mylėt
+ Jos nenori liūdėt
+ Leisk mergaitėms mylėt
+ Kaip jos moka mylėt
+ Koks vakaras šiltas ir nieko nestinga
+ Veidus apšviečia žaisminga šviesa
+ Jos buvo laimingos prie jūros kur liko
+ Tik vėjas išmokęs visas jų dainas
+ """,
+ artist="Atlanta",
+ track_title="Mergaitės Nori Mylėt",
+ url_title="Mergaitės nori mylėt – Atlanta | Dainų Žodžiai",
+ ),
LyricsPage.make(
"https://genius.com/The-beatles-lady-madonna-lyrics",
"""
@@ -223,6 +244,20 @@ def backend(self) -> str:
Mademoiselle Madonna, couchée sur votre lit
Listen to the music playing in your head.
Vous écoutez la musique qui joue dans votre tête
+
+ Tuesday afternoon is never ending.
+ Le mardi après-midi n'en finit pas
+ Wednesday morning papers didn't come.
+ Le mercredi matin les journaux ne sont pas arrivés
+ Thursday night you stockings needed mending.
+ Jeudi soir, vos bas avaient besoin d'être réparés
+ See how they run.
+ Regardez comme ils filent
+
+ Lady Madonna, children at your feet.
+ Mademoiselle Madonna, les enfants à vos pieds
+ Wonder how you manage to make ends meet.
+ Je me demande comment vous vous débrouillez pour joindre les deux bouts
""",
url_title="Paroles et traduction The Beatles : Lady Madonna - paroles de chanson", # noqa: E501
),
@@ -235,29 +270,35 @@ def backend(self) -> str:
Children at your feet
Wonder how you manage
To make ends meet
+
Who finds the money
When you pay the rent?
Did you think that money
Was Heaven sent?
+
Friday night arrives without a suitcase
Sunday morning creeping like a nun
Monday's child has learned
To tie his bootlace
See how they run
+
Lady Madonna
Baby at your breast
Wonders how you manage
To feed the rest
See how they run
+
Lady Madonna
Lying on the bed
Listen to the music
Playing in your head
+
Tuesday afternoon is neverending
Wednesday morning papers didn't come
Thursday night your stockings
Needed mending
See how they run
+
Lady Madonna
Children at your feet
Wonder how you manage
@@ -415,24 +456,29 @@ def backend(self) -> str:
LyricsPage.make(
"https://www.musica.com/letras.asp?letra=59862",
"""
- Lady Madonna
Lady Madonna, children at your feet
Wonder how you manage to make ends meet
Who finds the money when you pay the rent?
Did you think that money was heaven sent?
+
Friday night arrives without a suitcase
Sunday morning creeping like a nun
Monday's child has learned to tie his bootlace
See how they run
+
Lady Madonna, baby at your breast
Wonders how you manage to feed the rest
+
See how they run
+
Lady Madonna lying on the bed
Listen to the music playing in your head
+
Tuesday afternoon is never ending
Wednesday morning papers didn't come
Thursday night your stockings needed mending
See how they run
+
Lady Madonna, children at your feet
Wonder how you manage to make ends meet
""",
@@ -457,6 +503,14 @@ def backend(self) -> str:
See how they run.
Lady Madonna, lying on the bed,
Listen to the music playing in your head.
+
+ Tuesday afternoon is never ending.
+ Wednesday morning papers didn't come.
+ Thursday night your stockings needed mending.
+ See how they run.
+
+ Lady Madonna, children at your feet.
+ Wonder how you manage to make ends meet.
""",
url_title="Paroles Lady Madonna par The Beatles - Lyrics - Paroles.net",
),
@@ -489,6 +543,7 @@ def backend(self) -> str:
Wonder how you manage to make ends meet
""",
url_title="THE BEATLES - LADY MADONNA LYRICS",
+ marks=[xfail_on_ci("Songlyrics is blocked by Cloudflare")],
),
LyricsPage.make(
"https://sweetslyrics.com/the-beatles/lady-madonna-lyrics",
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 0dee427ec3..b33368b44d 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -101,47 +101,6 @@ def test_search_pairs_titles(self, title, expected_extra_titles):
assert list(actual_titles) == [title, *expected_extra_titles]
- @pytest.mark.parametrize(
- "initial_lyrics, expected",
- [
- ("Verse\nLyrics credit in the last line", "Verse"),
- ("Lyrics credit in the first line\nVerse", "Verse"),
- (
- """Verse
- Lyrics mentioned somewhere in the middle
- Verse""",
- """Verse
- Lyrics mentioned somewhere in the middle
- Verse""",
- ),
- ],
- )
- def test_remove_credits(self, initial_lyrics, expected):
- assert lyrics.remove_credits(initial_lyrics) == expected
-
- @pytest.mark.parametrize(
- "initial_text, expected",
- [
- (
- """
- one
-
- two !
-
- four """,
- "one\ntwo !\n\nfour",
- ),
- ("foobaz", "foobaz"),
- ("fooqux", "fooqux"),
- ],
- )
- def test_scrape_strip_cruft(self, initial_text, expected):
- assert lyrics._scrape_strip_cruft(initial_text, True) == expected
-
- def test_scrape_merge_paragraphs(self):
- text = "one two
three"
- assert lyrics._scrape_merge_paragraphs(text) == "one\ntwo\nthree"
-
@pytest.mark.parametrize(
"text, expected",
[
@@ -161,12 +120,67 @@ def test_slug(self, text, expected):
assert lyrics.slug(text) == expected
+class TestHtml:
+ def test_scrape_strip_cruft(self):
+ initial = """
+ one
+
+ two !
+
+ four """
+ expected = "\none\ntwo !\n\nfour "
+
+ assert lyrics.Html.normalize_space(initial) == expected
+
+ def test_scrape_merge_paragraphs(self):
+ text = "one
two
three"
+ expected = "one\ntwo\n\nthree"
+
+ assert lyrics.Html.merge_paragraphs(text) == expected
+
+
+class TestSearchBackend:
+ @pytest.fixture
+ def backend(self, dist_thresh):
+ plugin = lyrics.LyricsPlugin()
+ plugin.config.set({"dist_thresh": dist_thresh})
+ return lyrics.SearchBackend(plugin.config, plugin._log)
+
+ @pytest.mark.parametrize(
+ "dist_thresh, target_artist, artist, should_match",
+ [
+ (0.11, "Target Artist", "Target Artist", True),
+ (0.11, "Target Artist", "Target Artis", True),
+ (0.11, "Target Artist", "Target Arti", False),
+ (0.11, "Psychonaut", "Psychonaut (BEL)", True),
+ (0.11, "beets song", "beats song", True),
+ (0.10, "beets song", "beats song", False),
+ (
+ 0.11,
+ "Lucid Dreams (Forget Me)",
+ "Lucid Dreams (Remix) ft. Lil Uzi Vert",
+ False,
+ ),
+ (
+ 0.12,
+ "Lucid Dreams (Forget Me)",
+ "Lucid Dreams (Remix) ft. Lil Uzi Vert",
+ True,
+ ),
+ ],
+ )
+ def test_check_match(self, backend, target_artist, artist, should_match):
+ result = lyrics.SearchResult(artist, "", "")
+
+ assert backend.check_match(target_artist, "", result) == should_match
+
+
@pytest.fixture(scope="module")
def lyrics_root_dir(pytestconfig: pytest.Config):
return pytestconfig.rootpath / "test" / "rsrc" / "lyrics"
-class LyricsBackendTest(PluginMixin):
+class LyricsPluginMixin(PluginMixin):
plugin = "lyrics"
@pytest.fixture
@@ -182,6 +196,42 @@ def lyrics_plugin(self, backend_name, plugin_config):
return lyrics.LyricsPlugin()
+
+class TestLyricsPlugin(LyricsPluginMixin):
+ @pytest.fixture
+ def backend_name(self):
+ """Return lyrics configuration to test."""
+ return "lrclib"
+
+ @pytest.mark.parametrize(
+ "request_kwargs, expected_log_match",
+ [
+ (
+ {"status_code": HTTPStatus.BAD_GATEWAY},
+ r"LRCLib: Request error: 502",
+ ),
+ ({"text": "invalid"}, r"LRCLib: Could not decode.*JSON"),
+ ],
+ )
+ def test_error_handling(
+ self,
+ requests_mock,
+ lyrics_plugin,
+ caplog,
+ request_kwargs,
+ expected_log_match,
+ ):
+ """Errors are logged with the backend name."""
+ requests_mock.get(lyrics.LRCLib.GET_URL, **request_kwargs)
+
+ assert lyrics_plugin.get_lyrics("", "", "", 0.0) is None
+ assert caplog.messages
+ last_log = caplog.messages[-1]
+ assert last_log
+ assert re.search(expected_log_match, last_log, re.I)
+
+
+class LyricsBackendTest(LyricsPluginMixin):
@pytest.fixture
def backend(self, lyrics_plugin):
"""Return a lyrics backend instance."""
@@ -229,24 +279,23 @@ def _patch_google_search(self, requests_mock, lyrics_page):
def test_backend_source(self, lyrics_plugin, lyrics_page: LyricsPage):
"""Test parsed lyrics from each of the configured lyrics pages."""
- lyrics = lyrics_plugin.get_lyrics(
+ lyrics_info = lyrics_plugin.get_lyrics(
lyrics_page.artist, lyrics_page.track_title, "", 186
)
- assert lyrics
+ assert lyrics_info
+ lyrics, _ = lyrics_info.split("\n\nSource: ")
assert lyrics == lyrics_page.lyrics
class TestGoogleLyrics(LyricsBackendTest):
"""Test scraping heuristics on a fake html page."""
- TITLE = "Beets song"
-
@pytest.fixture(scope="class")
def backend_name(self):
return "google"
- @pytest.fixture(scope="class")
+ @pytest.fixture
def plugin_config(self):
return {"google_API_key": "test"}
@@ -254,54 +303,59 @@ def plugin_config(self):
def file_name(self):
return "examplecom/beetssong"
+ @pytest.fixture
+ def search_item(self, url_title, url):
+ return {"title": url_title, "link": url}
+
+ @pytest.mark.parametrize("plugin_config", [{}])
+ def test_disabled_without_api_key(self, lyrics_plugin):
+ assert not lyrics_plugin.backends
+
def test_mocked_source_ok(self, backend, lyrics_html):
"""Test that lyrics of the mocked page are correctly scraped"""
- result = lyrics.scrape_lyrics_from_html(lyrics_html).lower()
+ result = backend.scrape(lyrics_html).lower()
assert result
- assert backend.is_lyrics(result)
- assert PHRASE_BY_TITLE[self.TITLE] in result
+ assert PHRASE_BY_TITLE["Beets song"] in result
@pytest.mark.parametrize(
- "url_title, artist, should_be_candidate",
+ "url_title, expected_artist, expected_title",
[
- ("John Doe - beets song Lyrics", "John Doe", True),
- ("example.com | Beats song by John doe", "John Doe", True),
- ("example.com | seets bong lyrics by John doe", "John Doe", False),
- ("foo", "Sun O)))", False),
+ ("Artist - beets song Lyrics", "Artist", "beets song"),
+ ("www.azlyrics.com | Beats song by Artist", "Artist", "Beats song"),
+ ("lyric.com | seets bong lyrics by Artist", "Artist", "seets bong"),
+ ("foo", "", "foo"),
+ ("Artist - Beets Song lyrics | AZLyrics", "Artist", "Beets Song"),
+ ("Letra de Artist - Beets Song", "Artist", "Beets Song"),
+ ("Letra de Artist - Beets ...", "Artist", "Beets"),
+ ("Artist Beets Song", "Artist", "Beets Song"),
+ ("BeetsSong - Artist", "Artist", "BeetsSong"),
+ ("Artist - BeetsSong", "Artist", "BeetsSong"),
+ ("Beets Song", "", "Beets Song"),
+ ("Beets Song Artist", "Artist", "Beets Song"),
+ (
+ "BeetsSong (feat. Other & Another) - Artist",
+ "Artist",
+ "BeetsSong (feat. Other & Another)",
+ ),
+ (
+ (
+ "Beets song lyrics by Artist - original song full text. "
+ "Official Beets song lyrics, 2024 version | LyricsMode.com"
+ ),
+ "Artist",
+ "Beets song",
+ ),
],
)
- def test_is_page_candidate(
- self, backend, lyrics_html, url_title, artist, should_be_candidate
+ @pytest.mark.parametrize("url", ["http://doesntmatter.com"])
+ def test_make_search_result(
+ self, backend, search_item, expected_artist, expected_title
):
- result = backend.is_page_candidate(
- "http://www.example.com/lyrics/beetssong",
- url_title,
- self.TITLE,
- artist,
- )
- assert bool(result) == should_be_candidate
-
- @pytest.mark.parametrize(
- "lyrics",
- [
- "LyricsMania.com - Copyright (c) 2013 - All Rights Reserved",
- """All material found on this site is property\n
- of mywickedsongtext brand""",
- """
-Lyricsmania staff is working hard for you to add $TITLE lyrics as soon
-as they'll be released by $ARTIST, check back soon!
-In case you have the lyrics to $TITLE and want to send them to us, fill out
-the following form.
-""",
- ],
- )
- def test_bad_lyrics(self, backend, lyrics):
- assert not backend.is_lyrics(lyrics)
+ result = backend.make_search_result("Artist", "Beets song", search_item)
- def test_slugify(self, backend):
- text = "http://site.com/\xe7afe-au_lait(boisson)"
- assert backend.slugify(text) == "http://site.com/cafe_au_lait"
+ assert result.artist == expected_artist
+ assert result.title == expected_title
class TestGeniusLyrics(LyricsBackendTest):
@@ -312,13 +366,13 @@ def backend_name(self):
@pytest.mark.parametrize(
"file_name, expected_line_count",
[
- ("geniuscom/2pacalleyezonmelyrics", 134),
+ ("geniuscom/2pacalleyezonmelyrics", 131),
("geniuscom/Ttngchinchillalyrics", 29),
("geniuscom/sample", 0), # see https://github.com/beetbox/beets/issues/3535
],
) # fmt: skip
def test_scrape(self, backend, lyrics_html, expected_line_count):
- result = backend._scrape_lyrics_from_html(lyrics_html) or ""
+ result = backend.scrape(lyrics_html) or ""
assert len(result.splitlines()) == expected_line_count
@@ -339,7 +393,7 @@ def backend_name(self):
],
)
def test_scrape(self, backend, lyrics_html, expecting_lyrics):
- assert bool(backend.extract_lyrics(lyrics_html)) == expecting_lyrics
+ assert bool(backend.scrape(lyrics_html)) == expecting_lyrics
LYRICS_DURATION = 950
@@ -347,6 +401,7 @@ def test_scrape(self, backend, lyrics_html, expecting_lyrics):
def lyrics_match(**overrides):
return {
+ "id": 1,
"instrumental": False,
"duration": LYRICS_DURATION,
"syncedLyrics": "synced",
@@ -363,13 +418,9 @@ def backend_name(self):
return "lrclib"
@pytest.fixture
- def request_kwargs(self, response_data):
- return {"json": response_data}
-
- @pytest.fixture
- def fetch_lyrics(self, backend, requests_mock, request_kwargs):
+ def fetch_lyrics(self, backend, requests_mock, response_data):
requests_mock.get(backend.GET_URL, status_code=HTTPStatus.NOT_FOUND)
- requests_mock.get(backend.SEARCH_URL, **request_kwargs)
+ requests_mock.get(backend.SEARCH_URL, json=response_data)
return partial(backend.fetch, "la", "la", "la", self.ITEM_DURATION)
@@ -379,7 +430,9 @@ def fetch_lyrics(self, backend, requests_mock, request_kwargs):
[({"synced": True}, "synced"), ({"synced": False}, "plain")],
)
def test_synced_config_option(self, fetch_lyrics, expected_lyrics):
- assert fetch_lyrics() == expected_lyrics
+ lyrics, _ = fetch_lyrics()
+
+ assert lyrics == expected_lyrics
@pytest.mark.parametrize(
"response_data, expected_lyrics",
@@ -426,20 +479,10 @@ def test_synced_config_option(self, fetch_lyrics, expected_lyrics):
)
@pytest.mark.parametrize("plugin_config", [{"synced": True}])
def test_fetch_lyrics(self, fetch_lyrics, expected_lyrics):
- assert fetch_lyrics() == expected_lyrics
+ lyrics_info = fetch_lyrics()
+ if lyrics_info is None:
+ assert expected_lyrics is None
+ else:
+ lyrics, _ = fetch_lyrics()
- @pytest.mark.parametrize(
- "request_kwargs, expected_log_match",
- [
- (
- {"status_code": HTTPStatus.BAD_GATEWAY},
- r"LRCLib: Request error: 502",
- ),
- ({"text": "invalid"}, r"LRCLib: Could not decode.*JSON"),
- ],
- )
- def test_error(self, caplog, fetch_lyrics, expected_log_match):
- assert fetch_lyrics() is None
- assert caplog.messages
- assert (last_log := caplog.messages[-1])
- assert re.search(expected_log_match, last_log, re.I)
+ assert lyrics == expected_lyrics