Skip to content

Commit

Permalink
Merge pull request #4633 from JOJ0/refactor_id_extraction
Browse files Browse the repository at this point in the history
Refactor metadata source ID extraction utilities
  • Loading branch information
JOJ0 authored Mar 8, 2023
2 parents 8bbaefb + c6746ed commit 40d27f5
Show file tree
Hide file tree
Showing 8 changed files with 161 additions and 50 deletions.
15 changes: 10 additions & 5 deletions beets/plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -705,22 +705,27 @@ def get_artist(artists, id_key='id', name_key='name', join_key=None):

return artist_string, artist_id

def _get_id(self, url_type, id_):
@staticmethod
def _get_id(url_type, id_, id_regex):
"""Parse an ID from its URL if necessary.
:param url_type: Type of URL. Either 'album' or 'track'.
:type url_type: str
:param id_: Album/track ID or URL.
:type id_: str
:param id_regex: A dictionary containing a regular expression
extracting an ID from an URL (if it's not an ID already) in
'pattern' and the number of the match group in 'match_group'.
:type id_regex: dict
:return: Album/track ID.
:rtype: str
"""
self._log.debug(
"Searching {} for {} '{}'", self.data_source, url_type, id_
log.debug(
"Extracting {} ID from '{}'", url_type, id_
)
match = re.search(self.id_regex['pattern'].format(url_type), str(id_))
match = re.search(id_regex['pattern'].format(url_type), str(id_))
if match:
id_ = match.group(self.id_regex['match_group'])
id_ = match.group(id_regex['match_group'])
if id_:
return id_
return None
Expand Down
65 changes: 65 additions & 0 deletions beets/util/id_extractors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# This file is part of beets.
# Copyright 2016, Adrian Sampson.
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.

"""Helpers around the extraction of album/track ID's from metadata sources."""

import re

# Spotify IDs consist of 22 alphanumeric characters
# (zero-left-padded base62 representation of randomly generated UUID4)
spotify_id_regex = {
'pattern': r'(^|open\.spotify\.com/{}/)([0-9A-Za-z]{{22}})',
'match_group': 2,
}

deezer_id_regex = {
'pattern': r'(^|deezer\.com/)([a-z]*/)?({}/)?(\d+)',
'match_group': 4,
}

beatport_id_regex = {
'pattern': r'(^|beatport\.com/release/.+/)(\d+)$',
'match_group': 2,
}

# A note on Bandcamp: There is no such thing as a Bandcamp album or artist ID,
# the URL can be used as the identifier. The Bandcamp metadata source plugin
# works that way - https://github.com/unrblt/beets-bandcamp. Bandcamp album
# URLs usually look like: https://nameofartist.bandcamp.com/album/nameofalbum


def extract_discogs_id_regex(album_id):
"""Returns the Discogs_id or None."""
# Discogs-IDs are simple integers. In order to avoid confusion with
# other metadata plugins, we only look for very specific formats of the
# input string:
# - plain integer, optionally wrapped in brackets and prefixed by an
# 'r', as this is how discogs displays the release ID on its webpage.
# - legacy url format: discogs.com/<name of release>/release/<id>
# - legacy url short format: discogs.com/release/<id>
# - current url format: discogs.com/release/<id>-<name of release>
# See #291, #4080 and #4085 for the discussions leading up to these
# patterns.
# Regex has been tested here https://regex101.com/r/TOu7kw/1

for pattern in [
r'^\[?r?(?P<id>\d+)\]?$',
r'discogs\.com/release/(?P<id>\d+)-?',
r'discogs\.com/[^/]+/release/(?P<id>\d+)',
]:
match = re.search(pattern, album_id)
if match:
return int(match.group('id'))

return None
10 changes: 7 additions & 3 deletions beetsplug/beatport.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from beets.autotag.hooks import AlbumInfo, TrackInfo
from beets.plugins import BeetsPlugin, MetadataSourcePlugin, get_distance
import confuse
from beets.util.id_extractors import beatport_id_regex


AUTH_ERRORS = (TokenRequestDenied, TokenMissing, VerifierMissing)
Expand Down Expand Up @@ -267,6 +268,7 @@ def __init__(self, data):

class BeatportPlugin(BeetsPlugin):
data_source = 'Beatport'
id_regex = beatport_id_regex

def __init__(self):
super().__init__()
Expand Down Expand Up @@ -380,11 +382,13 @@ def album_for_id(self, release_id):
or None if the query is not a valid ID or release is not found.
"""
self._log.debug('Searching for release {0}', release_id)
match = re.search(r'(^|beatport\.com/release/.+/)(\d+)$', release_id)
if not match:

release_id = self._get_id('album', release_id, self.id_regex)
if release_id is None:
self._log.debug('Not a valid Beatport release ID.')
return None
release = self.client.get_release(match.group(2))

release = self.client.get_release(release_id)
if release:
return self._get_album_info(release)
return None
Expand Down
10 changes: 4 additions & 6 deletions beetsplug/deezer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from beets import ui
from beets.autotag import AlbumInfo, TrackInfo
from beets.plugins import MetadataSourcePlugin, BeetsPlugin
from betts.utils.id_extractors import deezer_id_regex


class DeezerPlugin(MetadataSourcePlugin, BeetsPlugin):
Expand All @@ -34,10 +35,7 @@ class DeezerPlugin(MetadataSourcePlugin, BeetsPlugin):
album_url = 'https://api.deezer.com/album/'
track_url = 'https://api.deezer.com/track/'

id_regex = {
'pattern': r'(^|deezer\.com/)([a-z]*/)?({}/)?(\d+)',
'match_group': 4,
}
id_regex = deezer_id_regex

def __init__(self):
super().__init__()
Expand All @@ -51,7 +49,7 @@ def album_for_id(self, album_id):
:return: AlbumInfo object for album.
:rtype: beets.autotag.hooks.AlbumInfo or None
"""
deezer_id = self._get_id('album', album_id)
deezer_id = self._get_id('album', album_id, self.id_regex)
if deezer_id is None:
return None

Expand Down Expand Up @@ -154,7 +152,7 @@ def track_for_id(self, track_id=None, track_data=None):
:rtype: beets.autotag.hooks.TrackInfo or None
"""
if track_data is None:
deezer_id = self._get_id('track', track_id)
deezer_id = self._get_id('track', track_id, self.id_regex)
if deezer_id is None:
return None
track_data = requests.get(self.track_url + deezer_id).json()
Expand Down
30 changes: 3 additions & 27 deletions beetsplug/discogs.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import beets.ui
from beets import config
from beets.util.id_extractors import extract_discogs_id_regex
from beets.autotag.hooks import AlbumInfo, TrackInfo
from beets.plugins import MetadataSourcePlugin, BeetsPlugin, get_distance
import confuse
Expand Down Expand Up @@ -218,31 +219,6 @@ def item_candidates(self, item, artist, title):
# first 10 results, don't overwhelm with options
return candidates[:10]

@staticmethod
def extract_release_id_regex(album_id):
"""Returns the Discogs_id or None."""
# Discogs-IDs are simple integers. In order to avoid confusion with
# other metadata plugins, we only look for very specific formats of the
# input string:
# - plain integer, optionally wrapped in brackets and prefixed by an
# 'r', as this is how discogs displays the release ID on its webpage.
# - legacy url format: discogs.com/<name of release>/release/<id>
# - current url format: discogs.com/release/<id>-<name of release>
# See #291, #4080 and #4085 for the discussions leading up to these
# patterns.
# Regex has been tested here https://regex101.com/r/wyLdB4/2

for pattern in [
r'^\[?r?(?P<id>\d+)\]?$',
r'discogs\.com/release/(?P<id>\d+)-',
r'discogs\.com/[^/]+/release/(?P<id>\d+)',
]:
match = re.search(pattern, album_id)
if match:
return int(match.group('id'))

return None

def album_for_id(self, album_id):
"""Fetches an album by its Discogs ID and returns an AlbumInfo object
or None if the album is not found.
Expand All @@ -252,7 +228,7 @@ def album_for_id(self, album_id):

self._log.debug('Searching for release {0}', album_id)

discogs_id = self.extract_release_id_regex(album_id)
discogs_id = extract_discogs_id_regex(album_id)

if not discogs_id:
return None
Expand Down Expand Up @@ -365,7 +341,7 @@ def get_album_info(self, result):
else:
genre = base_genre

discogs_albumid = self.extract_release_id_regex(result.data.get('uri'))
discogs_albumid = extract_discogs_id_regex(result.data.get('uri'))

# Extract information for the optional AlbumInfo fields that are
# contained on nested discogs fields.
Expand Down
12 changes: 4 additions & 8 deletions beetsplug/spotify.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from beets.dbcore import types
from beets.library import DateType
from beets.plugins import BeetsPlugin, MetadataSourcePlugin
from beets.util.id_extractors import spotify_id_regex

DEFAULT_WAITING_TIME = 5

Expand Down Expand Up @@ -69,12 +70,7 @@ class SpotifyPlugin(MetadataSourcePlugin, BeetsPlugin):
track_url = 'https://api.spotify.com/v1/tracks/'
audio_features_url = 'https://api.spotify.com/v1/audio-features/'

# Spotify IDs consist of 22 alphanumeric characters
# (zero-left-padded base62 representation of randomly generated UUID4)
id_regex = {
'pattern': r'(^|open\.spotify\.com/{}/)([0-9A-Za-z]{{22}})',
'match_group': 2,
}
id_regex = spotify_id_regex

spotify_audio_features = {
'acousticness': 'spotify_acousticness',
Expand Down Expand Up @@ -216,7 +212,7 @@ def album_for_id(self, album_id):
:return: AlbumInfo object for album
:rtype: beets.autotag.hooks.AlbumInfo or None
"""
spotify_id = self._get_id('album', album_id)
spotify_id = self._get_id('album', album_id, self.id_regex)
if spotify_id is None:
return None

Expand Down Expand Up @@ -330,7 +326,7 @@ def track_for_id(self, track_id=None, track_data=None):
:rtype: beets.autotag.hooks.TrackInfo or None
"""
if track_data is None:
spotify_id = self._get_id('track', track_id)
spotify_id = self._get_id('track', track_id, self.id_regex)
if spotify_id is None:
return None
track_data = self._handle_response(
Expand Down
3 changes: 2 additions & 1 deletion test/test_discogs.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from test.helper import capture_log

from beets import config
from beets.util.id_extractors import extract_discogs_id_regex

from beetsplug.discogs import DiscogsPlugin

Expand Down Expand Up @@ -371,7 +372,7 @@ def test_album_for_id(self):
('005b84a0-ecd6-39f1-b2f6-6eb48756b268', ''),
]
for test_pattern, expected in test_patterns:
match = DiscogsPlugin.extract_release_id_regex(test_pattern)
match = extract_discogs_id_regex(test_pattern)
if not match:
match = ''
self.assertEqual(match, expected)
Expand Down
66 changes: 66 additions & 0 deletions test/test_plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@
from beets.dbcore import types
from mediafile import MediaFile
from beets.util import displayable_path, bytestring_path, syspath
from beets.plugins import MetadataSourcePlugin
from beets.util.id_extractors import spotify_id_regex, deezer_id_regex, \
beatport_id_regex

from test.test_importer import ImportHelper, AutotagStub
from test.test_ui_importer import TerminalImportSessionSetup
Expand Down Expand Up @@ -558,6 +561,69 @@ def foo(self, session, task):
require=ANY)


class ParseSpotifyIDTest(unittest.TestCase):
def test_parse_id_correct(self):
id_string = "39WqpoPgZxygo6YQjehLJJ"
out = MetadataSourcePlugin._get_id(
"album", id_string, spotify_id_regex)
self.assertEqual(out, id_string)

def test_parse_id_non_id_returns_none(self):
id_string = "blah blah"
out = MetadataSourcePlugin._get_id(
"album", id_string, spotify_id_regex)
self.assertEqual(out, None)

def test_parse_id_url_finds_id(self):
id_string = "39WqpoPgZxygo6YQjehLJJ"
id_url = "https://open.spotify.com/album/%s" % id_string
out = MetadataSourcePlugin._get_id(
"album", id_url, spotify_id_regex)
self.assertEqual(out, id_string)


class ParseDeezerIDTest(unittest.TestCase):
def test_parse_id_correct(self):
id_string = "176356382"
out = MetadataSourcePlugin._get_id(
"album", id_string, deezer_id_regex)
self.assertEqual(out, id_string)

def test_parse_id_non_id_returns_none(self):
id_string = "blah blah"
out = MetadataSourcePlugin._get_id(
"album", id_string, deezer_id_regex)
self.assertEqual(out, None)

def test_parse_id_url_finds_id(self):
id_string = "176356382"
id_url = "https://www.deezer.com/album/%s" % id_string
out = MetadataSourcePlugin._get_id(
"album", id_url, deezer_id_regex)
self.assertEqual(out, id_string)


class ParseBeatportIDTest(unittest.TestCase):
def test_parse_id_correct(self):
id_string = "3089651"
out = MetadataSourcePlugin._get_id(
"album", id_string, beatport_id_regex)
self.assertEqual(out, id_string)

def test_parse_id_non_id_returns_none(self):
id_string = "blah blah"
out = MetadataSourcePlugin._get_id(
"album", id_string, beatport_id_regex)
self.assertEqual(out, None)

def test_parse_id_url_finds_id(self):
id_string = "3089651"
id_url = "https://www.beatport.com/release/album-name/%s" % id_string
out = MetadataSourcePlugin._get_id(
"album", id_url, beatport_id_regex)
self.assertEqual(out, id_string)


def suite():
return unittest.TestLoader().loadTestsFromName(__name__)

Expand Down

0 comments on commit 40d27f5

Please sign in to comment.