Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: use data from MinIO hosted s3 buckets #289

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
acrclient==0.3.0
ConfigArgParse==1.5.3
iso3901==0.3.0.post1
openpyxl==3.1.2
Expand Down
56 changes: 24 additions & 32 deletions suisa_sendemeldung/acrclient.py
Original file line number Diff line number Diff line change
@@ -1,80 +1,74 @@
"""module containing the ACRCloud client."""
import logging
from datetime import date, datetime, timedelta

import pytz
from acrclient import Client
import requests
from tqdm import tqdm

logger = logging.getLogger(__name__)

class ACRClient(Client):
"""ACRCloud client to fetch metadata.

class ACRClient:
"""Fetches cached metadata from MinIO.

Args:
bearer_token: The bearer token for ACRCloud.
minio_url: URL to a public MinIO bucket containing raw per-day JSON files.
timezone (optional): The timezone to use for localization.
"""

# format of timestamp in api answer
TS_FMT = "%Y-%m-%d %H:%M:%S"
# timezone of ACRCloud
ACR_TIMEZONE = "UTC"

def __init__(self, bearer_token, base_url="https://eu-api-v2.acrcloud.com"):
super().__init__(bearer_token=bearer_token, base_url=base_url)
def __init__(self, minio_url: str, timezone=ACR_TIMEZONE):
self.minio_url = minio_url
self.timezone = timezone
self.default_date = date.today() - timedelta(days=1)

def get_data(
self, project_id, stream_id, requested_date=None, timezone=ACR_TIMEZONE
):
"""Fetch metadata from ACRCloud for `stream_id`.
def get_data(self, requested_date=None):
"""Fetch ACRCloud metadata from MinIO.

Args:
project_id: The Project ID of the stream.
stream_id: The ID of the stream.
requested_date (optional): The date of the entries you want (default: yesterday).
timezone (optional): The timezone to use for localization.

Returns:
json: The ACR data from date
"""
if requested_date is None:
requested_date = self.default_date
data = self.get_bm_cs_projects_results(
project_id=project_id,
stream_id=stream_id,
params={
"date": requested_date.strftime("%Y%m%d"),
},
)
url = f"{self.minio_url}{requested_date.strftime('%Y-%m-%d')}.json"
resp = requests.get(url, timeout=10)
if resp.ok:
data = resp.json()
else: # pragma: no cover
raise RuntimeError(f"💀 failed to load data from {url}")
for entry in data:
metadata = entry.get("metadata")
ts_utc = pytz.utc.localize(
datetime.strptime(metadata.get("timestamp_utc"), ACRClient.TS_FMT)
)
ts_local = ts_utc.astimezone(pytz.timezone(timezone))
ts_local = ts_utc.astimezone(pytz.timezone(self.timezone))
metadata.update({"timestamp_local": ts_local.strftime(ACRClient.TS_FMT)})

return data

def get_interval_data(
self, project_id, stream_id, start, end, timezone=ACR_TIMEZONE
): # pylint: disable-msg=too-many-locals,too-many-arguments
def get_interval_data(self, start, end):
"""Get data specified by interval from start to end.

Args:
project_id: The ID of the project.
stream_id: The ID of the stream.
start: The start date of the interval.
end: The end date of the interval.
timezone (optional): will be passed to `get_data()`.

Returns:
json: The ACR data from start to end.
"""
trim = False
# if we have to localize the timestamps we may need more data
if timezone != ACRClient.ACR_TIMEZONE:
if self.timezone != ACRClient.ACR_TIMEZONE:
# compute utc offset
offset = pytz.timezone(timezone).utcoffset(datetime.now())
offset = pytz.timezone(self.timezone).utcoffset(datetime.now())
# decrease start by 1 day if we're ahead of utc
if offset > timedelta(seconds=1):
computed_start = start - timedelta(days=1)
Expand All @@ -98,9 +92,7 @@ def get_interval_data(
# make the prefix longer by this amount so tqdm lines up with the one in the main code
ljust_amount: int = 27
for ptr in tqdm(dates, desc="load ACRCloud data".ljust(ljust_amount)):
data += self.get_data(
project_id, stream_id, requested_date=ptr, timezone=timezone
)
data += self.get_data(requested_date=ptr)

# if timestamps are localized we will have to removed the unneeded entries.
if trim:
Expand Down
67 changes: 31 additions & 36 deletions suisa_sendemeldung/suisa_sendemeldung.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import cridlib
import pytz
import requests
from babel.dates import format_date
from configargparse import ArgumentParser
from dateutil.relativedelta import relativedelta
Expand Down Expand Up @@ -89,21 +90,6 @@ def validate_arguments(parser, args):
args: the arguments to validate
"""
msgs = []
# check length of bearer_token
if not len(args.bearer_token) >= 32:
msgs.append(
"".join(
(
"wrong format on bearer_token, ",
f"expected larger than 32 characters but got {len(args.bearer_token)}",
)
)
)
# check length of stream_id
if not len(args.stream_id) == 9:
msgs.append(
f"wrong format on stream_id, expected 9 characters but got {len(args.stream_id)}"
)
# one output option has to be set
if not (args.file or args.email or args.stdout):
msgs.append(
Expand All @@ -130,22 +116,25 @@ def get_arguments(parser: ArgumentParser): # pragma: no cover
args: the parsed args from the parser
"""
parser.add_argument(
"--bearer-token",
env_var="BEARER_TOKEN",
help="the bearer token for ACRCloud (required)",
required=True,
"--minio",
dest="minio",
env_var="MINIO",
help="URL to MinIO",
default="https://minio.service.int.rabe.ch:9000",
)
parser.add_argument(
"--project-id",
env_var="PROJECT_ID",
help="the id of the project at ACRCloud (required)",
required=True,
"--minio-raw-bucket",
dest="minio_raw",
env_var="MINIO_RAW_BUCKET",
help="world readable bucket with daily data exports from ACRCloud",
default="acrcloud.raw",
)
parser.add_argument(
"--stream-id",
env_var="STREAM_ID",
help="the id of the stream at ACRCloud (required)",
required=True,
"--minio-music-bucket",
dest="minio_music",
env_var="MINIO_MUSIC_BUCKET",
help="world readable bucket with deduplicated music info",
default="acrcloud.music",
)
parser.add_argument(
"--station-name",
Expand Down Expand Up @@ -444,7 +433,7 @@ def get_isrc(music):

# all local vars are required, eight are already used for the csv entries
# pylint: disable-msg=too-many-locals
def get_csv(data, station_name=""):
def get_csv(data, station_name="", minio_url=""):
"""Create SUISA compatible csv data.

Arguments:
Expand Down Expand Up @@ -498,6 +487,12 @@ def get_csv(data, station_name=""):

try:
music = metadata.get("music")[0]
url = f"{minio_url}{music.get('acrid')}"
resp = requests.get(url, timeout=10)
if resp.ok:
music = resp.json()
else: # pragma: no cover
raise RuntimeError(f"💀 failed to load data from {url}")
except TypeError:
music = metadata.get("custom_files")[0]
title = music.get("title")
Expand Down Expand Up @@ -574,7 +569,7 @@ def get_csv(data, station_name=""):
return csv.getvalue()


def get_xlsx(data, station_name=""):
def get_xlsx(data, station_name="", minio_url=""):
"""Create SUISA compatible xlsx data.

Arguments:
Expand All @@ -583,7 +578,7 @@ def get_xlsx(data, station_name=""):
Returns:
xlsx: The converted data as BytesIO object
"""
csv = get_csv(data, station_name=station_name)
csv = get_csv(data, station_name=station_name, minio_url=minio_url)
csv_reader = reader(StringIO(csv))

xlsx = BytesIO()
Expand Down Expand Up @@ -745,16 +740,16 @@ def main(): # pragma: no cover

start_date, end_date = parse_date(args)
filename = parse_filename(args, start_date)
minio_raw_url = f"{args.minio}/{args.minio_raw}/"
minio_music_url = f"{args.minio}/{args.minio_music}/"

client = ACRClient(bearer_token=args.bearer_token)
data = client.get_interval_data(
args.project_id, args.stream_id, start_date, end_date, timezone=args.timezone
)
client = ACRClient(minio_url=minio_raw_url, timezone=args.timezone)
data = client.get_interval_data(start_date, end_date)
data = merge_duplicates(data)
if args.filetype == "xlsx":
data = get_xlsx(data, station_name=args.station_name)
data = get_xlsx(data, station_name=args.station_name, minio_url=minio_music_url)
elif args.filetype == "csv":
data = get_csv(data, station_name=args.station_name)
data = get_csv(data, station_name=args.station_name, minio_url=minio_music_url)
if args.email:
email_subject = Template(args.email_subject).substitute(
{
Expand Down
47 changes: 17 additions & 30 deletions tests/test_acrclient.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,73 +6,60 @@

from suisa_sendemeldung import acrclient

_ACR_URL = "https://eu-api-v2.acrcloud.com/api/bm-cs-projects/project-id/streams/stream-id/results"
_MINIO_RAW_URL = "http://minio.example.com/acrcloud.raw/"


def test_init():
"""Test ACRClient.__init__."""
bearer_token = "secret-key"
with freeze_time("1993-03-02"):
acr = acrclient.ACRClient(bearer_token)
acr = acrclient.ACRClient(minio_url=_MINIO_RAW_URL)

assert acr.default_date == date(1993, 3, 1)


def test_get_data():
"""Test ACRClient.get_data."""
bearer_token = "secret-key"
project_id = "project-id"
stream_id = "stream-id"
data = {"data": [{"metadata": {"timestamp_utc": "1993-03-01 13:12:00"}}]}
data = [{"metadata": {"timestamp_utc": "1993-03-01 13:12:00"}}]
with freeze_time("1993-03-02"):
acr = acrclient.ACRClient(bearer_token)
acr = acrclient.ACRClient(minio_url=_MINIO_RAW_URL)
with requests_mock.Mocker() as mock:
mock.get(
_ACR_URL,
f"{_MINIO_RAW_URL}1993-03-01.json",
json=data,
)
acr.get_data(project_id, stream_id)
acr.get_data()


def test_get_interval_data():
"""Test ACRClient.get_interval_data."""
bearer_token = "secret-key"
project_id = "project-id"
stream_id = "stream-id"
data = {"data": [{"metadata": {"timestamp_utc": "1993-03-01 13:12:00"}}]}
data = [{"metadata": {"timestamp_utc": "1993-03-01 13:12:00"}}]

with freeze_time("1993-03-02"):
acr = acrclient.ACRClient(bearer_token)
acr = acrclient.ACRClient(minio_url=_MINIO_RAW_URL)
with requests_mock.Mocker() as mock:
mock.get(
_ACR_URL,
requests_mock.ANY,
json=data,
)
acr.get_interval_data(
project_id, stream_id, date(1993, 3, 1), date(1993, 3, 31)
)
acr.get_interval_data(date(1993, 3, 1), date(1993, 3, 31))

# ahead of UTC
with freeze_time("1993-03-02"):
acr = acrclient.ACRClient(bearer_token)
acr = acrclient.ACRClient(minio_url=_MINIO_RAW_URL, timezone="Europe/Zurich")
with requests_mock.Mocker() as mock:
data["data"][0]["metadata"]["timestamp_utc"] = "1993-03-01 00:00:00"
data[0]["metadata"]["timestamp_utc"] = "1993-03-01 00:00:00"
mock.get(
_ACR_URL,
requests_mock.ANY,
json=data,
)
acr.get_interval_data(
project_id, stream_id, date(1993, 3, 1), date(1993, 3, 31), "Europe/Zurich"
)
acr.get_interval_data(date(1993, 3, 1), date(1993, 3, 31))

# behind UTC
with freeze_time("1993-03-02"):
acr = acrclient.ACRClient(bearer_token)
acr = acrclient.ACRClient(minio_url=_MINIO_RAW_URL, timezone="America/Nuuk")
with requests_mock.Mocker() as mock:
mock.get(
_ACR_URL,
requests_mock.ANY,
json=data,
)
acr.get_interval_data(
project_id, stream_id, date(1993, 3, 1), date(1993, 3, 31), "America/Nuuk"
)
acr.get_interval_data(date(1993, 3, 1), date(1993, 3, 31))
Loading