From 6f16e1afa704af56c21d2a258030dea159755698 Mon Sep 17 00:00:00 2001 From: Braden Hilton Date: Sat, 4 Nov 2023 16:38:44 +0000 Subject: [PATCH] [weverse] add extractors --- docs/configuration.rst | 48 +++ gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/weverse.py | 487 +++++++++++++++++++++++++++++++ 3 files changed, 536 insertions(+) create mode 100644 gallery_dl/extractor/weverse.py diff --git a/docs/configuration.rst b/docs/configuration.rst index 0ea2eaeda25..f42a781e60c 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -3599,6 +3599,54 @@ Description Download video files. +extractor.weverse.access-token +------------------------------ +Type + ``string`` +Default + ``null`` +Description + Your Weverse account access token. + + The token can be found in the ``we2_access_token`` cookie in the + ``.weverse.io`` cookie domain after logging in to your account. + + An invalid or not up-to-date value + will result in ``401 Unauthorized`` errors. + + If this option is unset, and the cookie is not used, an extra HTTP + request will be sent with your ``username`` and ``password`` to + attempt to fetch a new token. + + +extractor.weverse.skip-embeds +----------------------------- +Type + ``bool`` +Default + ``false`` +Description + Control behavior on embedded content from external sites. + + * ``true``: Download embed URLs if supported. + * ``false``: Ignore embeds. + + +extractor.weverse.videos +------------------------ +Type + * ``bool`` + * ``string`` +Default + ``true`` +Description + Control video download behavior. + + * ``true``: Download videos + * ``"ytdl"``: Download videos using `youtube-dl`_ + * ``false``: Skip videos + + extractor.ytdl.enabled ---------------------- Type diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 22e4fe34123..08b56155220 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -168,6 +168,7 @@ "webmshare", "webtoons", "weibo", + "weverse", "wikiart", "wikifeet", "xhamster", diff --git a/gallery_dl/extractor/weverse.py b/gallery_dl/extractor/weverse.py new file mode 100644 index 00000000000..b702fefef44 --- /dev/null +++ b/gallery_dl/extractor/weverse.py @@ -0,0 +1,487 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://weverse.io/""" + +from .common import Extractor, Message +from .. import text, exception +from ..cache import cache +import binascii +import hashlib +import hmac +import time +import urllib.parse +import uuid +from collections import OrderedDict + +BASE_PATTERN = r"(?:https?://)?(?:m\.)?weverse\.io" +COMMUNITY_PATTERN = BASE_PATTERN + r"/(\w+)" + +MEMBER_ID_PATTERN = r"/([a-f0-9]+)" +POST_ID_PATTERN = r"/(\d-\d+)" + + +class WeverseExtractor(Extractor): + """Base class for weverse extractors""" + category = "weverse" + cookies_domain = ".weverse.io" + cookies_names = ("we2_access_token",) + root = "https://weverse.io" + directory_fmt = ("{category}", "{community[communityName]}", "{postId}") + filename_fmt = "weverse_{file_id}.{extension}" + archive_fmt = "{postId}" + request_interval = 1.0 + + def _init(self): + self.skip_embeds = self.config("skip-embeds", False) + self.videos = self.config("videos", True) + + self.login() + if self.access_token: + self.api = WeverseAPI(self, self.access_token) + + def login(self): + if self.config("access-token"): + self.access_token = self.config("access-token") + return + + if not self.cookies_check(self.cookies_names): + username, password = self._get_auth_info() + if username: + self.cookies_update( + self._login_impl(username, password), self.cookies_domain) + + self.access_token = self.cookies.get(self.cookies_names[0]) + + @cache(maxage=365*24*3600, keyarg=1) + def _login_impl(self, username, password): + endpoint = ("https://accountapi.weverse.io" + "/web/api/v2/auth/token/by-credentials") + data = {"email": username, "password": password} + headers = { + "x-acc-app-secret": "5419526f1c624b38b10787e5c10b2a7a", + "x-acc-app-version": "2.2.20-alpha.0", + "x-acc-language": "en", + "x-acc-service-id": "weverse", + "x-acc-trace-id": str(uuid.uuid4()) + } + res = self.request( + endpoint, method="POST", json=data, headers=headers).json() + if "accessToken" not in res: + raise exception.AuthenticationError() + return {self.cookies_names[0]: res["accessToken"]} + + def metadata(self, data): + delete = ("attachment", "authorMomentPosts", + "body", "extension", "plainBody") + + if "date" not in data and "publishedAt" in data: + data["date"] = text.parse_timestamp(data["publishedAt"] / 1000) + + if "author_name" not in data and "author" in data: + author = data["author"] + data["author_name"] = author.get("artistOfficialProfile", {}).get( + "officialName") or author["profileName"] + + for key in delete: + if key in data: + del data[key] + + def has_media(self, data): + for key in ("extension", "attachment", "photo", "video"): + if key in data and len(data[key]): + return True + return False + + +class WeversePostExtractor(WeverseExtractor): + """Extractor for weverse posts""" + subcategory = "post" + directory_fmt = ("{category}", "{community[communityName]}", + "{author_name}", "{postId}") + pattern = (COMMUNITY_PATTERN + + r"/(?:artist|fanpost)" + POST_ID_PATTERN) + example = "https://weverse.io/abcdef/artist/1-123456789" + + def __init__(self, match): + WeverseExtractor.__init__(self, match) + self.post_id = match.group(2) + + def items(self): + data = self.api.post(self.post_id) + + # skip posts with no media + if not self.has_media(data): + self.log.debug("Skipping %s (no media)", self.url) + return + + attachments = data["attachment"] + self.metadata(data) + + yield Message.Directory, data + for attachment_type, attachment_data in attachments.items(): + for attachment in attachment_data.values(): + url = "" + file_id = "" + + if attachment_type == "photo": + url = attachment["url"] + file_id = attachment["photoId"] + if attachment_type == "video": + if not self.videos: + continue + file_id = attachment["videoId"] + best_video = self.api.post_video(file_id) + url = best_video["url"] + + data["file_id"] = file_id + data["extension"] = text.ext_from_url(url) + yield Message.Url, url, data + + +class WeverseProfileExtractor(WeverseExtractor): + """Extractor for weverse community profiles""" + subcategory = "profile" + pattern = COMMUNITY_PATTERN + "/profile" + MEMBER_ID_PATTERN + example = ("https://weverse.io/abcdef" + "/profile/a0b1c2d3e4f5a6b7c8d9e0f1a2b3c4d5") + + def __init__(self, match): + WeverseExtractor.__init__(self, match) + self.member_id = match.group(2) + + def items(self): + data = {"_extractor": WeversePostExtractor} + posts = self.api.profile(self.member_id) + for post in posts: + yield Message.Queue, post["shareUrl"], data + + +class WeverseArtistTabExtractor(WeverseExtractor): + """Extractor for all artists in a weverse community""" + subcategory = "artist-tab" + pattern = COMMUNITY_PATTERN + "/artist$" + example = "https://weverse.io/abcdef/artist" + + def __init__(self, match): + WeverseExtractor.__init__(self, match) + self.community_keyword = match.group(1) + + def items(self): + data = {"_extractor": WeversePostExtractor} + posts = self.api.artist_tab(self.community_keyword) + for post in posts: + yield Message.Queue, post["shareUrl"], data + + +class WeverseMomentExtractor(WeverseExtractor): + """Extractor for moments from a weverse community artist""" + subcategory = "moment" + pattern = (COMMUNITY_PATTERN + + "/moment" + MEMBER_ID_PATTERN + + "/post" + POST_ID_PATTERN) + example = ("https://weverse.io/abcdef" + "/moment/a0b1c2d3e4f5a6b7c8d9e0f1a2b3c4d5" + "/post/1-123456789") + + def __init__(self, match): + WeverseExtractor.__init__(self, match) + self.post_id = match.group(3) + + def items(self): + data = self.api.post(self.post_id) + + moment = {} + if "moment" in data["extension"]: + moment = data["extension"]["moment"] + elif "momentW1" in data["extension"]: + moment = data["extension"]["momentW1"] + + # skip moments with no media + if not self.has_media(moment): + self.log.debug("Skipping %s (no media)", self.url) + return + + self.metadata(data) + + yield Message.Directory, data + url = "" + file_id = "" + data["extension"] = None + + if "photo" in moment: + url = moment["photo"]["url"] + file_id = moment["photo"]["photoId"] + data["file_id"] = file_id + data["extension"] = text.ext_from_url(url) + if "video" in moment: + if not self.videos: + return + file_id = moment["video"]["videoId"] + best_video = self.api.post_video(file_id) + url = best_video["url"] + data["file_id"] = file_id + data["extension"] = text.ext_from_url(url) + if self.videos == "ytdl": + url = "ytdl:" + data["shareUrl"] + data["extension"] = None + + yield Message.Url, url, data + + +class WeverseMomentsExtractor(WeverseExtractor): + """Extractor for all moments from a weverse community artist""" + subcategory = "moments" + pattern = COMMUNITY_PATTERN + "/moment" + MEMBER_ID_PATTERN + "$" + example = ("https://weverse.io/abcdef" + "/moment/a0b1c2d3e4f5a6b7c8d9e0f1a2b3c4d5") + + def __init__(self, match): + WeverseExtractor.__init__(self, match) + self.member_id = match.group(2) + + def items(self): + data = {"_extractor": WeverseMomentExtractor} + moments = self.api.moments(self.member_id) + for moment in moments: + yield Message.Queue, moment["shareUrl"], data + + +class WeverseMediaExtractor(WeverseExtractor): + """Extractor for weverse media""" + subcategory = "media" + directory_fmt = ("{category}", "{community[communityName]}", + "media", "{postId}") + pattern = COMMUNITY_PATTERN + "/media" + POST_ID_PATTERN + example = "https://weverse.io/abcdef/media/1-123456789" + + def __init__(self, match): + WeverseExtractor.__init__(self, match) + self.post_id = match.group(2) + + def items(self): + data = self.api.post(self.post_id) + + extensions = data["extension"] + + self.metadata(data) + + yield Message.Directory, data + for extension_type, extension_data in extensions.items(): + if extension_type == "youtube": + if self.skip_embeds: + continue + url = extension_data["videoPath"] + data["file_id"] = extension_data["youtubeVideoId"] + data["extension"] = None + yield Message.Url, "ytdl:" + url, data + if extension_type == "image": + for photo in extension_data["photos"]: + file_id = photo["photoId"] + url = photo["url"] + data["file_id"] = file_id + data["extension"] = text.ext_from_url(url) + yield Message.Url, url, data + if extension_type == "video": + if not self.videos: + continue + file_id = extension_data["videoId"] + master_id = extension_data["infraVideoId"] + best_video = self.api.media_video(file_id, master_id) + url = best_video["source"] + data["file_id"] = file_id + data["extension"] = text.ext_from_url(url) + if self.videos == "ytdl": + url = "ytdl:" + data["shareUrl"] + data["extension"] = None + yield Message.Url, url, data + + +class WeverseMediaTabExtractor(WeverseExtractor): + """Extractor for the media tab of a weverse commnity""" + subcategory = "media-tab" + pattern = COMMUNITY_PATTERN + r"/media(?:/(?:all|new))?$" + example = "https://weverse.io/abcdef/media" + + def __init__(self, match): + WeverseExtractor.__init__(self, match) + self.community_keyword = match.group(1) + + def items(self): + data = {"_extractor": WeverseMediaExtractor} + medias = self.api.media_tab(self.community_keyword) + for media in medias: + yield Message.Queue, media["shareUrl"], data + + +class WeverseMediaCategoryExtractor(WeverseExtractor): + """Extractor for media by category of a weverse commnity""" + subcategory = "media-category" + pattern = COMMUNITY_PATTERN + r"/media/category/(\d+)" + example = "https://weverse.io/abcdef/media/category/1234" + + def __init__(self, match): + WeverseExtractor.__init__(self, match) + self.community_keyword = match.group(1) + self.media_category = match.group(2) + + def items(self): + data = {"_extractor": WeverseMediaExtractor} + medias = self.api.media_category(self.media_category) + for media in medias: + yield Message.Queue, media["shareUrl"], data + + +class WeverseAPI(): + """Interface for the Weverse API""" + BASE_API_URL = "https://global.apis.naver.com" + WMD_API_URL = BASE_API_URL + "/weverse/wevweb" + VOD_API_URL = BASE_API_URL + "/rmcnmv/rmcnmv" + + def __init__(self, extractor, access_token): + self.extractor = extractor + self.headers = { + "Authorization": "Bearer " + access_token, + "Referer": extractor.root + "/" + } + + def _endpoint_with_params(self, endpoint, params): + params_delimiter = "?" + if "?" in endpoint: + params_delimiter = "&" + return endpoint + params_delimiter + urllib.parse.urlencode( + query=params) + + def _message_digest(self, endpoint, params, timestamp): + key = "1b9cb6378d959b45714bec49971ade22e6e24e42".encode() + url = self._endpoint_with_params(endpoint, params) + message = "{}{}".format(url[:255], timestamp).encode() + hash = hmac.new(key, message, hashlib.sha1).digest() + return binascii.b2a_base64(hash).rstrip().decode() + + def _in_key(self, video_id): + endpoint = "/video/v1.1/vod/{}/inKey".format(video_id) + return self._call_wmd(endpoint, method="POST")["inKey"] + + def community_id(self, community_keyword): + endpoint = "/community/v1.0/communityIdUrlPathByUrlPathArtistCode" + params = {"keyword": community_keyword} + return self._call_wmd(endpoint, params)["communityId"] + + def post(self, post_id): + endpoint = "/post/v1.0/post-{}".format(post_id) + params = {"fieldSet": "postV1"} + return self._call_wmd(endpoint, params) + + def post_video(self, video_id): + endpoint = "/cvideo/v1.0/cvideo-{}/downloadInfo".format(video_id) + videos = self._call_wmd(endpoint)["downloadInfo"] + best_video = max(videos, key=lambda video: + text.parse_int(video["resolution"].rstrip("P"))) + return best_video + + def profile(self, member_id): + endpoint = "/post/v1.0/member-{}/posts".format(member_id) + params = { + "fieldSet": "postsV1", + "filterType": "DEFAULT", + "limit": 20, + "sortType": "LATEST" + } + yield from self._pagination(endpoint, params) + + def artist_tab(self, community_keyword): + community_id = self.community_id(community_keyword) + endpoint = "/post/v1.0/community-{}/artistTabPosts".format( + community_id) + params = { + "fieldSet": "postsV1", + "limit": 20, + "pagingType": "CURSOR" + } + yield from self._pagination(endpoint, params) + + def media_video(self, video_id, master_id): + in_key = self._in_key(video_id) + endpoint = "/vod/play/v2.0/{}".format(master_id) + params = {"key": in_key} + videos = self._call(self.VOD_API_URL + endpoint, + params=params)["videos"]["list"] + best_video = max(videos, key=lambda video: + video["encodingOption"]["width"] * + video["encodingOption"]["height"]) + return best_video + + def media_tab(self, community_keyword): + community_id = self.community_id(community_keyword) + endpoint = "/media/v1.0/community-{}/searchAllMedia".format( + community_id) + params = { + "fieldSet": "postsV1", + "sortOrder": "DESC" + } + yield from self._pagination(endpoint, params) + + def media_category(self, category_id): + endpoint = "/media/v1.0/category-{}/mediaPosts".format(category_id) + params = { + "fieldSet": "postsV1", + "sortOrder": "DESC" + } + yield from self._pagination(endpoint, params) + + def moments(self, member_id): + endpoint = "/post/v1.0/member-{}/posts".format(member_id) + params = { + "fieldSet": "postsV1", + "filterType": "MOMENT", + "limit": 1 + } + yield from self._pagination(endpoint, params) + + def _call_wmd(self, endpoint, params=None, **kwargs): + if params is None: + params = {} + params.update({ + "appId": "be4d79eb8fc7bd008ee82c8ec4ff6fd4", + "language": "en", + "platform": "WEB", + "wpf": "pc", + }) + params = OrderedDict(sorted(params.items())) + timestamp = int(time.time() * 1000) + message_digest = self._message_digest(endpoint, params, timestamp) + params.update({ + "wmsgpad": timestamp, + "wmd": message_digest + }) + return self._call(self.WMD_API_URL + endpoint, params=params, + headers=self.headers, **kwargs) + + def _call(self, url, **kwargs): + while True: + try: + return self.extractor.request(url, **kwargs).json() + except exception.HttpError as exc: + if exc.status == 401: + raise exception.AuthenticationError() + if exc.status == 403: + raise exception.AuthorizationError( + "Post requires membership") + if exc.status == 404: + raise exception.NotFoundError(self.extractor.subcategory) + self.extractor.log.debug(exc) + return + + def _pagination(self, endpoint, params=None): + if params is None: + params = {} + while True: + res = self._call_wmd(endpoint, params) + yield from res["data"] + if "nextParams" not in res["paging"]: + return + params["after"] = res["paging"]["nextParams"]["after"]