Support downloading videos and audio

Respond to comments Improve archiving and file naming
mikf · Dec 23, 2024 · 33954fd · 33954fd
1 parent 451de93
commit 33954fd
Show file tree

Hide file tree

Showing 4 changed files with 147 additions and 70 deletions.
diff --git a/gallery_dl/extractor/tiktok.py b/gallery_dl/extractor/tiktok.py
@@ -22,69 +22,104 @@ class TiktokExtractor(Extractor):
 
     category = "tiktok"
     directory_fmt = ("{category}", "{user}")
-    filename_fmt = "{title} [{id}] [{index}].{extension}"
-    archive_fmt = "{id}_{img_id}"
+    filename_fmt = "{title} [{id}{index:?_//}{img_id:?_//}].{extension}"
+    archive_fmt = "{id}_{index}_{img_id}"
     root = "https://www.tiktok.com/"
     cookies_domain = ".tiktok.com"
 
     def urls(self):
         return [self.url]
 
     def items(self):
+        videos = self.config("videos", True)
         for tiktok_url in self.urls():
             # If we can recognise that this is a /photo/ link, preemptively
             # replace it with /video/ to prevent a needless second request.
             # See below.
-            tiktok_url = compile(
+            tiktok_url_to_use = compile(
                 escape("/photo/"),
                 IGNORECASE
             ).sub("/video/", tiktok_url)
             video_detail = util.json_loads(text.extr(
-                self.request(tiktok_url).text,
+                self.request(tiktok_url_to_use).text,
                 '<script id="__UNIVERSAL_DATA_FOR_REHYDRATION__" '
                 'type="application/json">',
                 '</script>'
             ))["__DEFAULT_SCOPE__"]
             if "webapp.video-detail" not in video_detail:
                 # Only /video/ links result in the video-detail dict we need.
                 # Try again using that form of link.
-                tiktok_url = video_detail["seo.abtest"]["canonical"] \
+                tiktok_url_to_use = video_detail["seo.abtest"]["canonical"] \
                     .replace("/photo/", "/video/")
                 video_detail = util.json_loads(text.extr(
-                    self.request(tiktok_url).text,
+                    self.request(tiktok_url_to_use).text,
                     '<script id="__UNIVERSAL_DATA_FOR_REHYDRATION__" '
                     'type="application/json">',
                     '</script>'
                 ))["__DEFAULT_SCOPE__"]
             video_detail = video_detail["webapp.video-detail"]
-            has_status = "statusMsg" in video_detail
-            if has_status and video_detail["statusMsg"] == "author_secret":
-                raise exception.AuthorizationError("Login required to access "
-                                                   "this post")
+            if "statusCode" in video_detail:
+                if video_detail["statusCode"] == 10222:
+                    raise exception.AuthorizationError(
+                        tiktok_url + ": Login required to access this post"
+                    )
+                elif video_detail["statusCode"] == 10204:
+                    raise exception.NotFoundError(tiktok_url)
+                elif video_detail["statusCode"] == 10231:
+                    raise exception.ExtractionError(
+                        tiktok_url + " is region locked, try downloading with "
+                        "a VPN/proxy connection"
+                    )
+                elif video_detail["statusCode"] != 0:
+                    raise exception.ExtractionError(
+                        tiktok_url + ": Received unknown error code " +
+                        str(video_detail['statusCode']) + " with message " +
+                        (video_detail['statusMsg'] if
+                            "statusMsg" in video_detail else "")
+                    )
             post_info = video_detail["itemInfo"]["itemStruct"]
+            id = post_info["id"]
+            original_title = title = post_info["desc"]
+            if len(original_title) == 0:
+                title = "TikTok photo #{}".format(id)
+            title = title[:150]
             user = post_info["author"]["uniqueId"]
             if "imagePost" in post_info:
                 yield Message.Directory, {"user": user}
                 img_list = post_info["imagePost"]["images"]
                 for i, img in enumerate(img_list):
                     url = img["imageURL"]["urlList"][0]
                     name_and_ext = text.nameext_from_url(url)
-                    id = post_info["id"]
-                    title = post_info["desc"]
-                    if len(title) == 0:
-                        title = "TikTok photo #{}".format(id)
                     yield Message.Url, url, {
-                        "title"     : text.sanitize_for_filename(title)[:170],
+                        "title"     : title,
                         "id"        : id,
-                        "index"     : i,
+                        "index"     : i + 1,
                         "img_id"    : name_and_ext["filename"].split("~")[0],
                         "extension" : name_and_ext["extension"],
                         "width"     : img["imageWidth"],
                         "height"    : img["imageHeight"]
                     }
+            elif videos:
+                # It's probably obvious but I thought it was worth noting
+                # because I got stuck on this for a while: make sure to emit
+                # a Directory message before attempting to download anything
+                # with yt-dlp! Otherwise you'll run into NoneType, set_filename
+                # errors since the download job doesn't get initialized.
+                yield Message.Directory, {"user": user}
+                if len(original_title) == 0:
+                    title = "TikTok video #{}".format(id)
+                    title = title[:150]
             else:
-                # TODO: Not a slide show. Should pass this to yt-dlp.
-                pass
+                self.log.info("Skipping video post %s", tiktok_url)
+            if videos:
+                yield Message.Url, "ytdl:" + tiktok_url_to_use, {
+                    "filename"  : "",
+                    "extension" : "",
+                    "title"     : title,
+                    "id"        : id,
+                    "index"     : "",
+                    "img_id"    : ""
+                }
 
 
 class TiktokPostExtractor(TiktokExtractor):

diff --git a/gallery_dl/text.py b/gallery_dl/text.py
@@ -51,21 +51,6 @@ def slugify(value):
     return re.sub(r"[-\s]+", "-", value).strip("-_")
 
 
-def sanitize_for_filename(string):
-    """Removes characters from a string that would be illegal to have in
-    a filename
-
-    This function is similar to slugify(), except it retains more
-    characters (notably characters such as # and @).
-
-    Note that the length of the string is not capped!
-
-    Inspiration:
-    https://stackoverflow.com/a/71199182
-    """
-    return re.sub(r"[/\\?%*:|\"<>\x7F\x00-\x1F]", " ", str(string))
-
-
 def ensure_http_scheme(url, scheme="https://"):
     """Prepend 'scheme' to 'url' if it doesn't have one"""
     if url and not url.startswith(("https://", "http://")):

diff --git a/test/results/tiktok.py b/test/results/tiktok.py
@@ -5,66 +5,147 @@
 # published by the Free Software Foundation.
 
 from gallery_dl.extractor import tiktok
+from gallery_dl import exception
 
 PATTERN = r"https://p1[69]-.*\.tiktokcdn.*\.com/.*/[0-9a-fA-F]+~.*\.jpeg"
+PATTERN_WITH_AUDIO = r"(?:" + PATTERN + r")|(?:ytdl\:)"
 
 
 __tests__ = (
-# Test many photos.
 {
     "#url"      : "https://www.tiktok.com/@chillezy/photo/7240568259186019630",
+    "#comment"  : "/photo/ link: many photos",
     "#category" : ("", "tiktok", "post"),
     "#class"    : tiktok.TiktokPostExtractor,
-    "#pattern"  : PATTERN
+    "#pattern"  : PATTERN,
+    "#options"  : {"videos": False}
 },
 {
     "#url"      : "https://www.tiktok.com/@chillezy/video/7240568259186019630",
+    "#comment"  : "/video/ link: many photos",
     "#category" : ("", "tiktok", "post"),
     "#class"    : tiktok.TiktokPostExtractor,
-    "#pattern"  : PATTERN
+    "#pattern"  : PATTERN,
+    "#options"  : {"videos": False}
 },
 {
     "#url"      : "https://vm.tiktok.com/ZGdh4WUhr/",
+    "#comment"  : "vm.tiktok.com link: many photos",
     "#category" : ("", "tiktok", "vmpost"),
     "#class"    : tiktok.TiktokVmpostExtractor,
-    "#pattern"  : PATTERN
+    "#pattern"  : PATTERN,
+    "#options"  : {"videos": False}
 },
-# Test one photo.
 {
     "#url"      : "https://www.tiktok.com/@d4vinefem/photo/7449575367024626974",
+    "#comment"  : "/photo/ link: single photo",
     "#category" : ("", "tiktok", "post"),
     "#class"    : tiktok.TiktokPostExtractor,
-    "#pattern"  : PATTERN
+    "#pattern"  : PATTERN,
+    "#options"  : {"videos": False}
 },
 {
     "#url"      : "https://www.tiktok.com/@d4vinefem/video/7449575367024626974",
+    "#comment"  : "/video/ link: single photo",
     "#category" : ("", "tiktok", "post"),
     "#class"    : tiktok.TiktokPostExtractor,
-    "#pattern"  : PATTERN
+    "#pattern"  : PATTERN,
+    "#options"  : {"videos": False}
 },
 {
     "#url"      : "https://vm.tiktok.com/ZGdhVtER2/",
+    "#comment"  : "vm.tiktok.com link: single photo",
     "#category" : ("", "tiktok", "vmpost"),
     "#class"    : tiktok.TiktokVmpostExtractor,
-    "#pattern"  : PATTERN
+    "#pattern"  : PATTERN,
+    "#options"  : {"videos": False}
 },
-# Test a few photos.
 {
     "#url"      : "https://www.tiktok.com/@.mcfc.central/photo/7449701420934122785",
+    "#comment"  : "/photo/ link: few photos",
     "#category" : ("", "tiktok", "post"),
     "#class"    : tiktok.TiktokPostExtractor,
-    "#pattern"  : PATTERN
+    "#pattern"  : PATTERN,
+    "#options"  : {"videos": False}
 },
 {
     "#url"      : "https://www.tiktok.com/@.mcfc.central/video/7449701420934122785",
+    "#comment"  : "/video/ link: few photos",
     "#category" : ("", "tiktok", "post"),
     "#class"    : tiktok.TiktokPostExtractor,
-    "#pattern"  : PATTERN
+    "#pattern"  : PATTERN,
+    "#options"  : {"videos": False}
 },
 {
     "#url"      : "https://vm.tiktok.com/ZGdhVW3cu/",
+    "#comment"  : "vm.tiktok.com link: few photos",
     "#category" : ("", "tiktok", "vmpost"),
     "#class"    : tiktok.TiktokVmpostExtractor,
-    "#pattern"  : PATTERN
-}
+    "#pattern"  : PATTERN,
+    "#options"  : {"videos": False}
+},
+{
+    "#url"       : "https://www.tiktok.com/@ughuwhguweghw/video/1",
+    "#comment"   : "deleted post",
+    "#category"  : ("", "tiktok", "post"),
+    "#class"     : tiktok.TiktokPostExtractor,
+    "#exception" : exception.NotFoundError,
+    "#options"  : {"videos": False}
+},
+{
+    "#url"      : "https://www.tiktok.com/@memezar/video/7449708266168274208",
+    "#comment"  : "Video post",
+    "#category" : ("", "tiktok", "post"),
+    "#class"    : tiktok.TiktokPostExtractor,
+    "#urls"     : "ytdl:https://www.tiktok.com/@memezar/video/7449708266168274208",
+    "#options"  : {"videos": True}
+},
+{
+    "#url"      : "https://www.tiktok.com/@memezar/photo/7449708266168274208",
+    "#comment"  : "Video post as a /photo/ link",
+    "#category" : ("", "tiktok", "post"),
+    "#class"    : tiktok.TiktokPostExtractor,
+    "#urls"     : "ytdl:https://www.tiktok.com/@memezar/video/7449708266168274208",
+    "#options"  : {"videos": True}
+},
+{
+    "#url"      : "https://vm.tiktok.com/ZGdht7cjp/",
+    "#comment"  : "Video post as a VM link",
+    "#category" : ("", "tiktok", "vmpost"),
+    "#class"    : tiktok.TiktokVmpostExtractor,
+    "#urls"     : "ytdl:https://vm.tiktok.com/ZGdht7cjp/",
+    "#options"  : {"videos": True}
+},
+{
+    "#url"      : "https://www.tiktok.com/@memezar/video/7449708266168274208",
+    "#comment"  : "Skipping video post",
+    "#category" : ("", "tiktok", "post"),
+    "#class"    : tiktok.TiktokPostExtractor,
+    "#urls"     : [],
+    "#options"  : {"videos": False}
+},
+{
+    "#url"      : "https://www.tiktok.com/@chillezy/photo/7240568259186019630",
+    "#comment"  : "/photo/ link: many photos with audio",
+    "#category" : ("", "tiktok", "post"),
+    "#class"    : tiktok.TiktokPostExtractor,
+    "#pattern"  : PATTERN_WITH_AUDIO,
+    "#options"  : {"videos": True}
+},
+{
+    "#url"      : "https://www.tiktok.com/@chillezy/video/7240568259186019630",
+    "#comment"  : "/video/ link: many photos with audio",
+    "#category" : ("", "tiktok", "post"),
+    "#class"    : tiktok.TiktokPostExtractor,
+    "#pattern"  : PATTERN_WITH_AUDIO,
+    "#options"  : {"videos": True}
+},
+{
+    "#url"      : "https://vm.tiktok.com/ZGdh4WUhr/",
+    "#comment"  : "vm.tiktok.com link: many photos with audio",
+    "#category" : ("", "tiktok", "vmpost"),
+    "#class"    : tiktok.TiktokVmpostExtractor,
+    "#pattern"  : PATTERN_WITH_AUDIO,
+    "#options"  : {"videos": True}
+},
 )
diff --git a/test/test_text.py b/test/test_text.py
@@ -92,30 +92,6 @@ def test_slugify(self, f=text.slugify):
         self.assertEqual(f(1), "1")
         self.assertEqual(f(2.3), "23")
 
-    def test_sanitize_for_filename(self, f=text.sanitize_for_filename):
-        self.assertEqual(f("Hello World"), "Hello World")
-        self.assertEqual(f("-HeLLo---World-"), "-HeLLo---World-")
-        self.assertEqual(
-            f("_-H#e:l#l:o+\t+W?o!rl=d-_"),
-            "_-H#e l#l o+ +W o!rl=d-_"
-        )
-        self.assertEqual(f("_Hello_World_"), "_Hello_World_")
-        self.assertEqual(
-            f("/\\?%*:|\"<>\x7F\x00\x0B\x1F"),
-            "              "
-        )
-
-        self.assertEqual(f(""), "")
-        self.assertEqual(f("-"), "-")
-        self.assertEqual(f("--"), "--")
-
-        self.assertEqual(f(()), "()")
-        self.assertEqual(f([]), "[]")
-        self.assertEqual(f({}), "{}")
-        self.assertEqual(f(None), "None")
-        self.assertEqual(f(1), "1")
-        self.assertEqual(f(2.3), "2.3")
-
     def test_ensure_http_scheme(self, f=text.ensure_http_scheme):
         result = "https://example.org/filename.ext"