mikf · CasualYT31 · Dec 22, 2024 · Dec 22, 2024 · Dec 22, 2024 · Dec 22, 2024
diff --git a/docs/supportedsites.md b/docs/supportedsites.md
@@ -1,6 +1,7 @@
 # Supported Sites
 
 <!-- auto-generated by scripts/supportedsites.py -->
+
 Consider all listed sites to potentially be NSFW.
 
 <table>
@@ -925,6 +926,12 @@ Consider all listed sites to potentially be NSFW.
     <td>Galleries</td>
     <td></td>
 </tr>
+<tr>
+    <td>TikTok</td>
+    <td>https://www.tiktok.com/</td>
+    <td>Photos</td>
+    <td><a href="https://github.com/mikf/gallery-dl#cookies">Cookies</a></td>
+</tr>
 <tr>
     <td>TMOHentai</td>
     <td>https://tmohentai.com/</td>

diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
@@ -168,6 +168,7 @@
     "tapas",
     "tcbscans",
     "telegraph",
+    "tiktok",
     "tmohentai",
     "toyhouse",
     "tsumino",

diff --git a/gallery_dl/extractor/tiktok.py b/gallery_dl/extractor/tiktok.py
@@ -0,0 +1,106 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.tiktok.com/"""
+
+from .common import Extractor, Message
+from .. import exception, text, util
+from re import compile, escape, IGNORECASE
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?tiktok\.com"
+USER_PATTERN = BASE_PATTERN + r"/+@([\w.]{0,23}\w)(?:/\S*)?"
+POST_PATTERN = USER_PATTERN + \
+    r"/+(?:[pP][hH][oO][tT][oO]|[vV][iI][dD][eE][oO])/+(?:[0-9]+)/*"
+VM_POST_PATTERN = r"(?:https?://)?(?:vm\.)?tiktok\.com/+.*/*"
+
+
+class TiktokExtractor(Extractor):
+    """Base class for TikTok extractors"""
+
+    category = "tiktok"
+    directory_fmt = ("{category}", "{user}")
+    filename_fmt = "{title} [{id}] [{index}].{extension}"
+    archive_fmt = "{id}_{img_id}"
+    root = "https://www.tiktok.com/"
+    cookies_domain = ".tiktok.com"
+
+    def urls(self):
+        return [self.url]
+
+    def items(self):
+        for tiktok_url in self.urls():
+            # If we can recognise that this is a /photo/ link, preemptively
+            # replace it with /video/ to prevent a needless second request.
+            # See below.
+            tiktok_url = compile(
+                escape("/photo/"),
+                IGNORECASE
+            ).sub("/video/", tiktok_url)
+            video_detail = util.json_loads(text.extr(
+                self.request(tiktok_url).text,
+                '<script id="__UNIVERSAL_DATA_FOR_REHYDRATION__" '
+                'type="application/json">',
+                '</script>'
+            ))["__DEFAULT_SCOPE__"]
+            if "webapp.video-detail" not in video_detail:
+                # Only /video/ links result in the video-detail dict we need.
+                # Try again using that form of link.
+                tiktok_url = video_detail["seo.abtest"]["canonical"] \
+                    .replace("/photo/", "/video/")
+                video_detail = util.json_loads(text.extr(
+                    self.request(tiktok_url).text,
+                    '<script id="__UNIVERSAL_DATA_FOR_REHYDRATION__" '
+                    'type="application/json">',
+                    '</script>'
+                ))["__DEFAULT_SCOPE__"]
+            video_detail = video_detail["webapp.video-detail"]
+            has_status = "statusMsg" in video_detail
+            if has_status and video_detail["statusMsg"] == "author_secret":
+                raise exception.AuthorizationError("Login required to access "
+                                                   "this post")
+            post_info = video_detail["itemInfo"]["itemStruct"]
+            user = post_info["author"]["uniqueId"]
+            if "imagePost" in post_info:
+                yield Message.Directory, {"user": user}
+                img_list = post_info["imagePost"]["images"]
+                for i, img in enumerate(img_list):
+                    url = img["imageURL"]["urlList"][0]
+                    name_and_ext = text.nameext_from_url(url)
+                    id = post_info["id"]
+                    title = post_info["desc"]
+                    if len(title) == 0:
+                        title = "TikTok photo #{}".format(id)
+                    yield Message.Url, url, {
+                        "title"     : text.sanitize_for_filename(title)[:170],
+                        "id"        : id,
+                        "index"     : i,
+                        "img_id"    : name_and_ext["filename"].split("~")[0],
+                        "extension" : name_and_ext["extension"],
+                        "width"     : img["imageWidth"],
+                        "height"    : img["imageHeight"]
+                    }
+            else:
+                # TODO: Not a slide show. Should pass this to yt-dlp.
+                pass
+
+
+class TiktokPostExtractor(TiktokExtractor):
+    """Extract a single video or photo TikTok link"""
+
+    subcategory = "post"
+    pattern = POST_PATTERN
+    example = "https://www.tiktok.com/@chillezy/photo/7240568259186019630"
+
+
+class TiktokVmpostExtractor(TiktokExtractor):
+    """Extract a single video or photo TikTok VM link"""
+
+    subcategory = "vmpost"
+    pattern = VM_POST_PATTERN
+    example = "https://vm.tiktok.com/ZGdh4WUhr/"
+
+
+# TODO: Write profile extractor.
diff --git a/gallery_dl/text.py b/gallery_dl/text.py
@@ -51,6 +51,21 @@ def slugify(value):
     return re.sub(r"[-\s]+", "-", value).strip("-_")
 
 
+def sanitize_for_filename(string):
+    """Removes characters from a string that would be illegal to have in
+    a filename
+
+    This function is similar to slugify(), except it retains more
+    characters (notably characters such as # and @).
+
+    Note that the length of the string is not capped!
+
+    Inspiration:
+    https://stackoverflow.com/a/71199182
+    """
+    return re.sub(r"[/\\?%*:|\"<>\x7F\x00-\x1F]", " ", str(string))
+
+
 def ensure_http_scheme(url, scheme="https://"):
     """Prepend 'scheme' to 'url' if it doesn't have one"""
     if url and not url.startswith(("https://", "http://")):

diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py
@@ -141,10 +141,11 @@
     "tbib"           : "The Big ImageBoard",
     "tcbscans"       : "TCB Scans",
     "tco"            : "Twitter t.co",
-    "tmohentai"      : "TMOHentai",
     "thatpervert"    : "ThatPervert",
     "thebarchive"    : "The /b/ Archive",
     "thecollection"  : "The /co/llection",
+    "tiktok"         : "TikTok",
+    "tmohentai"      : "TMOHentai",
     "tumblrgallery"  : "TumblrGallery",
     "vanillarock"    : "もえぴりあ",
     "vidyart2"       : "/v/idyart2",

diff --git a/test/results/tiktok.py b/test/results/tiktok.py
@@ -0,0 +1,70 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+from gallery_dl.extractor import tiktok
+
+PATTERN = r"https://p1[69]-.*\.tiktokcdn.*\.com/.*/[0-9a-fA-F]+~.*\.jpeg"
+
+
+__tests__ = (
+# Test many photos.
+{
+    "#url"      : "https://www.tiktok.com/@chillezy/photo/7240568259186019630",
+    "#category" : ("", "tiktok", "post"),
+    "#class"    : tiktok.TiktokPostExtractor,
+    "#pattern"  : PATTERN
+},
+{
+    "#url"      : "https://www.tiktok.com/@chillezy/video/7240568259186019630",
+    "#category" : ("", "tiktok", "post"),
+    "#class"    : tiktok.TiktokPostExtractor,
+    "#pattern"  : PATTERN
+},
+{
+    "#url"      : "https://vm.tiktok.com/ZGdh4WUhr/",
+    "#category" : ("", "tiktok", "vmpost"),
+    "#class"    : tiktok.TiktokVmpostExtractor,
+    "#pattern"  : PATTERN
+},
+# Test one photo.
+{
+    "#url"      : "https://www.tiktok.com/@d4vinefem/photo/7449575367024626974",
+    "#category" : ("", "tiktok", "post"),
+    "#class"    : tiktok.TiktokPostExtractor,
+    "#pattern"  : PATTERN
+},
+{
+    "#url"      : "https://www.tiktok.com/@d4vinefem/video/7449575367024626974",
+    "#category" : ("", "tiktok", "post"),
+    "#class"    : tiktok.TiktokPostExtractor,
+    "#pattern"  : PATTERN
+},
+{
+    "#url"      : "https://vm.tiktok.com/ZGdhVtER2/",
+    "#category" : ("", "tiktok", "vmpost"),
+    "#class"    : tiktok.TiktokVmpostExtractor,
+    "#pattern"  : PATTERN
+},
+# Test a few photos.
+{
+    "#url"      : "https://www.tiktok.com/@.mcfc.central/photo/7449701420934122785",
+    "#category" : ("", "tiktok", "post"),
+    "#class"    : tiktok.TiktokPostExtractor,
+    "#pattern"  : PATTERN
+},
+{
+    "#url"      : "https://www.tiktok.com/@.mcfc.central/video/7449701420934122785",
+    "#category" : ("", "tiktok", "post"),
+    "#class"    : tiktok.TiktokPostExtractor,
+    "#pattern"  : PATTERN
+},
+{
+    "#url"      : "https://vm.tiktok.com/ZGdhVW3cu/",
+    "#category" : ("", "tiktok", "vmpost"),
+    "#class"    : tiktok.TiktokVmpostExtractor,
+    "#pattern"  : PATTERN
+}
+)
diff --git a/test/test_text.py b/test/test_text.py
@@ -92,6 +92,30 @@ def test_slugify(self, f=text.slugify):
         self.assertEqual(f(1), "1")
         self.assertEqual(f(2.3), "23")
 
+    def test_sanitize_for_filename(self, f=text.sanitize_for_filename):
+        self.assertEqual(f("Hello World"), "Hello World")
+        self.assertEqual(f("-HeLLo---World-"), "-HeLLo---World-")
+        self.assertEqual(
+            f("_-H#e:l#l:o+\t+W?o!rl=d-_"),
+            "_-H#e l#l o+ +W o!rl=d-_"
+        )
+        self.assertEqual(f("_Hello_World_"), "_Hello_World_")
+        self.assertEqual(
+            f("/\\?%*:|\"<>\x7F\x00\x0B\x1F"),
+            "              "
+        )
+
+        self.assertEqual(f(""), "")
+        self.assertEqual(f("-"), "-")
+        self.assertEqual(f("--"), "--")
+
+        self.assertEqual(f(()), "()")
+        self.assertEqual(f([]), "[]")
+        self.assertEqual(f({}), "{}")
+        self.assertEqual(f(None), "None")
+        self.assertEqual(f(1), "1")
+        self.assertEqual(f(2.3), "2.3")
+
     def test_ensure_http_scheme(self, f=text.ensure_http_scheme):
         result = "https://example.org/filename.ext"