diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 3a704cf454..53c8833509 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -13,6 +13,12 @@ Consider all listed sites to potentially be NSFW. + + 2ch + https://2ch.hk/ + Boards, Threads + + 2chen https://sturdychan.help/ diff --git a/gallery_dl/extractor/2ch.py b/gallery_dl/extractor/2ch.py new file mode 100644 index 0000000000..dbbf21b635 --- /dev/null +++ b/gallery_dl/extractor/2ch.py @@ -0,0 +1,91 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://2ch.hk/""" + +from .common import Extractor, Message +from .. import text, util + + +class _2chThreadExtractor(Extractor): + """Extractor for 2ch threads""" + category = "2ch" + subcategory = "thread" + root = "https://2ch.hk" + directory_fmt = ("{category}", "{board}", "{thread} {title}") + filename_fmt = "{tim}{filename:? //}.{extension}" + archive_fmt = "{board}_{thread}_{tim}" + pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/res/(\d+)" + example = "https://2ch.hk/a/res/12345.html" + + def __init__(self, match): + Extractor.__init__(self, match) + self.board, self.thread = match.groups() + + def items(self): + url = "{}/{}/res/{}.json".format(self.root, self.board, self.thread) + posts = self.request(url).json()["threads"][0]["posts"] + + op = posts[0] + title = op.get("subject") or text.remove_html(op["comment"]) + + thread = { + "board" : self.board, + "thread": self.thread, + "title" : text.unescape(title)[:50], + } + + yield Message.Directory, thread + for post in posts: + files = post.get("files") + if files: + post["post_name"] = post["name"] + post["date"] = text.parse_timestamp(post["timestamp"]) + del post["files"] + del post["name"] + + for file in files: + file.update(thread) + file.update(post) + + file["filename"] = file["fullname"].rpartition(".")[0] + file["tim"], _, file["extension"] = \ + file["name"].rpartition(".") + + yield Message.Url, self.root + file["path"], file + + +class _2chBoardExtractor(Extractor): + """Extractor for 2ch boards""" + category = "2ch" + subcategory = "board" + root = "https://2ch.hk" + pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/?$" + example = "https://2ch.hk/a/" + + def __init__(self, match): + Extractor.__init__(self, match) + self.board = match.group(1) + + def items(self): + # index page + url = "{}/{}/index.json".format(self.root, self.board) + index = self.request(url).json() + index["_extractor"] = _2chThreadExtractor + for thread in index["threads"]: + url = "{}/{}/res/{}.html".format( + self.root, self.board, thread["thread_num"]) + yield Message.Queue, url, index + + # pages 1..n + for n in util.advance(index["pages"], 1): + url = "{}/{}/{}.json".format(self.root, self.board, n) + page = self.request(url).json() + page["_extractor"] = _2chThreadExtractor + for thread in page["threads"]: + url = "{}/{}/res/{}.html".format( + self.root, self.board, thread["thread_num"]) + yield Message.Queue, url, page diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 13d7b38b65..8e7129618a 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -10,6 +10,7 @@ import re modules = [ + "2ch", "2chan", "2chen", "35photo", diff --git a/test/results/2ch.py b/test/results/2ch.py new file mode 100644 index 0000000000..5400292cf4 --- /dev/null +++ b/test/results/2ch.py @@ -0,0 +1,64 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +gallery_dl = __import__("gallery_dl.extractor.2ch") +_2ch = getattr(gallery_dl.extractor, "2ch") + + +__tests__ = ( +{ + "#url" : "https://2ch.hk/a/res/6202876.html", + "#category": ("", "2ch", "thread"), + "#class" : _2ch._2chThreadExtractor, + "#pattern" : r"https://2ch\.hk/a/src/6202876/\d+\.\w+", + "#count" : range(450, 1000), + + "banned" : 0, + "board" : "a", + "closed" : 0, + "comment" : str, + "date" : "type:datetime", + "displayname": str, + "email" : "", + "endless" : 1, + "extension": str, + "filename" : str, + "fullname" : str, + "height" : int, + "lasthit" : 1705273977, + "md5" : r"re:[0-9a-f]{32}", + "name" : r"re:\d+\.\w+", + "num" : int, + "number" : range(1, 1000), + "op" : 0, + "parent" : int, + "path" : r"re:/a/src/6202876/\d+\.\w+", + "post_name": "Аноним", + "size" : int, + "sticky" : 0, + "subject" : str, + "thread" : "6202876", + "thumbnail": str, + "tim" : r"re:\d+", + "timestamp": int, + "title" : "MP4/WEBM", + "tn_height": int, + "tn_width" : int, + "trip" : "", + "type" : int, + "views" : int, + "width" : int, +}, + +{ + "#url" : "https://2ch.hk/a/", + "#category": ("", "2ch", "board"), + "#class" : _2ch._2chBoardExtractor, + "#pattern" : _2ch._2chThreadExtractor.pattern, + "#count" : range(200, 300), +}, + +)