Skip to content

Commit

Permalink
[2ch] add 'thread' and 'board' extractors
Browse files Browse the repository at this point in the history
- [2ch] add thread extractor
- [2ch] add board extractor
- [2ch] add new entry to supported sites
  • Loading branch information
hunter-gatherer8 authored and mikf committed Jan 15, 2024
1 parent 69726fc commit 6c4abc9
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 0 deletions.
6 changes: 6 additions & 0 deletions docs/supportedsites.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@ Consider all listed sites to potentially be NSFW.
</tr>
</thead>
<tbody valign="top">
<tr>
<td>2ch</td>
<td>https://2ch.hk/</td>
<td>Boards, Threads</td>
<td></td>
</tr>
<tr>
<td>2chen</td>
<td>https://sturdychan.help/</td>
Expand Down
84 changes: 84 additions & 0 deletions gallery_dl/extractor/2ch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# -*- coding: utf-8 -*-

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://www.2ch.hk/"""

from .common import Extractor, Message
from .. import text


class _2chThreadExtractor(Extractor):
"""Extractor for 2ch threads"""
category = "2ch"
subcategory = "thread"
directory_fmt = ("{category}", "{board}", "{thread} {title}")
filename_fmt = "{file_id} - {filename}.{extension}"
archive_fmt = "{board}_{thread}_{file_id}"
pattern = r"(?:https?://)?2ch\.hk/([^/]+)/res/(\d+)\.html"

def __init__(self, match):
Extractor.__init__(self, match)
self.board, self.thread = match.groups()

def items(self):
url = f"https://2ch.hk/{self.board}/res/{self.thread}.json"
thread_data = self.request(url).json()

posts = thread_data["threads"][0]["posts"]
post = posts[0]
title = post.get("subject") or text.remove_html(post["comment"])

thread_metadata = {
"board": self.board,
"thread": self.thread,
"title": text.unescape(title)[:50],
}

yield Message.Directory, thread_metadata
for post in posts:
if "files" in post and post['files']:
for file in post['files']:
file_metadata = {
"post_num": post["num"],
"file_id": file["name"].split('.')[0],
"filename": ".".join(file["fullname"].split('.')[:-1]),
"extension": file["name"].split('.')[-1],
}
file_metadata.update(thread_metadata)

url = f"https://2ch.hk/{file['path']}"
yield Message.Url, url, file_metadata


class _2chBoardExtractor(Extractor):
"""Extractor for 2ch boards"""
category = "2ch"
subcategory = "board"
pattern = r"(?:https?://)?2ch\.hk/([a-z]+)/?$"

def __init__(self, match):
Extractor.__init__(self, match)
self.board = match.group(1)

def get_pages(self):
url = f"https://2ch.hk/{self.board}/index.json"
index_page = self.request(url).json()
pages_total = len(index_page['pages'])

yield index_page
for i in range(1, pages_total):
url = f"https://2ch.hk/{self.board}/{i}.json"
yield self.request(url).json()

def get_thread_nums(self):
for page in self.get_pages():
for thread in page["threads"]:
yield thread["thread_num"]

def items(self):
for thread_num in self.get_thread_nums():
url = f"https://2ch.hk/{self.board}/res/{thread_num}.html"
yield Message.Queue, url, {"_extractor": _2chThreadExtractor}
1 change: 1 addition & 0 deletions gallery_dl/extractor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import re

modules = [
"2ch",
"2chan",
"2chen",
"35photo",
Expand Down

0 comments on commit 6c4abc9

Please sign in to comment.