Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement video-length filter using ffprobe #6246

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
archive/
.venv

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ Optional
--------

- yt-dlp_ or youtube-dl_: HLS/DASH video downloads, ``ytdl`` integration
- FFmpeg_: Pixiv Ugoira conversion
- FFmpeg_: Video length filters & Pixiv Ugoira conversion
- mkvmerge_: Accurate Ugoira frame timecodes
- PySocks_: SOCKS proxy support
- brotli_ or brotlicffi_: Brotli compression support
Expand Down
34 changes: 34 additions & 0 deletions docs/configuration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5295,6 +5295,40 @@ Description
These suffixes are case-insensitive.


downloader.*.videolegth-min & .videolegth-max
-----------------------------------------
Type
``string``
Default
``null``
Example
``"1min"``, ``"1m30s"``, ``"1h21min31s"``
Description
Minimum/Maximum allowed video length.
Any video shorter/longer than this limit will not be downloaded.

A file qualifies as a video if it contains more than 10 frames. If a file contains multiple video streams the shortest video will be used for comparison.

This option requires ``ffprobe`` to be available. Additionally ``download.*.ffprobe-location`` can be configured.

Possible values are valid integer numbers followed with one of the following suffixes:
* Hours: ```hours``, ``hour``, ``h``,
* Minutes: ``minutes``, ``minute``, ``min``, ``m``
* Seconds: ``seconds``, ``second``, ``sec``, ``s``

Multiple values can be combined. e.g. ``2hours30min2s``


download.*.ffprobe-location
------------------
Type
``string``
Default
``ffprobe``
Description
Path/Location of ``ffprobe``. Used for the ``downloader.*.videolegth-min & .videolegth-max`` option.


downloader.*.mtime
------------------
Type
Expand Down
40 changes: 39 additions & 1 deletion gallery_dl/downloader/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import mimetypes
from requests.exceptions import RequestException, ConnectionError, Timeout
from .common import DownloaderBase
from .. import text, util
from .. import text, util, ffprobe
from ssl import SSLError


Expand All @@ -32,6 +32,8 @@ def __init__(self, job):
self.headers = self.config("headers")
self.minsize = self.config("filesize-min")
self.maxsize = self.config("filesize-max")
self.minlength = self.config("videolength-min")
self.maxlength = self.config("videolength-max")
self.retries = self.config("retries", extractor._retries)
self.retry_codes = self.config("retry-codes", extractor._retry_codes)
self.timeout = self.config("timeout", extractor._timeout)
Expand Down Expand Up @@ -59,6 +61,20 @@ def __init__(self, job):
self.log.warning(
"Invalid maximum file size (%r)", self.maxsize)
self.maxsize = maxsize
if self.minlength:
minlength = text.parse_duration(self.minlength)
if not minlength:
self.log.warning(
"Invalid maximum videolength duration (%r)",
self.minlength)
self.minlength = minlength
if self.maxlength:
maxlength = text.parse_duration(self.maxlength)
if not maxlength:
self.log.warning(
"Invalid maximum videolength duration (%r)",
self.maxlength)
self.maxlength = maxlength
if isinstance(self.chunk_size, str):
chunk_size = text.parse_bytes(self.chunk_size)
if not chunk_size:
Expand Down Expand Up @@ -219,6 +235,28 @@ def _download_impl(self, url, pathfmt):
kwdict[metadata] = util.extract_headers(response)
build_path = True

# check video length using ffprobe request
if (self.minlength or self.maxlength):
length = ffprobe.get_video_length(self, url)

if length and self.minlength and length < self.minlength:
self.release_conn(response)
self.log.warning(
"Video length is shorter than allowed minimum "
"(%s < %s)",
length, self.minlength)
pathfmt.temppath = ""
return True

if length and self.maxlength and length > self.maxlength:
self.release_conn(response)
self.log.warning(
"Video length is longer than allowed maximum "
"(%s > %s)",
length, self.maxlength)
pathfmt.temppath = ""
return True

# build and check file path
if build_path:
pathfmt.build_path()
Expand Down
95 changes: 95 additions & 0 deletions gallery_dl/ffprobe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# -*- coding: utf-8 -*-

# Copyright 2014-2019 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Fetch Video Length before actually downloading a whole file"""

import subprocess
import json
import time
from datetime import timedelta
from . import util


def get_video_length(obj, url):
minimum_frames = 10
data = None
tries = 0
msg = ""

ffprobe = util.expand_path(obj.config("ffprobe-location", "ffprobe"))

command = [
ffprobe,
"-v",
"quiet",
"-print_format",
"json",
"-show_format",
"-show_streams",
]

if obj.headers:
for key, value in obj.headers.items():
command.extend(["-headers", key + ": " + value])

command.append(url)

while True:
if tries:
obj.log.warning("%s (%s/%s)", msg, tries, obj.retries+1)
if tries > obj.retries:
return False
time.sleep(tries)
tries += 1

try:
result = subprocess.run(
command,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=True,
)
data = json.loads(result.stdout)
except subprocess.CalledProcessError as e:
msg = "ffprobe failed: " + str(e)
Skaronator marked this conversation as resolved.
Show resolved Hide resolved
continue
except json.JSONDecodeError:
msg = "Failed to decode ffprobe output as JSON"
continue

# A file typically contains multiple streams (video, audio, subtitle).
# Here we filter out everything that is not considered a video
video_streams = [
float(stream["duration"])
for stream in data["streams"]
if stream["codec_type"] == "video" and
"duration" in stream and
"avg_frame_rate" in stream and
frame_count(stream) >= minimum_frames
]

if not video_streams:
obj.log.info(
"No video streams found or none with a valid duration "
"and minimum frames."
)
return None

duration = timedelta(seconds=min(video_streams))
return duration


def frame_count(stream):
"""Calculates the number of frames in the video stream."""
try:
duration = float(stream["duration"])
avg_frame_rate = eval(stream["avg_frame_rate"])
return int(duration * avg_frame_rate)
except (ValueError, ZeroDivisionError):
return 0
22 changes: 22 additions & 0 deletions gallery_dl/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,28 @@ def parse_timestamp(ts, default=None):
return default


def parse_duration(duration_string, default=None):
try:
patterns = {
'hours': r'(\d+)\s*h(our(s)?)?',
'minutes': r'(\d+)\s*m(in(ute)?(s)?)?',
'seconds': r'(\d+)\s*s(ec(ond)?(s)?)?'
}
parsed_values = {unit: 0 for unit in patterns.keys()}

for unit, pattern in patterns.items():
match = re.search(pattern, duration_string, re.IGNORECASE)
if match:
parsed_values[unit] = int(match.group(1))

return datetime.timedelta(
hours=parsed_values['hours'],
minutes=parsed_values['minutes'],
seconds=parsed_values['seconds'])
except Exception:
return default


def parse_datetime(date_string, format="%Y-%m-%dT%H:%M:%S%z", utcoffset=0):
"""Create a datetime object by parsing 'date_string'"""
try:
Expand Down
Loading