From 1e7457b619524e303376603f06baf36171e86877 Mon Sep 17 00:00:00 2001 From: Niklas Wagner Date: Sat, 28 Sep 2024 16:54:46 +0200 Subject: [PATCH] Implement videolength filter with ffprobe --- docs/configuration.rst | 34 +++++++++++++ gallery_dl/downloader/http.py | 96 +++++++++++++++++++++++++++++++++++ gallery_dl/text.py | 22 ++++++++ 3 files changed, 152 insertions(+) diff --git a/docs/configuration.rst b/docs/configuration.rst index dbc1d1ef45..7098249450 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -5018,6 +5018,40 @@ Description These suffixes are case-insensitive. +downloader.*.videolegth-min & .videolegth-max +----------------------------------------- +Type + ``string`` +Default + ``null`` +Example + ``"1min"``, ``"1m30s"``, ``"1h21min31s"`` +Description + Minimum/Maximum allowed video length. + Any video shorter/longer than this limit will not be downloaded. + + A file qualifies as a video if it contains more than 10 frames. If a file contains multiple video streams the shortest video will be used for comparison. + + This option requires ``ffprobe`` to be available. Additionally ``download.*.ffprobe-location`` can be configured. + + Possible values are valid integer numbers followed with one of the following suffixes: + * Hours: ```hours``, ``hour``, ``h``, + * Minutes: ``minutes``, ``minute``, ``min``, ``m`` + * Seconds: ``seconds``, ``second``, ``sec``, ``s`` + + Multiple values can be combined. e.g. ``2hours30min2s`` + + +download.*.ffprobe-location +------------------ +Type + ``string`` +Default + ``ffprobe`` +Description + Path/Location of ``ffprobe``. Used for the ``downloader.*.videolegth-min & .videolegth-max`` option. + + downloader.*.mtime ------------------ Type diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 54750ac733..62e38cd51c 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -10,6 +10,9 @@ import time import mimetypes +import subprocess +import json +from datetime import timedelta from requests.exceptions import RequestException, ConnectionError, Timeout from .common import DownloaderBase from .. import text, util @@ -32,6 +35,10 @@ def __init__(self, job): self.headers = self.config("headers") self.minsize = self.config("filesize-min") self.maxsize = self.config("filesize-max") + self.minlength = self.config("videolength-min") + self.maxlength = self.config("videolength-max") + ffprobe = self.config("ffprobe-location") + self.ffprobe = util.expand_path(ffprobe) if ffprobe else "ffprobe" self.retries = self.config("retries", extractor._retries) self.retry_codes = self.config("retry-codes", extractor._retry_codes) self.timeout = self.config("timeout", extractor._timeout) @@ -59,6 +66,18 @@ def __init__(self, job): self.log.warning( "Invalid maximum file size (%r)", self.maxsize) self.maxsize = maxsize + if self.minlength: + minlength = text.parse_duration(self.minlength) + if not minlength: + self.log.warning( + "Invalid maximum videolength duration (%r)", self.minlength) + self.minlength = minlength + if self.maxlength: + maxlength = text.parse_duration(self.maxlength) + if not maxlength: + self.log.warning( + "Invalid maximum videolength duration (%r)", self.maxlength) + self.maxlength = maxlength if isinstance(self.chunk_size, str): chunk_size = text.parse_bytes(self.chunk_size) if not chunk_size: @@ -219,6 +238,26 @@ def _download_impl(self, url, pathfmt): kwdict[metadata] = util.extract_headers(response) build_path = True + # check video length using ffprobe request + if (self.minlength or self.maxlength): + length = self._fetch_videolength(url) + + if length and self.minlength and length < self.minlength: + self.release_conn(response) + self.log.warning( + "Video length is shorter than allowed minimum (%s < %s)", + length, self.minlength) + pathfmt.temppath = "" + return True + + if length and self.maxlength and length > self.maxlength: + self.release_conn(response) + self.log.warning( + "Video length is longer than allowed maximum (%s > %s)", + length, self.maxlength) + pathfmt.temppath = "" + return True + # build and check file path if build_path: pathfmt.build_path() @@ -376,6 +415,63 @@ def _adjust_extension(pathfmt, file_header): return True return False + def _fetch_videolength(self, url): + minimum_frames = 10 + args = [ + self.ffprobe, + "-v", + "quiet", + "-print_format", + "json", + "-show_format", + "-show_streams", + url, + ] + + try: + result = subprocess.run( + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=True, + ) + data = json.loads(result.stdout) + + video_streams = [ + float(stream["duration"]) + for stream in data["streams"] + if stream["codec_type"] == "video" + and "duration" in stream + and "avg_frame_rate" in stream + and self._frame_count(stream) >= minimum_frames + ] + + if not video_streams: + self.log.info( + "No video streams found or none with a valid duration and minimum frames." + ) + return None + + duration = timedelta(seconds=min(video_streams)) + return duration + + except subprocess.CalledProcessError as e: + self.log.error("ffprobe failed: %s", e.stderr) + return None + except json.JSONDecodeError: + self.log.error("Failed to decode ffprobe output as JSON") + return None + + def _frame_count(self, stream): + """Calculates the number of frames in the video stream.""" + try: + duration = float(stream["duration"]) + avg_frame_rate = eval(stream["avg_frame_rate"]) + return int(duration * avg_frame_rate) + except (ValueError, ZeroDivisionError): + return 0 + MIME_TYPES = { "image/jpeg" : "jpg", diff --git a/gallery_dl/text.py b/gallery_dl/text.py index 8517cdf5dd..de8f872cae 100644 --- a/gallery_dl/text.py +++ b/gallery_dl/text.py @@ -268,6 +268,28 @@ def parse_timestamp(ts, default=None): return default +def parse_duration(duration_string, default=None): + try: + patterns = { + 'hours': r'(\d+)\s*h(our(s)?)?', + 'minutes': r'(\d+)\s*m(in(ute)?(s)?)?', + 'seconds': r'(\d+)\s*s(ec(ond)?(s)?)?' + } + parsed_values = {unit: 0 for unit in patterns.keys()} + + for unit, pattern in patterns.items(): + match = re.search(pattern, duration_string, re.IGNORECASE) + if match: + parsed_values[unit] = int(match.group(1)) + + return datetime.timedelta( + hours=parsed_values['hours'], + minutes=parsed_values['minutes'], + seconds=parsed_values['seconds']) + except Exception: + return default + + def parse_datetime(date_string, format="%Y-%m-%dT%H:%M:%S%z", utcoffset=0): """Create a datetime object by parsing 'date_string'""" try: