Merge pull request #4830 from voxel51/builtin-operators

Adding builtin operators for more of the FO interface
voxel51 · Sep 24, 2024 · 77de164 · 77de164
2 parents 75e8e87 + 160f7bd
commit 77de164
Show file tree

Hide file tree

Showing 6 changed files with 2,167 additions and 287 deletions.
diff --git a/docs/source/brain.rst b/docs/source/brain.rst
@@ -5,7 +5,7 @@ FiftyOne Brain
 
 .. default-role:: code
 
-The `FiftyOne Brain <https://github.com/voxel51/fiftyone-brain>` provides
+The `FiftyOne Brain <https://github.com/voxel51/fiftyone-brain>`_ provides
 powerful machine learning techniques that are designed to transform how you
 curate your data from an art into a measurable science.
 

diff --git a/fiftyone/core/collections.py b/fiftyone/core/collections.py
@@ -643,33 +643,58 @@ def _sync_samples_last_modified_at(self):
 
     def _sync_dataset_last_modified_at(self):
         dataset = self._root_dataset
-        if self.media_type == fom.GROUP:
-            samples = self.select_group_slices(media_type=fom.VIDEO)
-        else:
-            samples = self
+        curr_lma = dataset.last_modified_at
+        lma = self._get_last_modified_at()
+
+        if lma is not None and (curr_lma is None or lma > curr_lma):
+            dataset._doc.last_modified_at = lma
+            dataset._doc.save(virtual=True)
+
+    def _get_last_modified_at(self, frames=False):
+        if frames and not self._contains_videos(any_slice=True):
+            return
 
-        results = samples._aggregate(
-            post_pipeline=[
+        if isinstance(self, fod.Dataset):
+            # pylint:disable=no-member
+            dataset = self
+            if frames:
+                coll = dataset._frame_collection
+            else:
+                coll = dataset._sample_collection
+
+            pipeline = [
                 {"$sort": {"last_modified_at": -1}},
                 {"$limit": 1},
                 {"$project": {"last_modified_at": True}},
             ]
-        )
 
-        try:
-            last_modified_at = next(iter(results))["last_modified_at"]
-        except:
-            last_modified_at = None
+            results = foo.aggregate(coll, pipeline)
+        else:
+            if self.media_type == fom.GROUP:
+                if frames:
+                    view = self.select_group_slices(media_type=fom.VIDEO)
+                else:
+                    view = self.select_group_slices(_allow_mixed=True)
+            else:
+                view = self
 
-        if last_modified_at is None:
-            return
+            pipeline = [
+                {
+                    "$group": {
+                        "_id": None,
+                        "last_modified_at": {"$max": "$last_modified_at"},
+                    }
+                }
+            ]
 
-        if (
-            dataset.last_modified_at is None
-            or last_modified_at > dataset.last_modified_at
-        ):
-            dataset._doc.last_modified_at = last_modified_at
-            dataset._doc.save(virtual=True)
+            results = view._aggregate(
+                frames_only=frames, post_pipeline=pipeline
+            )
+
+        try:
+            return next(iter(results))["last_modified_at"]
+        except:
+            return None
 
     def stats(
         self,

diff --git a/fiftyone/core/dataset.py b/fiftyone/core/dataset.py
@@ -1716,9 +1716,9 @@ def create_summary_field(
                 given ``path``
             sidebar_group (None): the name of a
                 :ref:`App sidebar group <app-sidebar-groups>` to which to add
-                the summary field, if necessary. By default, all summary fields
-                are added to a ``"summaries"`` group. You can pass ``False`` to
-                skip sidebar group modification
+                the summary field. By default, all summary fields are added to
+                a ``"summaries"`` group. You can pass ``False`` to skip sidebar
+                group modification
             include_counts (False): whether to include per-value counts when
                 summarizing categorical fields
             group_by (None): an optional attribute to group by when ``path``
@@ -1752,27 +1752,8 @@ def create_summary_field(
                 f"undeclared field '{path}'"
             )
 
-        _path, is_frame_field, list_fields, _, _ = self._parse_field_name(path)
-
         if field_name is None:
-            _chunks = _path.split(".")
-
-            chunks = []
-            if is_frame_field:
-                chunks.append("frames")
-
-            found_list = False
-            for i, _chunk in enumerate(_chunks, 1):
-                if ".".join(_chunks[:i]) in list_fields:
-                    found_list = True
-                    break
-                else:
-                    chunks.append(_chunk)
-
-            if found_list:
-                chunks.append(_chunks[-1])
-
-            field_name = "_".join(chunks)
+            field_name = self._get_default_summary_field_name(path)
 
         index_fields = []
         summary_info = {"path": path, "field_type": field_type}
@@ -1891,6 +1872,27 @@ def create_summary_field(
 
         return field_name
 
+    def _get_default_summary_field_name(self, path):
+        _path, is_frame_field, list_fields, _, _ = self._parse_field_name(path)
+        _chunks = _path.split(".")
+
+        chunks = []
+        if is_frame_field:
+            chunks.append("frames")
+
+        found_list = False
+        for i, _chunk in enumerate(_chunks, 1):
+            if ".".join(_chunks[:i]) in list_fields:
+                found_list = True
+                break
+            else:
+                chunks.append(_chunk)
+
+        if found_list:
+            chunks.append(_chunks[-1])
+
+        return "_".join(chunks)
+
     def _populate_summary_field(self, field_name, summary_info):
         path = summary_info["path"]
         field_type = summary_info["field_type"]
@@ -2066,17 +2068,15 @@ def check_summary_fields(self):
                 update_indexes.append(path)
             elif self._is_frame_field(source_path):
                 if frames_last_modified_at is None:
-                    frames_last_modified_at, _ = self.bounds(
-                        "frames.last_modified_at"
+                    frames_last_modified_at = self._get_last_modified_at(
+                        frames=True
                     )
 
                 if frames_last_modified_at > last_modified_at:
                     update_indexes.append(path)
             else:
                 if samples_last_modified_at is None:
-                    _, samples_last_modified_at = self.bounds(
-                        "last_modified_at"
-                    )
+                    samples_last_modified_at = self._get_last_modified_at()
 
                 if samples_last_modified_at > last_modified_at:
                     update_indexes.append(path)
@@ -4324,7 +4324,8 @@ def save_view(
         self._doc.reload("saved_views")
 
         self._doc.saved_views.append(view_doc)
-        self.save()
+        self._doc.last_modified_at = now
+        self._doc.save(virtual=True)
 
     def get_saved_view_info(self, name):
         """Loads the editable information about the saved view with the given
@@ -4625,7 +4626,8 @@ def save_workspace(
         self._doc.reload("workspaces")
 
         self._doc.workspaces.append(workspace_doc)
-        self.save()
+        self._doc.last_modified_at = now
+        self._doc.save(virtual=True)
 
     def load_workspace(self, name):
         """Loads the saved workspace with the given name.

diff --git a/fiftyone/migrations/revisions/v1_0_0.py b/fiftyone/migrations/revisions/v1_0_0.py
@@ -5,62 +5,84 @@
 | `voxel51.com <https://voxel51.com/>`_
 |
 """
-import logging
-
-
-logger = logging.getLogger(__name__)
+from datetime import datetime
 
 
 def up(db, dataset_name):
     match_d = {"name": dataset_name}
     dataset_dict = db.datasets.find_one(match_d)
 
-    # Add `last_modified_at` property
-    if "last_modified_at" not in dataset_dict:
-        dataset_dict["last_modified_at"] = None
+    now = datetime.utcnow()
 
-    add_samples_created_at = False
+    # Populate `Dataset.last_modified_at`
+    if dataset_dict.get("last_modified_at", None) is None:
+        dataset_dict["last_modified_at"] = now
+
+    added_created_at_samples = False
+    added_last_modified_at_samples = False
     sample_fields = dataset_dict.get("sample_fields", [])
     if sample_fields:
-        add_samples_created_at = _up_fields(sample_fields)
+        (
+            added_created_at_samples,
+            added_last_modified_at_samples,
+        ) = _up_fields(dataset_name, sample_fields)
 
-    add_frames_created_at = False
+    added_created_at_frames = False
+    added_last_modified_at_frames = False
     frame_fields = dataset_dict.get("frame_fields", [])
     if frame_fields:
-        add_frames_created_at = _up_fields(frame_fields)
+        (
+            added_created_at_frames,
+            added_last_modified_at_frames,
+        ) = _up_fields(dataset_name, frame_fields)
 
     db.datasets.replace_one(match_d, dataset_dict)
 
     # Populate `Sample.created_at` values
-    if add_samples_created_at:
-        sample_collection_name = dataset_dict.get(
-            "sample_collection_name", None
+    sample_collection_name = dataset_dict.get("sample_collection_name", None)
+    if sample_collection_name:
+        _up_field_values(
+            db,
+            dataset_name,
+            sample_collection_name,
+            added_created_at_samples,
+            added_last_modified_at_samples,
+            now,
         )
-        if sample_collection_name:
-            _add_created_at(db, dataset_name, sample_collection_name)
 
     # Populate `Frame.created_at` values
-    if add_frames_created_at:
-        frame_collection_name = dataset_dict.get("frame_collection_name", None)
-        if frame_collection_name:
-            _add_created_at(db, dataset_name, frame_collection_name)
+    frame_collection_name = dataset_dict.get("frame_collection_name", None)
+    if frame_collection_name:
+        _up_field_values(
+            db,
+            dataset_name,
+            frame_collection_name,
+            added_created_at_frames,
+            added_last_modified_at_frames,
+            now,
+        )
 
 
 def down(db, dataset_name):
     pass
 
 
-def _up_fields(fields):
+def _up_fields(dataset_name, fields):
     found_created_at = False
     found_last_modified_at = False
 
     for field in fields:
         name = field.get("name", None)
-        found_created_at |= name == "created_at"
-        found_last_modified_at |= name == "last_modified_at"
-
-        # Add `read_only` property
-        if "read_only" not in field:
+        if name == "created_at":
+            # Existing 'created_at' field must be read-only DateTimeField
+            found_created_at = True
+            _up_read_only_datetime_field(dataset_name, field)
+        elif name == "last_modified_at":
+            # Existing 'last_modified_at' field must be read-only DateTimeField
+            found_last_modified_at = True
+            _up_read_only_datetime_field(dataset_name, field)
+        elif "read_only" not in field:
+            # Add `read_only` property
             field["read_only"] = False
 
     # Add `created_at` field
@@ -95,16 +117,49 @@ def _up_fields(fields):
             }
         )
 
-    return not found_created_at
+    added_created_at = not found_created_at
+    added_last_modified_at = not found_last_modified_at
+
+    return added_created_at, added_last_modified_at
+
+
+def _up_read_only_datetime_field(dataset_name, field):
+    field_name = field.get("name", None)
+    ftype = field.get("ftype", None)
+    expected_ftype = "fiftyone.core.fields.DateTimeField"
+
+    if ftype != expected_ftype:
+        raise ValueError(
+            f"Cannot migrate dataset '{dataset_name}' to v1.0.0 because it "
+            f"has an existing '{field_name}' field of type "
+            f"{ftype} != {expected_ftype}. Please rename or delete the field "
+            "and try again"
+        )
+
+    field["read_only"] = True
+
 
+def _up_field_values(
+    db,
+    dataset_name,
+    collection_name,
+    set_created_at,
+    set_last_modified_at,
+    now,
+):
+    set_expr = {}
+    if set_created_at:
+        set_expr["created_at"] = {"$toDate": "$_id"}
+    if set_last_modified_at:
+        set_expr["last_modified_at"] = now
+
+    if not set_expr:
+        return
 
-def _add_created_at(db, dataset_name, collection_name):
     try:
-        pipeline = [{"$set": {"created_at": {"$toDate": "$_id"}}}]
-        db[collection_name].update_many({}, pipeline)
+        db[collection_name].update_many({}, [{"$set": set_expr}])
     except Exception as e:
-        logger.warning(
-            "Failed to populate 'created_at' field for dataset %s. Reason: %s",
-            dataset_name,
-            e,
+        raise RuntimeError(
+            "Failed to populate 'created_at' and/or 'last_modified_at' fields "
+            f"for dataset '{dataset_name}'. Reason: {e}"
         )