Allow non-dimension keys in dataId if they are records

This allows raft and name_in_raft to be used for detector or seq_num and day_obs to be used for exposure. Also includes disambiguation logic to work out that day_obs can be in multiple dimensions -- picks the most popular dimension.
lsst · Nov 20, 2020 · 9c6a0ed · 9c6a0ed
1 parent 3efb8ce
commit 9c6a0ed
Show file tree

Hide file tree

Showing 2 changed files with 98 additions and 1 deletion.
diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py
@@ -34,7 +34,7 @@
 )
 
 
-from collections import defaultdict
+from collections import defaultdict, Counter
 import contextlib
 import logging
 import os
@@ -648,6 +648,93 @@ def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
                                     "Could not find matching alternative (primary key has type %s).",
                                     value, dimensionName, dimension.primaryKey.getPythonType())
 
+        # If we have some unrecognized dimensions we have to try to connect
+        # them to records in other dimensions.  This is made more complicated
+        # by some dimensions having records with clashing names.  A mitigation
+        # is that we can tell by this point which dimensions are missing
+        # for the DatasetType but this does not work for calibrations
+        # where additional dimensions can be used to constrain the temporal
+        # axis.
+        if not_dimensions:
+            # Calculate missing dimensions
+            provided = set(newDataId) | set(kwds) | set(byRecord)
+            missingDimensions = datasetType.dimensions.names - provided
+
+            # For calibrations we may well be needing temporal dimensions
+            # so rather than always including all dimensions in the scan
+            # restrict things a little. It is still possible for there
+            # to be confusion over day_obs in visit vs exposure for example.
+            # If we are not searching calibration collections things may
+            # fail but they are going to fail anyway because of the
+            # ambiguousness of the dataId...
+            candidateDimensions = set()
+            candidateDimensions.update(missingDimensions)
+            if datasetType.isCalibration():
+                for dim in self.registry.dimensions.getStaticDimensions():
+                    if dim.temporal:
+                        candidateDimensions.add(str(dim))
+
+            # Look up table for the first association with a dimension
+            guessedAssociation: dict[Any, dict[str, Any]] = defaultdict(dict)
+
+            # Keep track of whether an item is associated with multiple
+            # dimensions.
+            counter = Counter()
+            assigned: dict[Any, Set[str]] = defaultdict(set)
+
+            # Go through the missing dimensions and associate the
+            # given names with records within those dimensions
+            for dimensionName in candidateDimensions:
+                dimension = self.registry.dimensions[dimensionName]
+                fields = dimension.metadata | dimension.uniqueKeys
+                for field in not_dimensions:
+                    if field in fields:
+                        guessedAssociation[dimensionName][field] = not_dimensions[field]
+                        counter[dimensionName] += 1
+                        assigned[field].add(dimensionName)
+
+            # There is a chance we have allocated a single dataId item
+            # to multiple dimensions. Need to decide which should be retained.
+            # For now assume that the most popular alternative wins.
+            # This means that day_obs with seq_num will result in
+            # exposure.day_obs and not visit.day_obs
+            # Also prefer an explicitly missing dimension over an inferred
+            # temporal dimension.
+            for fieldName, assignedDimensions in assigned.items():
+                if len(assignedDimensions) > 1:
+                    # Pick the most popular (preferring mandatory dimensions)
+                    requiredButMissing = assignedDimensions.intersection(missingDimensions)
+                    if requiredButMissing:
+                        candidateDimensions = requiredButMissing
+                    else:
+                        candidateDimensions = assignedDimensions
+
+                    # Select the relevant items and get a new restricted
+                    # counter.
+                    theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
+                    duplicatesCounter = Counter()
+                    duplicatesCounter.update(theseCounts)
+
+                    # Choose the most common. If they are equally common
+                    # we will pick the one that was found first.
+                    # Returns a list of tuples
+                    selected = duplicatesCounter.most_common(1)[0][0]
+
+                    log.debug("Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
+                              " Removed ambiguity by choosing dimension %s.",
+                              fieldName, ", ".join(assignedDimensions), selected)
+
+                    for candidateDimension in assignedDimensions:
+                        if candidateDimension != selected:
+                            del guessedAssociation[candidateDimension][fieldName]
+
+            # Update the record look up dict with the new associations
+            for dimensionName, values in guessedAssociation.items():
+                if values:  # A dict might now be empty
+                    log.debug("Assigned non-dimension dataId keys to dimension %s: %s",
+                              dimensionName, values)
+                    byRecord[dimensionName].update(values)
+
         if byRecord:
             # Some record specifiers were found so we need to convert
             # them to the Id form

diff --git a/tests/test_simpleButler.py b/tests/test_simpleButler.py
@@ -250,13 +250,17 @@ def testGetCalibration(self):
                 "obs_id": "three",
                 "timespan": Timespan(t1, t2),
                 "physical_filter": "Cam1-G",
+                "day_obs": 20201114,
+                "seq_num": 55,
             },
             {
                 "instrument": "Cam1",
                 "id": 4,
                 "obs_id": "four",
                 "timespan": Timespan(t2, t3),
                 "physical_filter": "Cam1-G",
+                "day_obs": 20211114,
+                "seq_num": 42,
             },
         )
         # Get some biases from raw-like data IDs.
@@ -297,6 +301,12 @@ def testGetCalibration(self):
                                   collections="calibs", instrument="Cam1")
         self.assertEqual(bias3b_id, bias3b.id)
 
+        # Now with implied record columns
+        bias3b_id, _ = butler.get("bias", day_obs=20211114, seq_num=42,
+                                  raft="B", name_in_raft="a",
+                                  collections="calibs", instrument="Cam1")
+        self.assertEqual(bias3b_id, bias3b.id)
+
 
 if __name__ == "__main__":
     unittest.main()