Skip to content

Commit

Permalink
Allow non-dimension keys in dataId if they are records
Browse files Browse the repository at this point in the history
This allows raft and name_in_raft to be used for detector
or seq_num and day_obs to be used for exposure.

Also includes disambiguation logic to work out that day_obs
can be in multiple dimensions -- picks the most popular
dimension.
  • Loading branch information
timj committed Nov 20, 2020
1 parent 3efb8ce commit 9c6a0ed
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 1 deletion.
89 changes: 88 additions & 1 deletion python/lsst/daf/butler/_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
)


from collections import defaultdict
from collections import defaultdict, Counter
import contextlib
import logging
import os
Expand Down Expand Up @@ -648,6 +648,93 @@ def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
"Could not find matching alternative (primary key has type %s).",
value, dimensionName, dimension.primaryKey.getPythonType())

# If we have some unrecognized dimensions we have to try to connect
# them to records in other dimensions. This is made more complicated
# by some dimensions having records with clashing names. A mitigation
# is that we can tell by this point which dimensions are missing
# for the DatasetType but this does not work for calibrations
# where additional dimensions can be used to constrain the temporal
# axis.
if not_dimensions:
# Calculate missing dimensions
provided = set(newDataId) | set(kwds) | set(byRecord)
missingDimensions = datasetType.dimensions.names - provided

# For calibrations we may well be needing temporal dimensions
# so rather than always including all dimensions in the scan
# restrict things a little. It is still possible for there
# to be confusion over day_obs in visit vs exposure for example.
# If we are not searching calibration collections things may
# fail but they are going to fail anyway because of the
# ambiguousness of the dataId...
candidateDimensions = set()
candidateDimensions.update(missingDimensions)
if datasetType.isCalibration():
for dim in self.registry.dimensions.getStaticDimensions():
if dim.temporal:
candidateDimensions.add(str(dim))

# Look up table for the first association with a dimension
guessedAssociation: dict[Any, dict[str, Any]] = defaultdict(dict)

# Keep track of whether an item is associated with multiple
# dimensions.
counter = Counter()
assigned: dict[Any, Set[str]] = defaultdict(set)

# Go through the missing dimensions and associate the
# given names with records within those dimensions
for dimensionName in candidateDimensions:
dimension = self.registry.dimensions[dimensionName]
fields = dimension.metadata | dimension.uniqueKeys
for field in not_dimensions:
if field in fields:
guessedAssociation[dimensionName][field] = not_dimensions[field]
counter[dimensionName] += 1
assigned[field].add(dimensionName)

# There is a chance we have allocated a single dataId item
# to multiple dimensions. Need to decide which should be retained.
# For now assume that the most popular alternative wins.
# This means that day_obs with seq_num will result in
# exposure.day_obs and not visit.day_obs
# Also prefer an explicitly missing dimension over an inferred
# temporal dimension.
for fieldName, assignedDimensions in assigned.items():
if len(assignedDimensions) > 1:
# Pick the most popular (preferring mandatory dimensions)
requiredButMissing = assignedDimensions.intersection(missingDimensions)
if requiredButMissing:
candidateDimensions = requiredButMissing
else:
candidateDimensions = assignedDimensions

# Select the relevant items and get a new restricted
# counter.
theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
duplicatesCounter = Counter()
duplicatesCounter.update(theseCounts)

# Choose the most common. If they are equally common
# we will pick the one that was found first.
# Returns a list of tuples
selected = duplicatesCounter.most_common(1)[0][0]

log.debug("Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
" Removed ambiguity by choosing dimension %s.",
fieldName, ", ".join(assignedDimensions), selected)

for candidateDimension in assignedDimensions:
if candidateDimension != selected:
del guessedAssociation[candidateDimension][fieldName]

# Update the record look up dict with the new associations
for dimensionName, values in guessedAssociation.items():
if values: # A dict might now be empty
log.debug("Assigned non-dimension dataId keys to dimension %s: %s",
dimensionName, values)
byRecord[dimensionName].update(values)

if byRecord:
# Some record specifiers were found so we need to convert
# them to the Id form
Expand Down
10 changes: 10 additions & 0 deletions tests/test_simpleButler.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,13 +250,17 @@ def testGetCalibration(self):
"obs_id": "three",
"timespan": Timespan(t1, t2),
"physical_filter": "Cam1-G",
"day_obs": 20201114,
"seq_num": 55,
},
{
"instrument": "Cam1",
"id": 4,
"obs_id": "four",
"timespan": Timespan(t2, t3),
"physical_filter": "Cam1-G",
"day_obs": 20211114,
"seq_num": 42,
},
)
# Get some biases from raw-like data IDs.
Expand Down Expand Up @@ -297,6 +301,12 @@ def testGetCalibration(self):
collections="calibs", instrument="Cam1")
self.assertEqual(bias3b_id, bias3b.id)

# Now with implied record columns
bias3b_id, _ = butler.get("bias", day_obs=20211114, seq_num=42,
raft="B", name_in_raft="a",
collections="calibs", instrument="Cam1")
self.assertEqual(bias3b_id, bias3b.id)


if __name__ == "__main__":
unittest.main()

0 comments on commit 9c6a0ed

Please sign in to comment.