Skip to content

Commit

Permalink
[internal] Make more explicit module mapping code for Python, Shell, …
Browse files Browse the repository at this point in the history
…and Protobuf (#13981)

Because we know that a `SingleSourceField` has exactly one file—and we can ban `*` so can compute its file path without using the engine—we can simplify dependency inference when creating the global module mapping. There's no need to actually look up the file system.

A downside of this PR is that we have lazier validation that the `source` field matches files on disk, i.e. that the file is there. But there are still plenty of places this gets validated, such as when computing the dependencies of a target because we have to inspect its source code. (Compared to creating a global module mapping for dep inference in general.)

There's a marginal speed up, too:

Before:

```
❯ hyperfine -w 1 -r 25 './pants --no-pantsd dependencies src/python/pants/util/strutil.py:util'
Benchmark #1: ./pants --no-pantsd dependencies src/python/pants/util/strutil.py:util
  Time (mean ± σ):      4.576 s ±  0.139 s    [User: 3.180 s, System: 0.657 s]
  Range (min … max):    4.373 s …  5.003 s    25 runs
```

After:

```
❯ hyperfine -w 1 -r 25 './pants --no-pantsd dependencies src/python/pants/util/strutil.py:util'
Benchmark #1: ./pants --no-pantsd dependencies src/python/pants/util/strutil.py:util
  Time (mean ± σ):      4.487 s ±  0.172 s    [User: 3.019 s, System: 0.628 s]
  Range (min … max):    4.224 s …  5.094 s    25 runs
```

[ci skip-rust]
[ci skip-build-wheels]
  • Loading branch information
Eric-Arellano authored Dec 27, 2021
1 parent dc6373e commit 67b6fa2
Show file tree
Hide file tree
Showing 7 changed files with 73 additions and 50 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from pants.backend.codegen.protobuf.protoc import Protoc
from pants.backend.codegen.protobuf.target_types import AllProtobufTargets, ProtobufSourceField
from pants.core.util_rules.stripped_source_files import StrippedSourceFileNames
from pants.core.util_rules.stripped_source_files import StrippedFileName, StrippedFileNameRequest
from pants.engine.addresses import Address
from pants.engine.fs import Digest, DigestContents
from pants.engine.rules import Get, MultiGet, collect_rules, rule
Expand All @@ -22,7 +22,6 @@
HydrateSourcesRequest,
InferDependenciesRequest,
InferredDependencies,
SourcesPathsRequest,
WrappedTarget,
)
from pants.engine.unions import UnionRule
Expand All @@ -41,22 +40,20 @@ class ProtobufMapping:

@rule(desc="Creating map of Protobuf file names to Protobuf targets", level=LogLevel.DEBUG)
async def map_protobuf_files(protobuf_targets: AllProtobufTargets) -> ProtobufMapping:
stripped_sources_per_target = await MultiGet(
Get(StrippedSourceFileNames, SourcesPathsRequest(tgt[ProtobufSourceField]))
stripped_file_per_target = await MultiGet(
Get(StrippedFileName, StrippedFileNameRequest(tgt[ProtobufSourceField].file_path))
for tgt in protobuf_targets
)

stripped_files_to_addresses: dict[str, Address] = {}
stripped_files_with_multiple_owners: DefaultDict[str, set[Address]] = defaultdict(set)
for tgt, stripped_sources in zip(protobuf_targets, stripped_sources_per_target):
assert len(stripped_sources) == 1
stripped_f = stripped_sources[0]
if stripped_f in stripped_files_to_addresses:
stripped_files_with_multiple_owners[stripped_f].update(
{stripped_files_to_addresses[stripped_f], tgt.address}
for tgt, stripped_file in zip(protobuf_targets, stripped_file_per_target):
if stripped_file.value in stripped_files_to_addresses:
stripped_files_with_multiple_owners[stripped_file.value].update(
{stripped_files_to_addresses[stripped_file.value], tgt.address}
)
else:
stripped_files_to_addresses[stripped_f] = tgt.address
stripped_files_to_addresses[stripped_file.value] = tgt.address

# Remove files with ambiguous owners.
for ambiguous_stripped_f in stripped_files_with_multiple_owners:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
FirstPartyPythonMappingImpl,
FirstPartyPythonMappingImplMarker,
)
from pants.core.util_rules.stripped_source_files import StrippedSourceFileNames
from pants.core.util_rules.stripped_source_files import StrippedFileName, StrippedFileNameRequest
from pants.engine.addresses import Address
from pants.engine.rules import Get, MultiGet, collect_rules, rule
from pants.engine.target import SourcesPathsRequest, Target
from pants.engine.target import Target
from pants.engine.unions import UnionRule
from pants.util.frozendict import FrozenDict
from pants.util.logging import LogLevel
Expand All @@ -36,8 +36,8 @@ async def map_protobuf_to_python_modules(
protobuf_targets: AllProtobufTargets,
_: PythonProtobufMappingMarker,
) -> FirstPartyPythonMappingImpl:
stripped_sources_per_target = await MultiGet(
Get(StrippedSourceFileNames, SourcesPathsRequest(tgt[ProtobufSourceField]))
stripped_file_per_target = await MultiGet(
Get(StrippedFileName, StrippedFileNameRequest(tgt[ProtobufSourceField].file_path))
for tgt in protobuf_targets
)

Expand All @@ -53,14 +53,13 @@ def add_module(module: str, tgt: Target) -> None:
else:
modules_to_addresses[module] = (tgt.address,)

for tgt, stripped_sources in zip(protobuf_targets, stripped_sources_per_target):
for stripped_f in stripped_sources:
# NB: We don't consider the MyPy plugin, which generates `_pb2.pyi`. The stubs end up
# sharing the same module as the implementation `_pb2.py`. Because both generated files
# come from the same original Protobuf target, we're covered.
add_module(proto_path_to_py_module(stripped_f, suffix="_pb2"), tgt)
if tgt.get(ProtobufGrpcToggleField).value:
add_module(proto_path_to_py_module(stripped_f, suffix="_pb2_grpc"), tgt)
for tgt, stripped_file in zip(protobuf_targets, stripped_file_per_target):
# NB: We don't consider the MyPy plugin, which generates `_pb2.pyi`. The stubs end up
# sharing the same module as the implementation `_pb2.py`. Because both generated files
# come from the same original Protobuf target, we're covered.
add_module(proto_path_to_py_module(stripped_file.value, suffix="_pb2"), tgt)
if tgt.get(ProtobufGrpcToggleField).value:
add_module(proto_path_to_py_module(stripped_file.value, suffix="_pb2_grpc"), tgt)

# Remove modules with ambiguous owners.
for ambiguous_module in modules_with_multiple_owners:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@
PythonRequirementTypeStubModulesField,
PythonSourceField,
)
from pants.core.util_rules.stripped_source_files import StrippedSourceFileNames
from pants.core.util_rules.stripped_source_files import StrippedFileName, StrippedFileNameRequest
from pants.engine.addresses import Address
from pants.engine.rules import Get, MultiGet, collect_rules, rule
from pants.engine.target import AllTargets, SourcesPathsRequest, Target
from pants.engine.target import AllTargets, Target
from pants.engine.unions import UnionMembership, UnionRule, union
from pants.util.frozendict import FrozenDict
from pants.util.logging import LogLevel
Expand Down Expand Up @@ -205,18 +205,16 @@ class FirstPartyPythonTargetsMappingMarker(FirstPartyPythonMappingImplMarker):
async def map_first_party_python_targets_to_modules(
_: FirstPartyPythonTargetsMappingMarker, all_python_targets: AllPythonTargets
) -> FirstPartyPythonMappingImpl:
stripped_sources_per_target = await MultiGet(
Get(StrippedSourceFileNames, SourcesPathsRequest(tgt[PythonSourceField]))
stripped_file_per_target = await MultiGet(
Get(StrippedFileName, StrippedFileNameRequest(tgt[PythonSourceField].file_path))
for tgt in all_python_targets.first_party
)

modules_to_addresses: DefaultDict[str, list[Address]] = defaultdict(list)
modules_with_type_stub: set[str] = set()
modules_with_multiple_implementations: DefaultDict[str, set[Address]] = defaultdict(set)
for tgt, stripped_sources in zip(all_python_targets.first_party, stripped_sources_per_target):
# `PythonSourceFile` validates that each target has exactly one file.
assert len(stripped_sources) == 1
stripped_f = PurePath(stripped_sources[0])
for tgt, stripped_file in zip(all_python_targets.first_party, stripped_file_per_target):
stripped_f = PurePath(stripped_file.value)
is_type_stub = stripped_f.suffix == ".pyi"

module = PythonModule.create_from_stripped_path(stripped_f).module
Expand Down
13 changes: 5 additions & 8 deletions src/python/pants/backend/python/goals/setup_py.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
from pants.base.specs import AddressSpecs, AscendantAddresses
from pants.core.goals.package import BuiltPackage, BuiltPackageArtifact, PackageFieldSet
from pants.core.target_types import FileSourceField, ResourceSourceField
from pants.core.util_rules.stripped_source_files import StrippedSourceFileNames
from pants.core.util_rules.stripped_source_files import StrippedFileName, StrippedFileNameRequest
from pants.engine.addresses import Address, UnparsedAddressInputs
from pants.engine.collection import Collection, DeduplicatedCollection
from pants.engine.fs import (
Expand All @@ -68,7 +68,6 @@
Dependencies,
DependenciesRequest,
SourcesField,
SourcesPaths,
Target,
Targets,
TransitiveTargets,
Expand Down Expand Up @@ -502,15 +501,13 @@ async def generate_chroot(
# is just a dummy string, required because our source root stripping mechanism assumes
# that paths are files and starts searching from the parent dir. It doesn't correspond
# to an actual file on disk, so there are no collision issues.
# TODO: Add source root stripping functionality for directories.
stripped = await Get(
StrippedSourceFileNames,
SourcesPaths(
files=(os.path.join(request.exported_target.target.address.spec_path, "dummy.py"),),
dirs=(),
StrippedFileName,
StrippedFileNameRequest(
os.path.join(request.exported_target.target.address.spec_path, "dummy.py")
),
)
working_directory = os.path.dirname(stripped[0])
working_directory = os.path.dirname(stripped.value)
chroot_digest = sources.digest
return DistBuildChroot(chroot_digest, working_directory)

Expand Down
13 changes: 3 additions & 10 deletions src/python/pants/backend/shell/dependency_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,6 @@
HydrateSourcesRequest,
InferDependenciesRequest,
InferredDependencies,
SourcesPaths,
SourcesPathsRequest,
Targets,
WrappedTarget,
)
Expand Down Expand Up @@ -60,16 +58,11 @@ class ShellMapping:


@rule(desc="Creating map of Shell file names to Shell targets", level=LogLevel.DEBUG)
async def map_shell_files(tgts: AllShellTargets) -> ShellMapping:
sources_per_target = await MultiGet(
Get(SourcesPaths, SourcesPathsRequest(tgt[ShellSourceField])) for tgt in tgts
)

def map_shell_files(tgts: AllShellTargets) -> ShellMapping:
files_to_addresses: dict[str, Address] = {}
files_with_multiple_owners: DefaultDict[str, set[Address]] = defaultdict(set)
for tgt, sources in zip(tgts, sources_per_target):
assert len(sources.files) == 1
fp = sources.files[0]
for tgt in tgts:
fp = tgt[ShellSourceField].file_path
if fp in files_to_addresses:
files_with_multiple_owners[fp].update({files_to_addresses[fp], tgt.address})
else:
Expand Down
26 changes: 26 additions & 0 deletions src/python/pants/core/util_rules/stripped_source_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from pants.core.util_rules.source_files import SourceFiles
from pants.core.util_rules.source_files import rules as source_files_rules
from pants.engine.collection import Collection
from pants.engine.engine_aware import EngineAwareParameter
from pants.engine.fs import Digest, DigestSubset, MergeDigests, PathGlobs, RemovePrefix, Snapshot
from pants.engine.rules import Get, MultiGet, collect_rules, rule
from pants.engine.target import SourcesPaths
Expand Down Expand Up @@ -89,6 +90,31 @@ async def strip_source_roots(source_files: SourceFiles) -> StrippedSourceFiles:
return StrippedSourceFiles(resulting_snapshot)


@dataclass(frozen=True)
class StrippedFileName:
value: str


@dataclass(frozen=True)
class StrippedFileNameRequest(EngineAwareParameter):
file_path: str

def debug_hint(self) -> str:
return self.file_path


@rule
async def strip_file_name(request: StrippedFileNameRequest) -> StrippedFileName:
source_root = await Get(
SourceRoot, SourceRootRequest, SourceRootRequest.for_file(request.file_path)
)
return StrippedFileName(
request.file_path
if source_root.path == "."
else fast_relpath(request.file_path, source_root.path)
)


class StrippedSourceFileNames(Collection[str]):
"""The file names from a target's `sources` field, with source roots stripped.
Expand Down
15 changes: 14 additions & 1 deletion src/python/pants/core/util_rules/stripped_source_files_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,12 @@

from pants.core.util_rules import stripped_source_files
from pants.core.util_rules.source_files import SourceFiles, SourceFilesRequest
from pants.core.util_rules.stripped_source_files import StrippedSourceFileNames, StrippedSourceFiles
from pants.core.util_rules.stripped_source_files import (
StrippedFileName,
StrippedFileNameRequest,
StrippedSourceFileNames,
StrippedSourceFiles,
)
from pants.engine.addresses import Address
from pants.engine.fs import EMPTY_SNAPSHOT
from pants.engine.internals.scheduler import ExecutionError
Expand All @@ -30,6 +35,7 @@ def rule_runner() -> RuleRunner:
QueryRule(SourceFiles, [SourceFilesRequest]),
QueryRule(StrippedSourceFiles, [SourceFiles]),
QueryRule(StrippedSourceFileNames, [SourcesPathsRequest]),
QueryRule(StrippedFileName, [StrippedFileNameRequest]),
],
target_types=[TargetWithSources],
)
Expand Down Expand Up @@ -142,3 +148,10 @@ def assert_stripped_source_file_names(
assert_stripped_source_file_names(
Address("", target_name="empty"), source_root="/", expected=[]
)


@pytest.mark.parametrize("source_root,expected", [("root", "f.txt"), ("/", "root/f.txt")])
def test_strip_file_name(rule_runner: RuleRunner, source_root: str, expected: str) -> None:
rule_runner.set_options([f"--source-root-patterns=['{source_root}']"])
result = rule_runner.request(StrippedFileName, [StrippedFileNameRequest("root/f.txt")])
assert result.value == expected

0 comments on commit 67b6fa2

Please sign in to comment.