Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prevent util.execute from reading output files #296

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
21 changes: 17 additions & 4 deletions qcengine/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import sys
import time
import pathlib

import pytest
from qcelemental.models import AtomicInput
Expand Down Expand Up @@ -64,17 +65,29 @@ def test_tmpdir():
assert str(tmpdir).split(os.path.sep)[-1].endswith("this")


def test_disk_files():
@pytest.mark.parametrize("outfiles_track", [[], ["thing*"], ["thing*", "other"]])
def test_disk_files(outfiles_track):

infiles = {"thing1": "hello", "thing2": "world", "other": "everyone"}
outfiles = {"thing*": None, "other": None}
with util.temporary_directory(suffix="this") as tmpdir:
with util.disk_files(infiles=infiles, outfiles=outfiles, cwd=tmpdir):
with util.disk_files(infiles=infiles, outfiles=outfiles, cwd=tmpdir, outfiles_track=outfiles_track):
pass

assert outfiles.keys() == {"thing*", "other"}
assert outfiles["thing*"]["thing1"] == "hello"
assert outfiles["other"] == "everyone"
for ofile, ofile_val in outfiles.items():
if isinstance(ofile_val, dict):
if ofile in outfiles_track:
for fpath in ofile_val.values():
assert isinstance(fpath, pathlib.PurePath)
else:
for key in ofile_val.keys():
print(key)
assert ofile_val[key] == infiles[key]
elif ofile in outfiles_track:
assert isinstance(ofile_val, pathlib.PurePath)
else:
assert ofile_val == infiles[ofile]


def test_popen_tee_output(capsys):
Expand Down
57 changes: 41 additions & 16 deletions qcengine/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,7 @@ def execute(
infiles: Optional[Dict[str, str]] = None,
outfiles: Optional[List[str]] = None,
*,
outfiles_track: Optional[List[str]] = [],
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point but shouldn't matter here since outfiles_track should never be modified, but if you're concerned someone might I could change its type to an immutable object like a Tuple OR if you prefer a list for consistency that is by default None, I could add a single line to disk_files:

outfiles_track = outfiles_track or []

that would make the current code work with no additional changes.

as_binary: Optional[List[str]] = None,
scratch_name: Optional[str] = None,
scratch_directory: Optional[str] = None,
Expand Down Expand Up @@ -404,6 +405,11 @@ def execute(
outfiles : List[str] = None
Output file names to be collected after execution into
values. May be {}.
outfiles_track: List[str], optional
Keys of `outfiles` to keep track of without loading their contents in memory.
For specified filename in `outfiles_track`, the file path instead of contents
is stored in `outfiles`. To ensure tracked files are not deleted after execution,
you must set `scratch_messy=True`.
as_binary : List[str] = None
Keys of `infiles` or `outfiles` to be treated as bytes.
scratch_name : str, optional
Expand All @@ -428,7 +434,6 @@ def execute(
Run command through the shell.
exit_code: int, optional
The exit code above which the process is considered failure.

Raises
------
FileExistsError
Expand Down Expand Up @@ -471,7 +476,9 @@ def execute(
) as scrdir:
popen_kwargs["cwd"] = scrdir
popen_kwargs["shell"] = shell
with disk_files(infiles, outfiles, cwd=scrdir, as_binary=as_binary) as extrafiles:
with disk_files(
infiles, outfiles, cwd=scrdir, as_binary=as_binary, outfiles_track=outfiles_track
) as extrafiles:
with popen(command, popen_kwargs=popen_kwargs) as proc:
# Wait for the subprocess to complete or the timeout to expire
if interupt_after is None:
Expand Down Expand Up @@ -562,7 +569,8 @@ def disk_files(
*,
cwd: Optional[str] = None,
as_binary: Optional[List[str]] = None,
) -> Dict[str, Union[str, bytes]]:
outfiles_track: Optional[List[str]] = [],
) -> Dict[str, Union[str, bytes, Path]]:
"""Write and collect files.

Parameters
Expand All @@ -577,10 +585,13 @@ def disk_files(
Directory to which to write and read files.
as_binary : List[str] = None
Keys in `infiles` (`outfiles`) to be written (read) as bytes, not decoded.

outfiles_track: List[str], optional
Keys of `outfiles` to keep track of (i.e. file contents not loaded in memory).
For specified filename in `outfiles_track`, the file path instead of contents
is stored in `outfiles`.
Yields
------
Dict[str] = str
Dict[str, Union[str, bytes, Path]]
outfiles with RHS filled in.

"""
Expand All @@ -604,19 +615,33 @@ def disk_files(

finally:
for fl in outfiles.keys():
omode = "rb" if fl in as_binary else "r"
try:
filename = lwd / fl
with open(filename, omode) as fp:
outfiles[fl] = fp.read()
LOGGER.info(f"... Writing ({omode}): {filename}")
except (OSError, FileNotFoundError):
if "*" in fl:
filename = lwd / fl
if fl not in outfiles_track:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are you handling the case when outfiles=["subdir/*"], outfiles_track=["subdir/bigfile"]?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, this will not work (bigfile will be loaded in memory). For outfiles_track to work, it has to be a subset of outfiles. So for the following input:

{
    ...
    "outfiles": ["*"],
    "outfiles_track": ["bigfile"],
    ...
}

or the opposite:

{
    ...
    "outfiles": ["bigfile", ...],
    "outfiles_track": ["*"],
    ...
}

outfiles_track will be ignored. There are also no sanity checks being done on outfiles_track so if one specifies a filename not part of outfiles, it will be ignored without any warnings.

I guess the main advantage in setting outfiles_load: bool is we don't have to worry about individual cases like these.

omode = "rb" if fl in as_binary else "r"
try:
with open(filename, omode) as fp:
outfiles[fl] = fp.read()
LOGGER.info(f"... Reading ({omode}): {filename}")
except (OSError, FileNotFoundError):
if "*" in fl:
gfls = {}
for gfl in lwd.glob(fl):
with open(gfl, omode) as fp:
gfls[gfl.name] = fp.read()
LOGGER.info(f"... Reading ({omode}): {gfl}")
if not gfls:
gfls = None
outfiles[fl] = gfls
else:
outfiles[fl] = None
else:
if filename.is_file():
outfiles[fl] = filename
elif "*" in fl:
gfls = {}
for gfl in lwd.glob(fl):
with open(gfl, omode) as fp:
gfls[gfl.name] = fp.read()
LOGGER.info(f"... Writing ({omode}): {gfl}")
if gfl.is_file():
gfls[gfl.name] = gfl
if not gfls:
gfls = None
outfiles[fl] = gfls
Expand Down