Skip to content

Commit

Permalink
feat: add zstd support (#351)
Browse files Browse the repository at this point in the history
* Add zst support

* Remove redundant calls

* Update dependencies in meta.yaml

* Add news

* Install zstandard in CI

* Fix version specifier in meta.yaml

* Fix lint

---------

Co-authored-by: Uwe L. Korn <xhochy@users.noreply.github.com>
Co-authored-by: Uwe L. Korn <uwe.korn@quantco.com>
  • Loading branch information
3 people authored Aug 26, 2024
1 parent 3470236 commit 96ffc23
Show file tree
Hide file tree
Showing 10 changed files with 90 additions and 11 deletions.
2 changes: 2 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
conda_pack/_version.py export-subst
# GitHub syntax highlighting
pixi.lock linguist-language=YAML linguist-generated=true
2 changes: 1 addition & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ jobs:
source $CONDA_ROOT/etc/profile.d/conda.sh
conda info -a
mv conda-bld $CONDA_ROOT/conda-bld
conda create -n cptest local::conda-pack conda-forge::pytest conda-forge::pytest-cov defaults::python=${{ matrix.pyver }} ${{ matrix.conda_deps }}
conda create -n cptest local::conda-pack conda-forge::pytest conda-forge::pytest-cov defaults::python=${{ matrix.pyver }} zstandard>=0.23.0 ${{ matrix.conda_deps }}
conda activate cptest
pytest -v -ss --cov=conda_pack --cov-branch --cov-report=xml conda_pack/tests
- uses: codecov/codecov-action@v4
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,6 @@ dist
htmlcov/
.pytest_cache/
__pycache__/
# pixi environments
.pixi
*.egg-info
3 changes: 3 additions & 0 deletions conda-recipe/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ requirements:
run:
- python >=3.8
- setuptools
run_constrained:
- zstandard >=0.23.0

test:
source_files:
Expand All @@ -32,6 +34,7 @@ test:
- pytest
- squashfs-tools
- squashfuse
- zstandard >=0.23.0
commands:
- bash testing/setup_envs.sh
- pytest -s -vv conda_pack/tests
Expand Down
3 changes: 2 additions & 1 deletion conda_pack/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def build_parser():
help="(Parcels only) The distribution type for the parcel. The "
"default value is 'el7'. This value cannot have any hyphens.")
parser.add_argument("--format",
choices=['infer', 'zip', 'tar.gz', 'tgz', 'tar.bz2',
choices=['infer', 'zip', 'tar.gz', 'tgz', 'tar.bz2', 'tar.zst', 'tzst',
'tbz2', 'tar.xz', 'txz', 'tar', 'parcel', 'squashfs',
'no-archive'],
default='infer',
Expand All @@ -75,6 +75,7 @@ def build_parser():
type=int,
default=4,
help=("The compression level to use, from 0 to 9. "
"If ZSTD is used, compression is supported up to 19. "
"Higher numbers decrease output file size at "
"the expense of compression time. Default is 4."))
parser.add_argument("--n-threads", "-j",
Expand Down
16 changes: 10 additions & 6 deletions conda_pack/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,8 @@ def _output_and_format(self, output=None, format="infer"):
format = "tar.bz2"
elif output.endswith(".tar.xz") or output.endswith(".txz"):
format = "tar.xz"
elif output.endswith(".tar.zst") or output.endswith(".tzst"):
format = "tar.zst"
elif output.endswith(".tar"):
format = "tar"
elif output.endswith(".squashfs"):
Expand All @@ -257,6 +259,8 @@ def _output_and_format(self, output=None, format="infer"):
"tar.xz",
"txz",
"tar",
"tar.zst",
"tzst",
"parcel",
"squashfs",
"no-archive",
Expand Down Expand Up @@ -362,9 +366,9 @@ def pack(
Whether to overwrite any existing archive at the output path if present, or
create the output directory structure if it's missing. Default is False.
compress_level : int, optional
The compression level to use, from 0 to 9. Higher numbers decrease
output file size at the expense of compression time. Ignored for
``format='zip'``. Default is 4.
The compression level to use, from 0 to 9. If ZSTD is used, compression is
supported up to 19. Higher numbers decrease output file size at the expense of
compression time. Ignored for ``format='zip'``. Default is 4.
n_threads : int, optional
The number of threads to use. Set to -1 to use the number of cpus
on this machine. If a file format doesn't support threaded
Expand Down Expand Up @@ -547,9 +551,9 @@ def pack(
Whether to overwrite any existing archive at the output path if present, or
create the output directory structure if it's missing. Default is False.
compress_level : int, optional
The compression level to use, from 0 to 9. Higher numbers decrease
output file size at the expense of compression time. Ignored for
``format='zip'``. Default is 4.
The compression level to use, from 0 to 9. If ZSTD is used, compression is
supported up to 19. Higher numbers decrease output file size at the expense
of compression time. Ignored for ``format='zip'``. Default is 4.
zip_symlinks : bool, optional
(``zip`` format only) Symbolic links aren't supported by the Zip standard,
but are supported by *many* common Zip implementations. If ``True``, symbolic
Expand Down
24 changes: 24 additions & 0 deletions conda_pack/formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,11 @@ def archive(
close_file = True
fileobj = ParallelXZFileWriter(fileobj, compresslevel=compress_level,
n_threads=n_threads)
elif format in ("tar.zst", "tzst"):
# python's tarfile doesn't support zstd natively yet
mode = "w"
close_file = True
fileobj = ParallelZstdFileWriter(fileobj)
elif format == "squashfs":
return SquashFSArchive(fileobj, path, arcroot, n_threads, verbose=verbose,
compress_level=compress_level)
Expand All @@ -99,6 +104,25 @@ def archive(
)


class ParallelZstdFileWriter:
def __init__(self, fileobj, compresslevel=9, n_threads=1, mtime=None):
import zstandard

self.cctx = zstandard.ZstdCompressor(level=compresslevel, threads=n_threads)
self.compressor = self.cctx.stream_writer(fileobj)

def write(self, data: bytes):
self.compressor.write(data)

def tell(self):
return self.compressor.tell()

def close(self):
import zstandard

self.compressor.flush(zstandard.FLUSH_FRAME)


class ParallelFileWriter:
def __init__(self, fileobj, compresslevel=9, n_threads=1, mtime=None):
self.fileobj = fileobj
Expand Down
2 changes: 1 addition & 1 deletion conda_pack/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ def test_output_and_format(py37_env):
assert output == "py37.tar.gz"
assert format == "tar.gz"

for format in ["tar.gz", "tar.bz2", "tar.xz", "tar", "zip", "parcel"]:
for format in ["tar.gz", "tar.bz2", "tar.xz", "tar.zst", "tar", "zip", "parcel"]:
output = os.extsep.join([py37_env.name, format])

o, f = py37_env._output_and_format(format=format)
Expand Down
27 changes: 25 additions & 2 deletions conda_pack/tests/test_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,18 @@ def symlink(path, target):
return root, paths


# Need to open twice, to make sure excess is truncated
def decompress_zstd_inplace(path):
import zstandard

decomp = zstandard.ZstdDecompressor()
with open(path, "rb") as fil:
data = decomp.decompress(fil.read(), max_output_size=2**32)

with open(path, "wb") as fil:
fil.write(data)


def check(out_dir, root=None, links=False):
assert exists(join(out_dir, "empty_dir"))
assert isdir(join(out_dir, "empty_dir"))
Expand Down Expand Up @@ -137,7 +149,7 @@ def has_tar_cli():

@pytest.mark.parametrize('format, zip_symlinks', [
('zip', True), ('zip', False),
('tar.gz', False), ('tar.bz2', False), ('tar.xz', False), ('tar', False),
('tar.gz', False), ('tar.bz2', False), ('tar.xz', False), ('tar', False), ('tar.zst', False),
('squashfs', False), ('no-archive', False),
])
def test_format(tmpdir, format, zip_symlinks, root_and_paths):
Expand Down Expand Up @@ -195,6 +207,10 @@ def test_format(tmpdir, format, zip_symlinks, root_and_paths):
else:
cmd = ["squashfuse", packed_env_path, spill_dir]
subprocess.check_output(cmd)
elif format == "tar.zst":
decompress_zstd_inplace(packed_env_path)
with tarfile.open(packed_env_path) as out:
out.extractall(spill_dir)
elif format != "no-archive":
with tarfile.open(packed_env_path) as out:
out.extractall(spill_dir)
Expand All @@ -219,7 +235,7 @@ def test_n_threads():
_parse_n_threads(n)


@pytest.mark.parametrize('format', ['tar.gz', 'tar.bz2', 'tar.xz'])
@pytest.mark.parametrize('format', ['tar.gz', 'tar.bz2', 'tar.xz', 'tar.zst'])
def test_format_parallel(tmpdir, format, root_and_paths):
# Python 2's bzip dpesn't support reading multipart files :(
if format == 'tar.bz2' and PY2:
Expand All @@ -229,6 +245,9 @@ def test_format_parallel(tmpdir, format, root_and_paths):
else:
use_cli_to_extract = False

if format == "tar.zst" and PY2:
pytest.skip("Unable to test parallel tar.zst support on this platform")

root, paths = root_and_paths

out_path = join(str(tmpdir), 'test.' + format)
Expand All @@ -246,6 +265,10 @@ def test_format_parallel(tmpdir, format, root_and_paths):
timeout -= 0.1
assert timeout > 0, "Threads failed to shutdown in sufficient time"

# python's tarfile doesn't support decompressing zstd natively
if format == "tar.zst":
decompress_zstd_inplace(out_path)

if use_cli_to_extract:
check_output(['tar', '-xf', out_path, '-C', out_dir])
else:
Expand Down
19 changes: 19 additions & 0 deletions news/351-add-zstd-support
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
### Enhancements

* Add support for the .tar.zst format

### Bug fixes

* <news item>

### Deprecations

* <news item>

### Docs

* <news item>

### Other

* <news item>

0 comments on commit 96ffc23

Please sign in to comment.