xz_auto: use pixz for either compression or decompression
On my machine, `xz -T0 foo.tar && pixz -d foo.tar.xz` results in
the multi-threaded compression of `foo.tar`, and the single-threaded
decompression thereof. If you replace `xz` with `pixz` for compression,
both compression and decompression are multi-threaded.
This also makes us use a consistent number of threads; currently, we use
ncpu/2 for pixz, and ncpu for xz.
This also adds tests for xz_auto, since we had none before.
BUG=None
TEST=./run_tests
Change-Id: Ia2d60d3fdee5ba34af9a75903cd8127e0b82c493
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/chromite/+/3180382
Commit-Queue: George Burgess <gbiv@chromium.org>
Tested-by: George Burgess <gbiv@chromium.org>
Reviewed-by: Mike Frysinger <vapier@chromium.org>
diff --git a/scripts/xz_auto.py b/scripts/xz_auto.py
index 6a84575..4d6902e 100644
--- a/scripts/xz_auto.py
+++ b/scripts/xz_auto.py
@@ -6,7 +6,7 @@
from __future__ import division
-import multiprocessing
+import getopt
import os
from chromite.lib import commandline
@@ -14,34 +14,99 @@
from chromite.utils import memoize
+PIXZ_DISABLE_VAR = 'FOR_TEST_XZ_AUTO_NO_PIXZ'
+
+
@memoize.Memoize
def HasPixz():
"""Returns path to pixz if it's on PATH or None otherwise."""
- return osutils.Which('pixz')
+ return osutils.Which('pixz') and not os.environ.get(PIXZ_DISABLE_VAR)
-@memoize.Memoize
-def GetJobCount():
- """Returns half of the total number of the machine's CPUs as a string.
-
- Returns half rather than all of them to avoid starving out other parallel
- processes on the same machine.
- """
- return str(int(max(1, multiprocessing.cpu_count() / 2)))
+def BasePixzCommand(jobs):
+ """Returns a command that invokes pixz with the given job count."""
+ return ['pixz', '-p', str(jobs)]
-def GetDecompressCommand(stdout):
+def BaseXzCommand(jobs):
+ """Returns a command that invokes xz with the given job count."""
+ return ['xz', f'-T{jobs}']
+
+
+def DetermineFilesPassedToPixz(argv):
+ """Attempt to figure out what file we're trying to compress."""
+ # Glancing at docs, the following opts are supported. -i and -o are ignored,
+ # since we assert in `main` that they're not present, but include parsing for
+ # them anyway.
+ _, args = getopt.gnu_getopt(
+ args=argv,
+ shortopts='dlxi:o:0123456789p:tkch',
+ )
+ if not args:
+ file_to_compress = None
+ target = None
+ elif len(args) == 1:
+ file_to_compress = args[0]
+ target = None
+ else:
+ file_to_compress = args[0]
+ target = args[1]
+
+ return file_to_compress, target
+
+
+def GetCompressCommand(stdout, jobs, argv):
+ """Returns compression command."""
+ # It appears that in order for pixz to do parallel decompression, compression
+ # needs to be done with pixz. xz itself is only capable of parallel
+ # compression.
+ if HasPixz():
+ cmd = BasePixzCommand(jobs)
+
+ compressed_file_name, specifies_output_file = DetermineFilesPassedToPixz(
+ argv)
+
+ if compressed_file_name:
+ if not (stdout or specifies_output_file):
+ # Pixz defaults to a `.pxz` suffix (or `.tpxz` if it's compressing a
+ # tar file). We need the suffix to be consistent, so force it here.
+ cmd += ['-o', f'{compressed_file_name}.xz']
+ else:
+ cmd += ['-i', '/dev/stdin']
+ return cmd
+
+ cmd = BaseXzCommand(jobs)
+
+ if stdout:
+ cmd.append('-zc')
+ else:
+ cmd.append('-z')
+ return cmd
+
+
+def GetDecompressCommand(stdout, jobs, argv):
"""Returns decompression command."""
if HasPixz():
- cmd = ['pixz', '-d', '-p', GetJobCount()]
+ cmd = BasePixzCommand(jobs)
+ cmd.append('-d')
+
+ compressed_file_name, _ = DetermineFilesPassedToPixz(argv)
if stdout:
# Explicitly tell pixz the file is the input, so it will dump the output
# to stdout, instead of automatically choosing an output name.
cmd.append('-i')
+ if not compressed_file_name:
+ cmd.append('/dev/stdin')
+ elif not compressed_file_name:
+ cmd += ['-i', '/dev/stdin']
return cmd
+
+ cmd = BaseXzCommand(jobs)
if stdout:
- return ['xz', '-dc']
- return ['xz', '-d']
+ cmd.append('-dc')
+ else:
+ cmd.append('-d')
+ return cmd
def GetParser():
@@ -67,12 +132,12 @@
if '-i' in argv or '-o' in argv:
parser.error('It is invalid to use -i or -o with xz_auto')
- # xz doesn't support multi-threaded decompression, so try using pixz for that.
+ # Use half of our CPUs to avoid starving other processes.
+ jobs = max(1, os.cpu_count() // 2)
+
if known_args.decompress:
- args = GetDecompressCommand(known_args.stdout)
- os.execvp(args[0], args + argv)
+ args = GetDecompressCommand(known_args.stdout, jobs, argv)
else:
- cmd = ['xz', '-T0']
- if known_args.stdout:
- cmd.append('-c')
- os.execvp(cmd[0], cmd + argv)
+ args = GetCompressCommand(known_args.stdout, jobs, argv)
+
+ os.execvp(args[0], args + argv)