xz_auto: use pixz for either compression or decompression

On my machine, `xz -T0 foo.tar && pixz -d foo.tar.xz` results in
the multi-threaded compression of `foo.tar`, and the single-threaded
decompression thereof. If you replace `xz` with `pixz` for compression,
both compression and decompression are multi-threaded.

This also makes us use a consistent number of threads; currently, we use
ncpu/2 for pixz, and ncpu for xz.

This also adds tests for xz_auto, since we had none before.

BUG=None
TEST=./run_tests

Change-Id: Ia2d60d3fdee5ba34af9a75903cd8127e0b82c493
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/chromite/+/3180382
Commit-Queue: George Burgess <gbiv@chromium.org>
Tested-by: George Burgess <gbiv@chromium.org>
Reviewed-by: Mike Frysinger <vapier@chromium.org>
diff --git a/scripts/xz_auto.py b/scripts/xz_auto.py
index 6a84575..4d6902e 100644
--- a/scripts/xz_auto.py
+++ b/scripts/xz_auto.py
@@ -6,7 +6,7 @@
 
 from __future__ import division
 
-import multiprocessing
+import getopt
 import os
 
 from chromite.lib import commandline
@@ -14,34 +14,99 @@
 from chromite.utils import memoize
 
 
+PIXZ_DISABLE_VAR = 'FOR_TEST_XZ_AUTO_NO_PIXZ'
+
+
 @memoize.Memoize
 def HasPixz():
   """Returns path to pixz if it's on PATH or None otherwise."""
-  return osutils.Which('pixz')
+  return osutils.Which('pixz') and not os.environ.get(PIXZ_DISABLE_VAR)
 
 
-@memoize.Memoize
-def GetJobCount():
-  """Returns half of the total number of the machine's CPUs as a string.
-
-  Returns half rather than all of them to avoid starving out other parallel
-  processes on the same machine.
-  """
-  return str(int(max(1, multiprocessing.cpu_count() / 2)))
+def BasePixzCommand(jobs):
+  """Returns a command that invokes pixz with the given job count."""
+  return ['pixz', '-p', str(jobs)]
 
 
-def GetDecompressCommand(stdout):
+def BaseXzCommand(jobs):
+  """Returns a command that invokes xz with the given job count."""
+  return ['xz', f'-T{jobs}']
+
+
+def DetermineFilesPassedToPixz(argv):
+  """Attempt to figure out what file we're trying to compress."""
+  # Glancing at docs, the following opts are supported. -i and -o are ignored,
+  # since we assert in `main` that they're not present, but include parsing for
+  # them anyway.
+  _, args = getopt.gnu_getopt(
+      args=argv,
+      shortopts='dlxi:o:0123456789p:tkch',
+  )
+  if not args:
+    file_to_compress = None
+    target = None
+  elif len(args) == 1:
+    file_to_compress = args[0]
+    target = None
+  else:
+    file_to_compress = args[0]
+    target = args[1]
+
+  return file_to_compress, target
+
+
+def GetCompressCommand(stdout, jobs, argv):
+  """Returns compression command."""
+  # It appears that in order for pixz to do parallel decompression, compression
+  # needs to be done with pixz. xz itself is only capable of parallel
+  # compression.
+  if HasPixz():
+    cmd = BasePixzCommand(jobs)
+
+    compressed_file_name, specifies_output_file = DetermineFilesPassedToPixz(
+        argv)
+
+    if compressed_file_name:
+      if not (stdout or specifies_output_file):
+        # Pixz defaults to a `.pxz` suffix (or `.tpxz` if it's compressing a
+        # tar file). We need the suffix to be consistent, so force it here.
+        cmd += ['-o', f'{compressed_file_name}.xz']
+    else:
+      cmd += ['-i', '/dev/stdin']
+    return cmd
+
+  cmd = BaseXzCommand(jobs)
+
+  if stdout:
+    cmd.append('-zc')
+  else:
+    cmd.append('-z')
+  return cmd
+
+
+def GetDecompressCommand(stdout, jobs, argv):
   """Returns decompression command."""
   if HasPixz():
-    cmd = ['pixz', '-d', '-p', GetJobCount()]
+    cmd = BasePixzCommand(jobs)
+    cmd.append('-d')
+
+    compressed_file_name, _ = DetermineFilesPassedToPixz(argv)
     if stdout:
       # Explicitly tell pixz the file is the input, so it will dump the output
       # to stdout, instead of automatically choosing an output name.
       cmd.append('-i')
+      if not compressed_file_name:
+        cmd.append('/dev/stdin')
+    elif not compressed_file_name:
+      cmd += ['-i', '/dev/stdin']
     return cmd
+
+  cmd = BaseXzCommand(jobs)
   if stdout:
-    return ['xz', '-dc']
-  return ['xz', '-d']
+    cmd.append('-dc')
+  else:
+    cmd.append('-d')
+  return cmd
 
 
 def GetParser():
@@ -67,12 +132,12 @@
   if '-i' in argv or '-o' in argv:
     parser.error('It is invalid to use -i or -o with xz_auto')
 
-  # xz doesn't support multi-threaded decompression, so try using pixz for that.
+  # Use half of our CPUs to avoid starving other processes.
+  jobs = max(1, os.cpu_count() // 2)
+
   if known_args.decompress:
-    args = GetDecompressCommand(known_args.stdout)
-    os.execvp(args[0], args + argv)
+    args = GetDecompressCommand(known_args.stdout, jobs, argv)
   else:
-    cmd = ['xz', '-T0']
-    if known_args.stdout:
-      cmd.append('-c')
-    os.execvp(cmd[0], cmd + argv)
+    args = GetCompressCommand(known_args.stdout, jobs, argv)
+
+  os.execvp(args[0], args + argv)