GS Cache: list member of a tar file This change adds a feature of GS cache server to list all members of a tar file as lines of CSV, e.g. <filename>,<record start>,<record size>,<record end>,<content start>,<content size><content end> ... This feature is useful when we extract files from a tar by using HTTP Range header, e.g. curl -r <content start>-<content end> http://gs-cache/download/path/to/file.tar Another usecase is extracting some of files and re-create another tar file: for f in file_info_list: reader.seek(f.record_start) writer.write(reader.read(r.record_size)) BUG=chromium:824580 TEST=Ran unit tests. Change-Id: I2630a04795e16eb35dcdee46c17db64ba380ca09 Reviewed-on: https://chromium-review.googlesource.com/1047959 Commit-Ready: ChromeOS CL Exonerator Bot <chromiumos-cl-exonerator@appspot.gserviceaccount.com> Tested-by: Congbin Guo <guocb@chromium.org> Reviewed-by: Congbin Guo <guocb@chromium.org>

commit: d9506f01f3e409325a01429aab71bd5e26cc1348 [log] [tgz]
author: Congbin Guo <guocb@google.com> Mon May 07 14:56:55 2018 -0700
committer: chrome-bot <chrome-bot@chromium.org> Tue May 22 18:45:41 2018 -0700
tree: 74242e5515c34eb6daec89a011c2692f5d3d8454
parent: f6bd5742fe5235a79e439d5e0f022b5e940c7625 [diff] [blame]
diff --git a/gs_cache/tarfile_utils.py b/gs_cache/tarfile_utils.py
new file mode 100644
index 0000000..200ceb5
--- /dev/null
+++ b/gs_cache/tarfile_utils.py

@@ -0,0 +1,162 @@
+# Copyright 2018 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""Utils for manipulating tar format archives.
+
+We use tar command to manipulate tar file other than using Python tarfile module
+because that module is very slow in the case of large file.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import re
+
+from chromite.lib import cros_logging as logging
+
+_logger = logging.getLogger(__name__)
+
+
+def _round_up_to_512(number):
+  """Up round the given |number| to smallest multiple of 512.
+
+  Examples:
+    >>> for n in (0, 1, 512, 1025):
+    ...   _round_up_to_512(n)
+    0
+    512
+    512
+    1536
+
+  Args:
+    number: Zero or positive integer.
+
+  Returns:
+    The smallest multiple of 512.
+  """
+  return (number + 511) & -512
+
+
+def _get_command_result_from_tar_tvR(an_output_line):
+  """Get an object of _TarListCommandResult from one line of `tar tvR` output.
+
+  Args:
+    an_output_line: One line of `tar tvR` output. Trailing '\n' is acceptable.
+      The last line of `tar tvR` is acceptable.
+
+  Returns:
+    An object of _TarListCommandResult.
+  """
+  separators = re.compile('[ \t:]+')
+  fields_num = len(_TarListCommandResult._fields)
+  fields = re.split(separators, an_output_line.rstrip('\n'),
+                    maxsplit=fields_num - 1)
+  try:
+    return _TarListCommandResult._make(fields)
+  except TypeError:
+    # The last line of `tar tvR` hasn't enough fields. Fill with fake data.
+    _logger.debug('This should be the last line of `tar tvR`: %s',
+                  an_output_line)
+    fields.extend(_TarListCommandResult._fields[len(fields):])
+    return _TarListCommandResult._make(fields)
+
+
+def _block_to_bytes(block_num):
+  """Get offset of the block |block_num| in bytes, i.e. times 512"""
+  return block_num << 9  # * 512
+
+
+# The tuple of tar member information to be returned to caller.
+# Fields:
+#   filename: The file name of the tar member.
+#   record_start: The zero-based start offset of the file record, in bytes.
+#   record_size: The size of the file record, in bytes.
+#   content_start: The zero-based start offset of the file content, in bytes.
+#   size: The size of the file content, in bytes.
+TarMemberInfo = collections.namedtuple(
+    'TarMemberInfo', ('filename', 'record_start', 'record_size',
+                      'content_start', 'size'))
+
+
+class _TarListCommandResult(collections.namedtuple(
+    '_TarListCommandResult', ('block', 'block_num', 'mode', 'ownership',
+                              'size_str', 'date', 'hour', 'min', 'filename'))):
+  """Information of each member in a Tar archive.
+
+  This class using the output of command `tar tvR` to compute more information
+  we need, e.g. file content start offset, etc.
+
+  The output of `tar tvR` is like:
+  block 0: -rw-r--r-- user/group <size> <date> <time> <file name>
+  ...
+  block 7: ** Block of NULs **
+  """
+
+  @property
+  def record_start(self):
+    """Start offset of the file record, in bytes."""
+    return _block_to_bytes(int(self.block_num))
+
+  @property
+  def size(self):
+    return int(self.size_str)
+
+
+def _get_prev_content_start(cur_record_start, prev_file):
+  """Deduct prev file content information from current file record information.
+
+  In tar format, each file record has a header and followed by file content.
+  Both header and file content are rounded up to 512 Bytes. The header length is
+  variable, but we can get the current file content starting offset by
+  subtracting up rounded file size from next file header starting offset, i.e.
+
+  current_offset = block(next_file) * 512 - round_up_to_512(current_size)
+
+  |********|************************.......|********|****
+  | header |         content               | header |
+  |        |<----- prev_size ----->|
+  |        |<- prev_size round up to 512 ->|
+           ^prev_content_start             ^cur_record_start
+
+  Args:
+    cur_record_start: The zero-based start position of current file record, in
+        bytes.
+    prev_file: An instance of _TarListCommandResult which has size of the
+        previous file.
+
+  Returns:
+    The zero-based start position of previous file content, in bytes.
+  """
+  return cur_record_start - _round_up_to_512(prev_file.size)
+
+
+def list_tar_members(tar_tvR_output):
+  """List the members of a tar with information.
+
+  Yield each member of the tar archive with information of record start/size,
+  content start/size, etc.
+
+  Args:
+    tar_tvR_output: The output of command 'tar tvR'. Option 'R' print out the
+        starting block number of the file record.
+
+  Yields:
+    A tuple of data described above in the same order.
+  """
+  prev_file = _get_command_result_from_tar_tvR(tar_tvR_output.readline())
+
+  for line in tar_tvR_output:
+    cur_file = _get_command_result_from_tar_tvR(line)
+
+    prev_content_start = _get_prev_content_start(cur_file.record_start,
+                                                 prev_file)
+    prev_record_size = cur_file.record_start - prev_file.record_start
+
+    yield TarMemberInfo(prev_file.filename,
+                        prev_file.record_start, prev_record_size,
+                        prev_content_start, prev_file.size)
+
+    prev_file = cur_file
commit	d9506f01f3e409325a01429aab71bd5e26cc1348	[log] [tgz]
author	Congbin Guo <guocb@google.com>	Mon May 07 14:56:55 2018 -0700
committer	chrome-bot <chrome-bot@chromium.org>	Tue May 22 18:45:41 2018 -0700
tree	74242e5515c34eb6daec89a011c2692f5d3d8454
parent	f6bd5742fe5235a79e439d5e0f022b5e940c7625 [diff] [blame]