GS Cache: list member of a tar file
This change adds a feature of GS cache server to list all members of a
tar file as lines of CSV, e.g.
<filename>,<record start>,<record size>,<record end>,<content start>,<content size><content end>
...
This feature is useful when we extract files from a tar by using HTTP
Range header, e.g.
curl -r <content start>-<content end> http://gs-cache/download/path/to/file.tar
Another usecase is extracting some of files and re-create another tar
file:
for f in file_info_list:
reader.seek(f.record_start)
writer.write(reader.read(r.record_size))
BUG=chromium:824580
TEST=Ran unit tests.
Change-Id: I2630a04795e16eb35dcdee46c17db64ba380ca09
Reviewed-on: https://chromium-review.googlesource.com/1047959
Commit-Ready: ChromeOS CL Exonerator Bot <chromiumos-cl-exonerator@appspot.gserviceaccount.com>
Tested-by: Congbin Guo <guocb@chromium.org>
Reviewed-by: Congbin Guo <guocb@chromium.org>
diff --git a/gs_cache/tarfile_utils.py b/gs_cache/tarfile_utils.py
new file mode 100644
index 0000000..200ceb5
--- /dev/null
+++ b/gs_cache/tarfile_utils.py
@@ -0,0 +1,162 @@
+# Copyright 2018 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""Utils for manipulating tar format archives.
+
+We use tar command to manipulate tar file other than using Python tarfile module
+because that module is very slow in the case of large file.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import re
+
+from chromite.lib import cros_logging as logging
+
+_logger = logging.getLogger(__name__)
+
+
+def _round_up_to_512(number):
+ """Up round the given |number| to smallest multiple of 512.
+
+ Examples:
+ >>> for n in (0, 1, 512, 1025):
+ ... _round_up_to_512(n)
+ 0
+ 512
+ 512
+ 1536
+
+ Args:
+ number: Zero or positive integer.
+
+ Returns:
+ The smallest multiple of 512.
+ """
+ return (number + 511) & -512
+
+
+def _get_command_result_from_tar_tvR(an_output_line):
+ """Get an object of _TarListCommandResult from one line of `tar tvR` output.
+
+ Args:
+ an_output_line: One line of `tar tvR` output. Trailing '\n' is acceptable.
+ The last line of `tar tvR` is acceptable.
+
+ Returns:
+ An object of _TarListCommandResult.
+ """
+ separators = re.compile('[ \t:]+')
+ fields_num = len(_TarListCommandResult._fields)
+ fields = re.split(separators, an_output_line.rstrip('\n'),
+ maxsplit=fields_num - 1)
+ try:
+ return _TarListCommandResult._make(fields)
+ except TypeError:
+ # The last line of `tar tvR` hasn't enough fields. Fill with fake data.
+ _logger.debug('This should be the last line of `tar tvR`: %s',
+ an_output_line)
+ fields.extend(_TarListCommandResult._fields[len(fields):])
+ return _TarListCommandResult._make(fields)
+
+
+def _block_to_bytes(block_num):
+ """Get offset of the block |block_num| in bytes, i.e. times 512"""
+ return block_num << 9 # * 512
+
+
+# The tuple of tar member information to be returned to caller.
+# Fields:
+# filename: The file name of the tar member.
+# record_start: The zero-based start offset of the file record, in bytes.
+# record_size: The size of the file record, in bytes.
+# content_start: The zero-based start offset of the file content, in bytes.
+# size: The size of the file content, in bytes.
+TarMemberInfo = collections.namedtuple(
+ 'TarMemberInfo', ('filename', 'record_start', 'record_size',
+ 'content_start', 'size'))
+
+
+class _TarListCommandResult(collections.namedtuple(
+ '_TarListCommandResult', ('block', 'block_num', 'mode', 'ownership',
+ 'size_str', 'date', 'hour', 'min', 'filename'))):
+ """Information of each member in a Tar archive.
+
+ This class using the output of command `tar tvR` to compute more information
+ we need, e.g. file content start offset, etc.
+
+ The output of `tar tvR` is like:
+ block 0: -rw-r--r-- user/group <size> <date> <time> <file name>
+ ...
+ block 7: ** Block of NULs **
+ """
+
+ @property
+ def record_start(self):
+ """Start offset of the file record, in bytes."""
+ return _block_to_bytes(int(self.block_num))
+
+ @property
+ def size(self):
+ return int(self.size_str)
+
+
+def _get_prev_content_start(cur_record_start, prev_file):
+ """Deduct prev file content information from current file record information.
+
+ In tar format, each file record has a header and followed by file content.
+ Both header and file content are rounded up to 512 Bytes. The header length is
+ variable, but we can get the current file content starting offset by
+ subtracting up rounded file size from next file header starting offset, i.e.
+
+ current_offset = block(next_file) * 512 - round_up_to_512(current_size)
+
+ |********|************************.......|********|****
+ | header | content | header |
+ | |<----- prev_size ----->|
+ | |<- prev_size round up to 512 ->|
+ ^prev_content_start ^cur_record_start
+
+ Args:
+ cur_record_start: The zero-based start position of current file record, in
+ bytes.
+ prev_file: An instance of _TarListCommandResult which has size of the
+ previous file.
+
+ Returns:
+ The zero-based start position of previous file content, in bytes.
+ """
+ return cur_record_start - _round_up_to_512(prev_file.size)
+
+
+def list_tar_members(tar_tvR_output):
+ """List the members of a tar with information.
+
+ Yield each member of the tar archive with information of record start/size,
+ content start/size, etc.
+
+ Args:
+ tar_tvR_output: The output of command 'tar tvR'. Option 'R' print out the
+ starting block number of the file record.
+
+ Yields:
+ A tuple of data described above in the same order.
+ """
+ prev_file = _get_command_result_from_tar_tvR(tar_tvR_output.readline())
+
+ for line in tar_tvR_output:
+ cur_file = _get_command_result_from_tar_tvR(line)
+
+ prev_content_start = _get_prev_content_start(cur_file.record_start,
+ prev_file)
+ prev_record_size = cur_file.record_start - prev_file.record_start
+
+ yield TarMemberInfo(prev_file.filename,
+ prev_file.record_start, prev_record_size,
+ prev_content_start, prev_file.size)
+
+ prev_file = cur_file