Congbin Guo | d9506f0 | 2018-05-07 14:56:55 -0700 | [diff] [blame] | 1 | # Copyright 2018 The Chromium OS Authors. All rights reserved. |
| 2 | # Use of this source code is governed by a BSD-style license that can be |
| 3 | # found in the LICENSE file. |
| 4 | |
| 5 | """Utils for manipulating tar format archives. |
| 6 | |
| 7 | We use tar command to manipulate tar file other than using Python tarfile module |
| 8 | because that module is very slow in the case of large file. |
| 9 | """ |
| 10 | |
| 11 | from __future__ import absolute_import |
| 12 | from __future__ import division |
| 13 | from __future__ import print_function |
| 14 | |
| 15 | import collections |
| 16 | import re |
| 17 | |
| 18 | from chromite.lib import cros_logging as logging |
| 19 | |
| 20 | _logger = logging.getLogger(__name__) |
| 21 | |
| 22 | |
| 23 | def _round_up_to_512(number): |
| 24 | """Up round the given |number| to smallest multiple of 512. |
| 25 | |
| 26 | Examples: |
| 27 | >>> for n in (0, 1, 512, 1025): |
| 28 | ... _round_up_to_512(n) |
| 29 | 0 |
| 30 | 512 |
| 31 | 512 |
| 32 | 1536 |
| 33 | |
| 34 | Args: |
| 35 | number: Zero or positive integer. |
| 36 | |
| 37 | Returns: |
| 38 | The smallest multiple of 512. |
| 39 | """ |
| 40 | return (number + 511) & -512 |
| 41 | |
| 42 | |
| 43 | def _get_command_result_from_tar_tvR(an_output_line): |
| 44 | """Get an object of _TarListCommandResult from one line of `tar tvR` output. |
| 45 | |
| 46 | Args: |
| 47 | an_output_line: One line of `tar tvR` output. Trailing '\n' is acceptable. |
| 48 | The last line of `tar tvR` is acceptable. |
| 49 | |
| 50 | Returns: |
| 51 | An object of _TarListCommandResult. |
| 52 | """ |
| 53 | separators = re.compile('[ \t:]+') |
| 54 | fields_num = len(_TarListCommandResult._fields) |
| 55 | fields = re.split(separators, an_output_line.rstrip('\n'), |
| 56 | maxsplit=fields_num - 1) |
| 57 | try: |
| 58 | return _TarListCommandResult._make(fields) |
| 59 | except TypeError: |
| 60 | # The last line of `tar tvR` hasn't enough fields. Fill with fake data. |
| 61 | _logger.debug('This should be the last line of `tar tvR`: %s', |
| 62 | an_output_line) |
| 63 | fields.extend(_TarListCommandResult._fields[len(fields):]) |
| 64 | return _TarListCommandResult._make(fields) |
| 65 | |
| 66 | |
| 67 | def _block_to_bytes(block_num): |
| 68 | """Get offset of the block |block_num| in bytes, i.e. times 512""" |
| 69 | return block_num << 9 # * 512 |
| 70 | |
| 71 | |
| 72 | # The tuple of tar member information to be returned to caller. |
| 73 | # Fields: |
| 74 | # filename: The file name of the tar member. |
| 75 | # record_start: The zero-based start offset of the file record, in bytes. |
| 76 | # record_size: The size of the file record, in bytes. |
| 77 | # content_start: The zero-based start offset of the file content, in bytes. |
| 78 | # size: The size of the file content, in bytes. |
| 79 | TarMemberInfo = collections.namedtuple( |
| 80 | 'TarMemberInfo', ('filename', 'record_start', 'record_size', |
| 81 | 'content_start', 'size')) |
| 82 | |
| 83 | |
| 84 | class _TarListCommandResult(collections.namedtuple( |
| 85 | '_TarListCommandResult', ('block', 'block_num', 'mode', 'ownership', |
| 86 | 'size_str', 'date', 'hour', 'min', 'filename'))): |
| 87 | """Information of each member in a Tar archive. |
| 88 | |
| 89 | This class using the output of command `tar tvR` to compute more information |
| 90 | we need, e.g. file content start offset, etc. |
| 91 | |
| 92 | The output of `tar tvR` is like: |
| 93 | block 0: -rw-r--r-- user/group <size> <date> <time> <file name> |
| 94 | ... |
| 95 | block 7: ** Block of NULs ** |
| 96 | """ |
| 97 | |
| 98 | @property |
| 99 | def record_start(self): |
| 100 | """Start offset of the file record, in bytes.""" |
| 101 | return _block_to_bytes(int(self.block_num)) |
| 102 | |
| 103 | @property |
| 104 | def size(self): |
| 105 | return int(self.size_str) |
| 106 | |
| 107 | |
| 108 | def _get_prev_content_start(cur_record_start, prev_file): |
| 109 | """Deduct prev file content information from current file record information. |
| 110 | |
| 111 | In tar format, each file record has a header and followed by file content. |
| 112 | Both header and file content are rounded up to 512 Bytes. The header length is |
| 113 | variable, but we can get the current file content starting offset by |
| 114 | subtracting up rounded file size from next file header starting offset, i.e. |
| 115 | |
| 116 | current_offset = block(next_file) * 512 - round_up_to_512(current_size) |
| 117 | |
| 118 | |********|************************.......|********|**** |
| 119 | | header | content | header | |
| 120 | | |<----- prev_size ----->| |
| 121 | | |<- prev_size round up to 512 ->| |
| 122 | ^prev_content_start ^cur_record_start |
| 123 | |
| 124 | Args: |
| 125 | cur_record_start: The zero-based start position of current file record, in |
| 126 | bytes. |
| 127 | prev_file: An instance of _TarListCommandResult which has size of the |
| 128 | previous file. |
| 129 | |
| 130 | Returns: |
| 131 | The zero-based start position of previous file content, in bytes. |
| 132 | """ |
| 133 | return cur_record_start - _round_up_to_512(prev_file.size) |
| 134 | |
| 135 | |
| 136 | def list_tar_members(tar_tvR_output): |
| 137 | """List the members of a tar with information. |
| 138 | |
| 139 | Yield each member of the tar archive with information of record start/size, |
| 140 | content start/size, etc. |
| 141 | |
| 142 | Args: |
| 143 | tar_tvR_output: The output of command 'tar tvR'. Option 'R' print out the |
| 144 | starting block number of the file record. |
| 145 | |
| 146 | Yields: |
| 147 | A tuple of data described above in the same order. |
| 148 | """ |
| 149 | prev_file = _get_command_result_from_tar_tvR(tar_tvR_output.readline()) |
| 150 | |
| 151 | for line in tar_tvR_output: |
| 152 | cur_file = _get_command_result_from_tar_tvR(line) |
| 153 | |
| 154 | prev_content_start = _get_prev_content_start(cur_file.record_start, |
| 155 | prev_file) |
| 156 | prev_record_size = cur_file.record_start - prev_file.record_start |
| 157 | |
| 158 | yield TarMemberInfo(prev_file.filename, |
| 159 | prev_file.record_start, prev_record_size, |
| 160 | prev_content_start, prev_file.size) |
| 161 | |
| 162 | prev_file = cur_file |