blob: 200ceb51e63ce80660cb9bf16863f18224a4942d [file] [log] [blame]
Congbin Guod9506f02018-05-07 14:56:55 -07001# Copyright 2018 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""Utils for manipulating tar format archives.
6
7We use tar command to manipulate tar file other than using Python tarfile module
8because that module is very slow in the case of large file.
9"""
10
11from __future__ import absolute_import
12from __future__ import division
13from __future__ import print_function
14
15import collections
16import re
17
18from chromite.lib import cros_logging as logging
19
20_logger = logging.getLogger(__name__)
21
22
23def _round_up_to_512(number):
24 """Up round the given |number| to smallest multiple of 512.
25
26 Examples:
27 >>> for n in (0, 1, 512, 1025):
28 ... _round_up_to_512(n)
29 0
30 512
31 512
32 1536
33
34 Args:
35 number: Zero or positive integer.
36
37 Returns:
38 The smallest multiple of 512.
39 """
40 return (number + 511) & -512
41
42
43def _get_command_result_from_tar_tvR(an_output_line):
44 """Get an object of _TarListCommandResult from one line of `tar tvR` output.
45
46 Args:
47 an_output_line: One line of `tar tvR` output. Trailing '\n' is acceptable.
48 The last line of `tar tvR` is acceptable.
49
50 Returns:
51 An object of _TarListCommandResult.
52 """
53 separators = re.compile('[ \t:]+')
54 fields_num = len(_TarListCommandResult._fields)
55 fields = re.split(separators, an_output_line.rstrip('\n'),
56 maxsplit=fields_num - 1)
57 try:
58 return _TarListCommandResult._make(fields)
59 except TypeError:
60 # The last line of `tar tvR` hasn't enough fields. Fill with fake data.
61 _logger.debug('This should be the last line of `tar tvR`: %s',
62 an_output_line)
63 fields.extend(_TarListCommandResult._fields[len(fields):])
64 return _TarListCommandResult._make(fields)
65
66
67def _block_to_bytes(block_num):
68 """Get offset of the block |block_num| in bytes, i.e. times 512"""
69 return block_num << 9 # * 512
70
71
72# The tuple of tar member information to be returned to caller.
73# Fields:
74# filename: The file name of the tar member.
75# record_start: The zero-based start offset of the file record, in bytes.
76# record_size: The size of the file record, in bytes.
77# content_start: The zero-based start offset of the file content, in bytes.
78# size: The size of the file content, in bytes.
79TarMemberInfo = collections.namedtuple(
80 'TarMemberInfo', ('filename', 'record_start', 'record_size',
81 'content_start', 'size'))
82
83
84class _TarListCommandResult(collections.namedtuple(
85 '_TarListCommandResult', ('block', 'block_num', 'mode', 'ownership',
86 'size_str', 'date', 'hour', 'min', 'filename'))):
87 """Information of each member in a Tar archive.
88
89 This class using the output of command `tar tvR` to compute more information
90 we need, e.g. file content start offset, etc.
91
92 The output of `tar tvR` is like:
93 block 0: -rw-r--r-- user/group <size> <date> <time> <file name>
94 ...
95 block 7: ** Block of NULs **
96 """
97
98 @property
99 def record_start(self):
100 """Start offset of the file record, in bytes."""
101 return _block_to_bytes(int(self.block_num))
102
103 @property
104 def size(self):
105 return int(self.size_str)
106
107
108def _get_prev_content_start(cur_record_start, prev_file):
109 """Deduct prev file content information from current file record information.
110
111 In tar format, each file record has a header and followed by file content.
112 Both header and file content are rounded up to 512 Bytes. The header length is
113 variable, but we can get the current file content starting offset by
114 subtracting up rounded file size from next file header starting offset, i.e.
115
116 current_offset = block(next_file) * 512 - round_up_to_512(current_size)
117
118 |********|************************.......|********|****
119 | header | content | header |
120 | |<----- prev_size ----->|
121 | |<- prev_size round up to 512 ->|
122 ^prev_content_start ^cur_record_start
123
124 Args:
125 cur_record_start: The zero-based start position of current file record, in
126 bytes.
127 prev_file: An instance of _TarListCommandResult which has size of the
128 previous file.
129
130 Returns:
131 The zero-based start position of previous file content, in bytes.
132 """
133 return cur_record_start - _round_up_to_512(prev_file.size)
134
135
136def list_tar_members(tar_tvR_output):
137 """List the members of a tar with information.
138
139 Yield each member of the tar archive with information of record start/size,
140 content start/size, etc.
141
142 Args:
143 tar_tvR_output: The output of command 'tar tvR'. Option 'R' print out the
144 starting block number of the file record.
145
146 Yields:
147 A tuple of data described above in the same order.
148 """
149 prev_file = _get_command_result_from_tar_tvR(tar_tvR_output.readline())
150
151 for line in tar_tvR_output:
152 cur_file = _get_command_result_from_tar_tvR(line)
153
154 prev_content_start = _get_prev_content_start(cur_file.record_start,
155 prev_file)
156 prev_record_size = cur_file.record_start - prev_file.record_start
157
158 yield TarMemberInfo(prev_file.filename,
159 prev_file.record_start, prev_record_size,
160 prev_content_start, prev_file.size)
161
162 prev_file = cur_file