blob: 7eae38e5dedab4a1bab96803ffbcc89763abfa8c [file] [log] [blame]
Amin Hassanic3e6b532017-03-07 17:47:25 -08001// Copyright 2017 The Chromium OS Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "puffin/src/include/puffin/utils.h"
6
7#include <inttypes.h>
8
Sen Jiang5eb33e82018-05-01 15:01:11 -07009#include <algorithm>
10#include <set>
Amin Hassanic3e6b532017-03-07 17:47:25 -080011#include <string>
12#include <vector>
13
Tianjie Xu11942232018-01-18 18:31:42 -080014#include <zlib.h>
15
Amin Hassanic3e6b532017-03-07 17:47:25 -080016#include "puffin/src/bit_reader.h"
Amin Hassani00f08322017-10-18 11:55:10 -070017#include "puffin/src/file_stream.h"
Amin Hassanic3e6b532017-03-07 17:47:25 -080018#include "puffin/src/include/puffin/common.h"
Amin Hassani26bcfdd2017-09-29 17:54:15 -070019#include "puffin/src/include/puffin/puffer.h"
Amin Hassanie2e9cb02018-03-15 14:14:58 -070020#include "puffin/src/logging.h"
Tianjie Xu11942232018-01-18 18:31:42 -080021#include "puffin/src/memory_stream.h"
Amin Hassani26bcfdd2017-09-29 17:54:15 -070022#include "puffin/src/puff_writer.h"
Amin Hassanic3e6b532017-03-07 17:47:25 -080023
Sen Jiang5eb33e82018-05-01 15:01:11 -070024using std::set;
Amin Hassani10b869c2018-03-15 13:22:32 -070025using std::string;
26using std::vector;
27
Tianjie Xu11942232018-01-18 18:31:42 -080028namespace {
29// Use memcpy to access the unaligned data of type |T|.
30template <typename T>
31inline T get_unaligned(const void* address) {
32 T result;
33 memcpy(&result, address, sizeof(T));
34 return result;
35}
36
37// Calculate both the compressed size and uncompressed size of the deflate
38// block that starts from the offset |start| of buffer |data|.
39bool CalculateSizeOfDeflateBlock(const puffin::Buffer& data,
Amin Hassanid7768d52018-02-28 15:34:21 -080040 uint64_t start,
41 uint64_t* compressed_size,
42 uint64_t* uncompressed_size) {
Tianjie Xu11942232018-01-18 18:31:42 -080043 TEST_AND_RETURN_FALSE(compressed_size != nullptr &&
44 uncompressed_size != nullptr);
45
46 TEST_AND_RETURN_FALSE(start < data.size());
47
48 z_stream strm = {};
49 strm.avail_in = data.size() - start;
50 strm.next_in = data.data() + start;
51
52 // -15 means we are decoding a 'raw' stream without zlib headers.
53 if (inflateInit2(&strm, -15)) {
54 LOG(ERROR) << "Failed to initialize inflate: " << strm.msg;
55 return false;
56 }
57
58 const unsigned int kBufferSize = 32768;
59 std::vector<uint8_t> uncompressed_data(kBufferSize);
60 *uncompressed_size = 0;
61 int status = Z_OK;
62 do {
63 // Overwrite the same buffer since we don't need the uncompressed data.
64 strm.avail_out = kBufferSize;
65 strm.next_out = uncompressed_data.data();
66 status = inflate(&strm, Z_NO_FLUSH);
67 if (status < 0) {
68 LOG(ERROR) << "Inflate failed: " << strm.msg << ", has decompressed "
69 << *uncompressed_size << " bytes.";
70 return false;
71 }
72 *uncompressed_size += kBufferSize - strm.avail_out;
73 } while (status != Z_STREAM_END);
74
75 *compressed_size = data.size() - start - strm.avail_in;
Amin Hassanida664102018-01-30 16:32:43 -080076 TEST_AND_RETURN_FALSE(inflateEnd(&strm) == Z_OK);
Tianjie Xu11942232018-01-18 18:31:42 -080077 return true;
78}
79
Sen Jiang5eb33e82018-05-01 15:01:11 -070080struct ExtentData {
81 puffin::BitExtent extent;
82 uint64_t byte_offset;
83 uint64_t byte_length;
84 const puffin::Buffer& data;
85
86 ExtentData(const puffin::BitExtent& in_extent, const puffin::Buffer& in_data)
87 : extent(in_extent), data(in_data) {
88 // Round start offset up and end offset down to exclude bits not in this
89 // extent. We simply ignore the bits at start and end that's not on byte
90 // boundary because as long as the majority of the bytes are the same,
91 // bsdiff will be able to reference it.
92 byte_offset = (extent.offset + 7) / 8;
93 uint64_t byte_end_offset = (extent.offset + extent.length) / 8;
94 CHECK(byte_end_offset <= data.size());
95 if (byte_end_offset > byte_offset) {
96 byte_length = byte_end_offset - byte_offset;
97 } else {
98 byte_length = 0;
99 }
100 }
101
102 int Compare(const ExtentData& other) const {
103 if (extent.length != other.extent.length) {
104 return extent.length < other.extent.length ? -1 : 1;
105 }
106 return memcmp(data.data() + byte_offset,
107 other.data.data() + other.byte_offset,
108 std::min(byte_length, other.byte_length));
109 }
110 bool operator<(const ExtentData& other) const { return Compare(other) < 0; }
111 bool operator==(const ExtentData& other) const { return Compare(other) == 0; }
112};
113
Tianjie Xu11942232018-01-18 18:31:42 -0800114} // namespace
115
Amin Hassanic3e6b532017-03-07 17:47:25 -0800116namespace puffin {
117
Amin Hassanic3e6b532017-03-07 17:47:25 -0800118// This function uses RFC1950 (https://www.ietf.org/rfc/rfc1950.txt) for the
Amin Hassani75a7f2c2018-02-21 11:51:28 -0800119// definition of a zlib stream. For finding the deflate blocks, we relying on
120// the proper size of the zlib stream in |data|. Basically the size of the zlib
121// stream should be known before hand. Otherwise we need to parse the stream and
122// find the location of compressed blocks using CalculateSizeOfDeflateBlock().
123bool LocateDeflatesInZlib(const Buffer& data,
124 std::vector<ByteExtent>* deflate_blocks) {
125 // A zlib stream has the following format:
126 // 0 1 compression method and flag
127 // 1 1 flag
128 // 2 4 preset dictionary (optional)
129 // 2 or 6 n compressed data
130 // n+(2 or 6) 4 Adler-32 checksum
131 TEST_AND_RETURN_FALSE(data.size() >= 6 + 4); // Header + Footer
132 uint16_t cmf = data[0];
133 auto compression_method = cmf & 0x0F;
134 // For deflate compression_method should be 8.
135 TEST_AND_RETURN_FALSE(compression_method == 8);
Amin Hassanic3e6b532017-03-07 17:47:25 -0800136
Amin Hassani75a7f2c2018-02-21 11:51:28 -0800137 auto cinfo = (cmf & 0xF0) >> 4;
138 // Value greater than 7 is not allowed in deflate.
139 TEST_AND_RETURN_FALSE(cinfo <= 7);
Amin Hassanic3e6b532017-03-07 17:47:25 -0800140
Amin Hassani75a7f2c2018-02-21 11:51:28 -0800141 auto flag = data[1];
142 TEST_AND_RETURN_FALSE(((cmf << 8) + flag) % 31 == 0);
Amin Hassanic3e6b532017-03-07 17:47:25 -0800143
Amin Hassanid7768d52018-02-28 15:34:21 -0800144 uint64_t header_len = 2;
Amin Hassani75a7f2c2018-02-21 11:51:28 -0800145 if (flag & 0x20) {
146 header_len += 4; // 4 bytes for the preset dictionary.
Amin Hassani7074da62017-09-30 17:14:06 -0700147 }
Amin Hassani75a7f2c2018-02-21 11:51:28 -0800148
149 // 4 is for ADLER32.
150 deflate_blocks->emplace_back(header_len, data.size() - header_len - 4);
Amin Hassani7074da62017-09-30 17:14:06 -0700151 return true;
152}
153
154bool FindDeflateSubBlocks(const UniqueStreamPtr& src,
155 const vector<ByteExtent>& deflates,
156 vector<BitExtent>* subblock_deflates) {
157 Puffer puffer;
158 Buffer deflate_buffer;
159 for (const auto& deflate : deflates) {
160 TEST_AND_RETURN_FALSE(src->Seek(deflate.offset));
161 // Read from src into deflate_buffer.
162 deflate_buffer.resize(deflate.length);
163 TEST_AND_RETURN_FALSE(src->Read(deflate_buffer.data(), deflate.length));
164
165 // Find all the subblocks.
166 BufferBitReader bit_reader(deflate_buffer.data(), deflate.length);
Amin Hassanib8325c22018-05-22 14:57:22 -0700167 // The uncompressed blocks will be ignored since we are passing a null
168 // buffered puff writer and a valid deflate locations output array. This
169 // should not happen in the puffdiff or anywhere else by default.
Amin Hassani7074da62017-09-30 17:14:06 -0700170 BufferPuffWriter puff_writer(nullptr, 0);
Amin Hassani7074da62017-09-30 17:14:06 -0700171 vector<BitExtent> subblocks;
172 TEST_AND_RETURN_FALSE(
Amin Hassanie2e9cb02018-03-15 14:14:58 -0700173 puffer.PuffDeflate(&bit_reader, &puff_writer, &subblocks));
Amin Hassani7074da62017-09-30 17:14:06 -0700174 TEST_AND_RETURN_FALSE(deflate.length == bit_reader.Offset());
175 for (const auto& subblock : subblocks) {
176 subblock_deflates->emplace_back(subblock.offset + deflate.offset * 8,
177 subblock.length);
178 }
Amin Hassanic3e6b532017-03-07 17:47:25 -0800179 }
180 return true;
181}
182
Amin Hassani00f08322017-10-18 11:55:10 -0700183bool LocateDeflatesInZlibBlocks(const string& file_path,
184 const vector<ByteExtent>& zlibs,
185 vector<BitExtent>* deflates) {
186 auto src = FileStream::Open(file_path, true, false);
187 TEST_AND_RETURN_FALSE(src);
Amin Hassani75a7f2c2018-02-21 11:51:28 -0800188
189 Buffer buffer;
190 for (auto& zlib : zlibs) {
191 buffer.resize(zlib.length);
192 TEST_AND_RETURN_FALSE(src->Seek(zlib.offset));
193 TEST_AND_RETURN_FALSE(src->Read(buffer.data(), buffer.size()));
194
195 vector<ByteExtent> deflate_blocks;
196 TEST_AND_RETURN_FALSE(LocateDeflatesInZlib(buffer, &deflate_blocks));
197
198 vector<BitExtent> deflate_subblocks;
199 auto zlib_blc_src = MemoryStream::CreateForRead(buffer);
200 TEST_AND_RETURN_FALSE(
201 FindDeflateSubBlocks(zlib_blc_src, deflate_blocks, &deflate_subblocks));
202
203 // Relocated based on the offset of the zlib.
204 for (const auto& def : deflate_subblocks) {
205 deflates->emplace_back(zlib.offset * 8 + def.offset, def.length);
206 }
207 }
208 return true;
Amin Hassani00f08322017-10-18 11:55:10 -0700209}
210
Amin Hassani4a212ed2018-02-15 10:31:28 -0800211// For more information about gzip format, refer to RFC 1952 located at:
212// https://www.ietf.org/rfc/rfc1952.txt
213bool LocateDeflatesInGzip(const Buffer& data,
214 vector<ByteExtent>* deflate_blocks) {
Amin Hassanid7768d52018-02-28 15:34:21 -0800215 uint64_t member_start = 0;
Amin Hassani4a212ed2018-02-15 10:31:28 -0800216 while (member_start < data.size()) {
217 // Each member entry has the following format
218 // 0 1 0x1F
219 // 1 1 0x8B
220 // 2 1 compression method (8 denotes deflate)
221 // 3 1 set of flags
222 // 4 4 modification time
223 // 8 1 extra flags
224 // 9 1 operating system
225 TEST_AND_RETURN_FALSE(member_start + 10 <= data.size());
226 TEST_AND_RETURN_FALSE(data[member_start + 0] == 0x1F);
227 TEST_AND_RETURN_FALSE(data[member_start + 1] == 0x8B);
228 TEST_AND_RETURN_FALSE(data[member_start + 2] == 8);
229
Amin Hassanid7768d52018-02-28 15:34:21 -0800230 uint64_t offset = member_start + 10;
Amin Hassani4a212ed2018-02-15 10:31:28 -0800231 int flag = data[member_start + 3];
232 // Extra field
233 if (flag & 4) {
234 TEST_AND_RETURN_FALSE(offset + 2 <= data.size());
235 uint16_t extra_length = data[offset++];
236 extra_length |= static_cast<uint16_t>(data[offset++]) << 8;
237 TEST_AND_RETURN_FALSE(offset + extra_length <= data.size());
238 offset += extra_length;
239 }
240 // File name field
241 if (flag & 8) {
242 while (true) {
243 TEST_AND_RETURN_FALSE(offset + 1 <= data.size());
244 if (data[offset++] == 0) {
245 break;
246 }
247 }
248 }
249 // File comment field
250 if (flag & 16) {
251 while (true) {
252 TEST_AND_RETURN_FALSE(offset + 1 <= data.size());
253 if (data[offset++] == 0) {
254 break;
255 }
256 }
257 }
258 // CRC16 field
259 if (flag & 2) {
260 offset += 2;
261 }
262
Amin Hassanid7768d52018-02-28 15:34:21 -0800263 uint64_t compressed_size, uncompressed_size;
Amin Hassani4a212ed2018-02-15 10:31:28 -0800264 TEST_AND_RETURN_FALSE(CalculateSizeOfDeflateBlock(
265 data, offset, &compressed_size, &uncompressed_size));
266 TEST_AND_RETURN_FALSE(offset + compressed_size <= data.size());
267 deflate_blocks->push_back(ByteExtent(offset, compressed_size));
268 offset += compressed_size;
269
270 // Ignore CRC32;
271 TEST_AND_RETURN_FALSE(offset + 8 <= data.size());
272 offset += 4;
273 uint32_t u_size = 0;
274 for (size_t i = 0; i < 4; i++) {
275 u_size |= static_cast<uint32_t>(data[offset++]) << (i * 8);
276 }
277 TEST_AND_RETURN_FALSE(uncompressed_size % (1 << 31) == u_size);
278 member_start = offset;
279 }
280 return true;
281}
282
Tianjie Xu11942232018-01-18 18:31:42 -0800283// For more information about the zip format, refer to
284// https://support.pkware.com/display/PKZIP/APPNOTE
285bool LocateDeflatesInZipArchive(const Buffer& data,
286 vector<ByteExtent>* deflate_blocks) {
Amin Hassanid7768d52018-02-28 15:34:21 -0800287 uint64_t pos = 0;
Tianjie Xu11942232018-01-18 18:31:42 -0800288 while (pos <= data.size() - 30) {
289 // TODO(xunchang) add support for big endian system when searching for
290 // magic numbers.
291 if (get_unaligned<uint32_t>(data.data() + pos) != 0x04034b50) {
292 pos++;
293 continue;
294 }
295
296 // local file header format
297 // 0 4 0x04034b50
298 // 4 2 minimum version needed to extract
299 // 6 2 general purpose bit flag
300 // 8 2 compression method
301 // 10 4 file last modification date & time
302 // 14 4 CRC-32
303 // 18 4 compressed size
304 // 22 4 uncompressed size
305 // 26 2 file name length
306 // 28 2 extra field length
307 // 30 n file name
308 // 30+n m extra field
309 auto compression_method = get_unaligned<uint16_t>(data.data() + pos + 8);
310 if (compression_method != 8) { // non-deflate type
311 pos += 4;
312 continue;
313 }
314
315 auto compressed_size = get_unaligned<uint32_t>(data.data() + pos + 18);
316 auto uncompressed_size = get_unaligned<uint32_t>(data.data() + pos + 22);
317 auto file_name_length = get_unaligned<uint16_t>(data.data() + pos + 26);
318 auto extra_field_length = get_unaligned<uint16_t>(data.data() + pos + 28);
319 uint64_t header_size = 30 + file_name_length + extra_field_length;
320
321 // sanity check
322 if (static_cast<uint64_t>(header_size) + compressed_size > data.size() ||
323 pos > data.size() - header_size - compressed_size) {
324 pos += 4;
325 continue;
326 }
327
Amin Hassanid7768d52018-02-28 15:34:21 -0800328 uint64_t calculated_compressed_size;
329 uint64_t calculated_uncompressed_size;
Tianjie Xu11942232018-01-18 18:31:42 -0800330 if (!CalculateSizeOfDeflateBlock(data, pos + header_size,
331 &calculated_compressed_size,
332 &calculated_uncompressed_size)) {
333 LOG(ERROR) << "Failed to decompress the zip entry starting from: " << pos
334 << ", skip adding deflates for this entry.";
335 pos += 4;
336 continue;
337 }
338
339 // Double check the compressed size and uncompressed size if they are
340 // available in the file header.
341 if (compressed_size > 0 && compressed_size != calculated_compressed_size) {
342 LOG(WARNING) << "Compressed size in the file header: " << compressed_size
343 << " doesn't equal the real size: "
344 << calculated_compressed_size;
345 }
346
347 if (uncompressed_size > 0 &&
348 uncompressed_size != calculated_uncompressed_size) {
349 LOG(WARNING) << "Uncompressed size in the file header: "
350 << uncompressed_size << " doesn't equal the real size: "
351 << calculated_uncompressed_size;
352 }
353
354 deflate_blocks->emplace_back(pos + header_size, calculated_compressed_size);
355 pos += header_size + calculated_compressed_size;
356 }
357
358 return true;
359}
360
361bool LocateDeflateSubBlocksInZipArchive(const Buffer& data,
362 vector<BitExtent>* deflates) {
363 vector<ByteExtent> deflate_blocks;
364 if (!LocateDeflatesInZipArchive(data, &deflate_blocks)) {
365 return false;
366 }
367
368 auto src = MemoryStream::CreateForRead(data);
369 return FindDeflateSubBlocks(src, deflate_blocks, deflates);
370}
371
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700372bool FindPuffLocations(const UniqueStreamPtr& src,
Amin Hassani7074da62017-09-30 17:14:06 -0700373 const vector<BitExtent>& deflates,
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700374 vector<ByteExtent>* puffs,
Amin Hassanid7768d52018-02-28 15:34:21 -0800375 uint64_t* out_puff_size) {
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700376 Puffer puffer;
377 Buffer deflate_buffer;
378
379 // Here accumulate the size difference between each corresponding deflate and
380 // puff. At the end we add this cummulative size difference to the size of the
381 // deflate stream to get the size of the puff stream. We use signed size
382 // because puff size could be smaller than deflate size.
Amin Hassanid7768d52018-02-28 15:34:21 -0800383 int64_t total_size_difference = 0;
Amin Hassani7074da62017-09-30 17:14:06 -0700384 for (auto deflate = deflates.begin(); deflate != deflates.end(); ++deflate) {
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700385 // Read from src into deflate_buffer.
Amin Hassani7074da62017-09-30 17:14:06 -0700386 auto start_byte = deflate->offset / 8;
387 auto end_byte = (deflate->offset + deflate->length + 7) / 8;
388 deflate_buffer.resize(end_byte - start_byte);
389 TEST_AND_RETURN_FALSE(src->Seek(start_byte));
390 TEST_AND_RETURN_FALSE(
391 src->Read(deflate_buffer.data(), deflate_buffer.size()));
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700392 // Find the size of the puff.
Amin Hassani7074da62017-09-30 17:14:06 -0700393 BufferBitReader bit_reader(deflate_buffer.data(), deflate_buffer.size());
Amin Hassanid7768d52018-02-28 15:34:21 -0800394 uint64_t bits_to_skip = deflate->offset % 8;
Amin Hassani7074da62017-09-30 17:14:06 -0700395 TEST_AND_RETURN_FALSE(bit_reader.CacheBits(bits_to_skip));
396 bit_reader.DropBits(bits_to_skip);
397
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700398 BufferPuffWriter puff_writer(nullptr, 0);
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700399 TEST_AND_RETURN_FALSE(
Amin Hassanie2e9cb02018-03-15 14:14:58 -0700400 puffer.PuffDeflate(&bit_reader, &puff_writer, nullptr));
Amin Hassani7074da62017-09-30 17:14:06 -0700401 TEST_AND_RETURN_FALSE(deflate_buffer.size() == bit_reader.Offset());
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700402
Amin Hassani7074da62017-09-30 17:14:06 -0700403 // 1 if a deflate ends at the same byte that the next deflate starts and
404 // there is a few bits gap between them. In practice this may never happen,
405 // but it is a good idea to support it anyways. If there is a gap, the value
406 // of the gap will be saved as an integer byte to the puff stream. The parts
407 // of the byte that belogs to the deflates are shifted out.
408 int gap = 0;
409 if (deflate != deflates.begin()) {
410 auto prev_deflate = std::prev(deflate);
411 if ((prev_deflate->offset + prev_deflate->length == deflate->offset)
412 // If deflates are on byte boundary the gap will not be counted later,
413 // so we won't worry about it.
414 && (deflate->offset % 8 != 0)) {
415 gap = 1;
416 }
417 }
418
419 start_byte = ((deflate->offset + 7) / 8);
420 end_byte = (deflate->offset + deflate->length) / 8;
Amin Hassanid7768d52018-02-28 15:34:21 -0800421 int64_t deflate_length_in_bytes = end_byte - start_byte;
Amin Hassani7074da62017-09-30 17:14:06 -0700422
423 // If there was no gap bits between the current and previous deflates, there
424 // will be no extra gap byte, so the offset will be shifted one byte back.
425 auto puff_offset = start_byte - gap + total_size_difference;
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700426 auto puff_size = puff_writer.Size();
Amin Hassani7074da62017-09-30 17:14:06 -0700427 // Add the location into puff.
428 puffs->emplace_back(puff_offset, puff_size);
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700429 total_size_difference +=
Amin Hassanid7768d52018-02-28 15:34:21 -0800430 static_cast<int64_t>(puff_size) - deflate_length_in_bytes - gap;
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700431 }
432
Amin Hassanid7768d52018-02-28 15:34:21 -0800433 uint64_t src_size;
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700434 TEST_AND_RETURN_FALSE(src->GetSize(&src_size));
Amin Hassanid7768d52018-02-28 15:34:21 -0800435 auto final_size = static_cast<int64_t>(src_size) + total_size_difference;
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700436 TEST_AND_RETURN_FALSE(final_size >= 0);
437 *out_puff_size = final_size;
438 return true;
439}
440
Sen Jiang5eb33e82018-05-01 15:01:11 -0700441void RemoveEqualBitExtents(const Buffer& data1,
442 const Buffer& data2,
443 std::vector<BitExtent>* extents1,
444 std::vector<BitExtent>* extents2) {
445 set<ExtentData> extent1_set, equal_extents;
446 for (const BitExtent& ext : *extents1) {
447 extent1_set.emplace(ext, data1);
448 }
449
450 auto new_extents2_end = extents2->begin();
451 for (const BitExtent& ext : *extents2) {
452 ExtentData extent_data(ext, data2);
453 if (extent1_set.find(extent_data) != extent1_set.end()) {
454 equal_extents.insert(extent_data);
455 } else {
456 *new_extents2_end++ = ext;
457 }
458 }
459 extents2->erase(new_extents2_end, extents2->end());
460 extents1->erase(
461 std::remove_if(extents1->begin(), extents1->end(),
462 [&equal_extents, &data1](const BitExtent& ext) {
463 return equal_extents.find(ExtentData(ext, data1)) !=
464 equal_extents.end();
465 }),
466 extents1->end());
467}
Amin Hassanic3e6b532017-03-07 17:47:25 -0800468} // namespace puffin