blob: ada816cceb809e861d9d8facd2a83bd2f1e697ef [file] [log] [blame]
Amin Hassanic3e6b532017-03-07 17:47:25 -08001// Copyright 2017 The Chromium OS Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "puffin/src/include/puffin/utils.h"
6
7#include <inttypes.h>
8
9#include <string>
10#include <vector>
11
Tianjie Xu11942232018-01-18 18:31:42 -080012#include <zlib.h>
13
Amin Hassanic3e6b532017-03-07 17:47:25 -080014#include "puffin/src/bit_reader.h"
Amin Hassani00f08322017-10-18 11:55:10 -070015#include "puffin/src/file_stream.h"
Amin Hassanic3e6b532017-03-07 17:47:25 -080016#include "puffin/src/include/puffin/common.h"
Amin Hassani26bcfdd2017-09-29 17:54:15 -070017#include "puffin/src/include/puffin/puffer.h"
Amin Hassanie2e9cb02018-03-15 14:14:58 -070018#include "puffin/src/logging.h"
Tianjie Xu11942232018-01-18 18:31:42 -080019#include "puffin/src/memory_stream.h"
Amin Hassani26bcfdd2017-09-29 17:54:15 -070020#include "puffin/src/puff_writer.h"
Amin Hassanic3e6b532017-03-07 17:47:25 -080021
Amin Hassani10b869c2018-03-15 13:22:32 -070022using std::string;
23using std::vector;
24
Tianjie Xu11942232018-01-18 18:31:42 -080025namespace {
26// Use memcpy to access the unaligned data of type |T|.
27template <typename T>
28inline T get_unaligned(const void* address) {
29 T result;
30 memcpy(&result, address, sizeof(T));
31 return result;
32}
33
34// Calculate both the compressed size and uncompressed size of the deflate
35// block that starts from the offset |start| of buffer |data|.
36bool CalculateSizeOfDeflateBlock(const puffin::Buffer& data,
Amin Hassanid7768d52018-02-28 15:34:21 -080037 uint64_t start,
38 uint64_t* compressed_size,
39 uint64_t* uncompressed_size) {
Tianjie Xu11942232018-01-18 18:31:42 -080040 TEST_AND_RETURN_FALSE(compressed_size != nullptr &&
41 uncompressed_size != nullptr);
42
43 TEST_AND_RETURN_FALSE(start < data.size());
44
45 z_stream strm = {};
46 strm.avail_in = data.size() - start;
47 strm.next_in = data.data() + start;
48
49 // -15 means we are decoding a 'raw' stream without zlib headers.
50 if (inflateInit2(&strm, -15)) {
51 LOG(ERROR) << "Failed to initialize inflate: " << strm.msg;
52 return false;
53 }
54
55 const unsigned int kBufferSize = 32768;
56 std::vector<uint8_t> uncompressed_data(kBufferSize);
57 *uncompressed_size = 0;
58 int status = Z_OK;
59 do {
60 // Overwrite the same buffer since we don't need the uncompressed data.
61 strm.avail_out = kBufferSize;
62 strm.next_out = uncompressed_data.data();
63 status = inflate(&strm, Z_NO_FLUSH);
64 if (status < 0) {
65 LOG(ERROR) << "Inflate failed: " << strm.msg << ", has decompressed "
66 << *uncompressed_size << " bytes.";
67 return false;
68 }
69 *uncompressed_size += kBufferSize - strm.avail_out;
70 } while (status != Z_STREAM_END);
71
72 *compressed_size = data.size() - start - strm.avail_in;
Amin Hassanida664102018-01-30 16:32:43 -080073 TEST_AND_RETURN_FALSE(inflateEnd(&strm) == Z_OK);
Tianjie Xu11942232018-01-18 18:31:42 -080074 return true;
75}
76
77} // namespace
78
Amin Hassanic3e6b532017-03-07 17:47:25 -080079namespace puffin {
80
Amin Hassanic3e6b532017-03-07 17:47:25 -080081// This function uses RFC1950 (https://www.ietf.org/rfc/rfc1950.txt) for the
Amin Hassani75a7f2c2018-02-21 11:51:28 -080082// definition of a zlib stream. For finding the deflate blocks, we relying on
83// the proper size of the zlib stream in |data|. Basically the size of the zlib
84// stream should be known before hand. Otherwise we need to parse the stream and
85// find the location of compressed blocks using CalculateSizeOfDeflateBlock().
86bool LocateDeflatesInZlib(const Buffer& data,
87 std::vector<ByteExtent>* deflate_blocks) {
88 // A zlib stream has the following format:
89 // 0 1 compression method and flag
90 // 1 1 flag
91 // 2 4 preset dictionary (optional)
92 // 2 or 6 n compressed data
93 // n+(2 or 6) 4 Adler-32 checksum
94 TEST_AND_RETURN_FALSE(data.size() >= 6 + 4); // Header + Footer
95 uint16_t cmf = data[0];
96 auto compression_method = cmf & 0x0F;
97 // For deflate compression_method should be 8.
98 TEST_AND_RETURN_FALSE(compression_method == 8);
Amin Hassanic3e6b532017-03-07 17:47:25 -080099
Amin Hassani75a7f2c2018-02-21 11:51:28 -0800100 auto cinfo = (cmf & 0xF0) >> 4;
101 // Value greater than 7 is not allowed in deflate.
102 TEST_AND_RETURN_FALSE(cinfo <= 7);
Amin Hassanic3e6b532017-03-07 17:47:25 -0800103
Amin Hassani75a7f2c2018-02-21 11:51:28 -0800104 auto flag = data[1];
105 TEST_AND_RETURN_FALSE(((cmf << 8) + flag) % 31 == 0);
Amin Hassanic3e6b532017-03-07 17:47:25 -0800106
Amin Hassanid7768d52018-02-28 15:34:21 -0800107 uint64_t header_len = 2;
Amin Hassani75a7f2c2018-02-21 11:51:28 -0800108 if (flag & 0x20) {
109 header_len += 4; // 4 bytes for the preset dictionary.
Amin Hassani7074da62017-09-30 17:14:06 -0700110 }
Amin Hassani75a7f2c2018-02-21 11:51:28 -0800111
112 // 4 is for ADLER32.
113 deflate_blocks->emplace_back(header_len, data.size() - header_len - 4);
Amin Hassani7074da62017-09-30 17:14:06 -0700114 return true;
115}
116
117bool FindDeflateSubBlocks(const UniqueStreamPtr& src,
118 const vector<ByteExtent>& deflates,
119 vector<BitExtent>* subblock_deflates) {
120 Puffer puffer;
121 Buffer deflate_buffer;
122 for (const auto& deflate : deflates) {
123 TEST_AND_RETURN_FALSE(src->Seek(deflate.offset));
124 // Read from src into deflate_buffer.
125 deflate_buffer.resize(deflate.length);
126 TEST_AND_RETURN_FALSE(src->Read(deflate_buffer.data(), deflate.length));
127
128 // Find all the subblocks.
129 BufferBitReader bit_reader(deflate_buffer.data(), deflate.length);
130 BufferPuffWriter puff_writer(nullptr, 0);
Amin Hassani7074da62017-09-30 17:14:06 -0700131 vector<BitExtent> subblocks;
132 TEST_AND_RETURN_FALSE(
Amin Hassanie2e9cb02018-03-15 14:14:58 -0700133 puffer.PuffDeflate(&bit_reader, &puff_writer, &subblocks));
Amin Hassani7074da62017-09-30 17:14:06 -0700134 TEST_AND_RETURN_FALSE(deflate.length == bit_reader.Offset());
135 for (const auto& subblock : subblocks) {
136 subblock_deflates->emplace_back(subblock.offset + deflate.offset * 8,
137 subblock.length);
138 }
Amin Hassanic3e6b532017-03-07 17:47:25 -0800139 }
140 return true;
141}
142
Amin Hassani00f08322017-10-18 11:55:10 -0700143bool LocateDeflatesInZlibBlocks(const string& file_path,
144 const vector<ByteExtent>& zlibs,
145 vector<BitExtent>* deflates) {
146 auto src = FileStream::Open(file_path, true, false);
147 TEST_AND_RETURN_FALSE(src);
Amin Hassani75a7f2c2018-02-21 11:51:28 -0800148
149 Buffer buffer;
150 for (auto& zlib : zlibs) {
151 buffer.resize(zlib.length);
152 TEST_AND_RETURN_FALSE(src->Seek(zlib.offset));
153 TEST_AND_RETURN_FALSE(src->Read(buffer.data(), buffer.size()));
154
155 vector<ByteExtent> deflate_blocks;
156 TEST_AND_RETURN_FALSE(LocateDeflatesInZlib(buffer, &deflate_blocks));
157
158 vector<BitExtent> deflate_subblocks;
159 auto zlib_blc_src = MemoryStream::CreateForRead(buffer);
160 TEST_AND_RETURN_FALSE(
161 FindDeflateSubBlocks(zlib_blc_src, deflate_blocks, &deflate_subblocks));
162
163 // Relocated based on the offset of the zlib.
164 for (const auto& def : deflate_subblocks) {
165 deflates->emplace_back(zlib.offset * 8 + def.offset, def.length);
166 }
167 }
168 return true;
Amin Hassani00f08322017-10-18 11:55:10 -0700169}
170
Amin Hassani4a212ed2018-02-15 10:31:28 -0800171// For more information about gzip format, refer to RFC 1952 located at:
172// https://www.ietf.org/rfc/rfc1952.txt
173bool LocateDeflatesInGzip(const Buffer& data,
174 vector<ByteExtent>* deflate_blocks) {
Amin Hassanid7768d52018-02-28 15:34:21 -0800175 uint64_t member_start = 0;
Amin Hassani4a212ed2018-02-15 10:31:28 -0800176 while (member_start < data.size()) {
177 // Each member entry has the following format
178 // 0 1 0x1F
179 // 1 1 0x8B
180 // 2 1 compression method (8 denotes deflate)
181 // 3 1 set of flags
182 // 4 4 modification time
183 // 8 1 extra flags
184 // 9 1 operating system
185 TEST_AND_RETURN_FALSE(member_start + 10 <= data.size());
186 TEST_AND_RETURN_FALSE(data[member_start + 0] == 0x1F);
187 TEST_AND_RETURN_FALSE(data[member_start + 1] == 0x8B);
188 TEST_AND_RETURN_FALSE(data[member_start + 2] == 8);
189
Amin Hassanid7768d52018-02-28 15:34:21 -0800190 uint64_t offset = member_start + 10;
Amin Hassani4a212ed2018-02-15 10:31:28 -0800191 int flag = data[member_start + 3];
192 // Extra field
193 if (flag & 4) {
194 TEST_AND_RETURN_FALSE(offset + 2 <= data.size());
195 uint16_t extra_length = data[offset++];
196 extra_length |= static_cast<uint16_t>(data[offset++]) << 8;
197 TEST_AND_RETURN_FALSE(offset + extra_length <= data.size());
198 offset += extra_length;
199 }
200 // File name field
201 if (flag & 8) {
202 while (true) {
203 TEST_AND_RETURN_FALSE(offset + 1 <= data.size());
204 if (data[offset++] == 0) {
205 break;
206 }
207 }
208 }
209 // File comment field
210 if (flag & 16) {
211 while (true) {
212 TEST_AND_RETURN_FALSE(offset + 1 <= data.size());
213 if (data[offset++] == 0) {
214 break;
215 }
216 }
217 }
218 // CRC16 field
219 if (flag & 2) {
220 offset += 2;
221 }
222
Amin Hassanid7768d52018-02-28 15:34:21 -0800223 uint64_t compressed_size, uncompressed_size;
Amin Hassani4a212ed2018-02-15 10:31:28 -0800224 TEST_AND_RETURN_FALSE(CalculateSizeOfDeflateBlock(
225 data, offset, &compressed_size, &uncompressed_size));
226 TEST_AND_RETURN_FALSE(offset + compressed_size <= data.size());
227 deflate_blocks->push_back(ByteExtent(offset, compressed_size));
228 offset += compressed_size;
229
230 // Ignore CRC32;
231 TEST_AND_RETURN_FALSE(offset + 8 <= data.size());
232 offset += 4;
233 uint32_t u_size = 0;
234 for (size_t i = 0; i < 4; i++) {
235 u_size |= static_cast<uint32_t>(data[offset++]) << (i * 8);
236 }
237 TEST_AND_RETURN_FALSE(uncompressed_size % (1 << 31) == u_size);
238 member_start = offset;
239 }
240 return true;
241}
242
Tianjie Xu11942232018-01-18 18:31:42 -0800243// For more information about the zip format, refer to
244// https://support.pkware.com/display/PKZIP/APPNOTE
245bool LocateDeflatesInZipArchive(const Buffer& data,
246 vector<ByteExtent>* deflate_blocks) {
Amin Hassanid7768d52018-02-28 15:34:21 -0800247 uint64_t pos = 0;
Tianjie Xu11942232018-01-18 18:31:42 -0800248 while (pos <= data.size() - 30) {
249 // TODO(xunchang) add support for big endian system when searching for
250 // magic numbers.
251 if (get_unaligned<uint32_t>(data.data() + pos) != 0x04034b50) {
252 pos++;
253 continue;
254 }
255
256 // local file header format
257 // 0 4 0x04034b50
258 // 4 2 minimum version needed to extract
259 // 6 2 general purpose bit flag
260 // 8 2 compression method
261 // 10 4 file last modification date & time
262 // 14 4 CRC-32
263 // 18 4 compressed size
264 // 22 4 uncompressed size
265 // 26 2 file name length
266 // 28 2 extra field length
267 // 30 n file name
268 // 30+n m extra field
269 auto compression_method = get_unaligned<uint16_t>(data.data() + pos + 8);
270 if (compression_method != 8) { // non-deflate type
271 pos += 4;
272 continue;
273 }
274
275 auto compressed_size = get_unaligned<uint32_t>(data.data() + pos + 18);
276 auto uncompressed_size = get_unaligned<uint32_t>(data.data() + pos + 22);
277 auto file_name_length = get_unaligned<uint16_t>(data.data() + pos + 26);
278 auto extra_field_length = get_unaligned<uint16_t>(data.data() + pos + 28);
279 uint64_t header_size = 30 + file_name_length + extra_field_length;
280
281 // sanity check
282 if (static_cast<uint64_t>(header_size) + compressed_size > data.size() ||
283 pos > data.size() - header_size - compressed_size) {
284 pos += 4;
285 continue;
286 }
287
Amin Hassanid7768d52018-02-28 15:34:21 -0800288 uint64_t calculated_compressed_size;
289 uint64_t calculated_uncompressed_size;
Tianjie Xu11942232018-01-18 18:31:42 -0800290 if (!CalculateSizeOfDeflateBlock(data, pos + header_size,
291 &calculated_compressed_size,
292 &calculated_uncompressed_size)) {
293 LOG(ERROR) << "Failed to decompress the zip entry starting from: " << pos
294 << ", skip adding deflates for this entry.";
295 pos += 4;
296 continue;
297 }
298
299 // Double check the compressed size and uncompressed size if they are
300 // available in the file header.
301 if (compressed_size > 0 && compressed_size != calculated_compressed_size) {
302 LOG(WARNING) << "Compressed size in the file header: " << compressed_size
303 << " doesn't equal the real size: "
304 << calculated_compressed_size;
305 }
306
307 if (uncompressed_size > 0 &&
308 uncompressed_size != calculated_uncompressed_size) {
309 LOG(WARNING) << "Uncompressed size in the file header: "
310 << uncompressed_size << " doesn't equal the real size: "
311 << calculated_uncompressed_size;
312 }
313
314 deflate_blocks->emplace_back(pos + header_size, calculated_compressed_size);
315 pos += header_size + calculated_compressed_size;
316 }
317
318 return true;
319}
320
321bool LocateDeflateSubBlocksInZipArchive(const Buffer& data,
322 vector<BitExtent>* deflates) {
323 vector<ByteExtent> deflate_blocks;
324 if (!LocateDeflatesInZipArchive(data, &deflate_blocks)) {
325 return false;
326 }
327
328 auto src = MemoryStream::CreateForRead(data);
329 return FindDeflateSubBlocks(src, deflate_blocks, deflates);
330}
331
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700332bool FindPuffLocations(const UniqueStreamPtr& src,
Amin Hassani7074da62017-09-30 17:14:06 -0700333 const vector<BitExtent>& deflates,
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700334 vector<ByteExtent>* puffs,
Amin Hassanid7768d52018-02-28 15:34:21 -0800335 uint64_t* out_puff_size) {
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700336 Puffer puffer;
337 Buffer deflate_buffer;
338
339 // Here accumulate the size difference between each corresponding deflate and
340 // puff. At the end we add this cummulative size difference to the size of the
341 // deflate stream to get the size of the puff stream. We use signed size
342 // because puff size could be smaller than deflate size.
Amin Hassanid7768d52018-02-28 15:34:21 -0800343 int64_t total_size_difference = 0;
Amin Hassani7074da62017-09-30 17:14:06 -0700344 for (auto deflate = deflates.begin(); deflate != deflates.end(); ++deflate) {
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700345 // Read from src into deflate_buffer.
Amin Hassani7074da62017-09-30 17:14:06 -0700346 auto start_byte = deflate->offset / 8;
347 auto end_byte = (deflate->offset + deflate->length + 7) / 8;
348 deflate_buffer.resize(end_byte - start_byte);
349 TEST_AND_RETURN_FALSE(src->Seek(start_byte));
350 TEST_AND_RETURN_FALSE(
351 src->Read(deflate_buffer.data(), deflate_buffer.size()));
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700352 // Find the size of the puff.
Amin Hassani7074da62017-09-30 17:14:06 -0700353 BufferBitReader bit_reader(deflate_buffer.data(), deflate_buffer.size());
Amin Hassanid7768d52018-02-28 15:34:21 -0800354 uint64_t bits_to_skip = deflate->offset % 8;
Amin Hassani7074da62017-09-30 17:14:06 -0700355 TEST_AND_RETURN_FALSE(bit_reader.CacheBits(bits_to_skip));
356 bit_reader.DropBits(bits_to_skip);
357
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700358 BufferPuffWriter puff_writer(nullptr, 0);
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700359 TEST_AND_RETURN_FALSE(
Amin Hassanie2e9cb02018-03-15 14:14:58 -0700360 puffer.PuffDeflate(&bit_reader, &puff_writer, nullptr));
Amin Hassani7074da62017-09-30 17:14:06 -0700361 TEST_AND_RETURN_FALSE(deflate_buffer.size() == bit_reader.Offset());
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700362
Amin Hassani7074da62017-09-30 17:14:06 -0700363 // 1 if a deflate ends at the same byte that the next deflate starts and
364 // there is a few bits gap between them. In practice this may never happen,
365 // but it is a good idea to support it anyways. If there is a gap, the value
366 // of the gap will be saved as an integer byte to the puff stream. The parts
367 // of the byte that belogs to the deflates are shifted out.
368 int gap = 0;
369 if (deflate != deflates.begin()) {
370 auto prev_deflate = std::prev(deflate);
371 if ((prev_deflate->offset + prev_deflate->length == deflate->offset)
372 // If deflates are on byte boundary the gap will not be counted later,
373 // so we won't worry about it.
374 && (deflate->offset % 8 != 0)) {
375 gap = 1;
376 }
377 }
378
379 start_byte = ((deflate->offset + 7) / 8);
380 end_byte = (deflate->offset + deflate->length) / 8;
Amin Hassanid7768d52018-02-28 15:34:21 -0800381 int64_t deflate_length_in_bytes = end_byte - start_byte;
Amin Hassani7074da62017-09-30 17:14:06 -0700382
383 // If there was no gap bits between the current and previous deflates, there
384 // will be no extra gap byte, so the offset will be shifted one byte back.
385 auto puff_offset = start_byte - gap + total_size_difference;
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700386 auto puff_size = puff_writer.Size();
Amin Hassani7074da62017-09-30 17:14:06 -0700387 // Add the location into puff.
388 puffs->emplace_back(puff_offset, puff_size);
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700389 total_size_difference +=
Amin Hassanid7768d52018-02-28 15:34:21 -0800390 static_cast<int64_t>(puff_size) - deflate_length_in_bytes - gap;
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700391 }
392
Amin Hassanid7768d52018-02-28 15:34:21 -0800393 uint64_t src_size;
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700394 TEST_AND_RETURN_FALSE(src->GetSize(&src_size));
Amin Hassanid7768d52018-02-28 15:34:21 -0800395 auto final_size = static_cast<int64_t>(src_size) + total_size_difference;
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700396 TEST_AND_RETURN_FALSE(final_size >= 0);
397 *out_puff_size = final_size;
398 return true;
399}
400
Amin Hassanic3e6b532017-03-07 17:47:25 -0800401} // namespace puffin