blob: ba1057057e5bb8f13f46f35b7d8520d6dd5178c8 [file] [log] [blame]
Amin Hassanic3e6b532017-03-07 17:47:25 -08001// Copyright 2017 The Chromium OS Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "puffin/src/include/puffin/utils.h"
6
7#include <inttypes.h>
8
9#include <string>
10#include <vector>
11
Tianjie Xu11942232018-01-18 18:31:42 -080012#include <zlib.h>
13
Amin Hassanic3e6b532017-03-07 17:47:25 -080014#include "puffin/src/bit_reader.h"
Amin Hassani00f08322017-10-18 11:55:10 -070015#include "puffin/src/file_stream.h"
Amin Hassanic3e6b532017-03-07 17:47:25 -080016#include "puffin/src/include/puffin/common.h"
Amin Hassani26bcfdd2017-09-29 17:54:15 -070017#include "puffin/src/include/puffin/errors.h"
18#include "puffin/src/include/puffin/puffer.h"
Tianjie Xu11942232018-01-18 18:31:42 -080019#include "puffin/src/memory_stream.h"
Amin Hassani26bcfdd2017-09-29 17:54:15 -070020#include "puffin/src/puff_writer.h"
Amin Hassanic3e6b532017-03-07 17:47:25 -080021#include "puffin/src/set_errors.h"
22
Tianjie Xu11942232018-01-18 18:31:42 -080023namespace {
24// Use memcpy to access the unaligned data of type |T|.
25template <typename T>
26inline T get_unaligned(const void* address) {
27 T result;
28 memcpy(&result, address, sizeof(T));
29 return result;
30}
31
32// Calculate both the compressed size and uncompressed size of the deflate
33// block that starts from the offset |start| of buffer |data|.
34bool CalculateSizeOfDeflateBlock(const puffin::Buffer& data,
35 size_t start,
36 size_t* compressed_size,
37 size_t* uncompressed_size) {
38 TEST_AND_RETURN_FALSE(compressed_size != nullptr &&
39 uncompressed_size != nullptr);
40
41 TEST_AND_RETURN_FALSE(start < data.size());
42
43 z_stream strm = {};
44 strm.avail_in = data.size() - start;
45 strm.next_in = data.data() + start;
46
47 // -15 means we are decoding a 'raw' stream without zlib headers.
48 if (inflateInit2(&strm, -15)) {
49 LOG(ERROR) << "Failed to initialize inflate: " << strm.msg;
50 return false;
51 }
52
53 const unsigned int kBufferSize = 32768;
54 std::vector<uint8_t> uncompressed_data(kBufferSize);
55 *uncompressed_size = 0;
56 int status = Z_OK;
57 do {
58 // Overwrite the same buffer since we don't need the uncompressed data.
59 strm.avail_out = kBufferSize;
60 strm.next_out = uncompressed_data.data();
61 status = inflate(&strm, Z_NO_FLUSH);
62 if (status < 0) {
63 LOG(ERROR) << "Inflate failed: " << strm.msg << ", has decompressed "
64 << *uncompressed_size << " bytes.";
65 return false;
66 }
67 *uncompressed_size += kBufferSize - strm.avail_out;
68 } while (status != Z_STREAM_END);
69
70 *compressed_size = data.size() - start - strm.avail_in;
Amin Hassanida664102018-01-30 16:32:43 -080071 TEST_AND_RETURN_FALSE(inflateEnd(&strm) == Z_OK);
Tianjie Xu11942232018-01-18 18:31:42 -080072 return true;
73}
74
75} // namespace
76
Amin Hassanic3e6b532017-03-07 17:47:25 -080077namespace puffin {
78
79using std::string;
80using std::vector;
81
Amin Hassanic3e6b532017-03-07 17:47:25 -080082size_t BytesInByteExtents(const vector<ByteExtent>& extents) {
83 size_t bytes = 0;
84 for (const auto& extent : extents) {
85 bytes += extent.length;
86 }
87 return bytes;
88}
89
90// This function uses RFC1950 (https://www.ietf.org/rfc/rfc1950.txt) for the
91// definition of a zlib stream.
92bool LocateDeflatesInZlibBlocks(const UniqueStreamPtr& src,
93 const vector<ByteExtent>& zlibs,
Amin Hassani7074da62017-09-30 17:14:06 -070094 vector<BitExtent>* deflates) {
Amin Hassanic3e6b532017-03-07 17:47:25 -080095 for (auto& zlib : zlibs) {
96 TEST_AND_RETURN_FALSE(src->Seek(zlib.offset));
97 uint16_t zlib_header;
98 TEST_AND_RETURN_FALSE(src->Read(&zlib_header, 2));
Amin Hassani26bcfdd2017-09-29 17:54:15 -070099 BufferBitReader bit_reader(reinterpret_cast<uint8_t*>(&zlib_header), 2);
Amin Hassanic3e6b532017-03-07 17:47:25 -0800100
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700101 TEST_AND_RETURN_FALSE(bit_reader.CacheBits(8));
102 auto cmf = bit_reader.ReadBits(8);
103 auto cm = bit_reader.ReadBits(4);
Amin Hassanic3e6b532017-03-07 17:47:25 -0800104 if (cm != 8 && cm != 15) {
105 LOG(ERROR) << "Invalid compression method! cm: " << cm;
106 return false;
107 }
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700108 bit_reader.DropBits(4);
109 auto cinfo = bit_reader.ReadBits(4);
Amin Hassanic3e6b532017-03-07 17:47:25 -0800110 if (cinfo > 7) {
111 LOG(ERROR) << "cinfo greater than 7 is not allowed in deflate";
112 return false;
113 }
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700114 bit_reader.DropBits(4);
Amin Hassanic3e6b532017-03-07 17:47:25 -0800115
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700116 TEST_AND_RETURN_FALSE(bit_reader.CacheBits(8));
117 auto flg = bit_reader.ReadBits(8);
Amin Hassanic3e6b532017-03-07 17:47:25 -0800118 if (((cmf << 8) + flg) % 31) {
119 LOG(ERROR) << "Invalid zlib header on offset: " << zlib.offset;
120 return false;
121 }
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700122 bit_reader.ReadBits(5); // FCHECK
123 bit_reader.DropBits(5);
Amin Hassanic3e6b532017-03-07 17:47:25 -0800124
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700125 auto fdict = bit_reader.ReadBits(1);
126 bit_reader.DropBits(1);
Amin Hassanic3e6b532017-03-07 17:47:25 -0800127
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700128 bit_reader.ReadBits(2); // FLEVEL
129 bit_reader.DropBits(2);
Amin Hassanic3e6b532017-03-07 17:47:25 -0800130
131 auto header_len = 2;
132 if (fdict) {
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700133 TEST_AND_RETURN_FALSE(bit_reader.CacheBits(32));
134 bit_reader.DropBits(32);
Amin Hassanic3e6b532017-03-07 17:47:25 -0800135 header_len += 4;
136 }
137
Amin Hassani7074da62017-09-30 17:14:06 -0700138 ByteExtent deflate(zlib.offset + header_len, zlib.length - header_len - 4);
139 TEST_AND_RETURN_FALSE(FindDeflateSubBlocks(src, {deflate}, deflates));
140 }
141 return true;
142}
143
144bool FindDeflateSubBlocks(const UniqueStreamPtr& src,
145 const vector<ByteExtent>& deflates,
146 vector<BitExtent>* subblock_deflates) {
147 Puffer puffer;
148 Buffer deflate_buffer;
149 for (const auto& deflate : deflates) {
150 TEST_AND_RETURN_FALSE(src->Seek(deflate.offset));
151 // Read from src into deflate_buffer.
152 deflate_buffer.resize(deflate.length);
153 TEST_AND_RETURN_FALSE(src->Read(deflate_buffer.data(), deflate.length));
154
155 // Find all the subblocks.
156 BufferBitReader bit_reader(deflate_buffer.data(), deflate.length);
157 BufferPuffWriter puff_writer(nullptr, 0);
158 Error error;
159 vector<BitExtent> subblocks;
160 TEST_AND_RETURN_FALSE(
161 puffer.PuffDeflate(&bit_reader, &puff_writer, &subblocks, &error));
162 TEST_AND_RETURN_FALSE(deflate.length == bit_reader.Offset());
163 for (const auto& subblock : subblocks) {
164 subblock_deflates->emplace_back(subblock.offset + deflate.offset * 8,
165 subblock.length);
166 }
Amin Hassanic3e6b532017-03-07 17:47:25 -0800167 }
168 return true;
169}
170
Amin Hassani00f08322017-10-18 11:55:10 -0700171bool LocateDeflatesInZlibBlocks(const string& file_path,
172 const vector<ByteExtent>& zlibs,
173 vector<BitExtent>* deflates) {
174 auto src = FileStream::Open(file_path, true, false);
175 TEST_AND_RETURN_FALSE(src);
176 return LocateDeflatesInZlibBlocks(src, zlibs, deflates);
177}
178
Amin Hassani4a212ed2018-02-15 10:31:28 -0800179// For more information about gzip format, refer to RFC 1952 located at:
180// https://www.ietf.org/rfc/rfc1952.txt
181bool LocateDeflatesInGzip(const Buffer& data,
182 vector<ByteExtent>* deflate_blocks) {
183 size_t member_start = 0;
184 while (member_start < data.size()) {
185 // Each member entry has the following format
186 // 0 1 0x1F
187 // 1 1 0x8B
188 // 2 1 compression method (8 denotes deflate)
189 // 3 1 set of flags
190 // 4 4 modification time
191 // 8 1 extra flags
192 // 9 1 operating system
193 TEST_AND_RETURN_FALSE(member_start + 10 <= data.size());
194 TEST_AND_RETURN_FALSE(data[member_start + 0] == 0x1F);
195 TEST_AND_RETURN_FALSE(data[member_start + 1] == 0x8B);
196 TEST_AND_RETURN_FALSE(data[member_start + 2] == 8);
197
198 size_t offset = member_start + 10;
199 int flag = data[member_start + 3];
200 // Extra field
201 if (flag & 4) {
202 TEST_AND_RETURN_FALSE(offset + 2 <= data.size());
203 uint16_t extra_length = data[offset++];
204 extra_length |= static_cast<uint16_t>(data[offset++]) << 8;
205 TEST_AND_RETURN_FALSE(offset + extra_length <= data.size());
206 offset += extra_length;
207 }
208 // File name field
209 if (flag & 8) {
210 while (true) {
211 TEST_AND_RETURN_FALSE(offset + 1 <= data.size());
212 if (data[offset++] == 0) {
213 break;
214 }
215 }
216 }
217 // File comment field
218 if (flag & 16) {
219 while (true) {
220 TEST_AND_RETURN_FALSE(offset + 1 <= data.size());
221 if (data[offset++] == 0) {
222 break;
223 }
224 }
225 }
226 // CRC16 field
227 if (flag & 2) {
228 offset += 2;
229 }
230
231 size_t compressed_size, uncompressed_size;
232 TEST_AND_RETURN_FALSE(CalculateSizeOfDeflateBlock(
233 data, offset, &compressed_size, &uncompressed_size));
234 TEST_AND_RETURN_FALSE(offset + compressed_size <= data.size());
235 deflate_blocks->push_back(ByteExtent(offset, compressed_size));
236 offset += compressed_size;
237
238 // Ignore CRC32;
239 TEST_AND_RETURN_FALSE(offset + 8 <= data.size());
240 offset += 4;
241 uint32_t u_size = 0;
242 for (size_t i = 0; i < 4; i++) {
243 u_size |= static_cast<uint32_t>(data[offset++]) << (i * 8);
244 }
245 TEST_AND_RETURN_FALSE(uncompressed_size % (1 << 31) == u_size);
246 member_start = offset;
247 }
248 return true;
249}
250
Tianjie Xu11942232018-01-18 18:31:42 -0800251// For more information about the zip format, refer to
252// https://support.pkware.com/display/PKZIP/APPNOTE
253bool LocateDeflatesInZipArchive(const Buffer& data,
254 vector<ByteExtent>* deflate_blocks) {
255 size_t pos = 0;
256 while (pos <= data.size() - 30) {
257 // TODO(xunchang) add support for big endian system when searching for
258 // magic numbers.
259 if (get_unaligned<uint32_t>(data.data() + pos) != 0x04034b50) {
260 pos++;
261 continue;
262 }
263
264 // local file header format
265 // 0 4 0x04034b50
266 // 4 2 minimum version needed to extract
267 // 6 2 general purpose bit flag
268 // 8 2 compression method
269 // 10 4 file last modification date & time
270 // 14 4 CRC-32
271 // 18 4 compressed size
272 // 22 4 uncompressed size
273 // 26 2 file name length
274 // 28 2 extra field length
275 // 30 n file name
276 // 30+n m extra field
277 auto compression_method = get_unaligned<uint16_t>(data.data() + pos + 8);
278 if (compression_method != 8) { // non-deflate type
279 pos += 4;
280 continue;
281 }
282
283 auto compressed_size = get_unaligned<uint32_t>(data.data() + pos + 18);
284 auto uncompressed_size = get_unaligned<uint32_t>(data.data() + pos + 22);
285 auto file_name_length = get_unaligned<uint16_t>(data.data() + pos + 26);
286 auto extra_field_length = get_unaligned<uint16_t>(data.data() + pos + 28);
287 uint64_t header_size = 30 + file_name_length + extra_field_length;
288
289 // sanity check
290 if (static_cast<uint64_t>(header_size) + compressed_size > data.size() ||
291 pos > data.size() - header_size - compressed_size) {
292 pos += 4;
293 continue;
294 }
295
296 size_t calculated_compressed_size;
297 size_t calculated_uncompressed_size;
298 if (!CalculateSizeOfDeflateBlock(data, pos + header_size,
299 &calculated_compressed_size,
300 &calculated_uncompressed_size)) {
301 LOG(ERROR) << "Failed to decompress the zip entry starting from: " << pos
302 << ", skip adding deflates for this entry.";
303 pos += 4;
304 continue;
305 }
306
307 // Double check the compressed size and uncompressed size if they are
308 // available in the file header.
309 if (compressed_size > 0 && compressed_size != calculated_compressed_size) {
310 LOG(WARNING) << "Compressed size in the file header: " << compressed_size
311 << " doesn't equal the real size: "
312 << calculated_compressed_size;
313 }
314
315 if (uncompressed_size > 0 &&
316 uncompressed_size != calculated_uncompressed_size) {
317 LOG(WARNING) << "Uncompressed size in the file header: "
318 << uncompressed_size << " doesn't equal the real size: "
319 << calculated_uncompressed_size;
320 }
321
322 deflate_blocks->emplace_back(pos + header_size, calculated_compressed_size);
323 pos += header_size + calculated_compressed_size;
324 }
325
326 return true;
327}
328
329bool LocateDeflateSubBlocksInZipArchive(const Buffer& data,
330 vector<BitExtent>* deflates) {
331 vector<ByteExtent> deflate_blocks;
332 if (!LocateDeflatesInZipArchive(data, &deflate_blocks)) {
333 return false;
334 }
335
336 auto src = MemoryStream::CreateForRead(data);
337 return FindDeflateSubBlocks(src, deflate_blocks, deflates);
338}
339
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700340bool FindPuffLocations(const UniqueStreamPtr& src,
Amin Hassani7074da62017-09-30 17:14:06 -0700341 const vector<BitExtent>& deflates,
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700342 vector<ByteExtent>* puffs,
343 size_t* out_puff_size) {
344 Puffer puffer;
345 Buffer deflate_buffer;
346
347 // Here accumulate the size difference between each corresponding deflate and
348 // puff. At the end we add this cummulative size difference to the size of the
349 // deflate stream to get the size of the puff stream. We use signed size
350 // because puff size could be smaller than deflate size.
351 ssize_t total_size_difference = 0;
Amin Hassani7074da62017-09-30 17:14:06 -0700352 for (auto deflate = deflates.begin(); deflate != deflates.end(); ++deflate) {
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700353 // Read from src into deflate_buffer.
Amin Hassani7074da62017-09-30 17:14:06 -0700354 auto start_byte = deflate->offset / 8;
355 auto end_byte = (deflate->offset + deflate->length + 7) / 8;
356 deflate_buffer.resize(end_byte - start_byte);
357 TEST_AND_RETURN_FALSE(src->Seek(start_byte));
358 TEST_AND_RETURN_FALSE(
359 src->Read(deflate_buffer.data(), deflate_buffer.size()));
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700360 // Find the size of the puff.
Amin Hassani7074da62017-09-30 17:14:06 -0700361 BufferBitReader bit_reader(deflate_buffer.data(), deflate_buffer.size());
362 size_t bits_to_skip = deflate->offset % 8;
363 TEST_AND_RETURN_FALSE(bit_reader.CacheBits(bits_to_skip));
364 bit_reader.DropBits(bits_to_skip);
365
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700366 BufferPuffWriter puff_writer(nullptr, 0);
367 Error error;
368 TEST_AND_RETURN_FALSE(
Amin Hassani7074da62017-09-30 17:14:06 -0700369 puffer.PuffDeflate(&bit_reader, &puff_writer, nullptr, &error));
370 TEST_AND_RETURN_FALSE(deflate_buffer.size() == bit_reader.Offset());
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700371
Amin Hassani7074da62017-09-30 17:14:06 -0700372 // 1 if a deflate ends at the same byte that the next deflate starts and
373 // there is a few bits gap between them. In practice this may never happen,
374 // but it is a good idea to support it anyways. If there is a gap, the value
375 // of the gap will be saved as an integer byte to the puff stream. The parts
376 // of the byte that belogs to the deflates are shifted out.
377 int gap = 0;
378 if (deflate != deflates.begin()) {
379 auto prev_deflate = std::prev(deflate);
380 if ((prev_deflate->offset + prev_deflate->length == deflate->offset)
381 // If deflates are on byte boundary the gap will not be counted later,
382 // so we won't worry about it.
383 && (deflate->offset % 8 != 0)) {
384 gap = 1;
385 }
386 }
387
388 start_byte = ((deflate->offset + 7) / 8);
389 end_byte = (deflate->offset + deflate->length) / 8;
390 ssize_t deflate_length_in_bytes = end_byte - start_byte;
391
392 // If there was no gap bits between the current and previous deflates, there
393 // will be no extra gap byte, so the offset will be shifted one byte back.
394 auto puff_offset = start_byte - gap + total_size_difference;
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700395 auto puff_size = puff_writer.Size();
Amin Hassani7074da62017-09-30 17:14:06 -0700396 // Add the location into puff.
397 puffs->emplace_back(puff_offset, puff_size);
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700398 total_size_difference +=
Amin Hassani7074da62017-09-30 17:14:06 -0700399 static_cast<ssize_t>(puff_size) - deflate_length_in_bytes - gap;
Amin Hassani26bcfdd2017-09-29 17:54:15 -0700400 }
401
402 size_t src_size;
403 TEST_AND_RETURN_FALSE(src->GetSize(&src_size));
404 auto final_size = static_cast<ssize_t>(src_size) + total_size_difference;
405 TEST_AND_RETURN_FALSE(final_size >= 0);
406 *out_puff_size = final_size;
407 return true;
408}
409
Amin Hassanic3e6b532017-03-07 17:47:25 -0800410} // namespace puffin