Amin Hassani | c3e6b53 | 2017-03-07 17:47:25 -0800 | [diff] [blame] | 1 | // Copyright 2017 The Chromium OS Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
| 5 | #include "puffin/src/include/puffin/utils.h" |
| 6 | |
| 7 | #include <inttypes.h> |
| 8 | |
Sen Jiang | 5eb33e8 | 2018-05-01 15:01:11 -0700 | [diff] [blame] | 9 | #include <algorithm> |
Amin Hassani | 9c33d3b | 2020-02-19 13:58:33 -0800 | [diff] [blame] | 10 | #include <iterator> |
Sen Jiang | 5eb33e8 | 2018-05-01 15:01:11 -0700 | [diff] [blame] | 11 | #include <set> |
Amin Hassani | c3e6b53 | 2017-03-07 17:47:25 -0800 | [diff] [blame] | 12 | #include <string> |
| 13 | #include <vector> |
| 14 | |
| 15 | #include "puffin/src/bit_reader.h" |
Amin Hassani | 00f0832 | 2017-10-18 11:55:10 -0700 | [diff] [blame] | 16 | #include "puffin/src/file_stream.h" |
Amin Hassani | c3e6b53 | 2017-03-07 17:47:25 -0800 | [diff] [blame] | 17 | #include "puffin/src/include/puffin/common.h" |
Amin Hassani | 26bcfdd | 2017-09-29 17:54:15 -0700 | [diff] [blame] | 18 | #include "puffin/src/include/puffin/puffer.h" |
Amin Hassani | e2e9cb0 | 2018-03-15 14:14:58 -0700 | [diff] [blame] | 19 | #include "puffin/src/logging.h" |
Tianjie Xu | 1194223 | 2018-01-18 18:31:42 -0800 | [diff] [blame] | 20 | #include "puffin/src/memory_stream.h" |
Amin Hassani | 26bcfdd | 2017-09-29 17:54:15 -0700 | [diff] [blame] | 21 | #include "puffin/src/puff_writer.h" |
Amin Hassani | c3e6b53 | 2017-03-07 17:47:25 -0800 | [diff] [blame] | 22 | |
Sen Jiang | 5eb33e8 | 2018-05-01 15:01:11 -0700 | [diff] [blame] | 23 | using std::set; |
Amin Hassani | 10b869c | 2018-03-15 13:22:32 -0700 | [diff] [blame] | 24 | using std::string; |
| 25 | using std::vector; |
| 26 | |
Tianjie Xu | 1194223 | 2018-01-18 18:31:42 -0800 | [diff] [blame] | 27 | namespace { |
| 28 | // Use memcpy to access the unaligned data of type |T|. |
| 29 | template <typename T> |
| 30 | inline T get_unaligned(const void* address) { |
| 31 | T result; |
| 32 | memcpy(&result, address, sizeof(T)); |
| 33 | return result; |
| 34 | } |
| 35 | |
Sen Jiang | 5eb33e8 | 2018-05-01 15:01:11 -0700 | [diff] [blame] | 36 | struct ExtentData { |
| 37 | puffin::BitExtent extent; |
| 38 | uint64_t byte_offset; |
| 39 | uint64_t byte_length; |
| 40 | const puffin::Buffer& data; |
| 41 | |
| 42 | ExtentData(const puffin::BitExtent& in_extent, const puffin::Buffer& in_data) |
| 43 | : extent(in_extent), data(in_data) { |
| 44 | // Round start offset up and end offset down to exclude bits not in this |
| 45 | // extent. We simply ignore the bits at start and end that's not on byte |
| 46 | // boundary because as long as the majority of the bytes are the same, |
| 47 | // bsdiff will be able to reference it. |
| 48 | byte_offset = (extent.offset + 7) / 8; |
| 49 | uint64_t byte_end_offset = (extent.offset + extent.length) / 8; |
| 50 | CHECK(byte_end_offset <= data.size()); |
| 51 | if (byte_end_offset > byte_offset) { |
| 52 | byte_length = byte_end_offset - byte_offset; |
| 53 | } else { |
| 54 | byte_length = 0; |
| 55 | } |
| 56 | } |
| 57 | |
| 58 | int Compare(const ExtentData& other) const { |
| 59 | if (extent.length != other.extent.length) { |
| 60 | return extent.length < other.extent.length ? -1 : 1; |
| 61 | } |
| 62 | return memcmp(data.data() + byte_offset, |
| 63 | other.data.data() + other.byte_offset, |
| 64 | std::min(byte_length, other.byte_length)); |
| 65 | } |
| 66 | bool operator<(const ExtentData& other) const { return Compare(other) < 0; } |
| 67 | bool operator==(const ExtentData& other) const { return Compare(other) == 0; } |
| 68 | }; |
| 69 | |
Tianjie Xu | 1194223 | 2018-01-18 18:31:42 -0800 | [diff] [blame] | 70 | } // namespace |
| 71 | |
Amin Hassani | c3e6b53 | 2017-03-07 17:47:25 -0800 | [diff] [blame] | 72 | namespace puffin { |
| 73 | |
Amin Hassani | 8d0ec65 | 2018-06-05 16:02:28 -0700 | [diff] [blame] | 74 | bool LocateDeflatesInDeflateStream(const uint8_t* data, |
| 75 | uint64_t size, |
| 76 | uint64_t virtual_offset, |
| 77 | vector<BitExtent>* deflates, |
| 78 | uint64_t* compressed_size) { |
| 79 | Puffer puffer; |
| 80 | BufferBitReader bit_reader(data, size); |
| 81 | BufferPuffWriter puff_writer(nullptr, 0); |
| 82 | vector<BitExtent> sub_deflates; |
| 83 | TEST_AND_RETURN_FALSE( |
| 84 | puffer.PuffDeflate(&bit_reader, &puff_writer, &sub_deflates)); |
| 85 | for (const auto& deflate : sub_deflates) { |
| 86 | deflates->emplace_back(deflate.offset + virtual_offset * 8, deflate.length); |
| 87 | } |
| 88 | if (compressed_size) { |
| 89 | *compressed_size = bit_reader.Offset(); |
| 90 | } |
| 91 | return true; |
| 92 | } |
| 93 | |
Amin Hassani | c3e6b53 | 2017-03-07 17:47:25 -0800 | [diff] [blame] | 94 | // This function uses RFC1950 (https://www.ietf.org/rfc/rfc1950.txt) for the |
Amin Hassani | 75a7f2c | 2018-02-21 11:51:28 -0800 | [diff] [blame] | 95 | // definition of a zlib stream. For finding the deflate blocks, we relying on |
| 96 | // the proper size of the zlib stream in |data|. Basically the size of the zlib |
| 97 | // stream should be known before hand. Otherwise we need to parse the stream and |
| 98 | // find the location of compressed blocks using CalculateSizeOfDeflateBlock(). |
Amin Hassani | 9f79094 | 2019-01-03 17:26:16 -0800 | [diff] [blame] | 99 | bool LocateDeflatesInZlib(const Buffer& data, vector<BitExtent>* deflates) { |
Amin Hassani | 75a7f2c | 2018-02-21 11:51:28 -0800 | [diff] [blame] | 100 | // A zlib stream has the following format: |
| 101 | // 0 1 compression method and flag |
| 102 | // 1 1 flag |
| 103 | // 2 4 preset dictionary (optional) |
| 104 | // 2 or 6 n compressed data |
| 105 | // n+(2 or 6) 4 Adler-32 checksum |
| 106 | TEST_AND_RETURN_FALSE(data.size() >= 6 + 4); // Header + Footer |
| 107 | uint16_t cmf = data[0]; |
| 108 | auto compression_method = cmf & 0x0F; |
| 109 | // For deflate compression_method should be 8. |
| 110 | TEST_AND_RETURN_FALSE(compression_method == 8); |
Amin Hassani | c3e6b53 | 2017-03-07 17:47:25 -0800 | [diff] [blame] | 111 | |
Amin Hassani | 75a7f2c | 2018-02-21 11:51:28 -0800 | [diff] [blame] | 112 | auto cinfo = (cmf & 0xF0) >> 4; |
| 113 | // Value greater than 7 is not allowed in deflate. |
| 114 | TEST_AND_RETURN_FALSE(cinfo <= 7); |
Amin Hassani | c3e6b53 | 2017-03-07 17:47:25 -0800 | [diff] [blame] | 115 | |
Amin Hassani | 75a7f2c | 2018-02-21 11:51:28 -0800 | [diff] [blame] | 116 | auto flag = data[1]; |
| 117 | TEST_AND_RETURN_FALSE(((cmf << 8) + flag) % 31 == 0); |
Amin Hassani | c3e6b53 | 2017-03-07 17:47:25 -0800 | [diff] [blame] | 118 | |
Amin Hassani | d7768d5 | 2018-02-28 15:34:21 -0800 | [diff] [blame] | 119 | uint64_t header_len = 2; |
Amin Hassani | 75a7f2c | 2018-02-21 11:51:28 -0800 | [diff] [blame] | 120 | if (flag & 0x20) { |
| 121 | header_len += 4; // 4 bytes for the preset dictionary. |
Amin Hassani | 7074da6 | 2017-09-30 17:14:06 -0700 | [diff] [blame] | 122 | } |
Amin Hassani | 75a7f2c | 2018-02-21 11:51:28 -0800 | [diff] [blame] | 123 | |
| 124 | // 4 is for ADLER32. |
Amin Hassani | 8d0ec65 | 2018-06-05 16:02:28 -0700 | [diff] [blame] | 125 | TEST_AND_RETURN_FALSE(LocateDeflatesInDeflateStream( |
| 126 | data.data() + header_len, data.size() - header_len - 4, header_len, |
| 127 | deflates, nullptr)); |
Amin Hassani | 7074da6 | 2017-09-30 17:14:06 -0700 | [diff] [blame] | 128 | return true; |
| 129 | } |
| 130 | |
| 131 | bool FindDeflateSubBlocks(const UniqueStreamPtr& src, |
| 132 | const vector<ByteExtent>& deflates, |
| 133 | vector<BitExtent>* subblock_deflates) { |
| 134 | Puffer puffer; |
| 135 | Buffer deflate_buffer; |
| 136 | for (const auto& deflate : deflates) { |
| 137 | TEST_AND_RETURN_FALSE(src->Seek(deflate.offset)); |
| 138 | // Read from src into deflate_buffer. |
| 139 | deflate_buffer.resize(deflate.length); |
| 140 | TEST_AND_RETURN_FALSE(src->Read(deflate_buffer.data(), deflate.length)); |
| 141 | |
| 142 | // Find all the subblocks. |
| 143 | BufferBitReader bit_reader(deflate_buffer.data(), deflate.length); |
Amin Hassani | b8325c2 | 2018-05-22 14:57:22 -0700 | [diff] [blame] | 144 | // The uncompressed blocks will be ignored since we are passing a null |
| 145 | // buffered puff writer and a valid deflate locations output array. This |
| 146 | // should not happen in the puffdiff or anywhere else by default. |
Amin Hassani | 7074da6 | 2017-09-30 17:14:06 -0700 | [diff] [blame] | 147 | BufferPuffWriter puff_writer(nullptr, 0); |
Amin Hassani | 7074da6 | 2017-09-30 17:14:06 -0700 | [diff] [blame] | 148 | vector<BitExtent> subblocks; |
| 149 | TEST_AND_RETURN_FALSE( |
Amin Hassani | e2e9cb0 | 2018-03-15 14:14:58 -0700 | [diff] [blame] | 150 | puffer.PuffDeflate(&bit_reader, &puff_writer, &subblocks)); |
Amin Hassani | 7074da6 | 2017-09-30 17:14:06 -0700 | [diff] [blame] | 151 | TEST_AND_RETURN_FALSE(deflate.length == bit_reader.Offset()); |
| 152 | for (const auto& subblock : subblocks) { |
| 153 | subblock_deflates->emplace_back(subblock.offset + deflate.offset * 8, |
| 154 | subblock.length); |
| 155 | } |
Amin Hassani | c3e6b53 | 2017-03-07 17:47:25 -0800 | [diff] [blame] | 156 | } |
| 157 | return true; |
| 158 | } |
| 159 | |
Amin Hassani | 00f0832 | 2017-10-18 11:55:10 -0700 | [diff] [blame] | 160 | bool LocateDeflatesInZlibBlocks(const string& file_path, |
| 161 | const vector<ByteExtent>& zlibs, |
| 162 | vector<BitExtent>* deflates) { |
| 163 | auto src = FileStream::Open(file_path, true, false); |
| 164 | TEST_AND_RETURN_FALSE(src); |
Amin Hassani | 75a7f2c | 2018-02-21 11:51:28 -0800 | [diff] [blame] | 165 | |
| 166 | Buffer buffer; |
Amin Hassani | 8d0ec65 | 2018-06-05 16:02:28 -0700 | [diff] [blame] | 167 | for (const auto& zlib : zlibs) { |
Amin Hassani | 75a7f2c | 2018-02-21 11:51:28 -0800 | [diff] [blame] | 168 | buffer.resize(zlib.length); |
| 169 | TEST_AND_RETURN_FALSE(src->Seek(zlib.offset)); |
| 170 | TEST_AND_RETURN_FALSE(src->Read(buffer.data(), buffer.size())); |
Amin Hassani | 8d0ec65 | 2018-06-05 16:02:28 -0700 | [diff] [blame] | 171 | vector<BitExtent> tmp_deflates; |
| 172 | TEST_AND_RETURN_FALSE(LocateDeflatesInZlib(buffer, &tmp_deflates)); |
| 173 | for (const auto& deflate : tmp_deflates) { |
| 174 | deflates->emplace_back(deflate.offset + zlib.offset * 8, deflate.length); |
Amin Hassani | 75a7f2c | 2018-02-21 11:51:28 -0800 | [diff] [blame] | 175 | } |
| 176 | } |
| 177 | return true; |
Amin Hassani | 00f0832 | 2017-10-18 11:55:10 -0700 | [diff] [blame] | 178 | } |
| 179 | |
Amin Hassani | 9c33d3b | 2020-02-19 13:58:33 -0800 | [diff] [blame] | 180 | namespace { |
Amin Hassani | 4a212ed | 2018-02-15 10:31:28 -0800 | [diff] [blame] | 181 | // For more information about gzip format, refer to RFC 1952 located at: |
| 182 | // https://www.ietf.org/rfc/rfc1952.txt |
Amin Hassani | 9c33d3b | 2020-02-19 13:58:33 -0800 | [diff] [blame] | 183 | bool IsValidGzipHeader(const uint8_t* header, size_t size) { |
| 184 | // Each gzip entry has the following format magic header: |
| 185 | // 0 1 0x1F |
| 186 | // 1 1 0x8B |
| 187 | // 2 1 compression method (8 denotes deflate) |
| 188 | static const uint8_t magic[] = {0x1F, 0x8B, 8}; |
| 189 | return size >= 10 && std::equal(std::begin(magic), std::end(magic), header); |
| 190 | } |
| 191 | } // namespace |
| 192 | |
Amin Hassani | 8d0ec65 | 2018-06-05 16:02:28 -0700 | [diff] [blame] | 193 | bool LocateDeflatesInGzip(const Buffer& data, vector<BitExtent>* deflates) { |
Amin Hassani | 9c33d3b | 2020-02-19 13:58:33 -0800 | [diff] [blame] | 194 | TEST_AND_RETURN_FALSE(IsValidGzipHeader(data.data(), data.size())); |
Amin Hassani | d7768d5 | 2018-02-28 15:34:21 -0800 | [diff] [blame] | 195 | uint64_t member_start = 0; |
Amin Hassani | 9c33d3b | 2020-02-19 13:58:33 -0800 | [diff] [blame] | 196 | do { |
| 197 | // After the magic header, the gzip contains: |
Amin Hassani | 4a212ed | 2018-02-15 10:31:28 -0800 | [diff] [blame] | 198 | // 3 1 set of flags |
| 199 | // 4 4 modification time |
| 200 | // 8 1 extra flags |
| 201 | // 9 1 operating system |
Amin Hassani | 4a212ed | 2018-02-15 10:31:28 -0800 | [diff] [blame] | 202 | |
Amin Hassani | d7768d5 | 2018-02-28 15:34:21 -0800 | [diff] [blame] | 203 | uint64_t offset = member_start + 10; |
Amin Hassani | 4a212ed | 2018-02-15 10:31:28 -0800 | [diff] [blame] | 204 | int flag = data[member_start + 3]; |
| 205 | // Extra field |
| 206 | if (flag & 4) { |
| 207 | TEST_AND_RETURN_FALSE(offset + 2 <= data.size()); |
| 208 | uint16_t extra_length = data[offset++]; |
| 209 | extra_length |= static_cast<uint16_t>(data[offset++]) << 8; |
| 210 | TEST_AND_RETURN_FALSE(offset + extra_length <= data.size()); |
| 211 | offset += extra_length; |
| 212 | } |
| 213 | // File name field |
| 214 | if (flag & 8) { |
| 215 | while (true) { |
| 216 | TEST_AND_RETURN_FALSE(offset + 1 <= data.size()); |
| 217 | if (data[offset++] == 0) { |
| 218 | break; |
| 219 | } |
| 220 | } |
| 221 | } |
| 222 | // File comment field |
| 223 | if (flag & 16) { |
| 224 | while (true) { |
| 225 | TEST_AND_RETURN_FALSE(offset + 1 <= data.size()); |
| 226 | if (data[offset++] == 0) { |
| 227 | break; |
| 228 | } |
| 229 | } |
| 230 | } |
| 231 | // CRC16 field |
| 232 | if (flag & 2) { |
| 233 | offset += 2; |
| 234 | } |
| 235 | |
Amin Hassani | 8d0ec65 | 2018-06-05 16:02:28 -0700 | [diff] [blame] | 236 | uint64_t compressed_size = 0; |
| 237 | TEST_AND_RETURN_FALSE(LocateDeflatesInDeflateStream( |
| 238 | data.data() + offset, data.size() - offset, offset, deflates, |
| 239 | &compressed_size)); |
Amin Hassani | 4a212ed | 2018-02-15 10:31:28 -0800 | [diff] [blame] | 240 | offset += compressed_size; |
| 241 | |
Amin Hassani | 8d0ec65 | 2018-06-05 16:02:28 -0700 | [diff] [blame] | 242 | // Ignore CRC32 and uncompressed size. |
Amin Hassani | 4a212ed | 2018-02-15 10:31:28 -0800 | [diff] [blame] | 243 | TEST_AND_RETURN_FALSE(offset + 8 <= data.size()); |
Amin Hassani | 8d0ec65 | 2018-06-05 16:02:28 -0700 | [diff] [blame] | 244 | offset += 8; |
Amin Hassani | 4a212ed | 2018-02-15 10:31:28 -0800 | [diff] [blame] | 245 | member_start = offset; |
Amin Hassani | 9c33d3b | 2020-02-19 13:58:33 -0800 | [diff] [blame] | 246 | } while (IsValidGzipHeader(&data[member_start], data.size() - member_start)); |
| 247 | return true; |
Amin Hassani | 4a212ed | 2018-02-15 10:31:28 -0800 | [diff] [blame] | 248 | } |
| 249 | |
Tianjie Xu | 1194223 | 2018-01-18 18:31:42 -0800 | [diff] [blame] | 250 | // For more information about the zip format, refer to |
| 251 | // https://support.pkware.com/display/PKZIP/APPNOTE |
| 252 | bool LocateDeflatesInZipArchive(const Buffer& data, |
Amin Hassani | 8d0ec65 | 2018-06-05 16:02:28 -0700 | [diff] [blame] | 253 | vector<BitExtent>* deflates) { |
Amin Hassani | d7768d5 | 2018-02-28 15:34:21 -0800 | [diff] [blame] | 254 | uint64_t pos = 0; |
Amin Hassani | 4effb50 | 2020-01-13 16:21:56 -0800 | [diff] [blame] | 255 | while (pos + 30 <= data.size()) { |
Tianjie Xu | 1194223 | 2018-01-18 18:31:42 -0800 | [diff] [blame] | 256 | // TODO(xunchang) add support for big endian system when searching for |
| 257 | // magic numbers. |
| 258 | if (get_unaligned<uint32_t>(data.data() + pos) != 0x04034b50) { |
| 259 | pos++; |
| 260 | continue; |
| 261 | } |
| 262 | |
| 263 | // local file header format |
| 264 | // 0 4 0x04034b50 |
| 265 | // 4 2 minimum version needed to extract |
| 266 | // 6 2 general purpose bit flag |
| 267 | // 8 2 compression method |
| 268 | // 10 4 file last modification date & time |
| 269 | // 14 4 CRC-32 |
| 270 | // 18 4 compressed size |
| 271 | // 22 4 uncompressed size |
| 272 | // 26 2 file name length |
| 273 | // 28 2 extra field length |
| 274 | // 30 n file name |
| 275 | // 30+n m extra field |
| 276 | auto compression_method = get_unaligned<uint16_t>(data.data() + pos + 8); |
| 277 | if (compression_method != 8) { // non-deflate type |
| 278 | pos += 4; |
| 279 | continue; |
| 280 | } |
| 281 | |
| 282 | auto compressed_size = get_unaligned<uint32_t>(data.data() + pos + 18); |
Tianjie Xu | 1194223 | 2018-01-18 18:31:42 -0800 | [diff] [blame] | 283 | auto file_name_length = get_unaligned<uint16_t>(data.data() + pos + 26); |
| 284 | auto extra_field_length = get_unaligned<uint16_t>(data.data() + pos + 28); |
| 285 | uint64_t header_size = 30 + file_name_length + extra_field_length; |
| 286 | |
| 287 | // sanity check |
| 288 | if (static_cast<uint64_t>(header_size) + compressed_size > data.size() || |
| 289 | pos > data.size() - header_size - compressed_size) { |
| 290 | pos += 4; |
| 291 | continue; |
| 292 | } |
| 293 | |
Amin Hassani | 8d0ec65 | 2018-06-05 16:02:28 -0700 | [diff] [blame] | 294 | vector<BitExtent> tmp_deflates; |
| 295 | uint64_t offset = pos + header_size; |
| 296 | uint64_t calculated_compressed_size = 0; |
| 297 | if (!LocateDeflatesInDeflateStream( |
| 298 | data.data() + offset, data.size() - offset, offset, &tmp_deflates, |
| 299 | &calculated_compressed_size)) { |
Tianjie Xu | 1194223 | 2018-01-18 18:31:42 -0800 | [diff] [blame] | 300 | LOG(ERROR) << "Failed to decompress the zip entry starting from: " << pos |
| 301 | << ", skip adding deflates for this entry."; |
| 302 | pos += 4; |
| 303 | continue; |
| 304 | } |
| 305 | |
Amin Hassani | 8d0ec65 | 2018-06-05 16:02:28 -0700 | [diff] [blame] | 306 | // Double check the compressed size if it is available in the file header. |
Tianjie Xu | 1194223 | 2018-01-18 18:31:42 -0800 | [diff] [blame] | 307 | if (compressed_size > 0 && compressed_size != calculated_compressed_size) { |
| 308 | LOG(WARNING) << "Compressed size in the file header: " << compressed_size |
| 309 | << " doesn't equal the real size: " |
| 310 | << calculated_compressed_size; |
| 311 | } |
| 312 | |
Amin Hassani | 8d0ec65 | 2018-06-05 16:02:28 -0700 | [diff] [blame] | 313 | deflates->insert(deflates->end(), tmp_deflates.begin(), tmp_deflates.end()); |
Tianjie Xu | 1194223 | 2018-01-18 18:31:42 -0800 | [diff] [blame] | 314 | pos += header_size + calculated_compressed_size; |
| 315 | } |
| 316 | |
| 317 | return true; |
| 318 | } |
| 319 | |
Amin Hassani | 26bcfdd | 2017-09-29 17:54:15 -0700 | [diff] [blame] | 320 | bool FindPuffLocations(const UniqueStreamPtr& src, |
Amin Hassani | 7074da6 | 2017-09-30 17:14:06 -0700 | [diff] [blame] | 321 | const vector<BitExtent>& deflates, |
Amin Hassani | 26bcfdd | 2017-09-29 17:54:15 -0700 | [diff] [blame] | 322 | vector<ByteExtent>* puffs, |
Amin Hassani | d7768d5 | 2018-02-28 15:34:21 -0800 | [diff] [blame] | 323 | uint64_t* out_puff_size) { |
Amin Hassani | 26bcfdd | 2017-09-29 17:54:15 -0700 | [diff] [blame] | 324 | Puffer puffer; |
| 325 | Buffer deflate_buffer; |
| 326 | |
| 327 | // Here accumulate the size difference between each corresponding deflate and |
| 328 | // puff. At the end we add this cummulative size difference to the size of the |
| 329 | // deflate stream to get the size of the puff stream. We use signed size |
| 330 | // because puff size could be smaller than deflate size. |
Amin Hassani | d7768d5 | 2018-02-28 15:34:21 -0800 | [diff] [blame] | 331 | int64_t total_size_difference = 0; |
Amin Hassani | 7074da6 | 2017-09-30 17:14:06 -0700 | [diff] [blame] | 332 | for (auto deflate = deflates.begin(); deflate != deflates.end(); ++deflate) { |
Amin Hassani | 26bcfdd | 2017-09-29 17:54:15 -0700 | [diff] [blame] | 333 | // Read from src into deflate_buffer. |
Amin Hassani | 7074da6 | 2017-09-30 17:14:06 -0700 | [diff] [blame] | 334 | auto start_byte = deflate->offset / 8; |
| 335 | auto end_byte = (deflate->offset + deflate->length + 7) / 8; |
| 336 | deflate_buffer.resize(end_byte - start_byte); |
| 337 | TEST_AND_RETURN_FALSE(src->Seek(start_byte)); |
| 338 | TEST_AND_RETURN_FALSE( |
| 339 | src->Read(deflate_buffer.data(), deflate_buffer.size())); |
Amin Hassani | 26bcfdd | 2017-09-29 17:54:15 -0700 | [diff] [blame] | 340 | // Find the size of the puff. |
Amin Hassani | 7074da6 | 2017-09-30 17:14:06 -0700 | [diff] [blame] | 341 | BufferBitReader bit_reader(deflate_buffer.data(), deflate_buffer.size()); |
Amin Hassani | d7768d5 | 2018-02-28 15:34:21 -0800 | [diff] [blame] | 342 | uint64_t bits_to_skip = deflate->offset % 8; |
Amin Hassani | 7074da6 | 2017-09-30 17:14:06 -0700 | [diff] [blame] | 343 | TEST_AND_RETURN_FALSE(bit_reader.CacheBits(bits_to_skip)); |
| 344 | bit_reader.DropBits(bits_to_skip); |
| 345 | |
Amin Hassani | 26bcfdd | 2017-09-29 17:54:15 -0700 | [diff] [blame] | 346 | BufferPuffWriter puff_writer(nullptr, 0); |
Amin Hassani | 26bcfdd | 2017-09-29 17:54:15 -0700 | [diff] [blame] | 347 | TEST_AND_RETURN_FALSE( |
Amin Hassani | e2e9cb0 | 2018-03-15 14:14:58 -0700 | [diff] [blame] | 348 | puffer.PuffDeflate(&bit_reader, &puff_writer, nullptr)); |
Amin Hassani | 7074da6 | 2017-09-30 17:14:06 -0700 | [diff] [blame] | 349 | TEST_AND_RETURN_FALSE(deflate_buffer.size() == bit_reader.Offset()); |
Amin Hassani | 26bcfdd | 2017-09-29 17:54:15 -0700 | [diff] [blame] | 350 | |
Amin Hassani | 7074da6 | 2017-09-30 17:14:06 -0700 | [diff] [blame] | 351 | // 1 if a deflate ends at the same byte that the next deflate starts and |
| 352 | // there is a few bits gap between them. In practice this may never happen, |
| 353 | // but it is a good idea to support it anyways. If there is a gap, the value |
| 354 | // of the gap will be saved as an integer byte to the puff stream. The parts |
| 355 | // of the byte that belogs to the deflates are shifted out. |
| 356 | int gap = 0; |
| 357 | if (deflate != deflates.begin()) { |
| 358 | auto prev_deflate = std::prev(deflate); |
| 359 | if ((prev_deflate->offset + prev_deflate->length == deflate->offset) |
| 360 | // If deflates are on byte boundary the gap will not be counted later, |
| 361 | // so we won't worry about it. |
| 362 | && (deflate->offset % 8 != 0)) { |
| 363 | gap = 1; |
| 364 | } |
| 365 | } |
| 366 | |
| 367 | start_byte = ((deflate->offset + 7) / 8); |
| 368 | end_byte = (deflate->offset + deflate->length) / 8; |
Amin Hassani | d7768d5 | 2018-02-28 15:34:21 -0800 | [diff] [blame] | 369 | int64_t deflate_length_in_bytes = end_byte - start_byte; |
Amin Hassani | 7074da6 | 2017-09-30 17:14:06 -0700 | [diff] [blame] | 370 | |
| 371 | // If there was no gap bits between the current and previous deflates, there |
| 372 | // will be no extra gap byte, so the offset will be shifted one byte back. |
| 373 | auto puff_offset = start_byte - gap + total_size_difference; |
Amin Hassani | 26bcfdd | 2017-09-29 17:54:15 -0700 | [diff] [blame] | 374 | auto puff_size = puff_writer.Size(); |
Amin Hassani | 7074da6 | 2017-09-30 17:14:06 -0700 | [diff] [blame] | 375 | // Add the location into puff. |
| 376 | puffs->emplace_back(puff_offset, puff_size); |
Amin Hassani | 26bcfdd | 2017-09-29 17:54:15 -0700 | [diff] [blame] | 377 | total_size_difference += |
Amin Hassani | d7768d5 | 2018-02-28 15:34:21 -0800 | [diff] [blame] | 378 | static_cast<int64_t>(puff_size) - deflate_length_in_bytes - gap; |
Amin Hassani | 26bcfdd | 2017-09-29 17:54:15 -0700 | [diff] [blame] | 379 | } |
| 380 | |
Amin Hassani | d7768d5 | 2018-02-28 15:34:21 -0800 | [diff] [blame] | 381 | uint64_t src_size; |
Amin Hassani | 26bcfdd | 2017-09-29 17:54:15 -0700 | [diff] [blame] | 382 | TEST_AND_RETURN_FALSE(src->GetSize(&src_size)); |
Amin Hassani | d7768d5 | 2018-02-28 15:34:21 -0800 | [diff] [blame] | 383 | auto final_size = static_cast<int64_t>(src_size) + total_size_difference; |
Amin Hassani | 26bcfdd | 2017-09-29 17:54:15 -0700 | [diff] [blame] | 384 | TEST_AND_RETURN_FALSE(final_size >= 0); |
| 385 | *out_puff_size = final_size; |
| 386 | return true; |
| 387 | } |
| 388 | |
Sen Jiang | 5eb33e8 | 2018-05-01 15:01:11 -0700 | [diff] [blame] | 389 | void RemoveEqualBitExtents(const Buffer& data1, |
| 390 | const Buffer& data2, |
Amin Hassani | 9f79094 | 2019-01-03 17:26:16 -0800 | [diff] [blame] | 391 | vector<BitExtent>* extents1, |
| 392 | vector<BitExtent>* extents2) { |
Sen Jiang | 5eb33e8 | 2018-05-01 15:01:11 -0700 | [diff] [blame] | 393 | set<ExtentData> extent1_set, equal_extents; |
| 394 | for (const BitExtent& ext : *extents1) { |
| 395 | extent1_set.emplace(ext, data1); |
| 396 | } |
| 397 | |
| 398 | auto new_extents2_end = extents2->begin(); |
| 399 | for (const BitExtent& ext : *extents2) { |
| 400 | ExtentData extent_data(ext, data2); |
| 401 | if (extent1_set.find(extent_data) != extent1_set.end()) { |
| 402 | equal_extents.insert(extent_data); |
| 403 | } else { |
| 404 | *new_extents2_end++ = ext; |
| 405 | } |
| 406 | } |
| 407 | extents2->erase(new_extents2_end, extents2->end()); |
| 408 | extents1->erase( |
| 409 | std::remove_if(extents1->begin(), extents1->end(), |
| 410 | [&equal_extents, &data1](const BitExtent& ext) { |
| 411 | return equal_extents.find(ExtentData(ext, data1)) != |
| 412 | equal_extents.end(); |
| 413 | }), |
| 414 | extents1->end()); |
| 415 | } |
Amin Hassani | dfec7fa | 2019-01-03 11:43:15 -0800 | [diff] [blame] | 416 | |
| 417 | bool RemoveDeflatesWithBadDistanceCaches(const Buffer& data, |
| 418 | vector<BitExtent>* deflates) { |
| 419 | Puffer puffer(true /* exclude_bad_distance_caches */); |
| 420 | for (auto def = deflates->begin(); def != deflates->end();) { |
| 421 | uint64_t offset = def->offset / 8; |
| 422 | uint64_t length = (def->offset + def->length + 7) / 8 - offset; |
| 423 | BufferBitReader br(&data[offset], length); |
| 424 | BufferPuffWriter pw(nullptr, 0); |
| 425 | |
| 426 | // Drop the first few bits in the buffer so we start exactly where the |
| 427 | // deflate starts. |
| 428 | uint64_t bits_to_drop = def->offset % 8; |
| 429 | TEST_AND_RETURN_FALSE(br.CacheBits(bits_to_drop)); |
| 430 | br.DropBits(bits_to_drop); |
| 431 | |
| 432 | vector<BitExtent> defs_out; |
| 433 | TEST_AND_RETURN_FALSE(puffer.PuffDeflate(&br, &pw, &defs_out)); |
| 434 | |
| 435 | TEST_AND_RETURN_FALSE(defs_out.size() <= 1); |
| 436 | if (defs_out.size() == 0) { |
| 437 | // This is a deflate we were looking for, remove it. |
| 438 | def = deflates->erase(def); |
| 439 | } else { |
| 440 | ++def; |
| 441 | } |
| 442 | } |
| 443 | return true; |
| 444 | } |
| 445 | |
Amin Hassani | c3e6b53 | 2017-03-07 17:47:25 -0800 | [diff] [blame] | 446 | } // namespace puffin |