blob: c49aa9a6330dcf856b027af44d33112aec0667af [file] [log] [blame]
Nigel Taod2075ce2018-04-25 15:26:29 +10001// Copyright 2018 The Wuffs Authors.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
Nigel Tao670b45b2019-02-02 08:10:08 +110015// ----------------
16
Nigel Taod2075ce2018-04-25 15:26:29 +100017// This file contains a hand-written C benchmark of different strategies for
18// decoding PNG data.
19//
20// For a PNG image with width W and height H, the H rows can be decompressed
21// one-at-a-time or all-at-once. Roughly speaking, this corresponds to H versus
22// 1 call into the zlib decoder. The former (call it "fragmented dst") requires
23// less scratch-space memory than the latter ("full dst"): 2 * bytes_per_row
24// instead of H * bytes_per row, but the latter can be faster.
25//
26// The zlib-compressed data can be split into multiple IDAT chunks. Similarly,
27// these chunks can be decompressed separately ("fragmented IDAT") or together
28// ("full IDAT"), again providing a memory vs speed trade-off.
29//
30// This program reports the speed of combining the independent frag/full dst
31// and frag/full IDAT techniques.
32//
Nigel Tao95d02ee2019-01-12 12:34:40 +110033// For example, with gcc 7.3 (and -O3) as of January 2019:
Nigel Taod2075ce2018-04-25 15:26:29 +100034//
35// On ../test/data/hat.png (90 × 112 pixels):
36// name time/op relative
Nigel Taoc6a48b32019-01-19 10:50:40 +110037// FragDstFragIDAT/gcc 289µs ± 1% 1.00x
38// FragDstFullIDAT/gcc 288µs ± 0% 1.00x
39// FullDstFragIDAT/gcc 149µs ± 1% 1.93x
40// FullDstFullIDAT/gcc 148µs ± 1% 1.95x
Nigel Taod2075ce2018-04-25 15:26:29 +100041//
Nigel Tao95d02ee2019-01-12 12:34:40 +110042// On ../test/data/hibiscus.regular.png (312 × 442 pixels):
Nigel Taod2075ce2018-04-25 15:26:29 +100043// name time/op relative
Nigel Taoc6a48b32019-01-19 10:50:40 +110044// FragDstFragIDAT/gcc 2.49ms ± 0% 1.00x
45// FragDstFullIDAT/gcc 2.49ms ± 0% 1.00x
46// FullDstFragIDAT/gcc 2.08ms ± 0% 1.20x
47// FullDstFullIDAT/gcc 2.02ms ± 1% 1.23x
Nigel Taod2075ce2018-04-25 15:26:29 +100048//
49// On ../test/data/harvesters.png (1165 × 859 pixels):
50// name time/op relative
Nigel Taoc6a48b32019-01-19 10:50:40 +110051// FragDstFragIDAT/gcc 15.6ms ± 2% 1.00x
52// FragDstFullIDAT/gcc 15.4ms ± 0% 1.01x
53// FullDstFragIDAT/gcc 14.4ms ± 0% 1.08x
54// FullDstFullIDAT/gcc 14.1ms ± 0% 1.10x
Nigel Taod2075ce2018-04-25 15:26:29 +100055
56#include <errno.h>
57#include <inttypes.h>
58#include <stdio.h>
59#include <string.h>
60#include <sys/time.h>
61#include <unistd.h>
62
Nigel Taoa0bafff2018-07-14 08:59:36 +100063// Wuffs ships as a "single file C library" or "header file library" as per
64// https://github.com/nothings/stb/blob/master/docs/stb_howto.txt
65//
66// To use that single file as a "foo.c"-like implementation, instead of a
67// "foo.h"-like header, #define WUFFS_IMPLEMENTATION before #include'ing or
68// compiling it.
69#define WUFFS_IMPLEMENTATION
70
Jimmy Casey3c50ada2018-07-26 17:12:19 +000071// If building this program in an environment that doesn't easily accommodate
Nigel Taod2075ce2018-04-25 15:26:29 +100072// relative includes, you can use the script/inline-c-relative-includes.go
73// program to generate a stand-alone C file.
Nigel Tao95d02ee2019-01-12 12:34:40 +110074#include "../release/c/wuffs-unsupported-snapshot.c"
Nigel Taod2075ce2018-04-25 15:26:29 +100075
76// The order matters here. Clang also defines "__GNUC__".
77#if defined(__clang__)
78const char* cc = "clang";
79const char* cc_version = __clang_version__;
80#elif defined(__GNUC__)
81const char* cc = "gcc";
82const char* cc_version = __VERSION__;
83#elif defined(_MSC_VER)
84const char* cc = "cl";
85const char* cc_version = "???";
86#else
87const char* cc = "cc";
88const char* cc_version = "???";
89#endif
90
91static inline uint32_t load_u32be(uint8_t* p) {
92 return ((uint32_t)(p[0]) << 24) | ((uint32_t)(p[1]) << 16) |
93 ((uint32_t)(p[2]) << 8) | ((uint32_t)(p[3]) << 0);
94}
95
96// Limit the input PNG image (and therefore its IDAT data) to (64 MiB - 1 byte)
97// compressed, in up to 1024 IDAT chunks, and 256 MiB and 16384 × 16384 pixels
98// uncompressed. This is a limitation of this program (which uses the Wuffs
99// standard library), not a limitation of Wuffs per se.
100#define DST_BUFFER_SIZE (256 * 1024 * 1024)
101#define SRC_BUFFER_SIZE (64 * 1024 * 1024)
102#define MAX_DIMENSION (16384)
103#define MAX_IDAT_CHUNKS (1024)
104
105uint8_t dst_buffer[DST_BUFFER_SIZE] = {0};
106size_t dst_len = 0;
107uint8_t src_buffer[SRC_BUFFER_SIZE] = {0};
108size_t src_len = 0;
109uint8_t idat_buffer[SRC_BUFFER_SIZE] = {0};
110// The n'th IDAT chunk data (where n is a zero-based count) is in
111// idat_buffer[i:j], where i = idat_splits[n+0] and j = idat_splits[n+1].
112size_t idat_splits[MAX_IDAT_CHUNKS + 1] = {0};
113uint32_t num_idat_chunks = 0;
114
Nigel Tao53760ce2019-02-16 14:18:25 +1100115#define WORK_BUFFER_SIZE WUFFS_ZLIB__DECODER_WORKBUF_LEN_MAX_INCL_WORST_CASE
116#if WORK_BUFFER_SIZE > 0
117uint8_t work_buffer[WORK_BUFFER_SIZE];
118#else
119// Not all C/C++ compilers support 0-length arrays.
120uint8_t work_buffer[1];
121#endif
122
Nigel Taod2075ce2018-04-25 15:26:29 +1000123uint32_t width = 0;
124uint32_t height = 0;
125uint64_t bytes_per_pixel = 0;
126uint64_t bytes_per_row = 0;
127uint64_t bytes_per_frame = 0;
128
129const char* read_stdin() {
130 while (src_len < SRC_BUFFER_SIZE) {
131 const int stdin_fd = 0;
132 ssize_t n = read(stdin_fd, src_buffer + src_len, SRC_BUFFER_SIZE - src_len);
133 if (n > 0) {
134 src_len += n;
135 } else if (n == 0) {
136 return NULL;
137 } else if (errno == EINTR) {
138 // No-op.
139 } else {
140 return strerror(errno);
141 }
142 }
143 return "input is too large";
144}
145
146const char* process_png_chunks(uint8_t* p, size_t n) {
147 while (n > 0) {
148 // Process the 8 byte chunk header.
149 if (n < 8) {
150 return "invalid PNG chunk";
151 }
152 uint32_t chunk_len = load_u32be(p + 0);
153 uint32_t chunk_type = load_u32be(p + 4);
154 p += 8;
155 n -= 8;
156
157 // Process the chunk payload.
158 if (n < chunk_len) {
159 return "short PNG chunk data";
160 }
161 switch (chunk_type) {
162 case 0x49484452: // "IHDR"
163 if (chunk_len != 13) {
164 return "invalid PNG IDAT chunk";
165 }
166 width = load_u32be(p + 0);
167 height = load_u32be(p + 4);
168 if ((width == 0) || (height == 0)) {
169 return "image dimensions are too small";
170 }
171 if ((width > MAX_DIMENSION) || (height > MAX_DIMENSION)) {
172 return "image dimensions are too large";
173 }
174 if (p[8] != 8) {
175 return "unsupported PNG bit depth";
176 }
177 if (bytes_per_pixel != 0) {
178 return "duplicate PNG IHDR chunk";
179 }
180 // Process the color type, as per the PNG spec table 11.1.
181 switch (p[9]) {
182 case 0:
183 bytes_per_pixel = 1;
184 break;
185 case 2:
186 bytes_per_pixel = 3;
187 break;
188 case 3:
189 bytes_per_pixel = 1;
190 break;
191 case 4:
192 bytes_per_pixel = 2;
193 break;
194 case 6:
195 bytes_per_pixel = 4;
196 break;
197 default:
198 return "unsupported PNG color type";
199 }
200 if (p[12] != 0) {
201 return "unsupported PNG interlacing";
202 }
203 break;
204
205 case 0x49444154: // "IDAT"
206 if (num_idat_chunks == MAX_IDAT_CHUNKS - 1) {
207 return "too many IDAT chunks";
208 }
209 memcpy(idat_buffer + idat_splits[num_idat_chunks], p, chunk_len);
210 idat_splits[num_idat_chunks + 1] =
211 idat_splits[num_idat_chunks] + chunk_len;
212 num_idat_chunks++;
213 break;
214 }
215 p += chunk_len;
216 n -= chunk_len;
217
218 // Process (and ignore) the 4 byte chunk footer (a checksum).
219 if (n < 4) {
220 return "invalid PNG chunk";
221 }
222 p += 4;
223 n -= 4;
224 }
225 return NULL;
226}
227
228const char* decode_once(bool frag_dst, bool frag_idat) {
Nigel Tao53760ce2019-02-16 14:18:25 +1100229 wuffs_zlib__decoder dec;
Nigel Tao95d02ee2019-01-12 12:34:40 +1100230 const char* status =
Nigel Tao53760ce2019-02-16 14:18:25 +1100231 wuffs_zlib__decoder__initialize(&dec, sizeof dec, WUFFS_VERSION, 0);
Nigel Tao95d02ee2019-01-12 12:34:40 +1100232 if (status) {
233 return status;
234 }
Nigel Taod2075ce2018-04-25 15:26:29 +1000235
Nigel Tao95d02ee2019-01-12 12:34:40 +1100236 wuffs_base__io_buffer dst = ((wuffs_base__io_buffer){
237 .data = ((wuffs_base__slice_u8){
238 .ptr = dst_buffer,
239 .len = bytes_per_frame,
240 }),
241 });
242 wuffs_base__io_buffer idat = ((wuffs_base__io_buffer){
243 .data = ((wuffs_base__slice_u8){
244 .ptr = idat_buffer,
245 .len = SRC_BUFFER_SIZE,
246 }),
247 .meta = ((wuffs_base__io_buffer_meta){
248 .wi = idat_splits[num_idat_chunks],
249 .ri = 0,
250 .pos = 0,
251 .closed = true,
252 }),
253 });
Nigel Tao8827e222018-04-27 20:54:44 +1000254 wuffs_base__io_writer dst_writer = wuffs_base__io_buffer__writer(&dst);
255 wuffs_base__io_reader idat_reader = wuffs_base__io_buffer__reader(&idat);
Nigel Taod2075ce2018-04-25 15:26:29 +1000256
257 uint32_t i = 0; // Number of dst fragments processed, if frag_dst.
258 if (frag_dst) {
Nigel Tao95d02ee2019-01-12 12:34:40 +1100259 dst.data.len = bytes_per_row;
Nigel Taod2075ce2018-04-25 15:26:29 +1000260 }
261
262 uint32_t j = 0; // Number of IDAT fragments processed, if frag_idat.
263 if (frag_idat) {
Nigel Tao95d02ee2019-01-12 12:34:40 +1100264 idat.meta.wi = idat_splits[1];
265 idat.meta.closed = (num_idat_chunks == 1);
Nigel Taod2075ce2018-04-25 15:26:29 +1000266 }
267
268 while (true) {
Nigel Tao95d02ee2019-01-12 12:34:40 +1100269 status =
Nigel Tao53760ce2019-02-16 14:18:25 +1100270 wuffs_zlib__decoder__decode_io_writer(&dec, dst_writer, idat_reader,
271 ((wuffs_base__slice_u8){
272 .ptr = work_buffer,
273 .len = WORK_BUFFER_SIZE,
274 }));
Nigel Taod2075ce2018-04-25 15:26:29 +1000275
Nigel Tao95d02ee2019-01-12 12:34:40 +1100276 if (!status) {
Nigel Taod2075ce2018-04-25 15:26:29 +1000277 break;
278 }
Nigel Tao95d02ee2019-01-12 12:34:40 +1100279 if ((status == wuffs_base__suspension__short_write) && frag_dst &&
Nigel Taod2075ce2018-04-25 15:26:29 +1000280 (i < height - 1)) {
281 i++;
Nigel Tao95d02ee2019-01-12 12:34:40 +1100282 dst.data.len = bytes_per_row * (i + 1);
Nigel Taod2075ce2018-04-25 15:26:29 +1000283 continue;
284 }
Nigel Tao95d02ee2019-01-12 12:34:40 +1100285 if ((status == wuffs_base__suspension__short_read) && frag_idat &&
Nigel Taod2075ce2018-04-25 15:26:29 +1000286 (j < num_idat_chunks - 1)) {
287 j++;
Nigel Tao95d02ee2019-01-12 12:34:40 +1100288 idat.meta.wi = idat_splits[j + 1];
289 idat.meta.closed = (num_idat_chunks == j + 1);
Nigel Taod2075ce2018-04-25 15:26:29 +1000290 continue;
291 }
Nigel Tao95d02ee2019-01-12 12:34:40 +1100292 return status;
Nigel Taod2075ce2018-04-25 15:26:29 +1000293 }
294
Nigel Tao95d02ee2019-01-12 12:34:40 +1100295 if (dst.meta.wi != bytes_per_frame) {
Nigel Taod2075ce2018-04-25 15:26:29 +1000296 return "unexpected number of bytes decoded";
297 }
298 return NULL;
299}
300
301const char* decode(bool frag_dst, bool frag_idat) {
302 int reps;
303 if (bytes_per_frame < 100000) {
304 reps = 1000;
305 } else if (bytes_per_frame < 1000000) {
306 reps = 100;
307 } else if (bytes_per_frame < 10000000) {
308 reps = 10;
309 } else {
310 reps = 1;
311 }
312
313 struct timeval bench_start_tv;
314 gettimeofday(&bench_start_tv, NULL);
315
316 int i;
317 for (i = 0; i < reps; i++) {
318 const char* msg = decode_once(frag_dst, frag_idat);
319 if (msg) {
320 return msg;
321 }
322 }
323
324 struct timeval bench_finish_tv;
325 gettimeofday(&bench_finish_tv, NULL);
326 int64_t micros =
327 (int64_t)(bench_finish_tv.tv_sec - bench_start_tv.tv_sec) * 1000000 +
328 (int64_t)(bench_finish_tv.tv_usec - bench_start_tv.tv_usec);
329 uint64_t nanos = 1;
330 if (micros > 0) {
331 nanos = (uint64_t)(micros)*1000;
332 }
333
334 printf("Benchmark%sDst%sIDAT/%s\t%8d\t%8" PRIu64 " ns/op\n",
335 frag_dst ? "Frag" : "Full", //
336 frag_idat ? "Frag" : "Full", //
337 cc, reps, nanos / reps);
338
339 return NULL;
340}
341
342int fail(const char* msg) {
343 const int stderr_fd = 2;
344 write(stderr_fd, msg, strnlen(msg, 4095));
345 write(stderr_fd, "\n", 1);
346 return 1;
347}
348
349int main(int argc, char** argv) {
350 const char* msg = read_stdin();
351 if (msg) {
352 return fail(msg);
353 }
Nigel Tao95d02ee2019-01-12 12:34:40 +1100354 if ((src_len < 8) ||
355 strncmp((const char*)(src_buffer), "\x89PNG\x0D\x0A\x1A\x0A", 8)) {
Nigel Taod2075ce2018-04-25 15:26:29 +1000356 return fail("invalid PNG");
357 }
358 msg = process_png_chunks(src_buffer + 8, src_len - 8);
359 if (msg) {
360 return fail(msg);
361 }
362 if (bytes_per_pixel == 0) {
363 return fail("missing PNG IHDR chunk");
364 }
365 if (num_idat_chunks == 0) {
366 return fail("missing PNG IDAT chunk");
367 }
368 // The +1 here is for the per-row filter byte.
369 bytes_per_row = (uint64_t)width * bytes_per_pixel + 1;
370 bytes_per_frame = (uint64_t)height * bytes_per_row;
371 if (bytes_per_frame > DST_BUFFER_SIZE) {
372 return fail("decompressed data is too large");
373 }
374
375 printf("# %s version %s\n#\n", cc, cc_version);
376 printf(
377 "# The output format, including the \"Benchmark\" prefixes, is "
378 "compatible with the\n"
379 "# https://godoc.org/golang.org/x/perf/cmd/benchstat tool. To install "
380 "it, first\n"
381 "# install Go, then run \"go get golang.org/x/perf/cmd/benchstat\".\n");
382
383 int i;
384 for (i = 0; i < 5; i++) {
385 msg = decode(true, true);
386 if (msg) {
387 return fail(msg);
388 }
389 msg = decode(true, false);
390 if (msg) {
391 return fail(msg);
392 }
393 msg = decode(false, true);
394 if (msg) {
395 return fail(msg);
396 }
397 msg = decode(false, false);
398 if (msg) {
399 return fail(msg);
400 }
401 }
402
403 return 0;
404}