blob: b6dde288d6772a8d25bd308c46c5f1e0b9a140db [file] [log] [blame]
Nigel Taod2075ce2018-04-25 15:26:29 +10001// Copyright 2018 The Wuffs Authors.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// This file contains a hand-written C benchmark of different strategies for
16// decoding PNG data.
17//
18// For a PNG image with width W and height H, the H rows can be decompressed
19// one-at-a-time or all-at-once. Roughly speaking, this corresponds to H versus
20// 1 call into the zlib decoder. The former (call it "fragmented dst") requires
21// less scratch-space memory than the latter ("full dst"): 2 * bytes_per_row
22// instead of H * bytes_per row, but the latter can be faster.
23//
24// The zlib-compressed data can be split into multiple IDAT chunks. Similarly,
25// these chunks can be decompressed separately ("fragmented IDAT") or together
26// ("full IDAT"), again providing a memory vs speed trade-off.
27//
28// This program reports the speed of combining the independent frag/full dst
29// and frag/full IDAT techniques.
30//
31// For example, with gcc 7.3 (and -O3) as of April 2018:
32//
33// On ../test/data/hat.png (90 × 112 pixels):
34// name time/op relative
35// FragDstFragIDAT/gcc 203µs ± 1% 1.00x
36// FragDstFullIDAT/gcc 203µs ± 0% 1.00x
37// FullDstFragIDAT/gcc 170µs ± 0% 1.19x
38// FullDstFullIDAT/gcc 147µs ± 0% 1.38x
39//
40// On ../test/data/hibiscus.png (312 × 442 pixels):
41// name time/op relative
42// FragDstFragIDAT/gcc 2.62ms ± 1% 1.00x
43// FragDstFullIDAT/gcc 2.61ms ± 1% 1.00x
44// FullDstFragIDAT/gcc 2.44ms ± 0% 1.07x
45// FullDstFullIDAT/gcc 2.03ms ± 1% 1.29x
46//
47// On ../test/data/harvesters.png (1165 × 859 pixels):
48// name time/op relative
49// FragDstFragIDAT/gcc 18.3ms ± 0% 1.00x
50// FragDstFullIDAT/gcc 18.3ms ± 1% 1.00x
51// FullDstFragIDAT/gcc 17.1ms ± 0% 1.07x
52// FullDstFullIDAT/gcc 13.9ms ± 0% 1.32x
53
54#include <errno.h>
55#include <inttypes.h>
56#include <stdio.h>
57#include <string.h>
58#include <sys/time.h>
59#include <unistd.h>
60
61// If building this program in an environment that doesn't easily accomodate
62// relative includes, you can use the script/inline-c-relative-includes.go
63// program to generate a stand-alone C file.
Nigel Tao035b1382018-04-27 17:20:10 +100064#include "../gen/c/std/adler32.c"
Nigel Taod2075ce2018-04-25 15:26:29 +100065#include "../gen/c/std/deflate.c"
66#include "../gen/c/std/zlib.c"
67
68// The order matters here. Clang also defines "__GNUC__".
69#if defined(__clang__)
70const char* cc = "clang";
71const char* cc_version = __clang_version__;
72#elif defined(__GNUC__)
73const char* cc = "gcc";
74const char* cc_version = __VERSION__;
75#elif defined(_MSC_VER)
76const char* cc = "cl";
77const char* cc_version = "???";
78#else
79const char* cc = "cc";
80const char* cc_version = "???";
81#endif
82
83static inline uint32_t load_u32be(uint8_t* p) {
84 return ((uint32_t)(p[0]) << 24) | ((uint32_t)(p[1]) << 16) |
85 ((uint32_t)(p[2]) << 8) | ((uint32_t)(p[3]) << 0);
86}
87
88// Limit the input PNG image (and therefore its IDAT data) to (64 MiB - 1 byte)
89// compressed, in up to 1024 IDAT chunks, and 256 MiB and 16384 × 16384 pixels
90// uncompressed. This is a limitation of this program (which uses the Wuffs
91// standard library), not a limitation of Wuffs per se.
92#define DST_BUFFER_SIZE (256 * 1024 * 1024)
93#define SRC_BUFFER_SIZE (64 * 1024 * 1024)
94#define MAX_DIMENSION (16384)
95#define MAX_IDAT_CHUNKS (1024)
96
97uint8_t dst_buffer[DST_BUFFER_SIZE] = {0};
98size_t dst_len = 0;
99uint8_t src_buffer[SRC_BUFFER_SIZE] = {0};
100size_t src_len = 0;
101uint8_t idat_buffer[SRC_BUFFER_SIZE] = {0};
102// The n'th IDAT chunk data (where n is a zero-based count) is in
103// idat_buffer[i:j], where i = idat_splits[n+0] and j = idat_splits[n+1].
104size_t idat_splits[MAX_IDAT_CHUNKS + 1] = {0};
105uint32_t num_idat_chunks = 0;
106
107uint32_t width = 0;
108uint32_t height = 0;
109uint64_t bytes_per_pixel = 0;
110uint64_t bytes_per_row = 0;
111uint64_t bytes_per_frame = 0;
112
113const char* read_stdin() {
114 while (src_len < SRC_BUFFER_SIZE) {
115 const int stdin_fd = 0;
116 ssize_t n = read(stdin_fd, src_buffer + src_len, SRC_BUFFER_SIZE - src_len);
117 if (n > 0) {
118 src_len += n;
119 } else if (n == 0) {
120 return NULL;
121 } else if (errno == EINTR) {
122 // No-op.
123 } else {
124 return strerror(errno);
125 }
126 }
127 return "input is too large";
128}
129
130const char* process_png_chunks(uint8_t* p, size_t n) {
131 while (n > 0) {
132 // Process the 8 byte chunk header.
133 if (n < 8) {
134 return "invalid PNG chunk";
135 }
136 uint32_t chunk_len = load_u32be(p + 0);
137 uint32_t chunk_type = load_u32be(p + 4);
138 p += 8;
139 n -= 8;
140
141 // Process the chunk payload.
142 if (n < chunk_len) {
143 return "short PNG chunk data";
144 }
145 switch (chunk_type) {
146 case 0x49484452: // "IHDR"
147 if (chunk_len != 13) {
148 return "invalid PNG IDAT chunk";
149 }
150 width = load_u32be(p + 0);
151 height = load_u32be(p + 4);
152 if ((width == 0) || (height == 0)) {
153 return "image dimensions are too small";
154 }
155 if ((width > MAX_DIMENSION) || (height > MAX_DIMENSION)) {
156 return "image dimensions are too large";
157 }
158 if (p[8] != 8) {
159 return "unsupported PNG bit depth";
160 }
161 if (bytes_per_pixel != 0) {
162 return "duplicate PNG IHDR chunk";
163 }
164 // Process the color type, as per the PNG spec table 11.1.
165 switch (p[9]) {
166 case 0:
167 bytes_per_pixel = 1;
168 break;
169 case 2:
170 bytes_per_pixel = 3;
171 break;
172 case 3:
173 bytes_per_pixel = 1;
174 break;
175 case 4:
176 bytes_per_pixel = 2;
177 break;
178 case 6:
179 bytes_per_pixel = 4;
180 break;
181 default:
182 return "unsupported PNG color type";
183 }
184 if (p[12] != 0) {
185 return "unsupported PNG interlacing";
186 }
187 break;
188
189 case 0x49444154: // "IDAT"
190 if (num_idat_chunks == MAX_IDAT_CHUNKS - 1) {
191 return "too many IDAT chunks";
192 }
193 memcpy(idat_buffer + idat_splits[num_idat_chunks], p, chunk_len);
194 idat_splits[num_idat_chunks + 1] =
195 idat_splits[num_idat_chunks] + chunk_len;
196 num_idat_chunks++;
197 break;
198 }
199 p += chunk_len;
200 n -= chunk_len;
201
202 // Process (and ignore) the 4 byte chunk footer (a checksum).
203 if (n < 4) {
204 return "invalid PNG chunk";
205 }
206 p += 4;
207 n -= 4;
208 }
209 return NULL;
210}
211
212const char* decode_once(bool frag_dst, bool frag_idat) {
213 wuffs_zlib__decoder dec;
214 wuffs_zlib__decoder__initialize(&dec, WUFFS_VERSION, 0);
215
216 wuffs_base__io_buffer dst = {.ptr = dst_buffer, .len = bytes_per_frame};
217 wuffs_base__io_buffer idat = {.ptr = idat_buffer,
218 .len = SRC_BUFFER_SIZE,
219 .wi = idat_splits[num_idat_chunks],
220 .closed = true};
Nigel Tao8827e222018-04-27 20:54:44 +1000221 wuffs_base__io_writer dst_writer = wuffs_base__io_buffer__writer(&dst);
222 wuffs_base__io_reader idat_reader = wuffs_base__io_buffer__reader(&idat);
Nigel Taod2075ce2018-04-25 15:26:29 +1000223
224 uint32_t i = 0; // Number of dst fragments processed, if frag_dst.
225 if (frag_dst) {
226 dst.len = bytes_per_row;
227 }
228
229 uint32_t j = 0; // Number of IDAT fragments processed, if frag_idat.
230 if (frag_idat) {
231 idat.wi = idat_splits[1];
232 idat.closed = (num_idat_chunks == 1);
233 }
234
235 while (true) {
236 wuffs_zlib__status s =
237 wuffs_zlib__decoder__decode(&dec, dst_writer, idat_reader);
238
239 if (s == WUFFS_ZLIB__STATUS_OK) {
240 break;
241 }
242 if ((s == WUFFS_ZLIB__SUSPENSION_SHORT_WRITE) && frag_dst &&
243 (i < height - 1)) {
244 i++;
245 dst.len = bytes_per_row * (i + 1);
246 continue;
247 }
248 if ((s == WUFFS_ZLIB__SUSPENSION_SHORT_READ) && frag_idat &&
249 (j < num_idat_chunks - 1)) {
250 j++;
251 idat.wi = idat_splits[j + 1];
252 idat.closed = (num_idat_chunks == j + 1);
253 continue;
254 }
255 return wuffs_zlib__status__string(s);
256 }
257
258 if (dst.wi != bytes_per_frame) {
259 return "unexpected number of bytes decoded";
260 }
261 return NULL;
262}
263
264const char* decode(bool frag_dst, bool frag_idat) {
265 int reps;
266 if (bytes_per_frame < 100000) {
267 reps = 1000;
268 } else if (bytes_per_frame < 1000000) {
269 reps = 100;
270 } else if (bytes_per_frame < 10000000) {
271 reps = 10;
272 } else {
273 reps = 1;
274 }
275
276 struct timeval bench_start_tv;
277 gettimeofday(&bench_start_tv, NULL);
278
279 int i;
280 for (i = 0; i < reps; i++) {
281 const char* msg = decode_once(frag_dst, frag_idat);
282 if (msg) {
283 return msg;
284 }
285 }
286
287 struct timeval bench_finish_tv;
288 gettimeofday(&bench_finish_tv, NULL);
289 int64_t micros =
290 (int64_t)(bench_finish_tv.tv_sec - bench_start_tv.tv_sec) * 1000000 +
291 (int64_t)(bench_finish_tv.tv_usec - bench_start_tv.tv_usec);
292 uint64_t nanos = 1;
293 if (micros > 0) {
294 nanos = (uint64_t)(micros)*1000;
295 }
296
297 printf("Benchmark%sDst%sIDAT/%s\t%8d\t%8" PRIu64 " ns/op\n",
298 frag_dst ? "Frag" : "Full", //
299 frag_idat ? "Frag" : "Full", //
300 cc, reps, nanos / reps);
301
302 return NULL;
303}
304
305int fail(const char* msg) {
306 const int stderr_fd = 2;
307 write(stderr_fd, msg, strnlen(msg, 4095));
308 write(stderr_fd, "\n", 1);
309 return 1;
310}
311
312int main(int argc, char** argv) {
313 const char* msg = read_stdin();
314 if (msg) {
315 return fail(msg);
316 }
317 if ((src_len < 8) || strncmp(src_buffer, "\x89PNG\x0D\x0A\x1A\x0A", 8)) {
318 return fail("invalid PNG");
319 }
320 msg = process_png_chunks(src_buffer + 8, src_len - 8);
321 if (msg) {
322 return fail(msg);
323 }
324 if (bytes_per_pixel == 0) {
325 return fail("missing PNG IHDR chunk");
326 }
327 if (num_idat_chunks == 0) {
328 return fail("missing PNG IDAT chunk");
329 }
330 // The +1 here is for the per-row filter byte.
331 bytes_per_row = (uint64_t)width * bytes_per_pixel + 1;
332 bytes_per_frame = (uint64_t)height * bytes_per_row;
333 if (bytes_per_frame > DST_BUFFER_SIZE) {
334 return fail("decompressed data is too large");
335 }
336
337 printf("# %s version %s\n#\n", cc, cc_version);
338 printf(
339 "# The output format, including the \"Benchmark\" prefixes, is "
340 "compatible with the\n"
341 "# https://godoc.org/golang.org/x/perf/cmd/benchstat tool. To install "
342 "it, first\n"
343 "# install Go, then run \"go get golang.org/x/perf/cmd/benchstat\".\n");
344
345 int i;
346 for (i = 0; i < 5; i++) {
347 msg = decode(true, true);
348 if (msg) {
349 return fail(msg);
350 }
351 msg = decode(true, false);
352 if (msg) {
353 return fail(msg);
354 }
355 msg = decode(false, true);
356 if (msg) {
357 return fail(msg);
358 }
359 msg = decode(false, false);
360 if (msg) {
361 return fail(msg);
362 }
363 }
364
365 return 0;
366}