blob: 5692060c7f7eb65403022df2ffcf7054590519d3 [file] [log] [blame]
dor1s6fb30862019-06-11 14:30:18 +00001//===- FuzzedDataProvider.h - Utility header for fuzz targets ---*- C++ -* ===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
dor1sbbb32d02019-08-08 19:49:37 +00008// This a temporary copy of compiler-rt/include/fuzzer/FuzzedDataProvider.h.
dor1s058d0202019-08-06 16:02:39 +00009// TODO(mmoroz@chromium.org): delete this copy.
dor1s6fb30862019-06-11 14:30:18 +000010// A single header library providing an utility class to break up an array of
dor1sf1a25802019-06-18 20:29:11 +000011// bytes. Whenever run on the same input, provides the same output, as long as
12// its methods are called in the same order, with the same arguments.
dor1s6fb30862019-06-11 14:30:18 +000013//===----------------------------------------------------------------------===//
14
15#ifndef LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_
16#define LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_
17
18#include <limits.h>
19#include <stddef.h>
20#include <stdint.h>
21
22#include <algorithm>
23#include <cstring>
dor1sf1a25802019-06-18 20:29:11 +000024#include <initializer_list>
dor1s6fb30862019-06-11 14:30:18 +000025#include <string>
26#include <type_traits>
27#include <utility>
28#include <vector>
29
30class FuzzedDataProvider {
dor1s78e9a672019-08-05 19:55:52 +000031 public:
dor1s6fb30862019-06-11 14:30:18 +000032 // |data| is an array of length |size| that the FuzzedDataProvider wraps to
33 // provide more granular access. |data| must outlive the FuzzedDataProvider.
dor1sf1a25802019-06-18 20:29:11 +000034 FuzzedDataProvider(const uint8_t *data, size_t size)
dor1s6fb30862019-06-11 14:30:18 +000035 : data_ptr_(data), remaining_bytes_(size) {}
36 ~FuzzedDataProvider() = default;
37
38 // Returns a std::vector containing |num_bytes| of input data. If fewer than
39 // |num_bytes| of data remain, returns a shorter std::vector containing all
dor1sf1a25802019-06-18 20:29:11 +000040 // of the data that's left. Can be used with any byte sized type, such as
41 // char, unsigned char, uint8_t, etc.
42 template <typename T> std::vector<T> ConsumeBytes(size_t num_bytes) {
dor1s6fb30862019-06-11 14:30:18 +000043 num_bytes = std::min(num_bytes, remaining_bytes_);
dor1sf1a25802019-06-18 20:29:11 +000044 return ConsumeBytes<T>(num_bytes, num_bytes);
45 }
dor1s6fb30862019-06-11 14:30:18 +000046
dor1sf1a25802019-06-18 20:29:11 +000047 // Similar to |ConsumeBytes|, but also appends the terminator value at the end
48 // of the resulting vector. Useful, when a mutable null-terminated C-string is
49 // needed, for example. But that is a rare case. Better avoid it, if possible,
50 // and prefer using |ConsumeBytes| or |ConsumeBytesAsString| methods.
51 template <typename T>
52 std::vector<T> ConsumeBytesWithTerminator(size_t num_bytes,
53 T terminator = 0) {
54 num_bytes = std::min(num_bytes, remaining_bytes_);
55 std::vector<T> result = ConsumeBytes<T>(num_bytes + 1, num_bytes);
56 result.back() = terminator;
dor1s6fb30862019-06-11 14:30:18 +000057 return result;
58 }
59
dor1sf1a25802019-06-18 20:29:11 +000060 // Returns a std::string containing |num_bytes| of input data. Using this and
61 // |.c_str()| on the resulting string is the best way to get an immutable
62 // null-terminated C string. If fewer than |num_bytes| of data remain, returns
63 // a shorter std::string containing all of the data that's left.
dor1s6fb30862019-06-11 14:30:18 +000064 std::string ConsumeBytesAsString(size_t num_bytes) {
dor1sf1a25802019-06-18 20:29:11 +000065 static_assert(sizeof(std::string::value_type) == sizeof(uint8_t),
dor1s6fb30862019-06-11 14:30:18 +000066 "ConsumeBytesAsString cannot convert the data to a string.");
67
68 num_bytes = std::min(num_bytes, remaining_bytes_);
69 std::string result(
dor1sf1a25802019-06-18 20:29:11 +000070 reinterpret_cast<const std::string::value_type *>(data_ptr_),
71 num_bytes);
dor1s6fb30862019-06-11 14:30:18 +000072 Advance(num_bytes);
73 return result;
74 }
75
dor1sf1a25802019-06-18 20:29:11 +000076 // Returns a number in the range [min, max] by consuming bytes from the
77 // input data. The value might not be uniformly distributed in the given
78 // range. If there's no input data left, always returns |min|. |min| must
79 // be less than or equal to |max|.
80 template <typename T> T ConsumeIntegralInRange(T min, T max) {
dor1s6fb30862019-06-11 14:30:18 +000081 static_assert(std::is_integral<T>::value, "An integral type is required.");
82 static_assert(sizeof(T) <= sizeof(uint64_t), "Unsupported integral type.");
83
84 if (min > max)
85 abort();
86
87 // Use the biggest type possible to hold the range and the result.
88 uint64_t range = static_cast<uint64_t>(max) - min;
89 uint64_t result = 0;
90 size_t offset = 0;
91
92 while (offset < sizeof(T) * CHAR_BIT && (range >> offset) > 0 &&
93 remaining_bytes_ != 0) {
94 // Pull bytes off the end of the seed data. Experimentally, this seems to
95 // allow the fuzzer to more easily explore the input space. This makes
96 // sense, since it works by modifying inputs that caused new code to run,
97 // and this data is often used to encode length of data read by
98 // |ConsumeBytes|. Separating out read lengths makes it easier modify the
99 // contents of the data that is actually read.
100 --remaining_bytes_;
101 result = (result << CHAR_BIT) | data_ptr_[remaining_bytes_];
102 offset += CHAR_BIT;
103 }
104
dor1sf1a25802019-06-18 20:29:11 +0000105 // Avoid division by 0, in case |range + 1| results in overflow.
dor1s6fb30862019-06-11 14:30:18 +0000106 if (range != std::numeric_limits<decltype(range)>::max())
107 result = result % (range + 1);
108
109 return static_cast<T>(min + result);
110 }
111
112 // Returns a std::string of length from 0 to |max_length|. When it runs out of
113 // input data, returns what remains of the input. Designed to be more stable
114 // with respect to a fuzzer inserting characters than just picking a random
115 // length and then consuming that many bytes with |ConsumeBytes|.
116 std::string ConsumeRandomLengthString(size_t max_length) {
117 // Reads bytes from the start of |data_ptr_|. Maps "\\" to "\", and maps "\"
118 // followed by anything else to the end of the string. As a result of this
119 // logic, a fuzzer can insert characters into the string, and the string
120 // will be lengthened to include those new characters, resulting in a more
121 // stable fuzzer than picking the length of a string independently from
122 // picking its contents.
123 std::string result;
dor1sf1a25802019-06-18 20:29:11 +0000124
125 // Reserve the anticipated capaticity to prevent several reallocations.
126 result.reserve(std::min(max_length, remaining_bytes_));
dor1s6fb30862019-06-11 14:30:18 +0000127 for (size_t i = 0; i < max_length && remaining_bytes_ != 0; ++i) {
dor1sf1a25802019-06-18 20:29:11 +0000128 char next = ConvertUnsignedToSigned<char>(data_ptr_[0]);
dor1s6fb30862019-06-11 14:30:18 +0000129 Advance(1);
130 if (next == '\\' && remaining_bytes_ != 0) {
dor1sf1a25802019-06-18 20:29:11 +0000131 next = ConvertUnsignedToSigned<char>(data_ptr_[0]);
dor1s6fb30862019-06-11 14:30:18 +0000132 Advance(1);
133 if (next != '\\')
dor1sf1a25802019-06-18 20:29:11 +0000134 break;
dor1s6fb30862019-06-11 14:30:18 +0000135 }
136 result += next;
137 }
138
139 result.shrink_to_fit();
140 return result;
141 }
142
143 // Returns a std::vector containing all remaining bytes of the input data.
dor1sf1a25802019-06-18 20:29:11 +0000144 template <typename T> std::vector<T> ConsumeRemainingBytes() {
dor1s6fb30862019-06-11 14:30:18 +0000145 return ConsumeBytes<T>(remaining_bytes_);
146 }
147
148 // Prefer using |ConsumeRemainingBytes| unless you actually need a std::string
149 // object.
150 // Returns a std::vector containing all remaining bytes of the input data.
151 std::string ConsumeRemainingBytesAsString() {
152 return ConsumeBytesAsString(remaining_bytes_);
153 }
154
155 // Returns a number in the range [Type's min, Type's max]. The value might
156 // not be uniformly distributed in the given range. If there's no input data
157 // left, always returns |min|.
dor1sf1a25802019-06-18 20:29:11 +0000158 template <typename T> T ConsumeIntegral() {
dor1s6fb30862019-06-11 14:30:18 +0000159 return ConsumeIntegralInRange(std::numeric_limits<T>::min(),
160 std::numeric_limits<T>::max());
161 }
162
163 // Reads one byte and returns a bool, or false when no data remains.
164 bool ConsumeBool() { return 1 & ConsumeIntegral<uint8_t>(); }
165
dor1sf1a25802019-06-18 20:29:11 +0000166 // Returns a copy of a value selected from a fixed-size |array|.
dor1s6fb30862019-06-11 14:30:18 +0000167 template <typename T, size_t size>
dor1sf1a25802019-06-18 20:29:11 +0000168 T PickValueInArray(const T (&array)[size]) {
169 static_assert(size > 0, "The array must be non empty.");
dor1s6fb30862019-06-11 14:30:18 +0000170 return array[ConsumeIntegralInRange<size_t>(0, size - 1)];
171 }
172
dor1sf1a25802019-06-18 20:29:11 +0000173 template <typename T>
174 T PickValueInArray(std::initializer_list<const T> list) {
175 // static_assert(list.size() > 0, "The array must be non empty.");
176 return *(list.begin() + ConsumeIntegralInRange<size_t>(0, list.size() - 1));
177 }
178
dor1s6fb30862019-06-11 14:30:18 +0000179 // Return an enum value. The enum must start at 0 and be contiguous. It must
180 // also contain |kMaxValue| aliased to its largest (inclusive) value. Such as:
181 // enum class Foo { SomeValue, OtherValue, kMaxValue = OtherValue };
dor1sf1a25802019-06-18 20:29:11 +0000182 template <typename T> T ConsumeEnum() {
dor1s6fb30862019-06-11 14:30:18 +0000183 static_assert(std::is_enum<T>::value, "|T| must be an enum type.");
184 return static_cast<T>(ConsumeIntegralInRange<uint32_t>(
185 0, static_cast<uint32_t>(T::kMaxValue)));
186 }
187
188 // Reports the remaining bytes available for fuzzed input.
189 size_t remaining_bytes() { return remaining_bytes_; }
190
dor1s78e9a672019-08-05 19:55:52 +0000191 private:
dor1sf1a25802019-06-18 20:29:11 +0000192 FuzzedDataProvider(const FuzzedDataProvider &) = delete;
193 FuzzedDataProvider &operator=(const FuzzedDataProvider &) = delete;
dor1s6fb30862019-06-11 14:30:18 +0000194
195 void Advance(size_t num_bytes) {
196 if (num_bytes > remaining_bytes_)
197 abort();
198
199 data_ptr_ += num_bytes;
200 remaining_bytes_ -= num_bytes;
201 }
202
dor1sf1a25802019-06-18 20:29:11 +0000203 template <typename T>
204 std::vector<T> ConsumeBytes(size_t size, size_t num_bytes_to_consume) {
205 static_assert(sizeof(T) == sizeof(uint8_t), "Incompatible data type.");
206
207 // The point of using the size-based constructor below is to increase the
208 // odds of having a vector object with capacity being equal to the length.
209 // That part is always implementation specific, but at least both libc++ and
210 // libstdc++ allocate the requested number of bytes in that constructor,
211 // which seems to be a natural choice for other implementations as well.
212 // To increase the odds even more, we also call |shrink_to_fit| below.
213 std::vector<T> result(size);
214 std::memcpy(result.data(), data_ptr_, num_bytes_to_consume);
215 Advance(num_bytes_to_consume);
216
217 // Even though |shrink_to_fit| is also implementation specific, we expect it
218 // to provide an additional assurance in case vector's constructor allocated
219 // a buffer which is larger than the actual amount of data we put inside it.
220 result.shrink_to_fit();
221 return result;
222 }
223
224 template <typename TS, typename TU> TS ConvertUnsignedToSigned(TU value) {
225 static_assert(sizeof(TS) == sizeof(TU), "Incompatible data types.");
226 static_assert(!std::numeric_limits<TU>::is_signed,
227 "Source type must be unsigned.");
dor1sf1a25802019-06-18 20:29:11 +0000228
229 // TODO(Dor1s): change to `if constexpr` once C++17 becomes mainstream.
230 if (std::numeric_limits<TS>::is_modulo)
231 return static_cast<TS>(value);
232
233 // Avoid using implementation-defined unsigned to signer conversions.
234 // To learn more, see https://stackoverflow.com/questions/13150449.
235 if (value <= std::numeric_limits<TS>::max())
236 return static_cast<TS>(value);
237 else {
238 constexpr auto TS_min = std::numeric_limits<TS>::min();
239 return TS_min + static_cast<char>(value - TS_min);
240 }
241 }
242
243 const uint8_t *data_ptr_;
dor1s6fb30862019-06-11 14:30:18 +0000244 size_t remaining_bytes_;
245};
246
dor1sf1a25802019-06-18 20:29:11 +0000247#endif // LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_