Piotr Pawliczek | cdd921f | 2020-05-06 17:35:58 -0700 | [diff] [blame] | 1 | // Copyright 2020 The Chromium OS Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
| 5 | #include "foomatic_shell/scanner.h" |
| 6 | |
Piotr Pawliczek | cdd921f | 2020-05-06 17:35:58 -0700 | [diff] [blame] | 7 | #include <string> |
| 8 | #include <vector> |
| 9 | |
Qijiang Fan | 713061e | 2021-03-08 15:45:12 +0900 | [diff] [blame] | 10 | #include <base/check.h> |
Piotr Pawliczek | 7bb6e51 | 2020-07-13 10:59:36 -0700 | [diff] [blame] | 11 | #include <base/logging.h> |
| 12 | |
Piotr Pawliczek | cdd921f | 2020-05-06 17:35:58 -0700 | [diff] [blame] | 13 | namespace foomatic_shell { |
| 14 | |
| 15 | // This class encapsulates an iterator representing the current position in the |
| 16 | // input string. |
| 17 | class Scanner::Input { |
| 18 | public: |
| 19 | // Constructor. |data| is a reference to the input string. The input string |
| 20 | // must remain constant and valid during the lifetime of the object. The |
| 21 | // current position is set to the first element in the |data|. |
| 22 | explicit Input(const std::string& data) |
| 23 | : data_(data), current_(data_.begin()) {} |
| 24 | |
| 25 | Input(const Input&) = delete; |
| 26 | Input(Input&&) = delete; |
| 27 | |
| 28 | // Returns the iterator to the current position. The iterator is from the |
| 29 | // input string given in the constructor and is always valid (but may be |
| 30 | // equal |data.end()|). |
| 31 | std::string::const_iterator GetCurrentPosition() const { return current_; } |
| 32 | |
| 33 | // Returns the value of the current character. If the current position is set |
| 34 | // to |data.end()|, this method returns '\0'. |
| 35 | char GetCurrentCharacter() const { |
| 36 | if (current_ == data_.end()) |
| 37 | return '\0'; |
| 38 | return *current_; |
| 39 | } |
| 40 | |
| 41 | // Returns true <=> a character at the current position equals |c|. If the |
| 42 | // current position is set to |data.end()|, it returns false. |
| 43 | bool CurrentCharIs(char c) const { |
| 44 | return (current_ != data_.end() && *current_ == c); |
| 45 | } |
| 46 | |
| 47 | // Returns true <=> a character at the current position is ByteNative (see |
| 48 | // grammar.h for details). If the current position is set to |data.end()|, |
| 49 | // it returns false. |
| 50 | bool CurrentCharIsByteNative() const { |
| 51 | if (current_ == data_.end()) |
| 52 | return false; |
| 53 | if (*current_ >= 'A' && *current_ <= 'Z') |
| 54 | return true; |
| 55 | if (*current_ >= 'a' && *current_ <= 'z') |
| 56 | return true; |
| 57 | if (*current_ >= '0' && *current_ <= '9') |
| 58 | return true; |
| 59 | return (std::string("./_+-@%").find(*current_) != std::string::npos); |
| 60 | } |
| 61 | |
| 62 | // Returns true <=> a character at the current position equals to one of the |
| 63 | // elements of |chars|. If the current position is set to |data.end()|, |
| 64 | // it returns false. |
| 65 | bool CurrentCharIsOneOf(const std::string& chars) const { |
| 66 | if (current_ == data_.end()) |
| 67 | return false; |
| 68 | return (chars.find(*current_) != std::string::npos); |
| 69 | } |
| 70 | |
| 71 | // Returns true <=> the current position is set to |data.end()|. |
| 72 | bool CurrentCharIsEOF() const { return (current_ == data_.end()); } |
| 73 | |
| 74 | // Move the current position to the next element. If the current position |
| 75 | // is set to |data.end()|, it does nothing. |
| 76 | void MoveToNext() { |
| 77 | if (current_ != data_.end()) |
| 78 | ++current_; |
| 79 | } |
| 80 | |
| 81 | private: |
| 82 | const std::string& data_; |
| 83 | std::string::const_iterator current_; |
| 84 | }; |
| 85 | |
| 86 | Scanner::Scanner(const std::string& data) |
| 87 | : data_(std::make_unique<Input>(data)) {} |
| 88 | Scanner::~Scanner() {} |
| 89 | |
| 90 | // Parses the following (see grammar.h for details): |
Piotr Pawliczek | 7bb6e51 | 2020-07-13 10:59:36 -0700 | [diff] [blame] | 91 | // LiteralString = "'" , { ByteCommon | '"' | "`" | "\" } , "'" ; |
Piotr Pawliczek | cdd921f | 2020-05-06 17:35:58 -0700 | [diff] [blame] | 92 | // The current position must be one the opening '. It moves cursor to the first |
| 93 | // character after the closing '. The resultant token is added to |tokens|. |
| 94 | // |tokens| must not be nullptr. Returns false in case of an error. |
| 95 | bool Scanner::ParseLiteralString(std::vector<Token>* tokens) { |
| 96 | DCHECK(tokens != nullptr); |
| 97 | DCHECK(data_->CurrentCharIs('\'')); |
| 98 | |
| 99 | // Skip the opening '. |
| 100 | data_->MoveToNext(); |
| 101 | |
| 102 | // Create a new token. |
| 103 | tokens->resize(tokens->size() + 1); |
| 104 | Token* out = &(tokens->back()); |
| 105 | out->type = Token::Type::kLiteralString; |
| 106 | out->begin = data_->GetCurrentPosition(); |
| 107 | |
| 108 | // Move forward until we find EOF or the closing '. |
| 109 | while (!data_->CurrentCharIsEOF()) { |
| 110 | if (data_->CurrentCharIs('\'')) { |
| 111 | // The closing ' was found. |
| 112 | out->end = data_->GetCurrentPosition(); |
| 113 | out->value.assign(out->begin, out->end); |
| 114 | // Skip the closing '. |
| 115 | data_->MoveToNext(); |
| 116 | // Success! |
| 117 | return true; |
| 118 | } |
| 119 | data_->MoveToNext(); |
| 120 | } |
| 121 | |
| 122 | // There is no closing '. |
| 123 | out->end = data_->GetCurrentPosition(); |
| 124 | message_ = "Unexpected EOF when parsing '...' (literal string)"; |
| 125 | return false; |
| 126 | } |
| 127 | |
| 128 | // Parses the following (see grammar.h for details): |
Piotr Pawliczek | 7bb6e51 | 2020-07-13 10:59:36 -0700 | [diff] [blame] | 129 | // ExecutedString = "`" , { ByteCommon | "'" | '"' | ("\",ByteAny) } , "`" ; |
Piotr Pawliczek | cdd921f | 2020-05-06 17:35:58 -0700 | [diff] [blame] | 130 | // The current position must be one the opening `. It moves cursor to the first |
| 131 | // character after the closing `. The resultant token is added to |tokens|. |
| 132 | // |tokens| must not be nullptr. Returns false in case of an error. |
| 133 | bool Scanner::ParseExecutedString(std::vector<Token>* tokens) { |
| 134 | DCHECK(tokens != nullptr); |
| 135 | DCHECK(data_->CurrentCharIs('`')); |
| 136 | |
| 137 | // Skip the opening `. |
| 138 | data_->MoveToNext(); |
| 139 | |
| 140 | // Create a new token. |
| 141 | tokens->resize(tokens->size() + 1); |
| 142 | Token* out = &(tokens->back()); |
| 143 | out->type = Token::Type::kExecutedString; |
| 144 | out->begin = data_->GetCurrentPosition(); |
| 145 | |
| 146 | // Move forward until we find EOF or the closing `. |
| 147 | while (!data_->CurrentCharIsEOF()) { |
| 148 | if (data_->CurrentCharIs('`')) { |
| 149 | // The closing ` was found. |
| 150 | out->end = data_->GetCurrentPosition(); |
| 151 | // Skip the closing `. |
| 152 | data_->MoveToNext(); |
| 153 | // Success! |
| 154 | return true; |
| 155 | } |
| 156 | // The escape character (\) works in ExecutedString for ByteAny. |
| 157 | if (data_->CurrentCharIs('\\')) { |
| 158 | data_->MoveToNext(); |
| 159 | if (data_->CurrentCharIsEOF()) |
| 160 | break; |
| 161 | } |
| 162 | // Save the current character and move to the next element. |
| 163 | out->value.push_back(data_->GetCurrentCharacter()); |
| 164 | data_->MoveToNext(); |
| 165 | } |
| 166 | |
| 167 | // There is no closing `. |
| 168 | out->end = data_->GetCurrentPosition(); |
| 169 | message_ = "Unexpected EOF when parsing `...` (executed string)"; |
| 170 | return false; |
| 171 | } |
| 172 | |
| 173 | // Parses the following (see grammar.h for details): |
Piotr Pawliczek | 7bb6e51 | 2020-07-13 10:59:36 -0700 | [diff] [blame] | 174 | // InterpretedString = '"' , { ByteCommon | "'" | "\" | ("\",'"') | ("\","`") |
| 175 | // | ("\","\") | ExecutedString } , '"' ; |
Piotr Pawliczek | cdd921f | 2020-05-06 17:35:58 -0700 | [diff] [blame] | 176 | // The current position must be one the opening ". It moves cursor to the first |
| 177 | // character after the closing ". If the string contains one or more |
| 178 | // ExecutedString, it is split into a sequence of consecutive tokens of types |
| 179 | // InterpretedString and ExecutedString. The resultant tokens are added to |
| 180 | // |tokens|. |tokens| must not be nullptr. Returns false in case of an error. |
| 181 | bool Scanner::ParseInterpretedString(std::vector<Token>* tokens) { |
| 182 | DCHECK(tokens != nullptr); |
| 183 | DCHECK(data_->CurrentCharIs('"')); |
| 184 | |
| 185 | // Skip the opening ". |
| 186 | data_->MoveToNext(); |
| 187 | |
| 188 | // Create a sequence of alternating InterpretedString and ExecutedString |
| 189 | // tokens. |
| 190 | while (true) { |
| 191 | // Create a new InterpretedString token. |
| 192 | tokens->resize(tokens->size() + 1); |
| 193 | Token* out = &(tokens->back()); |
| 194 | out->type = Token::Type::kInterpretedString; |
| 195 | out->begin = data_->GetCurrentPosition(); |
| 196 | |
| 197 | // Move forward until we find EOF, the closing " or the opening `. |
| 198 | while (true) { |
| 199 | if (data_->CurrentCharIs('"')) { |
| 200 | // The closing " was found. |
| 201 | out->end = data_->GetCurrentPosition(); |
| 202 | data_->MoveToNext(); |
| 203 | return true; |
| 204 | } |
| 205 | if (data_->CurrentCharIs('`')) { |
| 206 | // The opening ` was found. We finish the current token and |
| 207 | // add a new ExecutedString token. |
| 208 | out->end = data_->GetCurrentPosition(); |
| 209 | if (!ParseExecutedString(tokens)) |
| 210 | return false; |
| 211 | // We break the internal loop to create a new InterpretedString |
| 212 | // token. |
| 213 | break; |
| 214 | } |
| 215 | if (data_->CurrentCharIs('\\')) { |
| 216 | // It may be an escape character for " or `. |
| 217 | data_->MoveToNext(); |
| 218 | if (data_->CurrentCharIsOneOf("\"`\\")) { |
| 219 | // The next character is " or `. Just skip \ and go ahead. |
| 220 | } else { |
| 221 | // It was not an escape character. We have to add a skipped \. |
| 222 | out->value.push_back('\\'); |
| 223 | } |
| 224 | } |
| 225 | if (data_->CurrentCharIsEOF()) { |
| 226 | // There is no closing ". |
| 227 | out->end = data_->GetCurrentPosition(); |
| 228 | message_ = "Unexpected EOF when parsing \"...\" (interpreted string)"; |
| 229 | return false; |
| 230 | } |
| 231 | // Save the current character and move to the next element. |
| 232 | out->value.push_back(data_->GetCurrentCharacter()); |
| 233 | data_->MoveToNext(); |
| 234 | } |
| 235 | } |
| 236 | } |
| 237 | |
| 238 | // Parses the following (see grammar.h for details): |
Piotr Pawliczek | 7bb6e51 | 2020-07-13 10:59:36 -0700 | [diff] [blame] | 239 | // NativeString = { ByteNative | ("\",ByteAny) }- ; |
Piotr Pawliczek | cdd921f | 2020-05-06 17:35:58 -0700 | [diff] [blame] | 240 | // The current position must be one the first character of NativeString. It |
| 241 | // moves cursor to the first character after the end of the string. The |
| 242 | // resultant token is added to |tokens|. |tokens| must not be nullptr. Returns |
| 243 | // false in case of an error. |
| 244 | bool Scanner::ParseNativeString(std::vector<Token>* tokens) { |
| 245 | DCHECK(tokens != nullptr); |
| 246 | DCHECK(data_->CurrentCharIsByteNative() || data_->CurrentCharIs('\\')); |
| 247 | |
| 248 | // Create a new token. |
| 249 | tokens->resize(tokens->size() + 1); |
| 250 | Token* out = &(tokens->back()); |
| 251 | out->type = Token::Type::kNativeString; |
| 252 | out->begin = data_->GetCurrentPosition(); |
| 253 | |
| 254 | // Move forward until we find EOF or the end of the string. |
| 255 | while (!data_->CurrentCharIsEOF()) { |
| 256 | if (data_->CurrentCharIs('\\')) { |
| 257 | // This is an escape character. |
| 258 | data_->MoveToNext(); |
| 259 | if (data_->CurrentCharIsEOF()) { |
| 260 | // It is an error: EOF after the escape character. |
| 261 | out->end = data_->GetCurrentPosition(); |
| 262 | message_ = "Unexpected EOF after escape character (\\)"; |
| 263 | return false; |
| 264 | } |
| 265 | // Add the escaped character to the string. |
| 266 | out->value.push_back(data_->GetCurrentCharacter()); |
| 267 | // Go to the next character. |
| 268 | data_->MoveToNext(); |
| 269 | continue; |
| 270 | } |
| 271 | |
| 272 | // If the current character is not a ByteNative, we found the end of the |
| 273 | // string. |
| 274 | if (!data_->CurrentCharIsByteNative()) |
| 275 | break; |
| 276 | |
| 277 | // Save the current character and move to the next element. |
| 278 | out->value.push_back(data_->GetCurrentCharacter()); |
| 279 | data_->MoveToNext(); |
| 280 | } |
| 281 | |
| 282 | // We are at EOF or at the first character not being part of the string. |
| 283 | out->end = data_->GetCurrentPosition(); |
| 284 | return true; |
| 285 | } |
| 286 | |
| 287 | bool Scanner::ParseWholeInput(std::vector<Token>* tokens) { |
| 288 | DCHECK(tokens != nullptr); |
| 289 | |
| 290 | while (!data_->CurrentCharIsEOF()) { |
| 291 | // Check for different types of string. |
| 292 | if (data_->CurrentCharIs('\'')) { |
| 293 | if (!ParseLiteralString(tokens)) |
| 294 | return false; |
| 295 | continue; |
| 296 | } |
| 297 | if (data_->CurrentCharIs('"')) { |
| 298 | if (!ParseInterpretedString(tokens)) |
| 299 | return false; |
| 300 | continue; |
| 301 | } |
| 302 | if (data_->CurrentCharIs('`')) { |
| 303 | if (!ParseExecutedString(tokens)) |
| 304 | return false; |
| 305 | continue; |
| 306 | } |
| 307 | if (data_->CurrentCharIsByteNative() || data_->CurrentCharIs('\\')) { |
| 308 | if (!ParseNativeString(tokens)) |
| 309 | return false; |
| 310 | continue; |
| 311 | } |
| 312 | |
| 313 | // Create a new token. |
| 314 | tokens->resize(tokens->size() + 1); |
| 315 | Token& token = tokens->back(); |
| 316 | |
| 317 | if (data_->CurrentCharIsOneOf(" \t")) { |
| 318 | // It is a Space token. |
Piotr Pawliczek | 7bb6e51 | 2020-07-13 10:59:36 -0700 | [diff] [blame] | 319 | // Space = { " " | Tabulator }- ; |
Piotr Pawliczek | cdd921f | 2020-05-06 17:35:58 -0700 | [diff] [blame] | 320 | token.type = Token::Type::kSpace; |
| 321 | token.begin = data_->GetCurrentPosition(); |
| 322 | // Move forward until we find the first character not being part of |
| 323 | // the Space token. It stops also at EOF. |
| 324 | while (data_->CurrentCharIsOneOf(" \t")) |
| 325 | data_->MoveToNext(); |
| 326 | token.end = data_->GetCurrentPosition(); |
| 327 | continue; |
| 328 | } |
| 329 | |
| 330 | // Add a single character as a token. |
| 331 | token.type = Token::Type::kByte; |
| 332 | token.begin = data_->GetCurrentPosition(); |
| 333 | data_->MoveToNext(); |
| 334 | token.end = data_->GetCurrentPosition(); |
| 335 | token.value.assign(token.begin, token.end); |
| 336 | } |
| 337 | |
| 338 | // Add a special EOF token at the end. |
| 339 | tokens->resize(tokens->size() + 1); |
| 340 | tokens->back().type = Token::Type::kEOF; |
| 341 | tokens->back().begin = tokens->back().end = data_->GetCurrentPosition(); |
| 342 | return true; |
| 343 | } |
| 344 | |
| 345 | std::string::const_iterator Scanner::GetPosition() const { |
| 346 | return data_->GetCurrentPosition(); |
| 347 | } |
| 348 | |
| 349 | } // namespace foomatic_shell |