Blame - node_modules/css-tree/lib/tokenizer/index.js - chromium.googlesource.com/devtools/devtools-frontend - Gitiles

blob: dd7ada27bae9f997c23611b69b8e01e392aafefe [file] [log] [blame]

Tim van der Lippe	706ec96	2021-06-04 13:24:42 +0100	[diff] [blame]	1	var TokenStream = require('../common/TokenStream');
				2	var adoptBuffer = require('../common/adopt-buffer');
				3
				4	var constants = require('./const');
				5	var TYPE = constants.TYPE;
				6
				7	var charCodeDefinitions = require('./char-code-definitions');
				8	var isNewline = charCodeDefinitions.isNewline;
				9	var isName = charCodeDefinitions.isName;
				10	var isValidEscape = charCodeDefinitions.isValidEscape;
				11	var isNumberStart = charCodeDefinitions.isNumberStart;
				12	var isIdentifierStart = charCodeDefinitions.isIdentifierStart;
				13	var charCodeCategory = charCodeDefinitions.charCodeCategory;
				14	var isBOM = charCodeDefinitions.isBOM;
				15
				16	var utils = require('./utils');
				17	var cmpStr = utils.cmpStr;
				18	var getNewlineLength = utils.getNewlineLength;
				19	var findWhiteSpaceEnd = utils.findWhiteSpaceEnd;
				20	var consumeEscaped = utils.consumeEscaped;
				21	var consumeName = utils.consumeName;
				22	var consumeNumber = utils.consumeNumber;
				23	var consumeBadUrlRemnants = utils.consumeBadUrlRemnants;
				24
				25	var OFFSET_MASK = 0x00FFFFFF;
				26	var TYPE_SHIFT = 24;
				27
				28	function tokenize(source, stream) {
				29	function getCharCode(offset) {
				30	return offset < sourceLength ? source.charCodeAt(offset) : 0;
				31	}
				32
				33	// § 4.3.3. Consume a numeric token
				34	function consumeNumericToken() {
				35	// Consume a number and let number be the result.
				36	offset = consumeNumber(source, offset);
				37
				38	// If the next 3 input code points would start an identifier, then:
				39	if (isIdentifierStart(getCharCode(offset), getCharCode(offset + 1), getCharCode(offset + 2))) {
				40	// Create a <dimension-token> with the same value and type flag as number, and a unit set initially to the empty string.
				41	// Consume a name. Set the <dimension-token>’s unit to the returned value.
				42	// Return the <dimension-token>.
				43	type = TYPE.Dimension;
				44	offset = consumeName(source, offset);
				45	return;
				46	}
				47
				48	// Otherwise, if the next input code point is U+0025 PERCENTAGE SIGN (%), consume it.
				49	if (getCharCode(offset) === 0x0025) {
				50	// Create a <percentage-token> with the same value as number, and return it.
				51	type = TYPE.Percentage;
				52	offset++;
				53	return;
				54	}
				55
				56	// Otherwise, create a <number-token> with the same value and type flag as number, and return it.
				57	type = TYPE.Number;
				58	}
				59
				60	// § 4.3.4. Consume an ident-like token
				61	function consumeIdentLikeToken() {
				62	const nameStartOffset = offset;
				63
				64	// Consume a name, and let string be the result.
				65	offset = consumeName(source, offset);
				66
				67	// If string’s value is an ASCII case-insensitive match for "url",
				68	// and the next input code point is U+0028 LEFT PARENTHESIS ((), consume it.
				69	if (cmpStr(source, nameStartOffset, offset, 'url') && getCharCode(offset) === 0x0028) {
				70	// While the next two input code points are whitespace, consume the next input code point.
				71	offset = findWhiteSpaceEnd(source, offset + 1);
				72
				73	// If the next one or two input code points are U+0022 QUOTATION MARK ("), U+0027 APOSTROPHE ('),
				74	// or whitespace followed by U+0022 QUOTATION MARK (") or U+0027 APOSTROPHE ('),
				75	// then create a <function-token> with its value set to string and return it.
				76	if (getCharCode(offset) === 0x0022 \|\|
				77	getCharCode(offset) === 0x0027) {
				78	type = TYPE.Function;
				79	offset = nameStartOffset + 4;
				80	return;
				81	}
				82
				83	// Otherwise, consume a url token, and return it.
				84	consumeUrlToken();
				85	return;
				86	}
				87
				88	// Otherwise, if the next input code point is U+0028 LEFT PARENTHESIS ((), consume it.
				89	// Create a <function-token> with its value set to string and return it.
				90	if (getCharCode(offset) === 0x0028) {
				91	type = TYPE.Function;
				92	offset++;
				93	return;
				94	}
				95
				96	// Otherwise, create an <ident-token> with its value set to string and return it.
				97	type = TYPE.Ident;
				98	}
				99
				100	// § 4.3.5. Consume a string token
				101	function consumeStringToken(endingCodePoint) {
				102	// This algorithm may be called with an ending code point, which denotes the code point
				103	// that ends the string. If an ending code point is not specified,
				104	// the current input code point is used.
				105	if (!endingCodePoint) {
				106	endingCodePoint = getCharCode(offset++);
				107	}
				108
				109	// Initially create a <string-token> with its value set to the empty string.
				110	type = TYPE.String;
				111
				112	// Repeatedly consume the next input code point from the stream:
				113	for (; offset < source.length; offset++) {
				114	var code = source.charCodeAt(offset);
				115
				116	switch (charCodeCategory(code)) {
				117	// ending code point
				118	case endingCodePoint:
				119	// Return the <string-token>.
				120	offset++;
				121	return;
				122
				123	// EOF
				124	case charCodeCategory.Eof:
				125	// This is a parse error. Return the <string-token>.
				126	return;
				127
				128	// newline
				129	case charCodeCategory.WhiteSpace:
				130	if (isNewline(code)) {
				131	// This is a parse error. Reconsume the current input code point,
				132	// create a <bad-string-token>, and return it.
				133	offset += getNewlineLength(source, offset, code);
				134	type = TYPE.BadString;
				135	return;
				136	}
				137	break;
				138
				139	// U+005C REVERSE SOLIDUS (\)
				140	case 0x005C:
				141	// If the next input code point is EOF, do nothing.
				142	if (offset === source.length - 1) {
				143	break;
				144	}
				145
				146	var nextCode = getCharCode(offset + 1);
				147
				148	// Otherwise, if the next input code point is a newline, consume it.
				149	if (isNewline(nextCode)) {
				150	offset += getNewlineLength(source, offset + 1, nextCode);
				151	} else if (isValidEscape(code, nextCode)) {
				152	// Otherwise, (the stream starts with a valid escape) consume
				153	// an escaped code point and append the returned code point to
				154	// the <string-token>’s value.
				155	offset = consumeEscaped(source, offset) - 1;
				156	}
				157	break;
				158
				159	// anything else
				160	// Append the current input code point to the <string-token>’s value.
				161	}
				162	}
				163	}
				164
				165	// § 4.3.6. Consume a url token
				166	// Note: This algorithm assumes that the initial "url(" has already been consumed.
				167	// This algorithm also assumes that it’s being called to consume an "unquoted" value, like url(foo).
				168	// A quoted value, like url("foo"), is parsed as a <function-token>. Consume an ident-like token
				169	// automatically handles this distinction; this algorithm shouldn’t be called directly otherwise.
				170	function consumeUrlToken() {
				171	// Initially create a <url-token> with its value set to the empty string.
				172	type = TYPE.Url;
				173
				174	// Consume as much whitespace as possible.
				175	offset = findWhiteSpaceEnd(source, offset);
				176
				177	// Repeatedly consume the next input code point from the stream:
				178	for (; offset < source.length; offset++) {
				179	var code = source.charCodeAt(offset);
				180
				181	switch (charCodeCategory(code)) {
				182	// U+0029 RIGHT PARENTHESIS ())
				183	case 0x0029:
				184	// Return the <url-token>.
				185	offset++;
				186	return;
				187
				188	// EOF
				189	case charCodeCategory.Eof:
				190	// This is a parse error. Return the <url-token>.
				191	return;
				192
				193	// whitespace
				194	case charCodeCategory.WhiteSpace:
				195	// Consume as much whitespace as possible.
				196	offset = findWhiteSpaceEnd(source, offset);
				197
				198	// If the next input code point is U+0029 RIGHT PARENTHESIS ()) or EOF,
				199	// consume it and return the <url-token>
				200	// (if EOF was encountered, this is a parse error);
				201	if (getCharCode(offset) === 0x0029 \|\| offset >= source.length) {
				202	if (offset < source.length) {
				203	offset++;
				204	}
				205	return;
				206	}
				207
				208	// otherwise, consume the remnants of a bad url, create a <bad-url-token>,
				209	// and return it.
				210	offset = consumeBadUrlRemnants(source, offset);
				211	type = TYPE.BadUrl;
				212	return;
				213
				214	// U+0022 QUOTATION MARK (")
				215	// U+0027 APOSTROPHE (')
				216	// U+0028 LEFT PARENTHESIS (()
				217	// non-printable code point
				218	case 0x0022:
				219	case 0x0027:
				220	case 0x0028:
				221	case charCodeCategory.NonPrintable:
				222	// This is a parse error. Consume the remnants of a bad url,
				223	// create a <bad-url-token>, and return it.
				224	offset = consumeBadUrlRemnants(source, offset);
				225	type = TYPE.BadUrl;
				226	return;
				227
				228	// U+005C REVERSE SOLIDUS (\)
				229	case 0x005C:
				230	// If the stream starts with a valid escape, consume an escaped code point and
				231	// append the returned code point to the <url-token>’s value.
				232	if (isValidEscape(code, getCharCode(offset + 1))) {
				233	offset = consumeEscaped(source, offset) - 1;
				234	break;
				235	}
				236
				237	// Otherwise, this is a parse error. Consume the remnants of a bad url,
				238	// create a <bad-url-token>, and return it.
				239	offset = consumeBadUrlRemnants(source, offset);
				240	type = TYPE.BadUrl;
				241	return;
				242
				243	// anything else
				244	// Append the current input code point to the <url-token>’s value.
				245	}
				246	}
				247	}
				248
				249	if (!stream) {
				250	stream = new TokenStream();
				251	}
				252
				253	// ensure source is a string
				254	source = String(source \|\| '');
				255
				256	var sourceLength = source.length;
				257	var offsetAndType = adoptBuffer(stream.offsetAndType, sourceLength + 1); // +1 because of eof-token
				258	var balance = adoptBuffer(stream.balance, sourceLength + 1);
				259	var tokenCount = 0;
				260	var start = isBOM(getCharCode(0));
				261	var offset = start;
				262	var balanceCloseType = 0;
				263	var balanceStart = 0;
				264	var balancePrev = 0;
				265
				266	// https://drafts.csswg.org/css-syntax-3/#consume-token
				267	// § 4.3.1. Consume a token
				268	while (offset < sourceLength) {
				269	var code = source.charCodeAt(offset);
				270	var type = 0;
				271
				272	balance[tokenCount] = sourceLength;
				273
				274	switch (charCodeCategory(code)) {
				275	// whitespace
				276	case charCodeCategory.WhiteSpace:
				277	// Consume as much whitespace as possible. Return a <whitespace-token>.
				278	type = TYPE.WhiteSpace;
				279	offset = findWhiteSpaceEnd(source, offset + 1);
				280	break;
				281
				282	// U+0022 QUOTATION MARK (")
				283	case 0x0022:
				284	// Consume a string token and return it.
				285	consumeStringToken();
				286	break;
				287
				288	// U+0023 NUMBER SIGN (#)
				289	case 0x0023:
				290	// If the next input code point is a name code point or the next two input code points are a valid escape, then:
				291	if (isName(getCharCode(offset + 1)) \|\| isValidEscape(getCharCode(offset + 1), getCharCode(offset + 2))) {
				292	// Create a <hash-token>.
				293	type = TYPE.Hash;
				294
				295	// If the next 3 input code points would start an identifier, set the <hash-token>’s type flag to "id".
				296	// if (isIdentifierStart(getCharCode(offset + 1), getCharCode(offset + 2), getCharCode(offset + 3))) {
				297	// // TODO: set id flag
				298	// }
				299
				300	// Consume a name, and set the <hash-token>’s value to the returned string.
				301	offset = consumeName(source, offset + 1);
				302
				303	// Return the <hash-token>.
				304	} else {
				305	// Otherwise, return a <delim-token> with its value set to the current input code point.
				306	type = TYPE.Delim;
				307	offset++;
				308	}
				309
				310	break;
				311
				312	// U+0027 APOSTROPHE (')
				313	case 0x0027:
				314	// Consume a string token and return it.
				315	consumeStringToken();
				316	break;
				317
				318	// U+0028 LEFT PARENTHESIS (()
				319	case 0x0028:
				320	// Return a <(-token>.
				321	type = TYPE.LeftParenthesis;
				322	offset++;
				323	break;
				324
				325	// U+0029 RIGHT PARENTHESIS ())
				326	case 0x0029:
				327	// Return a <)-token>.
				328	type = TYPE.RightParenthesis;
				329	offset++;
				330	break;
				331
				332	// U+002B PLUS SIGN (+)
				333	case 0x002B:
				334	// If the input stream starts with a number, ...
				335	if (isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) {
				336	// ... reconsume the current input code point, consume a numeric token, and return it.
				337	consumeNumericToken();
				338	} else {
				339	// Otherwise, return a <delim-token> with its value set to the current input code point.
				340	type = TYPE.Delim;
				341	offset++;
				342	}
				343	break;
				344
				345	// U+002C COMMA (,)
				346	case 0x002C:
				347	// Return a <comma-token>.
				348	type = TYPE.Comma;
				349	offset++;
				350	break;
				351
				352	// U+002D HYPHEN-MINUS (-)
				353	case 0x002D:
				354	// If the input stream starts with a number, reconsume the current input code point, consume a numeric token, and return it.
				355	if (isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) {
				356	consumeNumericToken();
				357	} else {
				358	// Otherwise, if the next 2 input code points are U+002D HYPHEN-MINUS U+003E GREATER-THAN SIGN (->), consume them and return a <CDC-token>.
				359	if (getCharCode(offset + 1) === 0x002D &&
				360	getCharCode(offset + 2) === 0x003E) {
				361	type = TYPE.CDC;
				362	offset = offset + 3;
				363	} else {
				364	// Otherwise, if the input stream starts with an identifier, ...
				365	if (isIdentifierStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) {
				366	// ... reconsume the current input code point, consume an ident-like token, and return it.
				367	consumeIdentLikeToken();
				368	} else {
				369	// Otherwise, return a <delim-token> with its value set to the current input code point.
				370	type = TYPE.Delim;
				371	offset++;
				372	}
				373	}
				374	}
				375	break;
				376
				377	// U+002E FULL STOP (.)
				378	case 0x002E:
				379	// If the input stream starts with a number, ...
				380	if (isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) {
				381	// ... reconsume the current input code point, consume a numeric token, and return it.
				382	consumeNumericToken();
				383	} else {
				384	// Otherwise, return a <delim-token> with its value set to the current input code point.
				385	type = TYPE.Delim;
				386	offset++;
				387	}
				388
				389	break;
				390
				391	// U+002F SOLIDUS (/)
				392	case 0x002F:
				393	// If the next two input code point are U+002F SOLIDUS (/) followed by a U+002A ASTERISK (*),
				394	if (getCharCode(offset + 1) === 0x002A) {
				395	// ... consume them and all following code points up to and including the first U+002A ASTERISK (*)
				396	// followed by a U+002F SOLIDUS (/), or up to an EOF code point.
				397	type = TYPE.Comment;
				398	offset = source.indexOf('*/', offset + 2) + 2;
				399	if (offset === 1) {
				400	offset = source.length;
				401	}
				402	} else {
				403	type = TYPE.Delim;
				404	offset++;
				405	}
				406	break;
				407
				408	// U+003A COLON (:)
				409	case 0x003A:
				410	// Return a <colon-token>.
				411	type = TYPE.Colon;
				412	offset++;
				413	break;
				414
				415	// U+003B SEMICOLON (;)
				416	case 0x003B:
				417	// Return a <semicolon-token>.
				418	type = TYPE.Semicolon;
				419	offset++;
				420	break;
				421
				422	// U+003C LESS-THAN SIGN (<)
				423	case 0x003C:
				424	// If the next 3 input code points are U+0021 EXCLAMATION MARK U+002D HYPHEN-MINUS U+002D HYPHEN-MINUS (!--), ...
				425	if (getCharCode(offset + 1) === 0x0021 &&
				426	getCharCode(offset + 2) === 0x002D &&
				427	getCharCode(offset + 3) === 0x002D) {
				428	// ... consume them and return a <CDO-token>.
				429	type = TYPE.CDO;
				430	offset = offset + 4;
				431	} else {
				432	// Otherwise, return a <delim-token> with its value set to the current input code point.
				433	type = TYPE.Delim;
				434	offset++;
				435	}
				436
				437	break;
				438
				439	// U+0040 COMMERCIAL AT (@)
				440	case 0x0040:
				441	// If the next 3 input code points would start an identifier, ...
				442	if (isIdentifierStart(getCharCode(offset + 1), getCharCode(offset + 2), getCharCode(offset + 3))) {
				443	// ... consume a name, create an <at-keyword-token> with its value set to the returned value, and return it.
				444	type = TYPE.AtKeyword;
				445	offset = consumeName(source, offset + 1);
				446	} else {
				447	// Otherwise, return a <delim-token> with its value set to the current input code point.
				448	type = TYPE.Delim;
				449	offset++;
				450	}
				451
				452	break;
				453
				454	// U+005B LEFT SQUARE BRACKET ([)
				455	case 0x005B:
				456	// Return a <[-token>.
				457	type = TYPE.LeftSquareBracket;
				458	offset++;
				459	break;
				460
				461	// U+005C REVERSE SOLIDUS (\)
				462	case 0x005C:
				463	// If the input stream starts with a valid escape, ...
				464	if (isValidEscape(code, getCharCode(offset + 1))) {
				465	// ... reconsume the current input code point, consume an ident-like token, and return it.
				466	consumeIdentLikeToken();
				467	} else {
				468	// Otherwise, this is a parse error. Return a <delim-token> with its value set to the current input code point.
				469	type = TYPE.Delim;
				470	offset++;
				471	}
				472	break;
				473
				474	// U+005D RIGHT SQUARE BRACKET (])
				475	case 0x005D:
				476	// Return a <]-token>.
				477	type = TYPE.RightSquareBracket;
				478	offset++;
				479	break;
				480
				481	// U+007B LEFT CURLY BRACKET ({)
				482	case 0x007B:
				483	// Return a <{-token>.
				484	type = TYPE.LeftCurlyBracket;
				485	offset++;
				486	break;
				487
				488	// U+007D RIGHT CURLY BRACKET (})
				489	case 0x007D:
				490	// Return a <}-token>.
				491	type = TYPE.RightCurlyBracket;
				492	offset++;
				493	break;
				494
				495	// digit
				496	case charCodeCategory.Digit:
				497	// Reconsume the current input code point, consume a numeric token, and return it.
				498	consumeNumericToken();
				499	break;
				500
				501	// name-start code point
				502	case charCodeCategory.NameStart:
				503	// Reconsume the current input code point, consume an ident-like token, and return it.
				504	consumeIdentLikeToken();
				505	break;
				506
				507	// EOF
				508	case charCodeCategory.Eof:
				509	// Return an <EOF-token>.
				510	break;
				511
				512	// anything else
				513	default:
				514	// Return a <delim-token> with its value set to the current input code point.
				515	type = TYPE.Delim;
				516	offset++;
				517	}
				518
				519	switch (type) {
				520	case balanceCloseType:
				521	balancePrev = balanceStart & OFFSET_MASK;
				522	balanceStart = balance[balancePrev];
				523	balanceCloseType = balanceStart >> TYPE_SHIFT;
				524	balance[tokenCount] = balancePrev;
				525	balance[balancePrev++] = tokenCount;
				526	for (; balancePrev < tokenCount; balancePrev++) {
				527	if (balance[balancePrev] === sourceLength) {
				528	balance[balancePrev] = tokenCount;
				529	}
				530	}
				531	break;
				532
				533	case TYPE.LeftParenthesis:
				534	case TYPE.Function:
				535	balance[tokenCount] = balanceStart;
				536	balanceCloseType = TYPE.RightParenthesis;
				537	balanceStart = (balanceCloseType << TYPE_SHIFT) \| tokenCount;
				538	break;
				539
				540	case TYPE.LeftSquareBracket:
				541	balance[tokenCount] = balanceStart;
				542	balanceCloseType = TYPE.RightSquareBracket;
				543	balanceStart = (balanceCloseType << TYPE_SHIFT) \| tokenCount;
				544	break;
				545
				546	case TYPE.LeftCurlyBracket:
				547	balance[tokenCount] = balanceStart;
				548	balanceCloseType = TYPE.RightCurlyBracket;
				549	balanceStart = (balanceCloseType << TYPE_SHIFT) \| tokenCount;
				550	break;
				551	}
				552
				553	offsetAndType[tokenCount++] = (type << TYPE_SHIFT) \| offset;
				554	}
				555
				556	// finalize buffers
				557	offsetAndType[tokenCount] = (TYPE.EOF << TYPE_SHIFT) \| offset; // <EOF-token>
				558	balance[tokenCount] = sourceLength;
				559	balance[sourceLength] = sourceLength; // prevents false positive balance match with any token
				560	while (balanceStart !== 0) {
				561	balancePrev = balanceStart & OFFSET_MASK;
				562	balanceStart = balance[balancePrev];
				563	balance[balancePrev] = sourceLength;
				564	}
				565
				566	// update stream
				567	stream.source = source;
				568	stream.firstCharOffset = start;
				569	stream.offsetAndType = offsetAndType;
				570	stream.tokenCount = tokenCount;
				571	stream.balance = balance;
				572	stream.reset();
				573	stream.next();
				574
				575	return stream;
				576	}
				577
				578	// extend tokenizer with constants
				579	Object.keys(constants).forEach(function(key) {
				580	tokenize[key] = constants[key];
				581	});
				582
				583	// extend tokenizer with static methods from utils
				584	Object.keys(charCodeDefinitions).forEach(function(key) {
				585	tokenize[key] = charCodeDefinitions[key];
				586	});
				587	Object.keys(utils).forEach(function(key) {
				588	tokenize[key] = utils[key];
				589	});
				590
				591	module.exports = tokenize;