Blame - third_party/coverage/phystokens.py - chromium.googlesource.com/chromium/tools/depot_tools

blob: 2a91882d019fc2f718796f43407762ef359a3921 [file] [log] [blame]

iannucci@chromium.org	3665cd2	2013-11-07 09:37:03 +0000	[diff] [blame]	1	"""Better tokenizing for coverage.py."""
				2
				3	import codecs, keyword, re, sys, token, tokenize
				4	from coverage.backward import StringIO # pylint: disable=W0622
				5
				6	def phys_tokens(toks):
				7	"""Return all physical tokens, even line continuations.
				8
				9	tokenize.generate_tokens() doesn't return a token for the backslash that
				10	continues lines. This wrapper provides those tokens so that we can
				11	re-create a faithful representation of the original source.
				12
				13	Returns the same values as generate_tokens()
				14
				15	"""
				16	last_line = None
				17	last_lineno = -1
				18	last_ttype = None
				19	for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks:
				20	if last_lineno != elineno:
				21	if last_line and last_line[-2:] == "\\\n":
				22	# We are at the beginning of a new line, and the last line
				23	# ended with a backslash. We probably have to inject a
				24	# backslash token into the stream. Unfortunately, there's more
				25	# to figure out. This code::
				26	#
				27	# usage = """\
				28	# HEY THERE
				29	# """
				30	#
				31	# triggers this condition, but the token text is::
				32	#
				33	# '"""\\\nHEY THERE\n"""'
				34	#
				35	# so we need to figure out if the backslash is already in the
				36	# string token or not.
				37	inject_backslash = True
				38	if last_ttype == tokenize.COMMENT:
				39	# Comments like this \
				40	# should never result in a new token.
				41	inject_backslash = False
				42	elif ttype == token.STRING:
				43	if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\':
				44	# It's a multiline string and the first line ends with
				45	# a backslash, so we don't need to inject another.
				46	inject_backslash = False
				47	if inject_backslash:
				48	# Figure out what column the backslash is in.
				49	ccol = len(last_line.split("\n")[-2]) - 1
				50	# Yield the token, with a fake token type.
				51	yield (
				52	99999, "\\\n",
				53	(slineno, ccol), (slineno, ccol+2),
				54	last_line
				55	)
				56	last_line = ltext
				57	last_ttype = ttype
				58	yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext
				59	last_lineno = elineno
				60
				61
				62	def source_token_lines(source):
				63	"""Generate a series of lines, one for each line in `source`.
				64
				65	Each line is a list of pairs, each pair is a token::
				66
				67	[('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ]
				68
				69	Each pair has a token class, and the token text.
				70
				71	If you concatenate all the token texts, and then join them with newlines,
				72	you should have your original `source` back, with two differences:
				73	trailing whitespace is not preserved, and a final line with no newline
				74	is indistinguishable from a final line with a newline.
				75
				76	"""
				77	ws_tokens = [token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL]
				78	line = []
				79	col = 0
				80	source = source.expandtabs(8).replace('\r\n', '\n')
				81	tokgen = tokenize.generate_tokens(StringIO(source).readline)
				82	for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen):
				83	mark_start = True
				84	for part in re.split('(\n)', ttext):
				85	if part == '\n':
				86	yield line
				87	line = []
				88	col = 0
				89	mark_end = False
				90	elif part == '':
				91	mark_end = False
				92	elif ttype in ws_tokens:
				93	mark_end = False
				94	else:
				95	if mark_start and scol > col:
				96	line.append(("ws", " " * (scol - col)))
				97	mark_start = False
				98	tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3]
				99	if ttype == token.NAME and keyword.iskeyword(ttext):
				100	tok_class = "key"
				101	line.append((tok_class, part))
				102	mark_end = True
				103	scol = 0
				104	if mark_end:
				105	col = ecol
				106
				107	if line:
				108	yield line
				109
				110	def source_encoding(source):
				111	"""Determine the encoding for `source` (a string), according to PEP 263.
				112
				113	Returns a string, the name of the encoding.
				114
				115	"""
				116	# Note: this function should never be called on Python 3, since py3 has
				117	# built-in tools to do this.
				118	assert sys.version_info < (3, 0)
				119
				120	# This is mostly code adapted from Py3.2's tokenize module.
				121
				122	cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)")
				123
				124	# Do this so the detect_encode code we copied will work.
				125	readline = iter(source.splitlines(True)).next
				126
				127	def _get_normal_name(orig_enc):
				128	"""Imitates get_normal_name in tokenizer.c."""
				129	# Only care about the first 12 characters.
				130	enc = orig_enc[:12].lower().replace("_", "-")
				131	if re.match(r"^utf-8($\|-)", enc):
				132	return "utf-8"
				133	if re.match(r"^(latin-1\|iso-8859-1\|iso-latin-1)($\|-)", enc):
				134	return "iso-8859-1"
				135	return orig_enc
				136
				137	# From detect_encode():
				138	# It detects the encoding from the presence of a utf-8 bom or an encoding
				139	# cookie as specified in pep-0263. If both a bom and a cookie are present,
				140	# but disagree, a SyntaxError will be raised. If the encoding cookie is an
				141	# invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
				142	# 'utf-8-sig' is returned.
				143
				144	# If no encoding is specified, then the default will be returned. The
				145	# default varied with version.
				146
				147	if sys.version_info <= (2, 4):
				148	default = 'iso-8859-1'
				149	else:
				150	default = 'ascii'
				151
				152	bom_found = False
				153	encoding = None
				154
				155	def read_or_stop():
				156	"""Get the next source line, or ''."""
				157	try:
				158	return readline()
				159	except StopIteration:
				160	return ''
				161
				162	def find_cookie(line):
				163	"""Find an encoding cookie in `line`."""
				164	try:
				165	line_string = line.decode('ascii')
				166	except UnicodeDecodeError:
				167	return None
				168
				169	matches = cookie_re.findall(line_string)
				170	if not matches:
				171	return None
				172	encoding = _get_normal_name(matches[0])
				173	try:
				174	codec = codecs.lookup(encoding)
				175	except LookupError:
				176	# This behaviour mimics the Python interpreter
				177	raise SyntaxError("unknown encoding: " + encoding)
				178
				179	if bom_found:
				180	# codecs in 2.3 were raw tuples of functions, assume the best.
				181	codec_name = getattr(codec, 'name', encoding)
				182	if codec_name != 'utf-8':
				183	# This behaviour mimics the Python interpreter
				184	raise SyntaxError('encoding problem: utf-8')
				185	encoding += '-sig'
				186	return encoding
				187
				188	first = read_or_stop()
				189	if first.startswith(codecs.BOM_UTF8):
				190	bom_found = True
				191	first = first[3:]
				192	default = 'utf-8-sig'
				193	if not first:
				194	return default
				195
				196	encoding = find_cookie(first)
				197	if encoding:
				198	return encoding
				199
				200	second = read_or_stop()
				201	if not second:
				202	return default
				203
				204	encoding = find_cookie(second)
				205	if encoding:
				206	return encoding
				207
				208	return default