iannucci@chromium.org | 3665cd2 | 2013-11-07 09:37:03 +0000 | [diff] [blame] | 1 | """Better tokenizing for coverage.py.""" |
| 2 | |
| 3 | import codecs, keyword, re, sys, token, tokenize |
| 4 | from coverage.backward import StringIO # pylint: disable=W0622 |
| 5 | |
| 6 | def phys_tokens(toks): |
| 7 | """Return all physical tokens, even line continuations. |
| 8 | |
| 9 | tokenize.generate_tokens() doesn't return a token for the backslash that |
| 10 | continues lines. This wrapper provides those tokens so that we can |
| 11 | re-create a faithful representation of the original source. |
| 12 | |
| 13 | Returns the same values as generate_tokens() |
| 14 | |
| 15 | """ |
| 16 | last_line = None |
| 17 | last_lineno = -1 |
| 18 | last_ttype = None |
| 19 | for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks: |
| 20 | if last_lineno != elineno: |
| 21 | if last_line and last_line[-2:] == "\\\n": |
| 22 | # We are at the beginning of a new line, and the last line |
| 23 | # ended with a backslash. We probably have to inject a |
| 24 | # backslash token into the stream. Unfortunately, there's more |
| 25 | # to figure out. This code:: |
| 26 | # |
| 27 | # usage = """\ |
| 28 | # HEY THERE |
| 29 | # """ |
| 30 | # |
| 31 | # triggers this condition, but the token text is:: |
| 32 | # |
| 33 | # '"""\\\nHEY THERE\n"""' |
| 34 | # |
| 35 | # so we need to figure out if the backslash is already in the |
| 36 | # string token or not. |
| 37 | inject_backslash = True |
| 38 | if last_ttype == tokenize.COMMENT: |
| 39 | # Comments like this \ |
| 40 | # should never result in a new token. |
| 41 | inject_backslash = False |
| 42 | elif ttype == token.STRING: |
| 43 | if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\': |
| 44 | # It's a multiline string and the first line ends with |
| 45 | # a backslash, so we don't need to inject another. |
| 46 | inject_backslash = False |
| 47 | if inject_backslash: |
| 48 | # Figure out what column the backslash is in. |
| 49 | ccol = len(last_line.split("\n")[-2]) - 1 |
| 50 | # Yield the token, with a fake token type. |
| 51 | yield ( |
| 52 | 99999, "\\\n", |
| 53 | (slineno, ccol), (slineno, ccol+2), |
| 54 | last_line |
| 55 | ) |
| 56 | last_line = ltext |
| 57 | last_ttype = ttype |
| 58 | yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext |
| 59 | last_lineno = elineno |
| 60 | |
| 61 | |
| 62 | def source_token_lines(source): |
| 63 | """Generate a series of lines, one for each line in `source`. |
| 64 | |
| 65 | Each line is a list of pairs, each pair is a token:: |
| 66 | |
| 67 | [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ] |
| 68 | |
| 69 | Each pair has a token class, and the token text. |
| 70 | |
| 71 | If you concatenate all the token texts, and then join them with newlines, |
| 72 | you should have your original `source` back, with two differences: |
| 73 | trailing whitespace is not preserved, and a final line with no newline |
| 74 | is indistinguishable from a final line with a newline. |
| 75 | |
| 76 | """ |
| 77 | ws_tokens = [token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL] |
| 78 | line = [] |
| 79 | col = 0 |
| 80 | source = source.expandtabs(8).replace('\r\n', '\n') |
| 81 | tokgen = tokenize.generate_tokens(StringIO(source).readline) |
| 82 | for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen): |
| 83 | mark_start = True |
| 84 | for part in re.split('(\n)', ttext): |
| 85 | if part == '\n': |
| 86 | yield line |
| 87 | line = [] |
| 88 | col = 0 |
| 89 | mark_end = False |
| 90 | elif part == '': |
| 91 | mark_end = False |
| 92 | elif ttype in ws_tokens: |
| 93 | mark_end = False |
| 94 | else: |
| 95 | if mark_start and scol > col: |
| 96 | line.append(("ws", " " * (scol - col))) |
| 97 | mark_start = False |
| 98 | tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3] |
| 99 | if ttype == token.NAME and keyword.iskeyword(ttext): |
| 100 | tok_class = "key" |
| 101 | line.append((tok_class, part)) |
| 102 | mark_end = True |
| 103 | scol = 0 |
| 104 | if mark_end: |
| 105 | col = ecol |
| 106 | |
| 107 | if line: |
| 108 | yield line |
| 109 | |
| 110 | def source_encoding(source): |
| 111 | """Determine the encoding for `source` (a string), according to PEP 263. |
| 112 | |
| 113 | Returns a string, the name of the encoding. |
| 114 | |
| 115 | """ |
| 116 | # Note: this function should never be called on Python 3, since py3 has |
| 117 | # built-in tools to do this. |
| 118 | assert sys.version_info < (3, 0) |
| 119 | |
| 120 | # This is mostly code adapted from Py3.2's tokenize module. |
| 121 | |
| 122 | cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)") |
| 123 | |
| 124 | # Do this so the detect_encode code we copied will work. |
| 125 | readline = iter(source.splitlines(True)).next |
| 126 | |
| 127 | def _get_normal_name(orig_enc): |
| 128 | """Imitates get_normal_name in tokenizer.c.""" |
| 129 | # Only care about the first 12 characters. |
| 130 | enc = orig_enc[:12].lower().replace("_", "-") |
| 131 | if re.match(r"^utf-8($|-)", enc): |
| 132 | return "utf-8" |
| 133 | if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc): |
| 134 | return "iso-8859-1" |
| 135 | return orig_enc |
| 136 | |
| 137 | # From detect_encode(): |
| 138 | # It detects the encoding from the presence of a utf-8 bom or an encoding |
| 139 | # cookie as specified in pep-0263. If both a bom and a cookie are present, |
| 140 | # but disagree, a SyntaxError will be raised. If the encoding cookie is an |
| 141 | # invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found, |
| 142 | # 'utf-8-sig' is returned. |
| 143 | |
| 144 | # If no encoding is specified, then the default will be returned. The |
| 145 | # default varied with version. |
| 146 | |
| 147 | if sys.version_info <= (2, 4): |
| 148 | default = 'iso-8859-1' |
| 149 | else: |
| 150 | default = 'ascii' |
| 151 | |
| 152 | bom_found = False |
| 153 | encoding = None |
| 154 | |
| 155 | def read_or_stop(): |
| 156 | """Get the next source line, or ''.""" |
| 157 | try: |
| 158 | return readline() |
| 159 | except StopIteration: |
| 160 | return '' |
| 161 | |
| 162 | def find_cookie(line): |
| 163 | """Find an encoding cookie in `line`.""" |
| 164 | try: |
| 165 | line_string = line.decode('ascii') |
| 166 | except UnicodeDecodeError: |
| 167 | return None |
| 168 | |
| 169 | matches = cookie_re.findall(line_string) |
| 170 | if not matches: |
| 171 | return None |
| 172 | encoding = _get_normal_name(matches[0]) |
| 173 | try: |
| 174 | codec = codecs.lookup(encoding) |
| 175 | except LookupError: |
| 176 | # This behaviour mimics the Python interpreter |
| 177 | raise SyntaxError("unknown encoding: " + encoding) |
| 178 | |
| 179 | if bom_found: |
| 180 | # codecs in 2.3 were raw tuples of functions, assume the best. |
| 181 | codec_name = getattr(codec, 'name', encoding) |
| 182 | if codec_name != 'utf-8': |
| 183 | # This behaviour mimics the Python interpreter |
| 184 | raise SyntaxError('encoding problem: utf-8') |
| 185 | encoding += '-sig' |
| 186 | return encoding |
| 187 | |
| 188 | first = read_or_stop() |
| 189 | if first.startswith(codecs.BOM_UTF8): |
| 190 | bom_found = True |
| 191 | first = first[3:] |
| 192 | default = 'utf-8-sig' |
| 193 | if not first: |
| 194 | return default |
| 195 | |
| 196 | encoding = find_cookie(first) |
| 197 | if encoding: |
| 198 | return encoding |
| 199 | |
| 200 | second = read_or_stop() |
| 201 | if not second: |
| 202 | return default |
| 203 | |
| 204 | encoding = find_cookie(second) |
| 205 | if encoding: |
| 206 | return encoding |
| 207 | |
| 208 | return default |