maruel@chromium.org | 35625c7 | 2011-03-23 17:34:02 +0000 | [diff] [blame] | 1 | # Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| 2 | # Use of this source code is governed by a BSD-style license that can be |
| 3 | # found in the LICENSE file. |
| 4 | |
| 5 | """Collection of functions and classes to fix various encoding problems on |
| 6 | multiple platforms with python. |
| 7 | """ |
| 8 | |
Raul Tambre | 80ee78e | 2019-05-06 22:41:05 +0000 | [diff] [blame] | 9 | from __future__ import print_function |
| 10 | |
maruel@chromium.org | 35625c7 | 2011-03-23 17:34:02 +0000 | [diff] [blame] | 11 | import codecs |
| 12 | import locale |
| 13 | import os |
| 14 | import sys |
| 15 | |
| 16 | |
| 17 | # Prevents initializing multiple times. |
| 18 | _SYS_ARGV_PROCESSED = False |
| 19 | |
| 20 | |
| 21 | def complain(message): |
| 22 | """If any exception occurs in this file, we'll probably try to print it |
| 23 | on stderr, which makes for frustrating debugging if stderr is directed |
| 24 | to our wrapper. So be paranoid about catching errors and reporting them |
| 25 | to sys.__stderr__, so that the user has a higher chance to see them. |
| 26 | """ |
Raul Tambre | 80ee78e | 2019-05-06 22:41:05 +0000 | [diff] [blame] | 27 | print( |
| 28 | isinstance(message, str) and message or repr(message), |
| 29 | file=sys.__stderr__) |
maruel@chromium.org | 35625c7 | 2011-03-23 17:34:02 +0000 | [diff] [blame] | 30 | |
| 31 | |
| 32 | def fix_default_encoding(): |
| 33 | """Forces utf8 solidly on all platforms. |
| 34 | |
| 35 | By default python execution environment is lazy and defaults to ascii |
| 36 | encoding. |
| 37 | |
| 38 | http://uucode.com/blog/2007/03/23/shut-up-you-dummy-7-bit-python/ |
| 39 | """ |
| 40 | if sys.getdefaultencoding() == 'utf-8': |
| 41 | return False |
| 42 | |
| 43 | # Regenerate setdefaultencoding. |
| 44 | reload(sys) |
| 45 | # Module 'sys' has no 'setdefaultencoding' member |
Quinten Yearsley | b2cc4a9 | 2016-12-15 13:53:26 -0800 | [diff] [blame] | 46 | # pylint: disable=no-member |
maruel@chromium.org | 35625c7 | 2011-03-23 17:34:02 +0000 | [diff] [blame] | 47 | sys.setdefaultencoding('utf-8') |
| 48 | for attr in dir(locale): |
| 49 | if attr[0:3] != 'LC_': |
| 50 | continue |
| 51 | aref = getattr(locale, attr) |
maruel@chromium.org | cfa826c | 2011-03-25 00:47:57 +0000 | [diff] [blame] | 52 | try: |
| 53 | locale.setlocale(aref, '') |
| 54 | except locale.Error: |
| 55 | continue |
maruel@chromium.org | 35625c7 | 2011-03-23 17:34:02 +0000 | [diff] [blame] | 56 | try: |
iannucci@chromium.org | 8b7274b | 2016-03-29 22:46:31 +0000 | [diff] [blame] | 57 | lang, _ = locale.getdefaultlocale() |
tony@chromium.org | e155bcd | 2011-03-23 23:16:55 +0000 | [diff] [blame] | 58 | except (TypeError, ValueError): |
maruel@chromium.org | cfa826c | 2011-03-25 00:47:57 +0000 | [diff] [blame] | 59 | continue |
maruel@chromium.org | 35625c7 | 2011-03-23 17:34:02 +0000 | [diff] [blame] | 60 | if lang: |
| 61 | try: |
| 62 | locale.setlocale(aref, (lang, 'UTF-8')) |
| 63 | except locale.Error: |
| 64 | os.environ[attr] = lang + '.UTF-8' |
maruel@chromium.org | cfa826c | 2011-03-25 00:47:57 +0000 | [diff] [blame] | 65 | try: |
| 66 | locale.setlocale(locale.LC_ALL, '') |
| 67 | except locale.Error: |
| 68 | pass |
maruel@chromium.org | 35625c7 | 2011-03-23 17:34:02 +0000 | [diff] [blame] | 69 | return True |
| 70 | |
| 71 | |
| 72 | ############################### |
| 73 | # Windows specific |
| 74 | |
| 75 | |
| 76 | def fix_win_sys_argv(encoding): |
| 77 | """Converts sys.argv to 'encoding' encoded string. |
| 78 | |
| 79 | utf-8 is recommended. |
| 80 | |
| 81 | Works around <http://bugs.python.org/issue2128>. |
| 82 | """ |
| 83 | global _SYS_ARGV_PROCESSED |
| 84 | if _SYS_ARGV_PROCESSED: |
| 85 | return False |
| 86 | |
maruel@chromium.org | ade9c59 | 2011-04-07 15:59:11 +0000 | [diff] [blame] | 87 | # These types are available on linux but not Mac. |
Quinten Yearsley | b2cc4a9 | 2016-12-15 13:53:26 -0800 | [diff] [blame] | 88 | # pylint: disable=no-name-in-module,F0401 |
maruel@chromium.org | 35625c7 | 2011-03-23 17:34:02 +0000 | [diff] [blame] | 89 | from ctypes import byref, c_int, POINTER, windll, WINFUNCTYPE |
| 90 | from ctypes.wintypes import LPCWSTR, LPWSTR |
| 91 | |
| 92 | # <http://msdn.microsoft.com/en-us/library/ms683156.aspx> |
| 93 | GetCommandLineW = WINFUNCTYPE(LPWSTR)(('GetCommandLineW', windll.kernel32)) |
| 94 | # <http://msdn.microsoft.com/en-us/library/bb776391.aspx> |
| 95 | CommandLineToArgvW = WINFUNCTYPE(POINTER(LPWSTR), LPCWSTR, POINTER(c_int))( |
| 96 | ('CommandLineToArgvW', windll.shell32)) |
| 97 | |
| 98 | argc = c_int(0) |
| 99 | argv_unicode = CommandLineToArgvW(GetCommandLineW(), byref(argc)) |
| 100 | argv = [ |
Raul Tambre | b946b23 | 2019-03-26 14:48:46 +0000 | [diff] [blame] | 101 | argv_unicode[i].encode(encoding, 'replace') for i in range(0, argc.value) |
| 102 | ] |
maruel@chromium.org | 35625c7 | 2011-03-23 17:34:02 +0000 | [diff] [blame] | 103 | |
| 104 | if not hasattr(sys, 'frozen'): |
| 105 | # If this is an executable produced by py2exe or bbfreeze, then it |
| 106 | # will have been invoked directly. Otherwise, unicode_argv[0] is the |
| 107 | # Python interpreter, so skip that. |
| 108 | argv = argv[1:] |
| 109 | |
| 110 | # Also skip option arguments to the Python interpreter. |
| 111 | while len(argv) > 0: |
| 112 | arg = argv[0] |
Raul Tambre | b946b23 | 2019-03-26 14:48:46 +0000 | [diff] [blame] | 113 | if not arg.startswith(b'-') or arg == b'-': |
maruel@chromium.org | 35625c7 | 2011-03-23 17:34:02 +0000 | [diff] [blame] | 114 | break |
| 115 | argv = argv[1:] |
| 116 | if arg == u'-m': |
| 117 | # sys.argv[0] should really be the absolute path of the |
| 118 | # module source, but never mind. |
| 119 | break |
| 120 | if arg == u'-c': |
| 121 | argv[0] = u'-c' |
| 122 | break |
| 123 | sys.argv = argv |
| 124 | _SYS_ARGV_PROCESSED = True |
| 125 | return True |
| 126 | |
| 127 | |
| 128 | def fix_win_codec(): |
| 129 | """Works around <http://bugs.python.org/issue6058>.""" |
| 130 | # <http://msdn.microsoft.com/en-us/library/dd317756.aspx> |
| 131 | try: |
| 132 | codecs.lookup('cp65001') |
| 133 | return False |
| 134 | except LookupError: |
| 135 | codecs.register( |
| 136 | lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None) |
| 137 | return True |
| 138 | |
| 139 | |
| 140 | class WinUnicodeOutputBase(object): |
| 141 | """Base class to adapt sys.stdout or sys.stderr to behave correctly on |
| 142 | Windows. |
| 143 | |
| 144 | Setting encoding to utf-8 is recommended. |
| 145 | """ |
| 146 | def __init__(self, fileno, name, encoding): |
| 147 | # Corresponding file handle. |
| 148 | self._fileno = fileno |
| 149 | self.encoding = encoding |
| 150 | self.name = name |
| 151 | |
| 152 | self.closed = False |
| 153 | self.softspace = False |
| 154 | self.mode = 'w' |
| 155 | |
| 156 | @staticmethod |
| 157 | def isatty(): |
| 158 | return False |
| 159 | |
| 160 | def close(self): |
| 161 | # Don't really close the handle, that would only cause problems. |
| 162 | self.closed = True |
| 163 | |
| 164 | def fileno(self): |
| 165 | return self._fileno |
| 166 | |
| 167 | def flush(self): |
| 168 | raise NotImplementedError() |
| 169 | |
| 170 | def write(self, text): |
| 171 | raise NotImplementedError() |
| 172 | |
| 173 | def writelines(self, lines): |
| 174 | try: |
| 175 | for line in lines: |
| 176 | self.write(line) |
maruel@chromium.org | 3215234 | 2016-02-17 23:19:35 +0000 | [diff] [blame] | 177 | except Exception as e: |
maruel@chromium.org | 35625c7 | 2011-03-23 17:34:02 +0000 | [diff] [blame] | 178 | complain('%s.writelines: %r' % (self.name, e)) |
| 179 | raise |
| 180 | |
| 181 | |
| 182 | class WinUnicodeConsoleOutput(WinUnicodeOutputBase): |
| 183 | """Output adapter to a Windows Console. |
| 184 | |
| 185 | Understands how to use the win32 console API. |
| 186 | """ |
| 187 | def __init__(self, console_handle, fileno, stream_name, encoding): |
| 188 | super(WinUnicodeConsoleOutput, self).__init__( |
| 189 | fileno, '<Unicode console %s>' % stream_name, encoding) |
| 190 | # Handle to use for WriteConsoleW |
| 191 | self._console_handle = console_handle |
| 192 | |
| 193 | # Loads the necessary function. |
maruel@chromium.org | ade9c59 | 2011-04-07 15:59:11 +0000 | [diff] [blame] | 194 | # These types are available on linux but not Mac. |
Quinten Yearsley | b2cc4a9 | 2016-12-15 13:53:26 -0800 | [diff] [blame] | 195 | # pylint: disable=no-name-in-module,F0401 |
maruel@chromium.org | 35625c7 | 2011-03-23 17:34:02 +0000 | [diff] [blame] | 196 | from ctypes import byref, GetLastError, POINTER, windll, WINFUNCTYPE |
maruel@chromium.org | 725f1c3 | 2011-04-01 20:24:54 +0000 | [diff] [blame] | 197 | from ctypes.wintypes import BOOL, DWORD, HANDLE, LPWSTR |
Quinten Yearsley | b2cc4a9 | 2016-12-15 13:53:26 -0800 | [diff] [blame] | 198 | from ctypes.wintypes import LPVOID # pylint: disable=no-name-in-module |
maruel@chromium.org | 35625c7 | 2011-03-23 17:34:02 +0000 | [diff] [blame] | 199 | |
| 200 | self._DWORD = DWORD |
| 201 | self._byref = byref |
| 202 | |
| 203 | # <http://msdn.microsoft.com/en-us/library/ms687401.aspx> |
| 204 | self._WriteConsoleW = WINFUNCTYPE( |
| 205 | BOOL, HANDLE, LPWSTR, DWORD, POINTER(DWORD), LPVOID)( |
| 206 | ('WriteConsoleW', windll.kernel32)) |
| 207 | self._GetLastError = GetLastError |
| 208 | |
| 209 | def flush(self): |
| 210 | # No need to flush the console since it's immediate. |
| 211 | pass |
| 212 | |
| 213 | def write(self, text): |
| 214 | try: |
Raul Tambre | 65c2b1e | 2019-07-16 14:07:04 +0000 | [diff] [blame^] | 215 | if sys.version_info.major == 2 and not isinstance(text, unicode): |
maruel@chromium.org | 35625c7 | 2011-03-23 17:34:02 +0000 | [diff] [blame] | 216 | # Convert to unicode. |
| 217 | text = str(text).decode(self.encoding, 'replace') |
| 218 | remaining = len(text) |
| 219 | while remaining > 0: |
| 220 | n = self._DWORD(0) |
| 221 | # There is a shorter-than-documented limitation on the length of the |
| 222 | # string passed to WriteConsoleW. See |
| 223 | # <http://tahoe-lafs.org/trac/tahoe-lafs/ticket/1232>. |
| 224 | retval = self._WriteConsoleW( |
| 225 | self._console_handle, text, |
| 226 | min(remaining, 10000), |
| 227 | self._byref(n), None) |
| 228 | if retval == 0 or n.value == 0: |
| 229 | raise IOError( |
| 230 | 'WriteConsoleW returned %r, n.value = %r, last error = %r' % ( |
| 231 | retval, n.value, self._GetLastError())) |
| 232 | remaining -= n.value |
| 233 | if not remaining: |
| 234 | break |
avakulenko@google.com | 255f2be | 2014-12-05 22:19:55 +0000 | [diff] [blame] | 235 | text = text[int(n.value):] |
maruel@chromium.org | 3215234 | 2016-02-17 23:19:35 +0000 | [diff] [blame] | 236 | except Exception as e: |
maruel@chromium.org | 35625c7 | 2011-03-23 17:34:02 +0000 | [diff] [blame] | 237 | complain('%s.write: %r' % (self.name, e)) |
| 238 | raise |
| 239 | |
| 240 | |
| 241 | class WinUnicodeOutput(WinUnicodeOutputBase): |
| 242 | """Output adaptor to a file output on Windows. |
| 243 | |
| 244 | If the standard FileWrite function is used, it will be encoded in the current |
| 245 | code page. WriteConsoleW() permits writting any character. |
| 246 | """ |
| 247 | def __init__(self, stream, fileno, encoding): |
| 248 | super(WinUnicodeOutput, self).__init__( |
| 249 | fileno, '<Unicode redirected %s>' % stream.name, encoding) |
| 250 | # Output stream |
| 251 | self._stream = stream |
| 252 | |
| 253 | # Flush right now. |
| 254 | self.flush() |
| 255 | |
| 256 | def flush(self): |
| 257 | try: |
| 258 | self._stream.flush() |
maruel@chromium.org | 3215234 | 2016-02-17 23:19:35 +0000 | [diff] [blame] | 259 | except Exception as e: |
maruel@chromium.org | 35625c7 | 2011-03-23 17:34:02 +0000 | [diff] [blame] | 260 | complain('%s.flush: %r from %r' % (self.name, e, self._stream)) |
| 261 | raise |
| 262 | |
| 263 | def write(self, text): |
| 264 | try: |
Raul Tambre | 65c2b1e | 2019-07-16 14:07:04 +0000 | [diff] [blame^] | 265 | if sys.version_info.major == 2 and isinstance(text, unicode): |
maruel@chromium.org | 35625c7 | 2011-03-23 17:34:02 +0000 | [diff] [blame] | 266 | # Replace characters that cannot be printed instead of failing. |
| 267 | text = text.encode(self.encoding, 'replace') |
| 268 | self._stream.write(text) |
maruel@chromium.org | 3215234 | 2016-02-17 23:19:35 +0000 | [diff] [blame] | 269 | except Exception as e: |
maruel@chromium.org | 35625c7 | 2011-03-23 17:34:02 +0000 | [diff] [blame] | 270 | complain('%s.write: %r' % (self.name, e)) |
| 271 | raise |
| 272 | |
| 273 | |
| 274 | def win_handle_is_a_console(handle): |
| 275 | """Returns True if a Windows file handle is a handle to a console.""" |
maruel@chromium.org | ade9c59 | 2011-04-07 15:59:11 +0000 | [diff] [blame] | 276 | # These types are available on linux but not Mac. |
Quinten Yearsley | b2cc4a9 | 2016-12-15 13:53:26 -0800 | [diff] [blame] | 277 | # pylint: disable=no-name-in-module,F0401 |
maruel@chromium.org | 35625c7 | 2011-03-23 17:34:02 +0000 | [diff] [blame] | 278 | from ctypes import byref, POINTER, windll, WINFUNCTYPE |
| 279 | from ctypes.wintypes import BOOL, DWORD, HANDLE |
| 280 | |
| 281 | FILE_TYPE_CHAR = 0x0002 |
| 282 | FILE_TYPE_REMOTE = 0x8000 |
| 283 | INVALID_HANDLE_VALUE = DWORD(-1).value |
| 284 | |
| 285 | # <http://msdn.microsoft.com/en-us/library/ms683167.aspx> |
| 286 | GetConsoleMode = WINFUNCTYPE(BOOL, HANDLE, POINTER(DWORD))( |
| 287 | ('GetConsoleMode', windll.kernel32)) |
| 288 | # <http://msdn.microsoft.com/en-us/library/aa364960.aspx> |
| 289 | GetFileType = WINFUNCTYPE(DWORD, DWORD)(('GetFileType', windll.kernel32)) |
| 290 | |
| 291 | # GetStdHandle returns INVALID_HANDLE_VALUE, NULL, or a valid handle. |
| 292 | if handle == INVALID_HANDLE_VALUE or handle is None: |
| 293 | return False |
| 294 | return ( |
| 295 | (GetFileType(handle) & ~FILE_TYPE_REMOTE) == FILE_TYPE_CHAR and |
| 296 | GetConsoleMode(handle, byref(DWORD()))) |
| 297 | |
| 298 | |
| 299 | def win_get_unicode_stream(stream, excepted_fileno, output_handle, encoding): |
| 300 | """Returns a unicode-compatible stream. |
| 301 | |
| 302 | This function will return a direct-Console writing object only if: |
| 303 | - the file number is the expected console file number |
| 304 | - the handle the expected file handle |
| 305 | - the 'real' handle is in fact a handle to a console. |
| 306 | """ |
| 307 | old_fileno = getattr(stream, 'fileno', lambda: None)() |
| 308 | if old_fileno == excepted_fileno: |
maruel@chromium.org | ade9c59 | 2011-04-07 15:59:11 +0000 | [diff] [blame] | 309 | # These types are available on linux but not Mac. |
Quinten Yearsley | b2cc4a9 | 2016-12-15 13:53:26 -0800 | [diff] [blame] | 310 | # pylint: disable=no-name-in-module,F0401 |
maruel@chromium.org | 35625c7 | 2011-03-23 17:34:02 +0000 | [diff] [blame] | 311 | from ctypes import windll, WINFUNCTYPE |
| 312 | from ctypes.wintypes import DWORD, HANDLE |
| 313 | |
| 314 | # <http://msdn.microsoft.com/en-us/library/ms683231.aspx> |
| 315 | GetStdHandle = WINFUNCTYPE(HANDLE, DWORD)(('GetStdHandle', windll.kernel32)) |
| 316 | |
| 317 | real_output_handle = GetStdHandle(DWORD(output_handle)) |
| 318 | if win_handle_is_a_console(real_output_handle): |
| 319 | # It's a console. |
| 320 | return WinUnicodeConsoleOutput( |
| 321 | real_output_handle, old_fileno, stream.name, encoding) |
| 322 | |
| 323 | # It's something else. Create an auto-encoding stream. |
| 324 | return WinUnicodeOutput(stream, old_fileno, encoding) |
| 325 | |
| 326 | |
| 327 | def fix_win_console(encoding): |
| 328 | """Makes Unicode console output work independently of the current code page. |
| 329 | |
| 330 | This also fixes <http://bugs.python.org/issue1602>. |
| 331 | Credit to Michael Kaplan |
| 332 | <http://blogs.msdn.com/b/michkap/archive/2010/04/07/9989346.aspx> and |
| 333 | TZOmegaTZIOY |
| 334 | <http://stackoverflow.com/questions/878972/windows-cmd-encoding-change-causes-python-crash/1432462#1432462>. |
| 335 | """ |
| 336 | if (isinstance(sys.stdout, WinUnicodeOutputBase) or |
| 337 | isinstance(sys.stderr, WinUnicodeOutputBase)): |
| 338 | return False |
| 339 | |
| 340 | try: |
| 341 | # SetConsoleCP and SetConsoleOutputCP could be used to change the code page |
| 342 | # but it's not really useful since the code here is using WriteConsoleW(). |
| 343 | # Also, changing the code page is 'permanent' to the console and needs to be |
| 344 | # reverted manually. |
| 345 | # In practice one needs to set the console font to a TTF font to be able to |
| 346 | # see all the characters but it failed for me in practice. In any case, it |
| 347 | # won't throw any exception when printing, which is the important part. |
| 348 | # -11 and -12 are defined in stdio.h |
| 349 | sys.stdout = win_get_unicode_stream(sys.stdout, 1, -11, encoding) |
| 350 | sys.stderr = win_get_unicode_stream(sys.stderr, 2, -12, encoding) |
| 351 | # TODO(maruel): Do sys.stdin with ReadConsoleW(). Albeit the limitation is |
| 352 | # "It doesn't appear to be possible to read Unicode characters in UTF-8 |
| 353 | # mode" and this appears to be a limitation of cmd.exe. |
maruel@chromium.org | 3215234 | 2016-02-17 23:19:35 +0000 | [diff] [blame] | 354 | except Exception as e: |
maruel@chromium.org | 35625c7 | 2011-03-23 17:34:02 +0000 | [diff] [blame] | 355 | complain('exception %r while fixing up sys.stdout and sys.stderr' % e) |
| 356 | return True |
| 357 | |
| 358 | |
| 359 | def fix_encoding(): |
| 360 | """Fixes various encoding problems on all platforms. |
| 361 | |
qyearsley | 12fa6ff | 2016-08-24 09:18:40 -0700 | [diff] [blame] | 362 | Should be called at the very beginning of the process. |
maruel@chromium.org | 35625c7 | 2011-03-23 17:34:02 +0000 | [diff] [blame] | 363 | """ |
| 364 | ret = True |
| 365 | if sys.platform == 'win32': |
| 366 | ret &= fix_win_codec() |
| 367 | |
| 368 | ret &= fix_default_encoding() |
| 369 | |
| 370 | if sys.platform == 'win32': |
| 371 | encoding = sys.getdefaultencoding() |
| 372 | ret &= fix_win_sys_argv(encoding) |
| 373 | ret &= fix_win_console(encoding) |
| 374 | return ret |