blob: fb4214c3c8885db0df01cc73e66c6280a088ddb0 [file] [log] [blame]
maruel@chromium.org35625c72011-03-23 17:34:02 +00001# Copyright (c) 2011 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""Collection of functions and classes to fix various encoding problems on
6multiple platforms with python.
7"""
8
Raul Tambre80ee78e2019-05-06 22:41:05 +00009from __future__ import print_function
10
maruel@chromium.org35625c72011-03-23 17:34:02 +000011import codecs
12import locale
13import os
14import sys
15
16
17# Prevents initializing multiple times.
18_SYS_ARGV_PROCESSED = False
19
20
21def complain(message):
22 """If any exception occurs in this file, we'll probably try to print it
23 on stderr, which makes for frustrating debugging if stderr is directed
24 to our wrapper. So be paranoid about catching errors and reporting them
25 to sys.__stderr__, so that the user has a higher chance to see them.
26 """
Raul Tambre80ee78e2019-05-06 22:41:05 +000027 print(
28 isinstance(message, str) and message or repr(message),
29 file=sys.__stderr__)
maruel@chromium.org35625c72011-03-23 17:34:02 +000030
31
32def fix_default_encoding():
33 """Forces utf8 solidly on all platforms.
34
35 By default python execution environment is lazy and defaults to ascii
36 encoding.
37
38 http://uucode.com/blog/2007/03/23/shut-up-you-dummy-7-bit-python/
39 """
40 if sys.getdefaultencoding() == 'utf-8':
41 return False
42
43 # Regenerate setdefaultencoding.
44 reload(sys)
45 # Module 'sys' has no 'setdefaultencoding' member
Quinten Yearsleyb2cc4a92016-12-15 13:53:26 -080046 # pylint: disable=no-member
maruel@chromium.org35625c72011-03-23 17:34:02 +000047 sys.setdefaultencoding('utf-8')
48 for attr in dir(locale):
49 if attr[0:3] != 'LC_':
50 continue
51 aref = getattr(locale, attr)
maruel@chromium.orgcfa826c2011-03-25 00:47:57 +000052 try:
53 locale.setlocale(aref, '')
54 except locale.Error:
55 continue
maruel@chromium.org35625c72011-03-23 17:34:02 +000056 try:
iannucci@chromium.org8b7274b2016-03-29 22:46:31 +000057 lang, _ = locale.getdefaultlocale()
tony@chromium.orge155bcd2011-03-23 23:16:55 +000058 except (TypeError, ValueError):
maruel@chromium.orgcfa826c2011-03-25 00:47:57 +000059 continue
maruel@chromium.org35625c72011-03-23 17:34:02 +000060 if lang:
61 try:
62 locale.setlocale(aref, (lang, 'UTF-8'))
63 except locale.Error:
64 os.environ[attr] = lang + '.UTF-8'
maruel@chromium.orgcfa826c2011-03-25 00:47:57 +000065 try:
66 locale.setlocale(locale.LC_ALL, '')
67 except locale.Error:
68 pass
maruel@chromium.org35625c72011-03-23 17:34:02 +000069 return True
70
71
72###############################
73# Windows specific
74
75
76def fix_win_sys_argv(encoding):
77 """Converts sys.argv to 'encoding' encoded string.
78
79 utf-8 is recommended.
80
81 Works around <http://bugs.python.org/issue2128>.
82 """
83 global _SYS_ARGV_PROCESSED
84 if _SYS_ARGV_PROCESSED:
85 return False
86
maruel@chromium.orgade9c592011-04-07 15:59:11 +000087 # These types are available on linux but not Mac.
Quinten Yearsleyb2cc4a92016-12-15 13:53:26 -080088 # pylint: disable=no-name-in-module,F0401
maruel@chromium.org35625c72011-03-23 17:34:02 +000089 from ctypes import byref, c_int, POINTER, windll, WINFUNCTYPE
90 from ctypes.wintypes import LPCWSTR, LPWSTR
91
92 # <http://msdn.microsoft.com/en-us/library/ms683156.aspx>
93 GetCommandLineW = WINFUNCTYPE(LPWSTR)(('GetCommandLineW', windll.kernel32))
94 # <http://msdn.microsoft.com/en-us/library/bb776391.aspx>
95 CommandLineToArgvW = WINFUNCTYPE(POINTER(LPWSTR), LPCWSTR, POINTER(c_int))(
96 ('CommandLineToArgvW', windll.shell32))
97
98 argc = c_int(0)
99 argv_unicode = CommandLineToArgvW(GetCommandLineW(), byref(argc))
100 argv = [
Raul Tambreb946b232019-03-26 14:48:46 +0000101 argv_unicode[i].encode(encoding, 'replace') for i in range(0, argc.value)
102 ]
maruel@chromium.org35625c72011-03-23 17:34:02 +0000103
104 if not hasattr(sys, 'frozen'):
105 # If this is an executable produced by py2exe or bbfreeze, then it
106 # will have been invoked directly. Otherwise, unicode_argv[0] is the
107 # Python interpreter, so skip that.
108 argv = argv[1:]
109
110 # Also skip option arguments to the Python interpreter.
111 while len(argv) > 0:
112 arg = argv[0]
Raul Tambreb946b232019-03-26 14:48:46 +0000113 if not arg.startswith(b'-') or arg == b'-':
maruel@chromium.org35625c72011-03-23 17:34:02 +0000114 break
115 argv = argv[1:]
116 if arg == u'-m':
117 # sys.argv[0] should really be the absolute path of the
118 # module source, but never mind.
119 break
120 if arg == u'-c':
121 argv[0] = u'-c'
122 break
123 sys.argv = argv
124 _SYS_ARGV_PROCESSED = True
125 return True
126
127
128def fix_win_codec():
129 """Works around <http://bugs.python.org/issue6058>."""
130 # <http://msdn.microsoft.com/en-us/library/dd317756.aspx>
131 try:
132 codecs.lookup('cp65001')
133 return False
134 except LookupError:
135 codecs.register(
136 lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None)
137 return True
138
139
140class WinUnicodeOutputBase(object):
141 """Base class to adapt sys.stdout or sys.stderr to behave correctly on
142 Windows.
143
144 Setting encoding to utf-8 is recommended.
145 """
146 def __init__(self, fileno, name, encoding):
147 # Corresponding file handle.
148 self._fileno = fileno
149 self.encoding = encoding
150 self.name = name
151
152 self.closed = False
153 self.softspace = False
154 self.mode = 'w'
155
156 @staticmethod
157 def isatty():
158 return False
159
160 def close(self):
161 # Don't really close the handle, that would only cause problems.
162 self.closed = True
163
164 def fileno(self):
165 return self._fileno
166
167 def flush(self):
168 raise NotImplementedError()
169
170 def write(self, text):
171 raise NotImplementedError()
172
173 def writelines(self, lines):
174 try:
175 for line in lines:
176 self.write(line)
maruel@chromium.org32152342016-02-17 23:19:35 +0000177 except Exception as e:
maruel@chromium.org35625c72011-03-23 17:34:02 +0000178 complain('%s.writelines: %r' % (self.name, e))
179 raise
180
181
182class WinUnicodeConsoleOutput(WinUnicodeOutputBase):
183 """Output adapter to a Windows Console.
184
185 Understands how to use the win32 console API.
186 """
187 def __init__(self, console_handle, fileno, stream_name, encoding):
188 super(WinUnicodeConsoleOutput, self).__init__(
189 fileno, '<Unicode console %s>' % stream_name, encoding)
190 # Handle to use for WriteConsoleW
191 self._console_handle = console_handle
192
193 # Loads the necessary function.
maruel@chromium.orgade9c592011-04-07 15:59:11 +0000194 # These types are available on linux but not Mac.
Quinten Yearsleyb2cc4a92016-12-15 13:53:26 -0800195 # pylint: disable=no-name-in-module,F0401
maruel@chromium.org35625c72011-03-23 17:34:02 +0000196 from ctypes import byref, GetLastError, POINTER, windll, WINFUNCTYPE
maruel@chromium.org725f1c32011-04-01 20:24:54 +0000197 from ctypes.wintypes import BOOL, DWORD, HANDLE, LPWSTR
Quinten Yearsleyb2cc4a92016-12-15 13:53:26 -0800198 from ctypes.wintypes import LPVOID # pylint: disable=no-name-in-module
maruel@chromium.org35625c72011-03-23 17:34:02 +0000199
200 self._DWORD = DWORD
201 self._byref = byref
202
203 # <http://msdn.microsoft.com/en-us/library/ms687401.aspx>
204 self._WriteConsoleW = WINFUNCTYPE(
205 BOOL, HANDLE, LPWSTR, DWORD, POINTER(DWORD), LPVOID)(
206 ('WriteConsoleW', windll.kernel32))
207 self._GetLastError = GetLastError
208
209 def flush(self):
210 # No need to flush the console since it's immediate.
211 pass
212
213 def write(self, text):
214 try:
Raul Tambre65c2b1e2019-07-16 14:07:04 +0000215 if sys.version_info.major == 2 and not isinstance(text, unicode):
maruel@chromium.org35625c72011-03-23 17:34:02 +0000216 # Convert to unicode.
217 text = str(text).decode(self.encoding, 'replace')
Raul Tambred9c1c852019-09-10 16:33:40 +0000218 elif sys.version_info.major == 3 and isinstance(text, bytes):
219 # Bytestrings need to be decoded to a string before being passed to
220 # Windows.
221 text = text.decode(self.encoding, 'replace')
maruel@chromium.org35625c72011-03-23 17:34:02 +0000222 remaining = len(text)
223 while remaining > 0:
224 n = self._DWORD(0)
225 # There is a shorter-than-documented limitation on the length of the
226 # string passed to WriteConsoleW. See
227 # <http://tahoe-lafs.org/trac/tahoe-lafs/ticket/1232>.
228 retval = self._WriteConsoleW(
229 self._console_handle, text,
230 min(remaining, 10000),
231 self._byref(n), None)
232 if retval == 0 or n.value == 0:
233 raise IOError(
234 'WriteConsoleW returned %r, n.value = %r, last error = %r' % (
235 retval, n.value, self._GetLastError()))
236 remaining -= n.value
237 if not remaining:
238 break
avakulenko@google.com255f2be2014-12-05 22:19:55 +0000239 text = text[int(n.value):]
maruel@chromium.org32152342016-02-17 23:19:35 +0000240 except Exception as e:
maruel@chromium.org35625c72011-03-23 17:34:02 +0000241 complain('%s.write: %r' % (self.name, e))
242 raise
243
244
245class WinUnicodeOutput(WinUnicodeOutputBase):
246 """Output adaptor to a file output on Windows.
247
248 If the standard FileWrite function is used, it will be encoded in the current
249 code page. WriteConsoleW() permits writting any character.
250 """
251 def __init__(self, stream, fileno, encoding):
252 super(WinUnicodeOutput, self).__init__(
253 fileno, '<Unicode redirected %s>' % stream.name, encoding)
254 # Output stream
255 self._stream = stream
256
257 # Flush right now.
258 self.flush()
259
260 def flush(self):
261 try:
262 self._stream.flush()
maruel@chromium.org32152342016-02-17 23:19:35 +0000263 except Exception as e:
maruel@chromium.org35625c72011-03-23 17:34:02 +0000264 complain('%s.flush: %r from %r' % (self.name, e, self._stream))
265 raise
266
267 def write(self, text):
268 try:
Raul Tambre65c2b1e2019-07-16 14:07:04 +0000269 if sys.version_info.major == 2 and isinstance(text, unicode):
maruel@chromium.org35625c72011-03-23 17:34:02 +0000270 # Replace characters that cannot be printed instead of failing.
271 text = text.encode(self.encoding, 'replace')
272 self._stream.write(text)
maruel@chromium.org32152342016-02-17 23:19:35 +0000273 except Exception as e:
maruel@chromium.org35625c72011-03-23 17:34:02 +0000274 complain('%s.write: %r' % (self.name, e))
275 raise
276
277
278def win_handle_is_a_console(handle):
279 """Returns True if a Windows file handle is a handle to a console."""
maruel@chromium.orgade9c592011-04-07 15:59:11 +0000280 # These types are available on linux but not Mac.
Quinten Yearsleyb2cc4a92016-12-15 13:53:26 -0800281 # pylint: disable=no-name-in-module,F0401
maruel@chromium.org35625c72011-03-23 17:34:02 +0000282 from ctypes import byref, POINTER, windll, WINFUNCTYPE
283 from ctypes.wintypes import BOOL, DWORD, HANDLE
284
285 FILE_TYPE_CHAR = 0x0002
286 FILE_TYPE_REMOTE = 0x8000
287 INVALID_HANDLE_VALUE = DWORD(-1).value
288
289 # <http://msdn.microsoft.com/en-us/library/ms683167.aspx>
290 GetConsoleMode = WINFUNCTYPE(BOOL, HANDLE, POINTER(DWORD))(
291 ('GetConsoleMode', windll.kernel32))
292 # <http://msdn.microsoft.com/en-us/library/aa364960.aspx>
293 GetFileType = WINFUNCTYPE(DWORD, DWORD)(('GetFileType', windll.kernel32))
294
295 # GetStdHandle returns INVALID_HANDLE_VALUE, NULL, or a valid handle.
296 if handle == INVALID_HANDLE_VALUE or handle is None:
297 return False
298 return (
299 (GetFileType(handle) & ~FILE_TYPE_REMOTE) == FILE_TYPE_CHAR and
300 GetConsoleMode(handle, byref(DWORD())))
301
302
303def win_get_unicode_stream(stream, excepted_fileno, output_handle, encoding):
304 """Returns a unicode-compatible stream.
305
306 This function will return a direct-Console writing object only if:
307 - the file number is the expected console file number
308 - the handle the expected file handle
309 - the 'real' handle is in fact a handle to a console.
310 """
311 old_fileno = getattr(stream, 'fileno', lambda: None)()
312 if old_fileno == excepted_fileno:
maruel@chromium.orgade9c592011-04-07 15:59:11 +0000313 # These types are available on linux but not Mac.
Quinten Yearsleyb2cc4a92016-12-15 13:53:26 -0800314 # pylint: disable=no-name-in-module,F0401
maruel@chromium.org35625c72011-03-23 17:34:02 +0000315 from ctypes import windll, WINFUNCTYPE
316 from ctypes.wintypes import DWORD, HANDLE
317
318 # <http://msdn.microsoft.com/en-us/library/ms683231.aspx>
319 GetStdHandle = WINFUNCTYPE(HANDLE, DWORD)(('GetStdHandle', windll.kernel32))
320
321 real_output_handle = GetStdHandle(DWORD(output_handle))
322 if win_handle_is_a_console(real_output_handle):
323 # It's a console.
324 return WinUnicodeConsoleOutput(
325 real_output_handle, old_fileno, stream.name, encoding)
326
327 # It's something else. Create an auto-encoding stream.
328 return WinUnicodeOutput(stream, old_fileno, encoding)
329
330
331def fix_win_console(encoding):
332 """Makes Unicode console output work independently of the current code page.
333
334 This also fixes <http://bugs.python.org/issue1602>.
335 Credit to Michael Kaplan
336 <http://blogs.msdn.com/b/michkap/archive/2010/04/07/9989346.aspx> and
337 TZOmegaTZIOY
338 <http://stackoverflow.com/questions/878972/windows-cmd-encoding-change-causes-python-crash/1432462#1432462>.
339 """
340 if (isinstance(sys.stdout, WinUnicodeOutputBase) or
341 isinstance(sys.stderr, WinUnicodeOutputBase)):
342 return False
343
344 try:
345 # SetConsoleCP and SetConsoleOutputCP could be used to change the code page
346 # but it's not really useful since the code here is using WriteConsoleW().
347 # Also, changing the code page is 'permanent' to the console and needs to be
348 # reverted manually.
349 # In practice one needs to set the console font to a TTF font to be able to
350 # see all the characters but it failed for me in practice. In any case, it
351 # won't throw any exception when printing, which is the important part.
352 # -11 and -12 are defined in stdio.h
353 sys.stdout = win_get_unicode_stream(sys.stdout, 1, -11, encoding)
354 sys.stderr = win_get_unicode_stream(sys.stderr, 2, -12, encoding)
355 # TODO(maruel): Do sys.stdin with ReadConsoleW(). Albeit the limitation is
356 # "It doesn't appear to be possible to read Unicode characters in UTF-8
357 # mode" and this appears to be a limitation of cmd.exe.
maruel@chromium.org32152342016-02-17 23:19:35 +0000358 except Exception as e:
maruel@chromium.org35625c72011-03-23 17:34:02 +0000359 complain('exception %r while fixing up sys.stdout and sys.stderr' % e)
360 return True
361
362
363def fix_encoding():
364 """Fixes various encoding problems on all platforms.
365
qyearsley12fa6ff2016-08-24 09:18:40 -0700366 Should be called at the very beginning of the process.
maruel@chromium.org35625c72011-03-23 17:34:02 +0000367 """
368 ret = True
369 if sys.platform == 'win32':
370 ret &= fix_win_codec()
371
372 ret &= fix_default_encoding()
373
374 if sys.platform == 'win32':
375 encoding = sys.getdefaultencoding()
376 ret &= fix_win_sys_argv(encoding)
377 ret &= fix_win_console(encoding)
378 return ret