blob: e22878c4050d26c1efb6c65462abc7b29d3be52c [file] [log] [blame]
maruel@chromium.org35625c72011-03-23 17:34:02 +00001# Copyright (c) 2011 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""Collection of functions and classes to fix various encoding problems on
6multiple platforms with python.
7"""
8
Raul Tambre80ee78e2019-05-06 22:41:05 +00009from __future__ import print_function
10
maruel@chromium.org35625c72011-03-23 17:34:02 +000011import codecs
12import locale
13import os
14import sys
15
16
17# Prevents initializing multiple times.
18_SYS_ARGV_PROCESSED = False
19
20
21def complain(message):
22 """If any exception occurs in this file, we'll probably try to print it
23 on stderr, which makes for frustrating debugging if stderr is directed
24 to our wrapper. So be paranoid about catching errors and reporting them
25 to sys.__stderr__, so that the user has a higher chance to see them.
26 """
Raul Tambre80ee78e2019-05-06 22:41:05 +000027 print(
28 isinstance(message, str) and message or repr(message),
29 file=sys.__stderr__)
maruel@chromium.org35625c72011-03-23 17:34:02 +000030
31
32def fix_default_encoding():
33 """Forces utf8 solidly on all platforms.
34
35 By default python execution environment is lazy and defaults to ascii
36 encoding.
37
38 http://uucode.com/blog/2007/03/23/shut-up-you-dummy-7-bit-python/
39 """
40 if sys.getdefaultencoding() == 'utf-8':
41 return False
42
43 # Regenerate setdefaultencoding.
44 reload(sys)
45 # Module 'sys' has no 'setdefaultencoding' member
Quinten Yearsleyb2cc4a92016-12-15 13:53:26 -080046 # pylint: disable=no-member
maruel@chromium.org35625c72011-03-23 17:34:02 +000047 sys.setdefaultencoding('utf-8')
48 for attr in dir(locale):
49 if attr[0:3] != 'LC_':
50 continue
51 aref = getattr(locale, attr)
maruel@chromium.orgcfa826c2011-03-25 00:47:57 +000052 try:
53 locale.setlocale(aref, '')
54 except locale.Error:
55 continue
maruel@chromium.org35625c72011-03-23 17:34:02 +000056 try:
iannucci@chromium.org8b7274b2016-03-29 22:46:31 +000057 lang, _ = locale.getdefaultlocale()
tony@chromium.orge155bcd2011-03-23 23:16:55 +000058 except (TypeError, ValueError):
maruel@chromium.orgcfa826c2011-03-25 00:47:57 +000059 continue
maruel@chromium.org35625c72011-03-23 17:34:02 +000060 if lang:
61 try:
62 locale.setlocale(aref, (lang, 'UTF-8'))
63 except locale.Error:
64 os.environ[attr] = lang + '.UTF-8'
maruel@chromium.orgcfa826c2011-03-25 00:47:57 +000065 try:
66 locale.setlocale(locale.LC_ALL, '')
67 except locale.Error:
68 pass
maruel@chromium.org35625c72011-03-23 17:34:02 +000069 return True
70
71
72###############################
73# Windows specific
74
75
76def fix_win_sys_argv(encoding):
77 """Converts sys.argv to 'encoding' encoded string.
78
79 utf-8 is recommended.
80
81 Works around <http://bugs.python.org/issue2128>.
82 """
83 global _SYS_ARGV_PROCESSED
84 if _SYS_ARGV_PROCESSED:
85 return False
86
maruel@chromium.orgade9c592011-04-07 15:59:11 +000087 # These types are available on linux but not Mac.
Quinten Yearsleyb2cc4a92016-12-15 13:53:26 -080088 # pylint: disable=no-name-in-module,F0401
maruel@chromium.org35625c72011-03-23 17:34:02 +000089 from ctypes import byref, c_int, POINTER, windll, WINFUNCTYPE
90 from ctypes.wintypes import LPCWSTR, LPWSTR
91
92 # <http://msdn.microsoft.com/en-us/library/ms683156.aspx>
93 GetCommandLineW = WINFUNCTYPE(LPWSTR)(('GetCommandLineW', windll.kernel32))
94 # <http://msdn.microsoft.com/en-us/library/bb776391.aspx>
95 CommandLineToArgvW = WINFUNCTYPE(POINTER(LPWSTR), LPCWSTR, POINTER(c_int))(
96 ('CommandLineToArgvW', windll.shell32))
97
98 argc = c_int(0)
99 argv_unicode = CommandLineToArgvW(GetCommandLineW(), byref(argc))
100 argv = [
Raul Tambreb946b232019-03-26 14:48:46 +0000101 argv_unicode[i].encode(encoding, 'replace') for i in range(0, argc.value)
102 ]
maruel@chromium.org35625c72011-03-23 17:34:02 +0000103
104 if not hasattr(sys, 'frozen'):
105 # If this is an executable produced by py2exe or bbfreeze, then it
106 # will have been invoked directly. Otherwise, unicode_argv[0] is the
107 # Python interpreter, so skip that.
108 argv = argv[1:]
109
110 # Also skip option arguments to the Python interpreter.
111 while len(argv) > 0:
112 arg = argv[0]
Raul Tambreb946b232019-03-26 14:48:46 +0000113 if not arg.startswith(b'-') or arg == b'-':
maruel@chromium.org35625c72011-03-23 17:34:02 +0000114 break
115 argv = argv[1:]
116 if arg == u'-m':
117 # sys.argv[0] should really be the absolute path of the
118 # module source, but never mind.
119 break
120 if arg == u'-c':
121 argv[0] = u'-c'
122 break
123 sys.argv = argv
124 _SYS_ARGV_PROCESSED = True
125 return True
126
127
128def fix_win_codec():
129 """Works around <http://bugs.python.org/issue6058>."""
130 # <http://msdn.microsoft.com/en-us/library/dd317756.aspx>
131 try:
132 codecs.lookup('cp65001')
133 return False
134 except LookupError:
135 codecs.register(
136 lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None)
137 return True
138
139
140class WinUnicodeOutputBase(object):
141 """Base class to adapt sys.stdout or sys.stderr to behave correctly on
142 Windows.
143
144 Setting encoding to utf-8 is recommended.
145 """
146 def __init__(self, fileno, name, encoding):
147 # Corresponding file handle.
148 self._fileno = fileno
149 self.encoding = encoding
150 self.name = name
151
152 self.closed = False
153 self.softspace = False
154 self.mode = 'w'
155
156 @staticmethod
157 def isatty():
158 return False
159
160 def close(self):
161 # Don't really close the handle, that would only cause problems.
162 self.closed = True
163
164 def fileno(self):
165 return self._fileno
166
167 def flush(self):
168 raise NotImplementedError()
169
170 def write(self, text):
171 raise NotImplementedError()
172
173 def writelines(self, lines):
174 try:
175 for line in lines:
176 self.write(line)
maruel@chromium.org32152342016-02-17 23:19:35 +0000177 except Exception as e:
maruel@chromium.org35625c72011-03-23 17:34:02 +0000178 complain('%s.writelines: %r' % (self.name, e))
179 raise
180
181
182class WinUnicodeConsoleOutput(WinUnicodeOutputBase):
183 """Output adapter to a Windows Console.
184
185 Understands how to use the win32 console API.
186 """
187 def __init__(self, console_handle, fileno, stream_name, encoding):
188 super(WinUnicodeConsoleOutput, self).__init__(
189 fileno, '<Unicode console %s>' % stream_name, encoding)
190 # Handle to use for WriteConsoleW
191 self._console_handle = console_handle
192
193 # Loads the necessary function.
maruel@chromium.orgade9c592011-04-07 15:59:11 +0000194 # These types are available on linux but not Mac.
Quinten Yearsleyb2cc4a92016-12-15 13:53:26 -0800195 # pylint: disable=no-name-in-module,F0401
maruel@chromium.org35625c72011-03-23 17:34:02 +0000196 from ctypes import byref, GetLastError, POINTER, windll, WINFUNCTYPE
maruel@chromium.org725f1c32011-04-01 20:24:54 +0000197 from ctypes.wintypes import BOOL, DWORD, HANDLE, LPWSTR
Quinten Yearsleyb2cc4a92016-12-15 13:53:26 -0800198 from ctypes.wintypes import LPVOID # pylint: disable=no-name-in-module
maruel@chromium.org35625c72011-03-23 17:34:02 +0000199
200 self._DWORD = DWORD
201 self._byref = byref
202
203 # <http://msdn.microsoft.com/en-us/library/ms687401.aspx>
204 self._WriteConsoleW = WINFUNCTYPE(
205 BOOL, HANDLE, LPWSTR, DWORD, POINTER(DWORD), LPVOID)(
206 ('WriteConsoleW', windll.kernel32))
207 self._GetLastError = GetLastError
208
209 def flush(self):
210 # No need to flush the console since it's immediate.
211 pass
212
213 def write(self, text):
214 try:
Raul Tambre65c2b1e2019-07-16 14:07:04 +0000215 if sys.version_info.major == 2 and not isinstance(text, unicode):
maruel@chromium.org35625c72011-03-23 17:34:02 +0000216 # Convert to unicode.
217 text = str(text).decode(self.encoding, 'replace')
218 remaining = len(text)
219 while remaining > 0:
220 n = self._DWORD(0)
221 # There is a shorter-than-documented limitation on the length of the
222 # string passed to WriteConsoleW. See
223 # <http://tahoe-lafs.org/trac/tahoe-lafs/ticket/1232>.
224 retval = self._WriteConsoleW(
225 self._console_handle, text,
226 min(remaining, 10000),
227 self._byref(n), None)
228 if retval == 0 or n.value == 0:
229 raise IOError(
230 'WriteConsoleW returned %r, n.value = %r, last error = %r' % (
231 retval, n.value, self._GetLastError()))
232 remaining -= n.value
233 if not remaining:
234 break
avakulenko@google.com255f2be2014-12-05 22:19:55 +0000235 text = text[int(n.value):]
maruel@chromium.org32152342016-02-17 23:19:35 +0000236 except Exception as e:
maruel@chromium.org35625c72011-03-23 17:34:02 +0000237 complain('%s.write: %r' % (self.name, e))
238 raise
239
240
241class WinUnicodeOutput(WinUnicodeOutputBase):
242 """Output adaptor to a file output on Windows.
243
244 If the standard FileWrite function is used, it will be encoded in the current
245 code page. WriteConsoleW() permits writting any character.
246 """
247 def __init__(self, stream, fileno, encoding):
248 super(WinUnicodeOutput, self).__init__(
249 fileno, '<Unicode redirected %s>' % stream.name, encoding)
250 # Output stream
251 self._stream = stream
252
253 # Flush right now.
254 self.flush()
255
256 def flush(self):
257 try:
258 self._stream.flush()
maruel@chromium.org32152342016-02-17 23:19:35 +0000259 except Exception as e:
maruel@chromium.org35625c72011-03-23 17:34:02 +0000260 complain('%s.flush: %r from %r' % (self.name, e, self._stream))
261 raise
262
263 def write(self, text):
264 try:
Raul Tambre65c2b1e2019-07-16 14:07:04 +0000265 if sys.version_info.major == 2 and isinstance(text, unicode):
maruel@chromium.org35625c72011-03-23 17:34:02 +0000266 # Replace characters that cannot be printed instead of failing.
267 text = text.encode(self.encoding, 'replace')
268 self._stream.write(text)
maruel@chromium.org32152342016-02-17 23:19:35 +0000269 except Exception as e:
maruel@chromium.org35625c72011-03-23 17:34:02 +0000270 complain('%s.write: %r' % (self.name, e))
271 raise
272
273
274def win_handle_is_a_console(handle):
275 """Returns True if a Windows file handle is a handle to a console."""
maruel@chromium.orgade9c592011-04-07 15:59:11 +0000276 # These types are available on linux but not Mac.
Quinten Yearsleyb2cc4a92016-12-15 13:53:26 -0800277 # pylint: disable=no-name-in-module,F0401
maruel@chromium.org35625c72011-03-23 17:34:02 +0000278 from ctypes import byref, POINTER, windll, WINFUNCTYPE
279 from ctypes.wintypes import BOOL, DWORD, HANDLE
280
281 FILE_TYPE_CHAR = 0x0002
282 FILE_TYPE_REMOTE = 0x8000
283 INVALID_HANDLE_VALUE = DWORD(-1).value
284
285 # <http://msdn.microsoft.com/en-us/library/ms683167.aspx>
286 GetConsoleMode = WINFUNCTYPE(BOOL, HANDLE, POINTER(DWORD))(
287 ('GetConsoleMode', windll.kernel32))
288 # <http://msdn.microsoft.com/en-us/library/aa364960.aspx>
289 GetFileType = WINFUNCTYPE(DWORD, DWORD)(('GetFileType', windll.kernel32))
290
291 # GetStdHandle returns INVALID_HANDLE_VALUE, NULL, or a valid handle.
292 if handle == INVALID_HANDLE_VALUE or handle is None:
293 return False
294 return (
295 (GetFileType(handle) & ~FILE_TYPE_REMOTE) == FILE_TYPE_CHAR and
296 GetConsoleMode(handle, byref(DWORD())))
297
298
299def win_get_unicode_stream(stream, excepted_fileno, output_handle, encoding):
300 """Returns a unicode-compatible stream.
301
302 This function will return a direct-Console writing object only if:
303 - the file number is the expected console file number
304 - the handle the expected file handle
305 - the 'real' handle is in fact a handle to a console.
306 """
307 old_fileno = getattr(stream, 'fileno', lambda: None)()
308 if old_fileno == excepted_fileno:
maruel@chromium.orgade9c592011-04-07 15:59:11 +0000309 # These types are available on linux but not Mac.
Quinten Yearsleyb2cc4a92016-12-15 13:53:26 -0800310 # pylint: disable=no-name-in-module,F0401
maruel@chromium.org35625c72011-03-23 17:34:02 +0000311 from ctypes import windll, WINFUNCTYPE
312 from ctypes.wintypes import DWORD, HANDLE
313
314 # <http://msdn.microsoft.com/en-us/library/ms683231.aspx>
315 GetStdHandle = WINFUNCTYPE(HANDLE, DWORD)(('GetStdHandle', windll.kernel32))
316
317 real_output_handle = GetStdHandle(DWORD(output_handle))
318 if win_handle_is_a_console(real_output_handle):
319 # It's a console.
320 return WinUnicodeConsoleOutput(
321 real_output_handle, old_fileno, stream.name, encoding)
322
323 # It's something else. Create an auto-encoding stream.
324 return WinUnicodeOutput(stream, old_fileno, encoding)
325
326
327def fix_win_console(encoding):
328 """Makes Unicode console output work independently of the current code page.
329
330 This also fixes <http://bugs.python.org/issue1602>.
331 Credit to Michael Kaplan
332 <http://blogs.msdn.com/b/michkap/archive/2010/04/07/9989346.aspx> and
333 TZOmegaTZIOY
334 <http://stackoverflow.com/questions/878972/windows-cmd-encoding-change-causes-python-crash/1432462#1432462>.
335 """
336 if (isinstance(sys.stdout, WinUnicodeOutputBase) or
337 isinstance(sys.stderr, WinUnicodeOutputBase)):
338 return False
339
340 try:
341 # SetConsoleCP and SetConsoleOutputCP could be used to change the code page
342 # but it's not really useful since the code here is using WriteConsoleW().
343 # Also, changing the code page is 'permanent' to the console and needs to be
344 # reverted manually.
345 # In practice one needs to set the console font to a TTF font to be able to
346 # see all the characters but it failed for me in practice. In any case, it
347 # won't throw any exception when printing, which is the important part.
348 # -11 and -12 are defined in stdio.h
349 sys.stdout = win_get_unicode_stream(sys.stdout, 1, -11, encoding)
350 sys.stderr = win_get_unicode_stream(sys.stderr, 2, -12, encoding)
351 # TODO(maruel): Do sys.stdin with ReadConsoleW(). Albeit the limitation is
352 # "It doesn't appear to be possible to read Unicode characters in UTF-8
353 # mode" and this appears to be a limitation of cmd.exe.
maruel@chromium.org32152342016-02-17 23:19:35 +0000354 except Exception as e:
maruel@chromium.org35625c72011-03-23 17:34:02 +0000355 complain('exception %r while fixing up sys.stdout and sys.stderr' % e)
356 return True
357
358
359def fix_encoding():
360 """Fixes various encoding problems on all platforms.
361
qyearsley12fa6ff2016-08-24 09:18:40 -0700362 Should be called at the very beginning of the process.
maruel@chromium.org35625c72011-03-23 17:34:02 +0000363 """
364 ret = True
365 if sys.platform == 'win32':
366 ret &= fix_win_codec()
367
368 ret &= fix_default_encoding()
369
370 if sys.platform == 'win32':
371 encoding = sys.getdefaultencoding()
372 ret &= fix_win_sys_argv(encoding)
373 ret &= fix_win_console(encoding)
374 return ret