blob: e97ea05203cf59ec8a4f88098ba99b4020b4b370 [file] [log] [blame]
maruel@chromium.org35625c72011-03-23 17:34:02 +00001# Copyright (c) 2011 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""Collection of functions and classes to fix various encoding problems on
6multiple platforms with python.
7"""
8
Raul Tambre80ee78e2019-05-06 22:41:05 +00009from __future__ import print_function
10
maruel@chromium.org35625c72011-03-23 17:34:02 +000011import codecs
12import locale
13import os
14import sys
15
16
maruel@chromium.org35625c72011-03-23 17:34:02 +000017def complain(message):
18 """If any exception occurs in this file, we'll probably try to print it
19 on stderr, which makes for frustrating debugging if stderr is directed
20 to our wrapper. So be paranoid about catching errors and reporting them
21 to sys.__stderr__, so that the user has a higher chance to see them.
22 """
Raul Tambre80ee78e2019-05-06 22:41:05 +000023 print(
24 isinstance(message, str) and message or repr(message),
25 file=sys.__stderr__)
maruel@chromium.org35625c72011-03-23 17:34:02 +000026
27
28def fix_default_encoding():
29 """Forces utf8 solidly on all platforms.
30
31 By default python execution environment is lazy and defaults to ascii
32 encoding.
33
34 http://uucode.com/blog/2007/03/23/shut-up-you-dummy-7-bit-python/
35 """
36 if sys.getdefaultencoding() == 'utf-8':
37 return False
38
39 # Regenerate setdefaultencoding.
40 reload(sys)
41 # Module 'sys' has no 'setdefaultencoding' member
Quinten Yearsleyb2cc4a92016-12-15 13:53:26 -080042 # pylint: disable=no-member
maruel@chromium.org35625c72011-03-23 17:34:02 +000043 sys.setdefaultencoding('utf-8')
44 for attr in dir(locale):
45 if attr[0:3] != 'LC_':
46 continue
47 aref = getattr(locale, attr)
maruel@chromium.orgcfa826c2011-03-25 00:47:57 +000048 try:
49 locale.setlocale(aref, '')
50 except locale.Error:
51 continue
maruel@chromium.org35625c72011-03-23 17:34:02 +000052 try:
iannucci@chromium.org8b7274b2016-03-29 22:46:31 +000053 lang, _ = locale.getdefaultlocale()
tony@chromium.orge155bcd2011-03-23 23:16:55 +000054 except (TypeError, ValueError):
maruel@chromium.orgcfa826c2011-03-25 00:47:57 +000055 continue
maruel@chromium.org35625c72011-03-23 17:34:02 +000056 if lang:
57 try:
58 locale.setlocale(aref, (lang, 'UTF-8'))
59 except locale.Error:
60 os.environ[attr] = lang + '.UTF-8'
maruel@chromium.orgcfa826c2011-03-25 00:47:57 +000061 try:
62 locale.setlocale(locale.LC_ALL, '')
63 except locale.Error:
64 pass
maruel@chromium.org35625c72011-03-23 17:34:02 +000065 return True
66
67
68###############################
69# Windows specific
70
maruel@chromium.org35625c72011-03-23 17:34:02 +000071def fix_win_codec():
72 """Works around <http://bugs.python.org/issue6058>."""
73 # <http://msdn.microsoft.com/en-us/library/dd317756.aspx>
74 try:
75 codecs.lookup('cp65001')
76 return False
77 except LookupError:
78 codecs.register(
79 lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None)
80 return True
81
82
83class WinUnicodeOutputBase(object):
84 """Base class to adapt sys.stdout or sys.stderr to behave correctly on
85 Windows.
86
87 Setting encoding to utf-8 is recommended.
88 """
89 def __init__(self, fileno, name, encoding):
90 # Corresponding file handle.
91 self._fileno = fileno
92 self.encoding = encoding
93 self.name = name
94
95 self.closed = False
96 self.softspace = False
97 self.mode = 'w'
98
99 @staticmethod
100 def isatty():
101 return False
102
103 def close(self):
104 # Don't really close the handle, that would only cause problems.
105 self.closed = True
106
107 def fileno(self):
108 return self._fileno
109
110 def flush(self):
111 raise NotImplementedError()
112
113 def write(self, text):
114 raise NotImplementedError()
115
116 def writelines(self, lines):
117 try:
118 for line in lines:
119 self.write(line)
maruel@chromium.org32152342016-02-17 23:19:35 +0000120 except Exception as e:
maruel@chromium.org35625c72011-03-23 17:34:02 +0000121 complain('%s.writelines: %r' % (self.name, e))
122 raise
123
124
125class WinUnicodeConsoleOutput(WinUnicodeOutputBase):
126 """Output adapter to a Windows Console.
127
128 Understands how to use the win32 console API.
129 """
130 def __init__(self, console_handle, fileno, stream_name, encoding):
131 super(WinUnicodeConsoleOutput, self).__init__(
132 fileno, '<Unicode console %s>' % stream_name, encoding)
133 # Handle to use for WriteConsoleW
134 self._console_handle = console_handle
135
136 # Loads the necessary function.
maruel@chromium.orgade9c592011-04-07 15:59:11 +0000137 # These types are available on linux but not Mac.
Quinten Yearsleyb2cc4a92016-12-15 13:53:26 -0800138 # pylint: disable=no-name-in-module,F0401
maruel@chromium.org35625c72011-03-23 17:34:02 +0000139 from ctypes import byref, GetLastError, POINTER, windll, WINFUNCTYPE
maruel@chromium.org725f1c32011-04-01 20:24:54 +0000140 from ctypes.wintypes import BOOL, DWORD, HANDLE, LPWSTR
Quinten Yearsleyb2cc4a92016-12-15 13:53:26 -0800141 from ctypes.wintypes import LPVOID # pylint: disable=no-name-in-module
maruel@chromium.org35625c72011-03-23 17:34:02 +0000142
143 self._DWORD = DWORD
144 self._byref = byref
145
146 # <http://msdn.microsoft.com/en-us/library/ms687401.aspx>
147 self._WriteConsoleW = WINFUNCTYPE(
148 BOOL, HANDLE, LPWSTR, DWORD, POINTER(DWORD), LPVOID)(
149 ('WriteConsoleW', windll.kernel32))
150 self._GetLastError = GetLastError
151
152 def flush(self):
153 # No need to flush the console since it's immediate.
154 pass
155
156 def write(self, text):
157 try:
Gavin Mak512f3cb2023-09-05 18:02:24 +0000158 if isinstance(text, bytes):
Raul Tambred9c1c852019-09-10 16:33:40 +0000159 # Bytestrings need to be decoded to a string before being passed to
160 # Windows.
161 text = text.decode(self.encoding, 'replace')
maruel@chromium.org35625c72011-03-23 17:34:02 +0000162 remaining = len(text)
163 while remaining > 0:
164 n = self._DWORD(0)
165 # There is a shorter-than-documented limitation on the length of the
166 # string passed to WriteConsoleW. See
167 # <http://tahoe-lafs.org/trac/tahoe-lafs/ticket/1232>.
168 retval = self._WriteConsoleW(
169 self._console_handle, text,
170 min(remaining, 10000),
171 self._byref(n), None)
172 if retval == 0 or n.value == 0:
173 raise IOError(
174 'WriteConsoleW returned %r, n.value = %r, last error = %r' % (
175 retval, n.value, self._GetLastError()))
176 remaining -= n.value
177 if not remaining:
178 break
avakulenko@google.com255f2be2014-12-05 22:19:55 +0000179 text = text[int(n.value):]
maruel@chromium.org32152342016-02-17 23:19:35 +0000180 except Exception as e:
maruel@chromium.org35625c72011-03-23 17:34:02 +0000181 complain('%s.write: %r' % (self.name, e))
182 raise
183
184
185class WinUnicodeOutput(WinUnicodeOutputBase):
186 """Output adaptor to a file output on Windows.
187
188 If the standard FileWrite function is used, it will be encoded in the current
Quinten Yearsley925cedb2020-04-13 17:49:39 +0000189 code page. WriteConsoleW() permits writing any character.
maruel@chromium.org35625c72011-03-23 17:34:02 +0000190 """
191 def __init__(self, stream, fileno, encoding):
192 super(WinUnicodeOutput, self).__init__(
193 fileno, '<Unicode redirected %s>' % stream.name, encoding)
194 # Output stream
195 self._stream = stream
196
197 # Flush right now.
198 self.flush()
199
200 def flush(self):
201 try:
202 self._stream.flush()
maruel@chromium.org32152342016-02-17 23:19:35 +0000203 except Exception as e:
maruel@chromium.org35625c72011-03-23 17:34:02 +0000204 complain('%s.flush: %r from %r' % (self.name, e, self._stream))
205 raise
206
207 def write(self, text):
208 try:
Gavin Mak512f3cb2023-09-05 18:02:24 +0000209 if isinstance(text, bytes):
Edward Lesmes05934952019-12-19 20:38:09 +0000210 # Replace characters that cannot be printed instead of failing.
211 text = text.decode(self.encoding, 'replace')
Bruce Dawson00790d32022-04-19 19:35:46 +0000212 # When redirecting to a file or process any \n characters will be replaced
213 # with \r\n. If the text to be printed already has \r\n line endings then
214 # \r\r\n line endings will be generated, leading to double-spacing of some
215 # output. Normalizing line endings to \n avoids this problem.
216 text = text.replace('\r\n', '\n')
maruel@chromium.org35625c72011-03-23 17:34:02 +0000217 self._stream.write(text)
maruel@chromium.org32152342016-02-17 23:19:35 +0000218 except Exception as e:
maruel@chromium.org35625c72011-03-23 17:34:02 +0000219 complain('%s.write: %r' % (self.name, e))
220 raise
221
222
223def win_handle_is_a_console(handle):
224 """Returns True if a Windows file handle is a handle to a console."""
maruel@chromium.orgade9c592011-04-07 15:59:11 +0000225 # These types are available on linux but not Mac.
Quinten Yearsleyb2cc4a92016-12-15 13:53:26 -0800226 # pylint: disable=no-name-in-module,F0401
maruel@chromium.org35625c72011-03-23 17:34:02 +0000227 from ctypes import byref, POINTER, windll, WINFUNCTYPE
228 from ctypes.wintypes import BOOL, DWORD, HANDLE
229
230 FILE_TYPE_CHAR = 0x0002
231 FILE_TYPE_REMOTE = 0x8000
232 INVALID_HANDLE_VALUE = DWORD(-1).value
233
234 # <http://msdn.microsoft.com/en-us/library/ms683167.aspx>
235 GetConsoleMode = WINFUNCTYPE(BOOL, HANDLE, POINTER(DWORD))(
236 ('GetConsoleMode', windll.kernel32))
237 # <http://msdn.microsoft.com/en-us/library/aa364960.aspx>
238 GetFileType = WINFUNCTYPE(DWORD, DWORD)(('GetFileType', windll.kernel32))
239
240 # GetStdHandle returns INVALID_HANDLE_VALUE, NULL, or a valid handle.
241 if handle == INVALID_HANDLE_VALUE or handle is None:
242 return False
243 return (
244 (GetFileType(handle) & ~FILE_TYPE_REMOTE) == FILE_TYPE_CHAR and
245 GetConsoleMode(handle, byref(DWORD())))
246
247
248def win_get_unicode_stream(stream, excepted_fileno, output_handle, encoding):
249 """Returns a unicode-compatible stream.
250
251 This function will return a direct-Console writing object only if:
252 - the file number is the expected console file number
253 - the handle the expected file handle
254 - the 'real' handle is in fact a handle to a console.
255 """
256 old_fileno = getattr(stream, 'fileno', lambda: None)()
257 if old_fileno == excepted_fileno:
maruel@chromium.orgade9c592011-04-07 15:59:11 +0000258 # These types are available on linux but not Mac.
Quinten Yearsleyb2cc4a92016-12-15 13:53:26 -0800259 # pylint: disable=no-name-in-module,F0401
maruel@chromium.org35625c72011-03-23 17:34:02 +0000260 from ctypes import windll, WINFUNCTYPE
261 from ctypes.wintypes import DWORD, HANDLE
262
263 # <http://msdn.microsoft.com/en-us/library/ms683231.aspx>
264 GetStdHandle = WINFUNCTYPE(HANDLE, DWORD)(('GetStdHandle', windll.kernel32))
265
266 real_output_handle = GetStdHandle(DWORD(output_handle))
267 if win_handle_is_a_console(real_output_handle):
268 # It's a console.
269 return WinUnicodeConsoleOutput(
270 real_output_handle, old_fileno, stream.name, encoding)
271
272 # It's something else. Create an auto-encoding stream.
273 return WinUnicodeOutput(stream, old_fileno, encoding)
274
275
276def fix_win_console(encoding):
277 """Makes Unicode console output work independently of the current code page.
278
279 This also fixes <http://bugs.python.org/issue1602>.
280 Credit to Michael Kaplan
281 <http://blogs.msdn.com/b/michkap/archive/2010/04/07/9989346.aspx> and
282 TZOmegaTZIOY
283 <http://stackoverflow.com/questions/878972/windows-cmd-encoding-change-causes-python-crash/1432462#1432462>.
284 """
285 if (isinstance(sys.stdout, WinUnicodeOutputBase) or
286 isinstance(sys.stderr, WinUnicodeOutputBase)):
287 return False
288
289 try:
290 # SetConsoleCP and SetConsoleOutputCP could be used to change the code page
291 # but it's not really useful since the code here is using WriteConsoleW().
292 # Also, changing the code page is 'permanent' to the console and needs to be
293 # reverted manually.
294 # In practice one needs to set the console font to a TTF font to be able to
295 # see all the characters but it failed for me in practice. In any case, it
296 # won't throw any exception when printing, which is the important part.
297 # -11 and -12 are defined in stdio.h
298 sys.stdout = win_get_unicode_stream(sys.stdout, 1, -11, encoding)
299 sys.stderr = win_get_unicode_stream(sys.stderr, 2, -12, encoding)
300 # TODO(maruel): Do sys.stdin with ReadConsoleW(). Albeit the limitation is
301 # "It doesn't appear to be possible to read Unicode characters in UTF-8
302 # mode" and this appears to be a limitation of cmd.exe.
maruel@chromium.org32152342016-02-17 23:19:35 +0000303 except Exception as e:
maruel@chromium.org35625c72011-03-23 17:34:02 +0000304 complain('exception %r while fixing up sys.stdout and sys.stderr' % e)
305 return True
306
307
308def fix_encoding():
309 """Fixes various encoding problems on all platforms.
310
qyearsley12fa6ff2016-08-24 09:18:40 -0700311 Should be called at the very beginning of the process.
maruel@chromium.org35625c72011-03-23 17:34:02 +0000312 """
313 ret = True
314 if sys.platform == 'win32':
315 ret &= fix_win_codec()
316
317 ret &= fix_default_encoding()
318
319 if sys.platform == 'win32':
320 encoding = sys.getdefaultencoding()
maruel@chromium.org35625c72011-03-23 17:34:02 +0000321 ret &= fix_win_console(encoding)
322 return ret