blob: 21efb938cfbe3ac7bd8c09f38d13c92b2ffa62ac [file] [log] [blame]
maruel@chromium.org35625c72011-03-23 17:34:02 +00001# Copyright (c) 2011 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
maruel@chromium.org35625c72011-03-23 17:34:02 +00004"""Collection of functions and classes to fix various encoding problems on
5multiple platforms with python.
6"""
7
Raul Tambre80ee78e2019-05-06 22:41:05 +00008from __future__ import print_function
9
maruel@chromium.org35625c72011-03-23 17:34:02 +000010import codecs
11import locale
12import os
13import sys
14
15
maruel@chromium.org35625c72011-03-23 17:34:02 +000016def complain(message):
Mike Frysinger124bb8e2023-09-06 05:48:55 +000017 """If any exception occurs in this file, we'll probably try to print it
maruel@chromium.org35625c72011-03-23 17:34:02 +000018 on stderr, which makes for frustrating debugging if stderr is directed
19 to our wrapper. So be paranoid about catching errors and reporting them
20 to sys.__stderr__, so that the user has a higher chance to see them.
21 """
Mike Frysinger124bb8e2023-09-06 05:48:55 +000022 print(isinstance(message, str) and message or repr(message),
23 file=sys.__stderr__)
maruel@chromium.org35625c72011-03-23 17:34:02 +000024
25
26def fix_default_encoding():
Mike Frysinger124bb8e2023-09-06 05:48:55 +000027 """Forces utf8 solidly on all platforms.
maruel@chromium.org35625c72011-03-23 17:34:02 +000028
29 By default python execution environment is lazy and defaults to ascii
30 encoding.
31
32 http://uucode.com/blog/2007/03/23/shut-up-you-dummy-7-bit-python/
33 """
Mike Frysinger124bb8e2023-09-06 05:48:55 +000034 if sys.getdefaultencoding() == 'utf-8':
35 return False
maruel@chromium.org35625c72011-03-23 17:34:02 +000036
Mike Frysinger124bb8e2023-09-06 05:48:55 +000037 # Regenerate setdefaultencoding.
38 reload(sys)
39 # Module 'sys' has no 'setdefaultencoding' member
40 # pylint: disable=no-member
41 sys.setdefaultencoding('utf-8')
42 for attr in dir(locale):
43 if attr[0:3] != 'LC_':
44 continue
45 aref = getattr(locale, attr)
46 try:
47 locale.setlocale(aref, '')
48 except locale.Error:
49 continue
50 try:
51 lang, _ = locale.getdefaultlocale()
52 except (TypeError, ValueError):
53 continue
54 if lang:
55 try:
56 locale.setlocale(aref, (lang, 'UTF-8'))
57 except locale.Error:
58 os.environ[attr] = lang + '.UTF-8'
maruel@chromium.orgcfa826c2011-03-25 00:47:57 +000059 try:
Mike Frysinger124bb8e2023-09-06 05:48:55 +000060 locale.setlocale(locale.LC_ALL, '')
maruel@chromium.orgcfa826c2011-03-25 00:47:57 +000061 except locale.Error:
Mike Frysinger124bb8e2023-09-06 05:48:55 +000062 pass
63 return True
maruel@chromium.org35625c72011-03-23 17:34:02 +000064
65
66###############################
67# Windows specific
68
Mike Frysinger124bb8e2023-09-06 05:48:55 +000069
maruel@chromium.org35625c72011-03-23 17:34:02 +000070def fix_win_codec():
Mike Frysinger124bb8e2023-09-06 05:48:55 +000071 """Works around <http://bugs.python.org/issue6058>."""
72 # <http://msdn.microsoft.com/en-us/library/dd317756.aspx>
73 try:
74 codecs.lookup('cp65001')
75 return False
76 except LookupError:
77 codecs.register(
78 lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None)
79 return True
maruel@chromium.org35625c72011-03-23 17:34:02 +000080
81
82class WinUnicodeOutputBase(object):
Mike Frysinger124bb8e2023-09-06 05:48:55 +000083 """Base class to adapt sys.stdout or sys.stderr to behave correctly on
maruel@chromium.org35625c72011-03-23 17:34:02 +000084 Windows.
85
86 Setting encoding to utf-8 is recommended.
87 """
Mike Frysinger124bb8e2023-09-06 05:48:55 +000088 def __init__(self, fileno, name, encoding):
89 # Corresponding file handle.
90 self._fileno = fileno
91 self.encoding = encoding
92 self.name = name
maruel@chromium.org35625c72011-03-23 17:34:02 +000093
Mike Frysinger124bb8e2023-09-06 05:48:55 +000094 self.closed = False
95 self.softspace = False
96 self.mode = 'w'
maruel@chromium.org35625c72011-03-23 17:34:02 +000097
Mike Frysinger124bb8e2023-09-06 05:48:55 +000098 @staticmethod
99 def isatty():
100 return False
maruel@chromium.org35625c72011-03-23 17:34:02 +0000101
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000102 def close(self):
103 # Don't really close the handle, that would only cause problems.
104 self.closed = True
maruel@chromium.org35625c72011-03-23 17:34:02 +0000105
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000106 def fileno(self):
107 return self._fileno
maruel@chromium.org35625c72011-03-23 17:34:02 +0000108
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000109 def flush(self):
110 raise NotImplementedError()
maruel@chromium.org35625c72011-03-23 17:34:02 +0000111
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000112 def write(self, text):
113 raise NotImplementedError()
maruel@chromium.org35625c72011-03-23 17:34:02 +0000114
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000115 def writelines(self, lines):
116 try:
117 for line in lines:
118 self.write(line)
119 except Exception as e:
120 complain('%s.writelines: %r' % (self.name, e))
121 raise
maruel@chromium.org35625c72011-03-23 17:34:02 +0000122
123
124class WinUnicodeConsoleOutput(WinUnicodeOutputBase):
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000125 """Output adapter to a Windows Console.
maruel@chromium.org35625c72011-03-23 17:34:02 +0000126
127 Understands how to use the win32 console API.
128 """
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000129 def __init__(self, console_handle, fileno, stream_name, encoding):
130 super(WinUnicodeConsoleOutput,
131 self).__init__(fileno, '<Unicode console %s>' % stream_name,
132 encoding)
133 # Handle to use for WriteConsoleW
134 self._console_handle = console_handle
maruel@chromium.org35625c72011-03-23 17:34:02 +0000135
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000136 # Loads the necessary function.
137 # These types are available on linux but not Mac.
138 # pylint: disable=no-name-in-module,F0401
139 from ctypes import byref, GetLastError, POINTER, windll, WINFUNCTYPE
140 from ctypes.wintypes import BOOL, DWORD, HANDLE, LPWSTR
141 from ctypes.wintypes import LPVOID # pylint: disable=no-name-in-module
maruel@chromium.org35625c72011-03-23 17:34:02 +0000142
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000143 self._DWORD = DWORD
144 self._byref = byref
maruel@chromium.org35625c72011-03-23 17:34:02 +0000145
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000146 # <http://msdn.microsoft.com/en-us/library/ms687401.aspx>
147 self._WriteConsoleW = WINFUNCTYPE(BOOL, HANDLE, LPWSTR, DWORD,
148 POINTER(DWORD),
149 LPVOID)(('WriteConsoleW',
150 windll.kernel32))
151 self._GetLastError = GetLastError
maruel@chromium.org35625c72011-03-23 17:34:02 +0000152
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000153 def flush(self):
154 # No need to flush the console since it's immediate.
155 pass
maruel@chromium.org35625c72011-03-23 17:34:02 +0000156
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000157 def write(self, text):
158 try:
159 if isinstance(text, bytes):
160 # Bytestrings need to be decoded to a string before being passed
161 # to Windows.
162 text = text.decode(self.encoding, 'replace')
163 remaining = len(text)
164 while remaining > 0:
165 n = self._DWORD(0)
166 # There is a shorter-than-documented limitation on the length of
167 # the string passed to WriteConsoleW. See
168 # <http://tahoe-lafs.org/trac/tahoe-lafs/ticket/1232>.
169 retval = self._WriteConsoleW(self._console_handle, text,
170 min(remaining, 10000),
171 self._byref(n), None)
172 if retval == 0 or n.value == 0:
173 raise IOError('WriteConsoleW returned %r, n.value = %r, '
174 'last error = %r' %
175 (retval, n.value, self._GetLastError()))
176 remaining -= n.value
177 if not remaining:
178 break
179 text = text[int(n.value):]
180 except Exception as e:
181 complain('%s.write: %r' % (self.name, e))
182 raise
maruel@chromium.org35625c72011-03-23 17:34:02 +0000183
184
185class WinUnicodeOutput(WinUnicodeOutputBase):
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000186 """Output adaptor to a file output on Windows.
maruel@chromium.org35625c72011-03-23 17:34:02 +0000187
188 If the standard FileWrite function is used, it will be encoded in the current
Quinten Yearsley925cedb2020-04-13 17:49:39 +0000189 code page. WriteConsoleW() permits writing any character.
maruel@chromium.org35625c72011-03-23 17:34:02 +0000190 """
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000191 def __init__(self, stream, fileno, encoding):
192 super(WinUnicodeOutput,
193 self).__init__(fileno, '<Unicode redirected %s>' % stream.name,
194 encoding)
195 # Output stream
196 self._stream = stream
maruel@chromium.org35625c72011-03-23 17:34:02 +0000197
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000198 # Flush right now.
199 self.flush()
maruel@chromium.org35625c72011-03-23 17:34:02 +0000200
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000201 def flush(self):
202 try:
203 self._stream.flush()
204 except Exception as e:
205 complain('%s.flush: %r from %r' % (self.name, e, self._stream))
206 raise
maruel@chromium.org35625c72011-03-23 17:34:02 +0000207
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000208 def write(self, text):
209 try:
210 if isinstance(text, bytes):
211 # Replace characters that cannot be printed instead of failing.
212 text = text.decode(self.encoding, 'replace')
213 # When redirecting to a file or process any \n characters will be
214 # replaced with \r\n. If the text to be printed already has \r\n
215 # line endings then \r\r\n line endings will be generated, leading
216 # to double-spacing of some output. Normalizing line endings to \n
217 # avoids this problem.
218 text = text.replace('\r\n', '\n')
219 self._stream.write(text)
220 except Exception as e:
221 complain('%s.write: %r' % (self.name, e))
222 raise
maruel@chromium.org35625c72011-03-23 17:34:02 +0000223
224
225def win_handle_is_a_console(handle):
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000226 """Returns True if a Windows file handle is a handle to a console."""
227 # These types are available on linux but not Mac.
228 # pylint: disable=no-name-in-module,F0401
229 from ctypes import byref, POINTER, windll, WINFUNCTYPE
230 from ctypes.wintypes import BOOL, DWORD, HANDLE
maruel@chromium.org35625c72011-03-23 17:34:02 +0000231
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000232 FILE_TYPE_CHAR = 0x0002
233 FILE_TYPE_REMOTE = 0x8000
234 INVALID_HANDLE_VALUE = DWORD(-1).value
maruel@chromium.org35625c72011-03-23 17:34:02 +0000235
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000236 # <http://msdn.microsoft.com/en-us/library/ms683167.aspx>
237 GetConsoleMode = WINFUNCTYPE(BOOL, HANDLE, POINTER(DWORD))(
238 ('GetConsoleMode', windll.kernel32))
239 # <http://msdn.microsoft.com/en-us/library/aa364960.aspx>
240 GetFileType = WINFUNCTYPE(DWORD, DWORD)(('GetFileType', windll.kernel32))
maruel@chromium.org35625c72011-03-23 17:34:02 +0000241
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000242 # GetStdHandle returns INVALID_HANDLE_VALUE, NULL, or a valid handle.
243 if handle == INVALID_HANDLE_VALUE or handle is None:
244 return False
245 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) == FILE_TYPE_CHAR
246 and GetConsoleMode(handle, byref(DWORD())))
maruel@chromium.org35625c72011-03-23 17:34:02 +0000247
248
249def win_get_unicode_stream(stream, excepted_fileno, output_handle, encoding):
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000250 """Returns a unicode-compatible stream.
maruel@chromium.org35625c72011-03-23 17:34:02 +0000251
252 This function will return a direct-Console writing object only if:
253 - the file number is the expected console file number
254 - the handle the expected file handle
255 - the 'real' handle is in fact a handle to a console.
256 """
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000257 old_fileno = getattr(stream, 'fileno', lambda: None)()
258 if old_fileno == excepted_fileno:
259 # These types are available on linux but not Mac.
260 # pylint: disable=no-name-in-module,F0401
261 from ctypes import windll, WINFUNCTYPE
262 from ctypes.wintypes import DWORD, HANDLE
maruel@chromium.org35625c72011-03-23 17:34:02 +0000263
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000264 # <http://msdn.microsoft.com/en-us/library/ms683231.aspx>
265 GetStdHandle = WINFUNCTYPE(HANDLE,
266 DWORD)(('GetStdHandle', windll.kernel32))
maruel@chromium.org35625c72011-03-23 17:34:02 +0000267
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000268 real_output_handle = GetStdHandle(DWORD(output_handle))
269 if win_handle_is_a_console(real_output_handle):
270 # It's a console.
271 return WinUnicodeConsoleOutput(real_output_handle, old_fileno,
272 stream.name, encoding)
maruel@chromium.org35625c72011-03-23 17:34:02 +0000273
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000274 # It's something else. Create an auto-encoding stream.
275 return WinUnicodeOutput(stream, old_fileno, encoding)
maruel@chromium.org35625c72011-03-23 17:34:02 +0000276
277
278def fix_win_console(encoding):
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000279 """Makes Unicode console output work independently of the current code page.
maruel@chromium.org35625c72011-03-23 17:34:02 +0000280
281 This also fixes <http://bugs.python.org/issue1602>.
282 Credit to Michael Kaplan
283 <http://blogs.msdn.com/b/michkap/archive/2010/04/07/9989346.aspx> and
284 TZOmegaTZIOY
285 <http://stackoverflow.com/questions/878972/windows-cmd-encoding-change-causes-python-crash/1432462#1432462>.
286 """
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000287 if (isinstance(sys.stdout, WinUnicodeOutputBase)
288 or isinstance(sys.stderr, WinUnicodeOutputBase)):
289 return False
maruel@chromium.org35625c72011-03-23 17:34:02 +0000290
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000291 try:
292 # SetConsoleCP and SetConsoleOutputCP could be used to change the code
293 # page but it's not really useful since the code here is using
294 # WriteConsoleW(). Also, changing the code page is 'permanent' to the
295 # console and needs to be reverted manually. In practice one needs to
296 # set the console font to a TTF font to be able to see all the
297 # characters but it failed for me in practice. In any case, it won't
298 # throw any exception when printing, which is the important part. -11
299 # and -12 are defined in stdio.h
300 sys.stdout = win_get_unicode_stream(sys.stdout, 1, -11, encoding)
301 sys.stderr = win_get_unicode_stream(sys.stderr, 2, -12, encoding)
302 # TODO(maruel): Do sys.stdin with ReadConsoleW(). Albeit the limitation
303 # is "It doesn't appear to be possible to read Unicode characters in
304 # UTF-8 mode" and this appears to be a limitation of cmd.exe.
305 except Exception as e:
306 complain('exception %r while fixing up sys.stdout and sys.stderr' % e)
307 return True
maruel@chromium.org35625c72011-03-23 17:34:02 +0000308
309
310def fix_encoding():
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000311 """Fixes various encoding problems on all platforms.
maruel@chromium.org35625c72011-03-23 17:34:02 +0000312
qyearsley12fa6ff2016-08-24 09:18:40 -0700313 Should be called at the very beginning of the process.
maruel@chromium.org35625c72011-03-23 17:34:02 +0000314 """
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000315 ret = True
316 if sys.platform == 'win32':
317 ret &= fix_win_codec()
maruel@chromium.org35625c72011-03-23 17:34:02 +0000318
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000319 ret &= fix_default_encoding()
maruel@chromium.org35625c72011-03-23 17:34:02 +0000320
Mike Frysinger124bb8e2023-09-06 05:48:55 +0000321 if sys.platform == 'win32':
322 encoding = sys.getdefaultencoding()
323 ret &= fix_win_console(encoding)
324 return ret