blob: b327a494ca414a3f04a2302df3aea4ade4f50dee [file] [log] [blame]
maruel@chromium.org35625c72011-03-23 17:34:02 +00001# Copyright (c) 2011 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""Collection of functions and classes to fix various encoding problems on
6multiple platforms with python.
7"""
8
Raul Tambre80ee78e2019-05-06 22:41:05 +00009from __future__ import print_function
10
maruel@chromium.org35625c72011-03-23 17:34:02 +000011import codecs
12import locale
13import os
14import sys
15
16
17# Prevents initializing multiple times.
18_SYS_ARGV_PROCESSED = False
19
20
21def complain(message):
22 """If any exception occurs in this file, we'll probably try to print it
23 on stderr, which makes for frustrating debugging if stderr is directed
24 to our wrapper. So be paranoid about catching errors and reporting them
25 to sys.__stderr__, so that the user has a higher chance to see them.
26 """
Raul Tambre80ee78e2019-05-06 22:41:05 +000027 print(
28 isinstance(message, str) and message or repr(message),
29 file=sys.__stderr__)
maruel@chromium.org35625c72011-03-23 17:34:02 +000030
31
32def fix_default_encoding():
33 """Forces utf8 solidly on all platforms.
34
35 By default python execution environment is lazy and defaults to ascii
36 encoding.
37
38 http://uucode.com/blog/2007/03/23/shut-up-you-dummy-7-bit-python/
39 """
40 if sys.getdefaultencoding() == 'utf-8':
41 return False
42
43 # Regenerate setdefaultencoding.
44 reload(sys)
45 # Module 'sys' has no 'setdefaultencoding' member
Quinten Yearsleyb2cc4a92016-12-15 13:53:26 -080046 # pylint: disable=no-member
maruel@chromium.org35625c72011-03-23 17:34:02 +000047 sys.setdefaultencoding('utf-8')
48 for attr in dir(locale):
49 if attr[0:3] != 'LC_':
50 continue
51 aref = getattr(locale, attr)
maruel@chromium.orgcfa826c2011-03-25 00:47:57 +000052 try:
53 locale.setlocale(aref, '')
54 except locale.Error:
55 continue
maruel@chromium.org35625c72011-03-23 17:34:02 +000056 try:
iannucci@chromium.org8b7274b2016-03-29 22:46:31 +000057 lang, _ = locale.getdefaultlocale()
tony@chromium.orge155bcd2011-03-23 23:16:55 +000058 except (TypeError, ValueError):
maruel@chromium.orgcfa826c2011-03-25 00:47:57 +000059 continue
maruel@chromium.org35625c72011-03-23 17:34:02 +000060 if lang:
61 try:
62 locale.setlocale(aref, (lang, 'UTF-8'))
63 except locale.Error:
64 os.environ[attr] = lang + '.UTF-8'
maruel@chromium.orgcfa826c2011-03-25 00:47:57 +000065 try:
66 locale.setlocale(locale.LC_ALL, '')
67 except locale.Error:
68 pass
maruel@chromium.org35625c72011-03-23 17:34:02 +000069 return True
70
71
72###############################
73# Windows specific
74
75
76def fix_win_sys_argv(encoding):
77 """Converts sys.argv to 'encoding' encoded string.
78
79 utf-8 is recommended.
80
81 Works around <http://bugs.python.org/issue2128>.
82 """
83 global _SYS_ARGV_PROCESSED
84 if _SYS_ARGV_PROCESSED:
85 return False
86
Edward Lesmes05934952019-12-19 20:38:09 +000087 if sys.version_info.major == 3:
88 _SYS_ARGV_PROCESSED = True
89 return True
90
maruel@chromium.orgade9c592011-04-07 15:59:11 +000091 # These types are available on linux but not Mac.
Quinten Yearsleyb2cc4a92016-12-15 13:53:26 -080092 # pylint: disable=no-name-in-module,F0401
maruel@chromium.org35625c72011-03-23 17:34:02 +000093 from ctypes import byref, c_int, POINTER, windll, WINFUNCTYPE
94 from ctypes.wintypes import LPCWSTR, LPWSTR
95
96 # <http://msdn.microsoft.com/en-us/library/ms683156.aspx>
97 GetCommandLineW = WINFUNCTYPE(LPWSTR)(('GetCommandLineW', windll.kernel32))
98 # <http://msdn.microsoft.com/en-us/library/bb776391.aspx>
99 CommandLineToArgvW = WINFUNCTYPE(POINTER(LPWSTR), LPCWSTR, POINTER(c_int))(
100 ('CommandLineToArgvW', windll.shell32))
101
102 argc = c_int(0)
103 argv_unicode = CommandLineToArgvW(GetCommandLineW(), byref(argc))
104 argv = [
Raul Tambreb946b232019-03-26 14:48:46 +0000105 argv_unicode[i].encode(encoding, 'replace') for i in range(0, argc.value)
106 ]
maruel@chromium.org35625c72011-03-23 17:34:02 +0000107
108 if not hasattr(sys, 'frozen'):
109 # If this is an executable produced by py2exe or bbfreeze, then it
110 # will have been invoked directly. Otherwise, unicode_argv[0] is the
111 # Python interpreter, so skip that.
112 argv = argv[1:]
113
114 # Also skip option arguments to the Python interpreter.
115 while len(argv) > 0:
116 arg = argv[0]
Raul Tambreb946b232019-03-26 14:48:46 +0000117 if not arg.startswith(b'-') or arg == b'-':
maruel@chromium.org35625c72011-03-23 17:34:02 +0000118 break
119 argv = argv[1:]
120 if arg == u'-m':
121 # sys.argv[0] should really be the absolute path of the
122 # module source, but never mind.
123 break
124 if arg == u'-c':
125 argv[0] = u'-c'
126 break
127 sys.argv = argv
128 _SYS_ARGV_PROCESSED = True
129 return True
130
131
132def fix_win_codec():
133 """Works around <http://bugs.python.org/issue6058>."""
134 # <http://msdn.microsoft.com/en-us/library/dd317756.aspx>
135 try:
136 codecs.lookup('cp65001')
137 return False
138 except LookupError:
139 codecs.register(
140 lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None)
141 return True
142
143
144class WinUnicodeOutputBase(object):
145 """Base class to adapt sys.stdout or sys.stderr to behave correctly on
146 Windows.
147
148 Setting encoding to utf-8 is recommended.
149 """
150 def __init__(self, fileno, name, encoding):
151 # Corresponding file handle.
152 self._fileno = fileno
153 self.encoding = encoding
154 self.name = name
155
156 self.closed = False
157 self.softspace = False
158 self.mode = 'w'
159
160 @staticmethod
161 def isatty():
162 return False
163
164 def close(self):
165 # Don't really close the handle, that would only cause problems.
166 self.closed = True
167
168 def fileno(self):
169 return self._fileno
170
171 def flush(self):
172 raise NotImplementedError()
173
174 def write(self, text):
175 raise NotImplementedError()
176
177 def writelines(self, lines):
178 try:
179 for line in lines:
180 self.write(line)
maruel@chromium.org32152342016-02-17 23:19:35 +0000181 except Exception as e:
maruel@chromium.org35625c72011-03-23 17:34:02 +0000182 complain('%s.writelines: %r' % (self.name, e))
183 raise
184
185
186class WinUnicodeConsoleOutput(WinUnicodeOutputBase):
187 """Output adapter to a Windows Console.
188
189 Understands how to use the win32 console API.
190 """
191 def __init__(self, console_handle, fileno, stream_name, encoding):
192 super(WinUnicodeConsoleOutput, self).__init__(
193 fileno, '<Unicode console %s>' % stream_name, encoding)
194 # Handle to use for WriteConsoleW
195 self._console_handle = console_handle
196
197 # Loads the necessary function.
maruel@chromium.orgade9c592011-04-07 15:59:11 +0000198 # These types are available on linux but not Mac.
Quinten Yearsleyb2cc4a92016-12-15 13:53:26 -0800199 # pylint: disable=no-name-in-module,F0401
maruel@chromium.org35625c72011-03-23 17:34:02 +0000200 from ctypes import byref, GetLastError, POINTER, windll, WINFUNCTYPE
maruel@chromium.org725f1c32011-04-01 20:24:54 +0000201 from ctypes.wintypes import BOOL, DWORD, HANDLE, LPWSTR
Quinten Yearsleyb2cc4a92016-12-15 13:53:26 -0800202 from ctypes.wintypes import LPVOID # pylint: disable=no-name-in-module
maruel@chromium.org35625c72011-03-23 17:34:02 +0000203
204 self._DWORD = DWORD
205 self._byref = byref
206
207 # <http://msdn.microsoft.com/en-us/library/ms687401.aspx>
208 self._WriteConsoleW = WINFUNCTYPE(
209 BOOL, HANDLE, LPWSTR, DWORD, POINTER(DWORD), LPVOID)(
210 ('WriteConsoleW', windll.kernel32))
211 self._GetLastError = GetLastError
212
213 def flush(self):
214 # No need to flush the console since it's immediate.
215 pass
216
217 def write(self, text):
218 try:
Raul Tambre65c2b1e2019-07-16 14:07:04 +0000219 if sys.version_info.major == 2 and not isinstance(text, unicode):
maruel@chromium.org35625c72011-03-23 17:34:02 +0000220 # Convert to unicode.
221 text = str(text).decode(self.encoding, 'replace')
Raul Tambred9c1c852019-09-10 16:33:40 +0000222 elif sys.version_info.major == 3 and isinstance(text, bytes):
223 # Bytestrings need to be decoded to a string before being passed to
224 # Windows.
225 text = text.decode(self.encoding, 'replace')
maruel@chromium.org35625c72011-03-23 17:34:02 +0000226 remaining = len(text)
227 while remaining > 0:
228 n = self._DWORD(0)
229 # There is a shorter-than-documented limitation on the length of the
230 # string passed to WriteConsoleW. See
231 # <http://tahoe-lafs.org/trac/tahoe-lafs/ticket/1232>.
232 retval = self._WriteConsoleW(
233 self._console_handle, text,
234 min(remaining, 10000),
235 self._byref(n), None)
236 if retval == 0 or n.value == 0:
237 raise IOError(
238 'WriteConsoleW returned %r, n.value = %r, last error = %r' % (
239 retval, n.value, self._GetLastError()))
240 remaining -= n.value
241 if not remaining:
242 break
avakulenko@google.com255f2be2014-12-05 22:19:55 +0000243 text = text[int(n.value):]
maruel@chromium.org32152342016-02-17 23:19:35 +0000244 except Exception as e:
maruel@chromium.org35625c72011-03-23 17:34:02 +0000245 complain('%s.write: %r' % (self.name, e))
246 raise
247
248
249class WinUnicodeOutput(WinUnicodeOutputBase):
250 """Output adaptor to a file output on Windows.
251
252 If the standard FileWrite function is used, it will be encoded in the current
253 code page. WriteConsoleW() permits writting any character.
254 """
255 def __init__(self, stream, fileno, encoding):
256 super(WinUnicodeOutput, self).__init__(
257 fileno, '<Unicode redirected %s>' % stream.name, encoding)
258 # Output stream
259 self._stream = stream
260
261 # Flush right now.
262 self.flush()
263
264 def flush(self):
265 try:
266 self._stream.flush()
maruel@chromium.org32152342016-02-17 23:19:35 +0000267 except Exception as e:
maruel@chromium.org35625c72011-03-23 17:34:02 +0000268 complain('%s.flush: %r from %r' % (self.name, e, self._stream))
269 raise
270
271 def write(self, text):
272 try:
Raul Tambre65c2b1e2019-07-16 14:07:04 +0000273 if sys.version_info.major == 2 and isinstance(text, unicode):
maruel@chromium.org35625c72011-03-23 17:34:02 +0000274 # Replace characters that cannot be printed instead of failing.
275 text = text.encode(self.encoding, 'replace')
Edward Lesmes05934952019-12-19 20:38:09 +0000276 if sys.version_info.major == 3 and isinstance(text, bytes):
277 # Replace characters that cannot be printed instead of failing.
278 text = text.decode(self.encoding, 'replace')
maruel@chromium.org35625c72011-03-23 17:34:02 +0000279 self._stream.write(text)
maruel@chromium.org32152342016-02-17 23:19:35 +0000280 except Exception as e:
maruel@chromium.org35625c72011-03-23 17:34:02 +0000281 complain('%s.write: %r' % (self.name, e))
282 raise
283
284
285def win_handle_is_a_console(handle):
286 """Returns True if a Windows file handle is a handle to a console."""
maruel@chromium.orgade9c592011-04-07 15:59:11 +0000287 # These types are available on linux but not Mac.
Quinten Yearsleyb2cc4a92016-12-15 13:53:26 -0800288 # pylint: disable=no-name-in-module,F0401
maruel@chromium.org35625c72011-03-23 17:34:02 +0000289 from ctypes import byref, POINTER, windll, WINFUNCTYPE
290 from ctypes.wintypes import BOOL, DWORD, HANDLE
291
292 FILE_TYPE_CHAR = 0x0002
293 FILE_TYPE_REMOTE = 0x8000
294 INVALID_HANDLE_VALUE = DWORD(-1).value
295
296 # <http://msdn.microsoft.com/en-us/library/ms683167.aspx>
297 GetConsoleMode = WINFUNCTYPE(BOOL, HANDLE, POINTER(DWORD))(
298 ('GetConsoleMode', windll.kernel32))
299 # <http://msdn.microsoft.com/en-us/library/aa364960.aspx>
300 GetFileType = WINFUNCTYPE(DWORD, DWORD)(('GetFileType', windll.kernel32))
301
302 # GetStdHandle returns INVALID_HANDLE_VALUE, NULL, or a valid handle.
303 if handle == INVALID_HANDLE_VALUE or handle is None:
304 return False
305 return (
306 (GetFileType(handle) & ~FILE_TYPE_REMOTE) == FILE_TYPE_CHAR and
307 GetConsoleMode(handle, byref(DWORD())))
308
309
310def win_get_unicode_stream(stream, excepted_fileno, output_handle, encoding):
311 """Returns a unicode-compatible stream.
312
313 This function will return a direct-Console writing object only if:
314 - the file number is the expected console file number
315 - the handle the expected file handle
316 - the 'real' handle is in fact a handle to a console.
317 """
318 old_fileno = getattr(stream, 'fileno', lambda: None)()
319 if old_fileno == excepted_fileno:
maruel@chromium.orgade9c592011-04-07 15:59:11 +0000320 # These types are available on linux but not Mac.
Quinten Yearsleyb2cc4a92016-12-15 13:53:26 -0800321 # pylint: disable=no-name-in-module,F0401
maruel@chromium.org35625c72011-03-23 17:34:02 +0000322 from ctypes import windll, WINFUNCTYPE
323 from ctypes.wintypes import DWORD, HANDLE
324
325 # <http://msdn.microsoft.com/en-us/library/ms683231.aspx>
326 GetStdHandle = WINFUNCTYPE(HANDLE, DWORD)(('GetStdHandle', windll.kernel32))
327
328 real_output_handle = GetStdHandle(DWORD(output_handle))
329 if win_handle_is_a_console(real_output_handle):
330 # It's a console.
331 return WinUnicodeConsoleOutput(
332 real_output_handle, old_fileno, stream.name, encoding)
333
334 # It's something else. Create an auto-encoding stream.
335 return WinUnicodeOutput(stream, old_fileno, encoding)
336
337
338def fix_win_console(encoding):
339 """Makes Unicode console output work independently of the current code page.
340
341 This also fixes <http://bugs.python.org/issue1602>.
342 Credit to Michael Kaplan
343 <http://blogs.msdn.com/b/michkap/archive/2010/04/07/9989346.aspx> and
344 TZOmegaTZIOY
345 <http://stackoverflow.com/questions/878972/windows-cmd-encoding-change-causes-python-crash/1432462#1432462>.
346 """
347 if (isinstance(sys.stdout, WinUnicodeOutputBase) or
348 isinstance(sys.stderr, WinUnicodeOutputBase)):
349 return False
350
351 try:
352 # SetConsoleCP and SetConsoleOutputCP could be used to change the code page
353 # but it's not really useful since the code here is using WriteConsoleW().
354 # Also, changing the code page is 'permanent' to the console and needs to be
355 # reverted manually.
356 # In practice one needs to set the console font to a TTF font to be able to
357 # see all the characters but it failed for me in practice. In any case, it
358 # won't throw any exception when printing, which is the important part.
359 # -11 and -12 are defined in stdio.h
360 sys.stdout = win_get_unicode_stream(sys.stdout, 1, -11, encoding)
361 sys.stderr = win_get_unicode_stream(sys.stderr, 2, -12, encoding)
362 # TODO(maruel): Do sys.stdin with ReadConsoleW(). Albeit the limitation is
363 # "It doesn't appear to be possible to read Unicode characters in UTF-8
364 # mode" and this appears to be a limitation of cmd.exe.
maruel@chromium.org32152342016-02-17 23:19:35 +0000365 except Exception as e:
maruel@chromium.org35625c72011-03-23 17:34:02 +0000366 complain('exception %r while fixing up sys.stdout and sys.stderr' % e)
367 return True
368
369
370def fix_encoding():
371 """Fixes various encoding problems on all platforms.
372
qyearsley12fa6ff2016-08-24 09:18:40 -0700373 Should be called at the very beginning of the process.
maruel@chromium.org35625c72011-03-23 17:34:02 +0000374 """
375 ret = True
376 if sys.platform == 'win32':
377 ret &= fix_win_codec()
378
379 ret &= fix_default_encoding()
380
381 if sys.platform == 'win32':
382 encoding = sys.getdefaultencoding()
383 ret &= fix_win_sys_argv(encoding)
384 ret &= fix_win_console(encoding)
385 return ret