blob: 87e54a579054aa1a9f5dee678b82b33ecc1d2853 [file] [log] [blame]
maruel@chromium.org35625c72011-03-23 17:34:02 +00001# Copyright (c) 2011 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""Collection of functions and classes to fix various encoding problems on
6multiple platforms with python.
7"""
8
9import codecs
10import locale
11import os
12import sys
13
14
15# Prevents initializing multiple times.
16_SYS_ARGV_PROCESSED = False
17
18
19def complain(message):
20 """If any exception occurs in this file, we'll probably try to print it
21 on stderr, which makes for frustrating debugging if stderr is directed
22 to our wrapper. So be paranoid about catching errors and reporting them
23 to sys.__stderr__, so that the user has a higher chance to see them.
24 """
25 print >> sys.__stderr__, (
26 isinstance(message, str) and message or repr(message))
27
28
29def fix_default_encoding():
30 """Forces utf8 solidly on all platforms.
31
32 By default python execution environment is lazy and defaults to ascii
33 encoding.
34
35 http://uucode.com/blog/2007/03/23/shut-up-you-dummy-7-bit-python/
36 """
37 if sys.getdefaultencoding() == 'utf-8':
38 return False
39
40 # Regenerate setdefaultencoding.
41 reload(sys)
42 # Module 'sys' has no 'setdefaultencoding' member
43 # pylint: disable=E1101
44 sys.setdefaultencoding('utf-8')
45 for attr in dir(locale):
46 if attr[0:3] != 'LC_':
47 continue
48 aref = getattr(locale, attr)
maruel@chromium.orgcfa826c2011-03-25 00:47:57 +000049 try:
50 locale.setlocale(aref, '')
51 except locale.Error:
52 continue
maruel@chromium.org35625c72011-03-23 17:34:02 +000053 try:
54 lang = locale.getlocale(aref)[0]
tony@chromium.orge155bcd2011-03-23 23:16:55 +000055 except (TypeError, ValueError):
maruel@chromium.orgcfa826c2011-03-25 00:47:57 +000056 continue
maruel@chromium.org35625c72011-03-23 17:34:02 +000057 if lang:
58 try:
59 locale.setlocale(aref, (lang, 'UTF-8'))
60 except locale.Error:
61 os.environ[attr] = lang + '.UTF-8'
maruel@chromium.orgcfa826c2011-03-25 00:47:57 +000062 try:
63 locale.setlocale(locale.LC_ALL, '')
64 except locale.Error:
65 pass
maruel@chromium.org35625c72011-03-23 17:34:02 +000066 return True
67
68
69###############################
70# Windows specific
71
72
73def fix_win_sys_argv(encoding):
74 """Converts sys.argv to 'encoding' encoded string.
75
76 utf-8 is recommended.
77
78 Works around <http://bugs.python.org/issue2128>.
79 """
80 global _SYS_ARGV_PROCESSED
81 if _SYS_ARGV_PROCESSED:
82 return False
83
84 from ctypes import byref, c_int, POINTER, windll, WINFUNCTYPE
85 from ctypes.wintypes import LPCWSTR, LPWSTR
86
87 # <http://msdn.microsoft.com/en-us/library/ms683156.aspx>
88 GetCommandLineW = WINFUNCTYPE(LPWSTR)(('GetCommandLineW', windll.kernel32))
89 # <http://msdn.microsoft.com/en-us/library/bb776391.aspx>
90 CommandLineToArgvW = WINFUNCTYPE(POINTER(LPWSTR), LPCWSTR, POINTER(c_int))(
91 ('CommandLineToArgvW', windll.shell32))
92
93 argc = c_int(0)
94 argv_unicode = CommandLineToArgvW(GetCommandLineW(), byref(argc))
95 argv = [
96 argv_unicode[i].encode(encoding, 'replace')
97 for i in xrange(0, argc.value)]
98
99 if not hasattr(sys, 'frozen'):
100 # If this is an executable produced by py2exe or bbfreeze, then it
101 # will have been invoked directly. Otherwise, unicode_argv[0] is the
102 # Python interpreter, so skip that.
103 argv = argv[1:]
104
105 # Also skip option arguments to the Python interpreter.
106 while len(argv) > 0:
107 arg = argv[0]
108 if not arg.startswith(u'-') or arg == u'-':
109 break
110 argv = argv[1:]
111 if arg == u'-m':
112 # sys.argv[0] should really be the absolute path of the
113 # module source, but never mind.
114 break
115 if arg == u'-c':
116 argv[0] = u'-c'
117 break
118 sys.argv = argv
119 _SYS_ARGV_PROCESSED = True
120 return True
121
122
123def fix_win_codec():
124 """Works around <http://bugs.python.org/issue6058>."""
125 # <http://msdn.microsoft.com/en-us/library/dd317756.aspx>
126 try:
127 codecs.lookup('cp65001')
128 return False
129 except LookupError:
130 codecs.register(
131 lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None)
132 return True
133
134
135class WinUnicodeOutputBase(object):
136 """Base class to adapt sys.stdout or sys.stderr to behave correctly on
137 Windows.
138
139 Setting encoding to utf-8 is recommended.
140 """
141 def __init__(self, fileno, name, encoding):
142 # Corresponding file handle.
143 self._fileno = fileno
144 self.encoding = encoding
145 self.name = name
146
147 self.closed = False
148 self.softspace = False
149 self.mode = 'w'
150
151 @staticmethod
152 def isatty():
153 return False
154
155 def close(self):
156 # Don't really close the handle, that would only cause problems.
157 self.closed = True
158
159 def fileno(self):
160 return self._fileno
161
162 def flush(self):
163 raise NotImplementedError()
164
165 def write(self, text):
166 raise NotImplementedError()
167
168 def writelines(self, lines):
169 try:
170 for line in lines:
171 self.write(line)
172 except Exception, e:
173 complain('%s.writelines: %r' % (self.name, e))
174 raise
175
176
177class WinUnicodeConsoleOutput(WinUnicodeOutputBase):
178 """Output adapter to a Windows Console.
179
180 Understands how to use the win32 console API.
181 """
182 def __init__(self, console_handle, fileno, stream_name, encoding):
183 super(WinUnicodeConsoleOutput, self).__init__(
184 fileno, '<Unicode console %s>' % stream_name, encoding)
185 # Handle to use for WriteConsoleW
186 self._console_handle = console_handle
187
188 # Loads the necessary function.
189 from ctypes import byref, GetLastError, POINTER, windll, WINFUNCTYPE
maruel@chromium.org725f1c32011-04-01 20:24:54 +0000190 from ctypes.wintypes import BOOL, DWORD, HANDLE, LPWSTR
191 from ctypes.wintypes import LPVOID # pylint: disable=E0611
maruel@chromium.org35625c72011-03-23 17:34:02 +0000192
193 self._DWORD = DWORD
194 self._byref = byref
195
196 # <http://msdn.microsoft.com/en-us/library/ms687401.aspx>
197 self._WriteConsoleW = WINFUNCTYPE(
198 BOOL, HANDLE, LPWSTR, DWORD, POINTER(DWORD), LPVOID)(
199 ('WriteConsoleW', windll.kernel32))
200 self._GetLastError = GetLastError
201
202 def flush(self):
203 # No need to flush the console since it's immediate.
204 pass
205
206 def write(self, text):
207 try:
208 if not isinstance(text, unicode):
209 # Convert to unicode.
210 text = str(text).decode(self.encoding, 'replace')
211 remaining = len(text)
212 while remaining > 0:
213 n = self._DWORD(0)
214 # There is a shorter-than-documented limitation on the length of the
215 # string passed to WriteConsoleW. See
216 # <http://tahoe-lafs.org/trac/tahoe-lafs/ticket/1232>.
217 retval = self._WriteConsoleW(
218 self._console_handle, text,
219 min(remaining, 10000),
220 self._byref(n), None)
221 if retval == 0 or n.value == 0:
222 raise IOError(
223 'WriteConsoleW returned %r, n.value = %r, last error = %r' % (
224 retval, n.value, self._GetLastError()))
225 remaining -= n.value
226 if not remaining:
227 break
228 text = text[n.value:]
229 except Exception, e:
230 complain('%s.write: %r' % (self.name, e))
231 raise
232
233
234class WinUnicodeOutput(WinUnicodeOutputBase):
235 """Output adaptor to a file output on Windows.
236
237 If the standard FileWrite function is used, it will be encoded in the current
238 code page. WriteConsoleW() permits writting any character.
239 """
240 def __init__(self, stream, fileno, encoding):
241 super(WinUnicodeOutput, self).__init__(
242 fileno, '<Unicode redirected %s>' % stream.name, encoding)
243 # Output stream
244 self._stream = stream
245
246 # Flush right now.
247 self.flush()
248
249 def flush(self):
250 try:
251 self._stream.flush()
252 except Exception, e:
253 complain('%s.flush: %r from %r' % (self.name, e, self._stream))
254 raise
255
256 def write(self, text):
257 try:
258 if isinstance(text, unicode):
259 # Replace characters that cannot be printed instead of failing.
260 text = text.encode(self.encoding, 'replace')
261 self._stream.write(text)
262 except Exception, e:
263 complain('%s.write: %r' % (self.name, e))
264 raise
265
266
267def win_handle_is_a_console(handle):
268 """Returns True if a Windows file handle is a handle to a console."""
269 from ctypes import byref, POINTER, windll, WINFUNCTYPE
270 from ctypes.wintypes import BOOL, DWORD, HANDLE
271
272 FILE_TYPE_CHAR = 0x0002
273 FILE_TYPE_REMOTE = 0x8000
274 INVALID_HANDLE_VALUE = DWORD(-1).value
275
276 # <http://msdn.microsoft.com/en-us/library/ms683167.aspx>
277 GetConsoleMode = WINFUNCTYPE(BOOL, HANDLE, POINTER(DWORD))(
278 ('GetConsoleMode', windll.kernel32))
279 # <http://msdn.microsoft.com/en-us/library/aa364960.aspx>
280 GetFileType = WINFUNCTYPE(DWORD, DWORD)(('GetFileType', windll.kernel32))
281
282 # GetStdHandle returns INVALID_HANDLE_VALUE, NULL, or a valid handle.
283 if handle == INVALID_HANDLE_VALUE or handle is None:
284 return False
285 return (
286 (GetFileType(handle) & ~FILE_TYPE_REMOTE) == FILE_TYPE_CHAR and
287 GetConsoleMode(handle, byref(DWORD())))
288
289
290def win_get_unicode_stream(stream, excepted_fileno, output_handle, encoding):
291 """Returns a unicode-compatible stream.
292
293 This function will return a direct-Console writing object only if:
294 - the file number is the expected console file number
295 - the handle the expected file handle
296 - the 'real' handle is in fact a handle to a console.
297 """
298 old_fileno = getattr(stream, 'fileno', lambda: None)()
299 if old_fileno == excepted_fileno:
maruel@chromium.org35625c72011-03-23 17:34:02 +0000300 from ctypes import windll, WINFUNCTYPE
301 from ctypes.wintypes import DWORD, HANDLE
302
303 # <http://msdn.microsoft.com/en-us/library/ms683231.aspx>
304 GetStdHandle = WINFUNCTYPE(HANDLE, DWORD)(('GetStdHandle', windll.kernel32))
305
306 real_output_handle = GetStdHandle(DWORD(output_handle))
307 if win_handle_is_a_console(real_output_handle):
308 # It's a console.
309 return WinUnicodeConsoleOutput(
310 real_output_handle, old_fileno, stream.name, encoding)
311
312 # It's something else. Create an auto-encoding stream.
313 return WinUnicodeOutput(stream, old_fileno, encoding)
314
315
316def fix_win_console(encoding):
317 """Makes Unicode console output work independently of the current code page.
318
319 This also fixes <http://bugs.python.org/issue1602>.
320 Credit to Michael Kaplan
321 <http://blogs.msdn.com/b/michkap/archive/2010/04/07/9989346.aspx> and
322 TZOmegaTZIOY
323 <http://stackoverflow.com/questions/878972/windows-cmd-encoding-change-causes-python-crash/1432462#1432462>.
324 """
325 if (isinstance(sys.stdout, WinUnicodeOutputBase) or
326 isinstance(sys.stderr, WinUnicodeOutputBase)):
327 return False
328
329 try:
330 # SetConsoleCP and SetConsoleOutputCP could be used to change the code page
331 # but it's not really useful since the code here is using WriteConsoleW().
332 # Also, changing the code page is 'permanent' to the console and needs to be
333 # reverted manually.
334 # In practice one needs to set the console font to a TTF font to be able to
335 # see all the characters but it failed for me in practice. In any case, it
336 # won't throw any exception when printing, which is the important part.
337 # -11 and -12 are defined in stdio.h
338 sys.stdout = win_get_unicode_stream(sys.stdout, 1, -11, encoding)
339 sys.stderr = win_get_unicode_stream(sys.stderr, 2, -12, encoding)
340 # TODO(maruel): Do sys.stdin with ReadConsoleW(). Albeit the limitation is
341 # "It doesn't appear to be possible to read Unicode characters in UTF-8
342 # mode" and this appears to be a limitation of cmd.exe.
343 except Exception, e:
344 complain('exception %r while fixing up sys.stdout and sys.stderr' % e)
345 return True
346
347
348def fix_encoding():
349 """Fixes various encoding problems on all platforms.
350
351 Should be called at the very begining of the process.
352 """
353 ret = True
354 if sys.platform == 'win32':
355 ret &= fix_win_codec()
356
357 ret &= fix_default_encoding()
358
359 if sys.platform == 'win32':
360 encoding = sys.getdefaultencoding()
361 ret &= fix_win_sys_argv(encoding)
362 ret &= fix_win_console(encoding)
363 return ret