blob: ab418695b20d50eca5b6f2874ad988d734087a4d [file] [log] [blame]
maruel@chromium.org35625c72011-03-23 17:34:02 +00001# Copyright (c) 2011 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""Collection of functions and classes to fix various encoding problems on
6multiple platforms with python.
7"""
8
9import codecs
10import locale
11import os
12import sys
13
14
15# Prevents initializing multiple times.
16_SYS_ARGV_PROCESSED = False
17
18
19def complain(message):
20 """If any exception occurs in this file, we'll probably try to print it
21 on stderr, which makes for frustrating debugging if stderr is directed
22 to our wrapper. So be paranoid about catching errors and reporting them
23 to sys.__stderr__, so that the user has a higher chance to see them.
24 """
25 print >> sys.__stderr__, (
26 isinstance(message, str) and message or repr(message))
27
28
29def fix_default_encoding():
30 """Forces utf8 solidly on all platforms.
31
32 By default python execution environment is lazy and defaults to ascii
33 encoding.
34
35 http://uucode.com/blog/2007/03/23/shut-up-you-dummy-7-bit-python/
36 """
37 if sys.getdefaultencoding() == 'utf-8':
38 return False
39
40 # Regenerate setdefaultencoding.
41 reload(sys)
42 # Module 'sys' has no 'setdefaultencoding' member
43 # pylint: disable=E1101
44 sys.setdefaultencoding('utf-8')
45 for attr in dir(locale):
46 if attr[0:3] != 'LC_':
47 continue
48 aref = getattr(locale, attr)
maruel@chromium.orgcfa826c2011-03-25 00:47:57 +000049 try:
50 locale.setlocale(aref, '')
51 except locale.Error:
52 continue
maruel@chromium.org35625c72011-03-23 17:34:02 +000053 try:
54 lang = locale.getlocale(aref)[0]
tony@chromium.orge155bcd2011-03-23 23:16:55 +000055 except (TypeError, ValueError):
maruel@chromium.orgcfa826c2011-03-25 00:47:57 +000056 continue
maruel@chromium.org35625c72011-03-23 17:34:02 +000057 if lang:
58 try:
59 locale.setlocale(aref, (lang, 'UTF-8'))
60 except locale.Error:
61 os.environ[attr] = lang + '.UTF-8'
maruel@chromium.orgcfa826c2011-03-25 00:47:57 +000062 try:
63 locale.setlocale(locale.LC_ALL, '')
64 except locale.Error:
65 pass
maruel@chromium.org35625c72011-03-23 17:34:02 +000066 return True
67
68
69###############################
70# Windows specific
71
72
73def fix_win_sys_argv(encoding):
74 """Converts sys.argv to 'encoding' encoded string.
75
76 utf-8 is recommended.
77
78 Works around <http://bugs.python.org/issue2128>.
79 """
80 global _SYS_ARGV_PROCESSED
81 if _SYS_ARGV_PROCESSED:
82 return False
83
84 from ctypes import byref, c_int, POINTER, windll, WINFUNCTYPE
85 from ctypes.wintypes import LPCWSTR, LPWSTR
86
87 # <http://msdn.microsoft.com/en-us/library/ms683156.aspx>
88 GetCommandLineW = WINFUNCTYPE(LPWSTR)(('GetCommandLineW', windll.kernel32))
89 # <http://msdn.microsoft.com/en-us/library/bb776391.aspx>
90 CommandLineToArgvW = WINFUNCTYPE(POINTER(LPWSTR), LPCWSTR, POINTER(c_int))(
91 ('CommandLineToArgvW', windll.shell32))
92
93 argc = c_int(0)
94 argv_unicode = CommandLineToArgvW(GetCommandLineW(), byref(argc))
95 argv = [
96 argv_unicode[i].encode(encoding, 'replace')
97 for i in xrange(0, argc.value)]
98
99 if not hasattr(sys, 'frozen'):
100 # If this is an executable produced by py2exe or bbfreeze, then it
101 # will have been invoked directly. Otherwise, unicode_argv[0] is the
102 # Python interpreter, so skip that.
103 argv = argv[1:]
104
105 # Also skip option arguments to the Python interpreter.
106 while len(argv) > 0:
107 arg = argv[0]
108 if not arg.startswith(u'-') or arg == u'-':
109 break
110 argv = argv[1:]
111 if arg == u'-m':
112 # sys.argv[0] should really be the absolute path of the
113 # module source, but never mind.
114 break
115 if arg == u'-c':
116 argv[0] = u'-c'
117 break
118 sys.argv = argv
119 _SYS_ARGV_PROCESSED = True
120 return True
121
122
123def fix_win_codec():
124 """Works around <http://bugs.python.org/issue6058>."""
125 # <http://msdn.microsoft.com/en-us/library/dd317756.aspx>
126 try:
127 codecs.lookup('cp65001')
128 return False
129 except LookupError:
130 codecs.register(
131 lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None)
132 return True
133
134
135class WinUnicodeOutputBase(object):
136 """Base class to adapt sys.stdout or sys.stderr to behave correctly on
137 Windows.
138
139 Setting encoding to utf-8 is recommended.
140 """
141 def __init__(self, fileno, name, encoding):
142 # Corresponding file handle.
143 self._fileno = fileno
144 self.encoding = encoding
145 self.name = name
146
147 self.closed = False
148 self.softspace = False
149 self.mode = 'w'
150
151 @staticmethod
152 def isatty():
153 return False
154
155 def close(self):
156 # Don't really close the handle, that would only cause problems.
157 self.closed = True
158
159 def fileno(self):
160 return self._fileno
161
162 def flush(self):
163 raise NotImplementedError()
164
165 def write(self, text):
166 raise NotImplementedError()
167
168 def writelines(self, lines):
169 try:
170 for line in lines:
171 self.write(line)
172 except Exception, e:
173 complain('%s.writelines: %r' % (self.name, e))
174 raise
175
176
177class WinUnicodeConsoleOutput(WinUnicodeOutputBase):
178 """Output adapter to a Windows Console.
179
180 Understands how to use the win32 console API.
181 """
182 def __init__(self, console_handle, fileno, stream_name, encoding):
183 super(WinUnicodeConsoleOutput, self).__init__(
184 fileno, '<Unicode console %s>' % stream_name, encoding)
185 # Handle to use for WriteConsoleW
186 self._console_handle = console_handle
187
188 # Loads the necessary function.
189 from ctypes import byref, GetLastError, POINTER, windll, WINFUNCTYPE
190 from ctypes.wintypes import BOOL, DWORD, HANDLE, LPVOID, LPWSTR
191
192 self._DWORD = DWORD
193 self._byref = byref
194
195 # <http://msdn.microsoft.com/en-us/library/ms687401.aspx>
196 self._WriteConsoleW = WINFUNCTYPE(
197 BOOL, HANDLE, LPWSTR, DWORD, POINTER(DWORD), LPVOID)(
198 ('WriteConsoleW', windll.kernel32))
199 self._GetLastError = GetLastError
200
201 def flush(self):
202 # No need to flush the console since it's immediate.
203 pass
204
205 def write(self, text):
206 try:
207 if not isinstance(text, unicode):
208 # Convert to unicode.
209 text = str(text).decode(self.encoding, 'replace')
210 remaining = len(text)
211 while remaining > 0:
212 n = self._DWORD(0)
213 # There is a shorter-than-documented limitation on the length of the
214 # string passed to WriteConsoleW. See
215 # <http://tahoe-lafs.org/trac/tahoe-lafs/ticket/1232>.
216 retval = self._WriteConsoleW(
217 self._console_handle, text,
218 min(remaining, 10000),
219 self._byref(n), None)
220 if retval == 0 or n.value == 0:
221 raise IOError(
222 'WriteConsoleW returned %r, n.value = %r, last error = %r' % (
223 retval, n.value, self._GetLastError()))
224 remaining -= n.value
225 if not remaining:
226 break
227 text = text[n.value:]
228 except Exception, e:
229 complain('%s.write: %r' % (self.name, e))
230 raise
231
232
233class WinUnicodeOutput(WinUnicodeOutputBase):
234 """Output adaptor to a file output on Windows.
235
236 If the standard FileWrite function is used, it will be encoded in the current
237 code page. WriteConsoleW() permits writting any character.
238 """
239 def __init__(self, stream, fileno, encoding):
240 super(WinUnicodeOutput, self).__init__(
241 fileno, '<Unicode redirected %s>' % stream.name, encoding)
242 # Output stream
243 self._stream = stream
244
245 # Flush right now.
246 self.flush()
247
248 def flush(self):
249 try:
250 self._stream.flush()
251 except Exception, e:
252 complain('%s.flush: %r from %r' % (self.name, e, self._stream))
253 raise
254
255 def write(self, text):
256 try:
257 if isinstance(text, unicode):
258 # Replace characters that cannot be printed instead of failing.
259 text = text.encode(self.encoding, 'replace')
260 self._stream.write(text)
261 except Exception, e:
262 complain('%s.write: %r' % (self.name, e))
263 raise
264
265
266def win_handle_is_a_console(handle):
267 """Returns True if a Windows file handle is a handle to a console."""
268 from ctypes import byref, POINTER, windll, WINFUNCTYPE
269 from ctypes.wintypes import BOOL, DWORD, HANDLE
270
271 FILE_TYPE_CHAR = 0x0002
272 FILE_TYPE_REMOTE = 0x8000
273 INVALID_HANDLE_VALUE = DWORD(-1).value
274
275 # <http://msdn.microsoft.com/en-us/library/ms683167.aspx>
276 GetConsoleMode = WINFUNCTYPE(BOOL, HANDLE, POINTER(DWORD))(
277 ('GetConsoleMode', windll.kernel32))
278 # <http://msdn.microsoft.com/en-us/library/aa364960.aspx>
279 GetFileType = WINFUNCTYPE(DWORD, DWORD)(('GetFileType', windll.kernel32))
280
281 # GetStdHandle returns INVALID_HANDLE_VALUE, NULL, or a valid handle.
282 if handle == INVALID_HANDLE_VALUE or handle is None:
283 return False
284 return (
285 (GetFileType(handle) & ~FILE_TYPE_REMOTE) == FILE_TYPE_CHAR and
286 GetConsoleMode(handle, byref(DWORD())))
287
288
289def win_get_unicode_stream(stream, excepted_fileno, output_handle, encoding):
290 """Returns a unicode-compatible stream.
291
292 This function will return a direct-Console writing object only if:
293 - the file number is the expected console file number
294 - the handle the expected file handle
295 - the 'real' handle is in fact a handle to a console.
296 """
297 old_fileno = getattr(stream, 'fileno', lambda: None)()
298 if old_fileno == excepted_fileno:
299 from ctypes import windll, WINFUNCTYPE
300 from ctypes.wintypes import DWORD, HANDLE
301
302 # <http://msdn.microsoft.com/en-us/library/ms683231.aspx>
303 GetStdHandle = WINFUNCTYPE(HANDLE, DWORD)(('GetStdHandle', windll.kernel32))
304
305 real_output_handle = GetStdHandle(DWORD(output_handle))
306 if win_handle_is_a_console(real_output_handle):
307 # It's a console.
308 return WinUnicodeConsoleOutput(
309 real_output_handle, old_fileno, stream.name, encoding)
310
311 # It's something else. Create an auto-encoding stream.
312 return WinUnicodeOutput(stream, old_fileno, encoding)
313
314
315def fix_win_console(encoding):
316 """Makes Unicode console output work independently of the current code page.
317
318 This also fixes <http://bugs.python.org/issue1602>.
319 Credit to Michael Kaplan
320 <http://blogs.msdn.com/b/michkap/archive/2010/04/07/9989346.aspx> and
321 TZOmegaTZIOY
322 <http://stackoverflow.com/questions/878972/windows-cmd-encoding-change-causes-python-crash/1432462#1432462>.
323 """
324 if (isinstance(sys.stdout, WinUnicodeOutputBase) or
325 isinstance(sys.stderr, WinUnicodeOutputBase)):
326 return False
327
328 try:
329 # SetConsoleCP and SetConsoleOutputCP could be used to change the code page
330 # but it's not really useful since the code here is using WriteConsoleW().
331 # Also, changing the code page is 'permanent' to the console and needs to be
332 # reverted manually.
333 # In practice one needs to set the console font to a TTF font to be able to
334 # see all the characters but it failed for me in practice. In any case, it
335 # won't throw any exception when printing, which is the important part.
336 # -11 and -12 are defined in stdio.h
337 sys.stdout = win_get_unicode_stream(sys.stdout, 1, -11, encoding)
338 sys.stderr = win_get_unicode_stream(sys.stderr, 2, -12, encoding)
339 # TODO(maruel): Do sys.stdin with ReadConsoleW(). Albeit the limitation is
340 # "It doesn't appear to be possible to read Unicode characters in UTF-8
341 # mode" and this appears to be a limitation of cmd.exe.
342 except Exception, e:
343 complain('exception %r while fixing up sys.stdout and sys.stderr' % e)
344 return True
345
346
347def fix_encoding():
348 """Fixes various encoding problems on all platforms.
349
350 Should be called at the very begining of the process.
351 """
352 ret = True
353 if sys.platform == 'win32':
354 ret &= fix_win_codec()
355
356 ret &= fix_default_encoding()
357
358 if sys.platform == 'win32':
359 encoding = sys.getdefaultencoding()
360 ret &= fix_win_sys_argv(encoding)
361 ret &= fix_win_console(encoding)
362 return ret