blob: f556e822fb2f21a6a8deb221c28ff1661e83bdf1 [file] [log] [blame]
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001#!/usr/bin/env python
2# coding=utf-8
3# Copyright (c) 2012 The Chromium Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7"""Traces an executable and its child processes and extract the files accessed
8by them.
9
10The implementation uses OS-specific API. The native Kernel logger and the ETL
11interface is used on Windows. Dtrace is used on OSX. Strace is used otherwise.
12The OS-specific implementation is hidden in an 'API' interface.
13
14The results are embedded in a Results instance. The tracing is done in two
15phases, the first is to do the actual trace and generate an
16implementation-specific log file. Then the log file is parsed to extract the
17information, including the individual child processes and the files accessed
18from the log.
19"""
20
21import codecs
22import csv
23import getpass
24import glob
25import json
26import logging
27import optparse
28import os
29import re
maruel@chromium.org9b2112a2013-04-05 18:01:02 +000030import stat
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +000031import subprocess
32import sys
33import tempfile
34import threading
35import time
maruel@chromium.org9b2112a2013-04-05 18:01:02 +000036import unicodedata
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +000037import weakref
38
39## OS-specific imports
40
41if sys.platform == 'win32':
42 from ctypes.wintypes import byref, create_unicode_buffer, c_int, c_wchar_p
43 from ctypes.wintypes import windll, FormatError # pylint: disable=E0611
44 from ctypes.wintypes import GetLastError # pylint: disable=E0611
45elif sys.platform == 'darwin':
46 import Carbon.File # pylint: disable=F0401
47 import MacOS # pylint: disable=F0401
48
maruel@chromium.org46e61cc2013-03-25 19:55:34 +000049import run_isolated
50
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +000051
52BASE_DIR = os.path.dirname(os.path.abspath(__file__))
53ROOT_DIR = os.path.dirname(os.path.dirname(BASE_DIR))
54
55
56class TracingFailure(Exception):
57 """An exception occured during tracing."""
58 def __init__(self, description, pid, line_number, line, *args):
59 super(TracingFailure, self).__init__(
60 description, pid, line_number, line, *args)
61 self.description = description
62 self.pid = pid
63 self.line_number = line_number
64 self.line = line
65 self.extra = args
66
67 def __str__(self):
68 out = self.description
69 if self.pid:
70 out += '\npid: %d' % self.pid
71 if self.line_number:
72 out += '\nline: %d' % self.line_number
73 if self.line:
74 out += '\n%s' % self.line
75 if self.extra:
76 out += '\n' + ', '.join(map(str, filter(None, self.extra)))
77 return out
78
79
80## OS-specific functions
81
82if sys.platform == 'win32':
83 def QueryDosDevice(drive_letter):
84 """Returns the Windows 'native' path for a DOS drive letter."""
85 assert re.match(r'^[a-zA-Z]:$', drive_letter), drive_letter
maruel@chromium.org306e0e72012-11-02 18:22:03 +000086 assert isinstance(drive_letter, unicode)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +000087 # Guesswork. QueryDosDeviceW never returns the required number of bytes.
88 chars = 1024
maruel@chromium.org306e0e72012-11-02 18:22:03 +000089 drive_letter = drive_letter
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +000090 p = create_unicode_buffer(chars)
91 if 0 == windll.kernel32.QueryDosDeviceW(drive_letter, p, chars):
92 err = GetLastError()
93 if err:
94 # pylint: disable=E0602
maruel@chromium.orgf37c2472012-11-02 20:06:13 +000095 msg = u'QueryDosDevice(%s): %s (%d)' % (
96 drive_letter, FormatError(err), err)
97 raise WindowsError(err, msg.encode('utf-8'))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +000098 return p.value
99
100
101 def GetShortPathName(long_path):
102 """Returns the Windows short path equivalent for a 'long' path."""
maruel@chromium.org68bb1482013-02-15 22:36:24 +0000103 assert isinstance(long_path, unicode), repr(long_path)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000104 # Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is
105 # not enforced.
106 if os.path.isabs(long_path) and not long_path.startswith('\\\\?\\'):
107 long_path = '\\\\?\\' + long_path
108 chars = windll.kernel32.GetShortPathNameW(long_path, None, 0)
109 if chars:
110 p = create_unicode_buffer(chars)
111 if windll.kernel32.GetShortPathNameW(long_path, p, chars):
112 return p.value
113
114 err = GetLastError()
115 if err:
116 # pylint: disable=E0602
maruel@chromium.orgf37c2472012-11-02 20:06:13 +0000117 msg = u'GetShortPathName(%s): %s (%d)' % (
118 long_path, FormatError(err), err)
119 raise WindowsError(err, msg.encode('utf-8'))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000120
121
122 def GetLongPathName(short_path):
123 """Returns the Windows long path equivalent for a 'short' path."""
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000124 assert isinstance(short_path, unicode)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000125 # Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is
126 # not enforced.
127 if os.path.isabs(short_path) and not short_path.startswith('\\\\?\\'):
128 short_path = '\\\\?\\' + short_path
129 chars = windll.kernel32.GetLongPathNameW(short_path, None, 0)
130 if chars:
131 p = create_unicode_buffer(chars)
132 if windll.kernel32.GetLongPathNameW(short_path, p, chars):
133 return p.value
134
135 err = GetLastError()
136 if err:
137 # pylint: disable=E0602
maruel@chromium.orgf37c2472012-11-02 20:06:13 +0000138 msg = u'GetLongPathName(%s): %s (%d)' % (
139 short_path, FormatError(err), err)
140 raise WindowsError(err, msg.encode('utf-8'))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000141
142
143 def get_current_encoding():
144 """Returns the 'ANSI' code page associated to the process."""
145 return 'cp%d' % int(windll.kernel32.GetACP())
146
147
148 class DosDriveMap(object):
149 """Maps \Device\HarddiskVolumeN to N: on Windows."""
150 # Keep one global cache.
151 _MAPPING = {}
152
153 def __init__(self):
154 """Lazy loads the cache."""
155 if not self._MAPPING:
156 # This is related to UNC resolver on windows. Ignore that.
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000157 self._MAPPING[u'\\Device\\Mup'] = None
158 self._MAPPING[u'\\SystemRoot'] = os.environ[u'SystemRoot']
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000159
160 for letter in (chr(l) for l in xrange(ord('C'), ord('Z')+1)):
161 try:
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000162 letter = u'%s:' % letter
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000163 mapped = QueryDosDevice(letter)
164 if mapped in self._MAPPING:
165 logging.warn(
166 ('Two drives: \'%s\' and \'%s\', are mapped to the same disk'
167 '. Drive letters are a user-mode concept and the kernel '
168 'traces only have NT path, so all accesses will be '
169 'associated with the first drive letter, independent of the '
170 'actual letter used by the code') % (
171 self._MAPPING[mapped], letter))
172 else:
173 self._MAPPING[mapped] = letter
174 except WindowsError: # pylint: disable=E0602
175 pass
176
177 def to_win32(self, path):
178 """Converts a native NT path to Win32/DOS compatible path."""
179 match = re.match(r'(^\\Device\\[a-zA-Z0-9]+)(\\.*)?$', path)
180 if not match:
181 raise ValueError(
182 'Can\'t convert %s into a Win32 compatible path' % path,
183 path)
184 if not match.group(1) in self._MAPPING:
185 # Unmapped partitions may be accessed by windows for the
186 # fun of it while the test is running. Discard these.
187 return None
188 drive = self._MAPPING[match.group(1)]
189 if not drive or not match.group(2):
190 return drive
191 return drive + match.group(2)
192
193
194 def isabs(path):
195 """Accepts X: as an absolute path, unlike python's os.path.isabs()."""
196 return os.path.isabs(path) or len(path) == 2 and path[1] == ':'
197
198
csharp@chromium.orgf2eacff2013-04-04 14:20:20 +0000199 def find_item_native_case(root, item):
200 """Gets the native path case of a single item based at root_path."""
201 if item == '..':
202 return item
203
204 root = get_native_path_case(root)
205 return os.path.basename(get_native_path_case(os.path.join(root, item)))
206
207
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000208 def get_native_path_case(p):
209 """Returns the native path case for an existing file.
210
211 On Windows, removes any leading '\\?\'.
212 """
maruel@chromium.orgd1e9fbd2013-02-26 14:42:24 +0000213 assert isinstance(p, unicode), repr(p)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000214 if not isabs(p):
215 raise ValueError(
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +0000216 'get_native_path_case(%r): Require an absolute path' % p, p)
217
maruel@chromium.org037758d2012-12-10 17:59:46 +0000218 # Make sure it is normalized to os.path.sep. Do not do it here to keep the
219 # function fast
220 assert '/' not in p, p
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +0000221 suffix = ''
222 count = p.count(':')
223 if count > 1:
224 # This means it has an alternate-data stream. There could be 3 ':', since
225 # it could be the $DATA datastream of an ADS. Split the whole ADS suffix
226 # off and add it back afterward. There is no way to know the native path
227 # case of an alternate data stream.
228 items = p.split(':')
229 p = ':'.join(items[0:2])
230 suffix = ''.join(':' + i for i in items[2:])
231
maruel@chromium.org42b091d2013-04-02 22:27:02 +0000232 # TODO(maruel): Use os.path.normpath?
233 if p.endswith('.\\'):
234 p = p[:-2]
235
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000236 # Windows used to have an option to turn on case sensitivity on non Win32
237 # subsystem but that's out of scope here and isn't supported anymore.
238 # Go figure why GetShortPathName() is needed.
239 try:
240 out = GetLongPathName(GetShortPathName(p))
241 except OSError, e:
242 if e.args[0] in (2, 3, 5):
243 # The path does not exist. Try to recurse and reconstruct the path.
244 base = os.path.dirname(p)
245 rest = os.path.basename(p)
246 return os.path.join(get_native_path_case(base), rest)
247 raise
248 if out.startswith('\\\\?\\'):
249 out = out[4:]
250 # Always upper case the first letter since GetLongPathName() will return the
251 # drive letter in the case it was given.
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +0000252 return out[0].upper() + out[1:] + suffix
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000253
254
255 def CommandLineToArgvW(command_line):
256 """Splits a commandline into argv using CommandLineToArgvW()."""
257 # http://msdn.microsoft.com/library/windows/desktop/bb776391.aspx
258 size = c_int()
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000259 assert isinstance(command_line, unicode)
260 ptr = windll.shell32.CommandLineToArgvW(command_line, byref(size))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000261 try:
262 return [arg for arg in (c_wchar_p * size.value).from_address(ptr)]
263 finally:
264 windll.kernel32.LocalFree(ptr)
265
266
267elif sys.platform == 'darwin':
268
269
270 # On non-windows, keep the stdlib behavior.
271 isabs = os.path.isabs
272
273
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000274 def _native_case(p):
275 """Gets the native path case. Warning: this function resolves symlinks."""
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000276 try:
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000277 rel_ref, _ = Carbon.File.FSPathMakeRef(p.encode('utf-8'))
278 # The OSX underlying code uses NFD but python strings are in NFC. This
279 # will cause issues with os.listdir() for example. Since the dtrace log
280 # *is* in NFC, normalize it here.
csharp@chromium.org5a2c3732013-03-07 14:44:56 +0000281 out = unicodedata.normalize(
282 'NFC', rel_ref.FSRefMakePath().decode('utf-8'))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000283 if p.endswith(os.path.sep) and not out.endswith(os.path.sep):
284 return out + os.path.sep
285 return out
286 except MacOS.Error, e:
287 if e.args[0] in (-43, -120):
288 # The path does not exist. Try to recurse and reconstruct the path.
289 # -43 means file not found.
290 # -120 means directory not found.
291 base = os.path.dirname(p)
292 rest = os.path.basename(p)
csharp@chromium.org5a2c3732013-03-07 14:44:56 +0000293 return os.path.join(_native_case(base), rest)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000294 raise OSError(
295 e.args[0], 'Failed to get native path for %s' % p, p, e.args[1])
296
297
298 def _split_at_symlink_native(base_path, rest):
299 """Returns the native path for a symlink."""
300 base, symlink, rest = split_at_symlink(base_path, rest)
301 if symlink:
302 if not base_path:
303 base_path = base
304 else:
305 base_path = safe_join(base_path, base)
csharp@chromium.orgf2eacff2013-04-04 14:20:20 +0000306 symlink = find_item_native_case(base_path, symlink)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000307 return base, symlink, rest
308
309
csharp@chromium.orgf2eacff2013-04-04 14:20:20 +0000310 def find_item_native_case(root_path, item):
311 """Gets the native path case of a single item based at root_path.
312
313 There is no API to get the native path case of symlinks on OSX. So it
314 needs to be done the slow way.
315 """
316 if item == '..':
317 return item
318
319 item = item.lower()
320 for element in os.listdir(root_path):
321 if element.lower() == item:
322 return element
323
324
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000325 def get_native_path_case(path):
326 """Returns the native path case for an existing file.
327
328 Technically, it's only HFS+ on OSX that is case preserving and
329 insensitive. It's the default setting on HFS+ but can be changed.
330 """
maruel@chromium.orgd1e9fbd2013-02-26 14:42:24 +0000331 assert isinstance(path, unicode), repr(path)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000332 if not isabs(path):
333 raise ValueError(
maruel@chromium.orgd1e9fbd2013-02-26 14:42:24 +0000334 'get_native_path_case(%r): Require an absolute path' % path, path)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000335 if path.startswith('/dev'):
336 # /dev is not visible from Carbon, causing an exception.
337 return path
338
339 # Starts assuming there is no symlink along the path.
340 resolved = _native_case(path)
csharp@chromium.orgf2eacff2013-04-04 14:20:20 +0000341 if path.lower() in (resolved.lower(), resolved.lower() + './'):
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000342 # This code path is incredibly faster.
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000343 logging.debug('get_native_path_case(%s) = %s' % (path, resolved))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000344 return resolved
345
346 # There was a symlink, process it.
347 base, symlink, rest = _split_at_symlink_native(None, path)
348 assert symlink, (path, base, symlink, rest, resolved)
349 prev = base
350 base = safe_join(_native_case(base), symlink)
351 assert len(base) > len(prev)
352 while rest:
353 prev = base
354 relbase, symlink, rest = _split_at_symlink_native(base, rest)
355 base = safe_join(base, relbase)
356 assert len(base) > len(prev), (prev, base, symlink)
357 if symlink:
358 base = safe_join(base, symlink)
359 assert len(base) > len(prev), (prev, base, symlink)
360 # Make sure no symlink was resolved.
361 assert base.lower() == path.lower(), (base, path)
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000362 logging.debug('get_native_path_case(%s) = %s' % (path, base))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000363 return base
364
365
366else: # OSes other than Windows and OSX.
367
368
369 # On non-windows, keep the stdlib behavior.
370 isabs = os.path.isabs
371
372
csharp@chromium.orgf2eacff2013-04-04 14:20:20 +0000373 def find_item_native_case(root, item):
374 """Gets the native path case of a single item based at root_path."""
375 if item == '..':
376 return item
377
378 root = get_native_path_case(root)
379 return os.path.basename(get_native_path_case(os.path.join(root, item)))
380
381
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000382 def get_native_path_case(path):
383 """Returns the native path case for an existing file.
384
385 On OSes other than OSX and Windows, assume the file system is
386 case-sensitive.
387
388 TODO(maruel): This is not strictly true. Implement if necessary.
389 """
maruel@chromium.orgd1e9fbd2013-02-26 14:42:24 +0000390 assert isinstance(path, unicode), repr(path)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000391 if not isabs(path):
392 raise ValueError(
maruel@chromium.orgd1e9fbd2013-02-26 14:42:24 +0000393 'get_native_path_case(%r): Require an absolute path' % path, path)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000394 # Give up on cygwin, as GetLongPathName() can't be called.
395 # Linux traces tends to not be normalized so use this occasion to normalize
396 # it. This function implementation already normalizes the path on the other
397 # OS so this needs to be done here to be coherent between OSes.
398 out = os.path.normpath(path)
399 if path.endswith(os.path.sep) and not out.endswith(os.path.sep):
400 return out + os.path.sep
401 return out
402
403
404if sys.platform != 'win32': # All non-Windows OSes.
405
406
407 def safe_join(*args):
408 """Joins path elements like os.path.join() but doesn't abort on absolute
409 path.
410
411 os.path.join('foo', '/bar') == '/bar'
412 but safe_join('foo', '/bar') == 'foo/bar'.
413 """
414 out = ''
415 for element in args:
416 if element.startswith(os.path.sep):
417 if out.endswith(os.path.sep):
418 out += element[1:]
419 else:
420 out += element
421 else:
422 if out.endswith(os.path.sep):
423 out += element
424 else:
425 out += os.path.sep + element
426 return out
427
428
429 def split_at_symlink(base_dir, relfile):
430 """Scans each component of relfile and cut the string at the symlink if
431 there is any.
432
433 Returns a tuple (base_path, symlink, rest), with symlink == rest == None if
434 not symlink was found.
435 """
436 if base_dir:
437 assert relfile
438 assert os.path.isabs(base_dir)
439 index = 0
440 else:
441 assert os.path.isabs(relfile)
442 index = 1
443
444 def at_root(rest):
445 if base_dir:
446 return safe_join(base_dir, rest)
447 return rest
448
449 while True:
450 try:
451 index = relfile.index(os.path.sep, index)
452 except ValueError:
453 index = len(relfile)
454 full = at_root(relfile[:index])
455 if os.path.islink(full):
456 # A symlink!
457 base = os.path.dirname(relfile[:index])
458 symlink = os.path.basename(relfile[:index])
459 rest = relfile[index:]
460 logging.debug(
461 'split_at_symlink(%s, %s) -> (%s, %s, %s)' %
462 (base_dir, relfile, base, symlink, rest))
463 return base, symlink, rest
464 if index == len(relfile):
465 break
466 index += 1
467 return relfile, None, None
468
469
470def fix_python_path(cmd):
471 """Returns the fixed command line to call the right python executable."""
472 out = cmd[:]
473 if out[0] == 'python':
474 out[0] = sys.executable
475 elif out[0].endswith('.py'):
476 out.insert(0, sys.executable)
477 return out
478
479
480def create_thunk():
481 handle, name = tempfile.mkstemp(prefix='trace_inputs_thunk', suffix='.py')
482 os.write(
483 handle,
484 (
485 'import subprocess\n'
486 'import sys\n'
487 'sys.exit(subprocess.call(sys.argv[2:]))\n'
488 ))
489 os.close(handle)
490 return name
491
492
493def strace_process_quoted_arguments(text):
494 """Extracts quoted arguments on a string and return the arguments as a list.
495
496 Implemented as an automaton. Supports incomplete strings in the form
497 '"foo"...'.
498
499 Example:
500 With text = '"foo", "bar"', the function will return ['foo', 'bar']
501
502 TODO(maruel): Implement escaping.
503 """
504 # All the possible states of the DFA.
505 ( NEED_QUOTE, # Begining of a new arguments.
506 INSIDE_STRING, # Inside an argument.
507 ESCAPED, # Found a '\' inside a quote. Treat the next char as-is.
508 NEED_COMMA_OR_DOT, # Right after the closing quote of an argument. Could be
509 # a serie of 3 dots or a comma.
510 NEED_SPACE, # Right after a comma
511 NEED_DOT_2, # Found a dot, need a second one.
512 NEED_DOT_3, # Found second dot, need a third one.
513 NEED_COMMA, # Found third dot, need a comma.
514 ) = range(8)
515
516 state = NEED_QUOTE
517 out = []
518 for index, char in enumerate(text):
519 if char == '"':
520 if state == NEED_QUOTE:
521 state = INSIDE_STRING
522 # A new argument was found.
523 out.append('')
524 elif state == INSIDE_STRING:
525 # The argument is now closed.
526 state = NEED_COMMA_OR_DOT
527 elif state == ESCAPED:
528 out[-1] += char
529 state = INSIDE_STRING
530 else:
531 raise ValueError(
532 'Can\'t process char at column %d for: %r' % (index, text),
533 index,
534 text)
535 elif char == ',':
536 if state in (NEED_COMMA_OR_DOT, NEED_COMMA):
537 state = NEED_SPACE
538 elif state == INSIDE_STRING:
539 out[-1] += char
540 elif state == ESCAPED:
541 out[-1] += char
542 state = INSIDE_STRING
543 else:
544 raise ValueError(
545 'Can\'t process char at column %d for: %r' % (index, text),
546 index,
547 text)
548 elif char == ' ':
549 if state == NEED_SPACE:
550 state = NEED_QUOTE
551 elif state == INSIDE_STRING:
552 out[-1] += char
553 elif state == ESCAPED:
554 out[-1] += char
555 state = INSIDE_STRING
556 else:
557 raise ValueError(
558 'Can\'t process char at column %d for: %r' % (index, text),
559 index,
560 text)
561 elif char == '.':
562 if state == NEED_COMMA_OR_DOT:
563 # The string is incomplete, this mean the strace -s flag should be
564 # increased.
565 state = NEED_DOT_2
566 elif state == NEED_DOT_2:
567 state = NEED_DOT_3
568 elif state == NEED_DOT_3:
569 state = NEED_COMMA
570 elif state == INSIDE_STRING:
571 out[-1] += char
572 elif state == ESCAPED:
573 out[-1] += char
574 state = INSIDE_STRING
575 else:
576 raise ValueError(
577 'Can\'t process char at column %d for: %r' % (index, text),
578 index,
579 text)
580 elif char == '\\':
581 if state == ESCAPED:
582 out[-1] += char
583 state = INSIDE_STRING
584 elif state == INSIDE_STRING:
585 state = ESCAPED
586 else:
587 raise ValueError(
588 'Can\'t process char at column %d for: %r' % (index, text),
589 index,
590 text)
591 else:
592 if state == INSIDE_STRING:
593 out[-1] += char
594 else:
595 raise ValueError(
596 'Can\'t process char at column %d for: %r' % (index, text),
597 index,
598 text)
599 if state not in (NEED_COMMA, NEED_COMMA_OR_DOT):
600 raise ValueError(
601 'String is incorrectly terminated: %r' % text,
602 text)
603 return out
604
605
606def read_json(filepath):
607 with open(filepath, 'r') as f:
608 return json.load(f)
609
610
611def write_json(filepath_or_handle, data, dense):
612 """Writes data into filepath or file handle encoded as json.
613
614 If dense is True, the json is packed. Otherwise, it is human readable.
615 """
616 if hasattr(filepath_or_handle, 'write'):
617 if dense:
618 filepath_or_handle.write(
619 json.dumps(data, sort_keys=True, separators=(',',':')))
620 else:
621 filepath_or_handle.write(json.dumps(data, sort_keys=True, indent=2))
622 else:
623 with open(filepath_or_handle, 'wb') as f:
624 if dense:
625 json.dump(data, f, sort_keys=True, separators=(',',':'))
626 else:
627 json.dump(data, f, sort_keys=True, indent=2)
628
629
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000630def assert_is_renderable(pseudo_string):
631 """Asserts the input is a valid object to be processed by render()."""
632 assert (
633 isinstance(pseudo_string, (None.__class__, unicode)) or
634 hasattr(pseudo_string, 'render')), repr(pseudo_string)
635
636
637def render(pseudo_string):
638 """Converts the pseudo-string to an unicode string."""
639 assert_is_renderable(pseudo_string)
640 if isinstance(pseudo_string, (None.__class__, unicode)):
641 return pseudo_string
642 return pseudo_string.render()
643
644
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000645class Results(object):
646 """Results of a trace session."""
647
648 class _TouchedObject(object):
649 """Something, a file or a directory, that was accessed."""
650 def __init__(self, root, path, tainted, size, nb_files):
651 logging.debug(
652 '%s(%s, %s, %s, %s, %s)' %
653 (self.__class__.__name__, root, path, tainted, size, nb_files))
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000654 assert_is_renderable(root)
655 assert_is_renderable(path)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000656 self.root = root
657 self.path = path
658 self.tainted = tainted
659 self.nb_files = nb_files
660 # Can be used as a cache or a default value, depending on context.
661 self._size = size
662 # These are cache only.
663 self._real_path = None
664
665 # Check internal consistency.
666 assert path, path
667 assert tainted or bool(root) != bool(isabs(path)), (root, path)
668 assert tainted or (
669 not os.path.exists(self.full_path) or
670 (self.full_path == get_native_path_case(self.full_path))), (
671 tainted, self.full_path, get_native_path_case(self.full_path))
672
673 @property
674 def existent(self):
675 return self.size != -1
676
677 @property
678 def full_path(self):
679 if self.root:
680 return os.path.join(self.root, self.path)
681 return self.path
682
683 @property
684 def real_path(self):
685 """Returns the path with symlinks resolved."""
686 if not self._real_path:
687 self._real_path = os.path.realpath(self.full_path)
688 return self._real_path
689
690 @property
691 def size(self):
692 """File's size. -1 is not existent."""
693 if self._size is None and not self.tainted:
694 try:
695 self._size = os.stat(self.full_path).st_size
696 except OSError:
697 self._size = -1
698 return self._size
699
700 def flatten(self):
701 """Returns a dict representing this object.
702
703 A 'size' of 0 means the file was only touched and not read.
704 """
705 return {
706 'path': self.path,
707 'size': self.size,
708 }
709
710 def replace_variables(self, variables):
711 """Replaces the root of this File with one of the variables if it matches.
712
713 If a variable replacement occurs, the cloned object becomes tainted.
714 """
715 for variable, root_path in variables.iteritems():
716 if self.path.startswith(root_path):
717 return self._clone(
718 self.root, variable + self.path[len(root_path):], True)
719 # No need to clone, returns ourself.
720 return self
721
722 def strip_root(self, root):
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000723 """Returns a clone of itself with 'root' stripped off.
724
725 Note that the file is kept if it is either accessible from a symlinked
726 path that was used to access the file or through the real path.
727 """
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000728 # Check internal consistency.
729 assert self.tainted or (isabs(root) and root.endswith(os.path.sep)), root
730 if not self.full_path.startswith(root):
731 # Now try to resolve the symlinks to see if it can be reached this way.
732 # Only try *after* trying without resolving symlink.
733 if not self.real_path.startswith(root):
734 return None
735 path = self.real_path
736 else:
737 path = self.full_path
738 return self._clone(root, path[len(root):], self.tainted)
739
740 def _clone(self, new_root, new_path, tainted):
741 raise NotImplementedError(self.__class__.__name__)
742
743 class File(_TouchedObject):
744 """A file that was accessed. May not be present anymore.
745
746 If tainted is true, it means it is not a real path anymore as a variable
747 replacement occured.
748
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000749 If only_touched is True, this means the file was probed for existence, and
750 it is existent, but was never _opened_. If only_touched is True, the file
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000751 must have existed.
752 """
753 def __init__(self, root, path, tainted, size):
754 super(Results.File, self).__init__(root, path, tainted, size, 1)
755
756 def _clone(self, new_root, new_path, tainted):
757 """Clones itself keeping meta-data."""
758 # Keep the self.size and self._real_path caches for performance reason. It
759 # is also important when the file becomes tainted (with a variable instead
760 # of the real path) since self.path is not an on-disk path anymore so
761 # out._size cannot be updated.
762 out = self.__class__(new_root, new_path, tainted, self.size)
763 out._real_path = self._real_path
764 return out
765
766 class Directory(_TouchedObject):
767 """A directory of files. Must exist."""
768 def __init__(self, root, path, tainted, size, nb_files):
769 """path='.' is a valid value and must be handled appropriately."""
770 assert not path.endswith(os.path.sep), path
771 super(Results.Directory, self).__init__(
772 root, path + os.path.sep, tainted, size, nb_files)
773 # For a Directory instance, self.size is not a cache, it's an actual value
774 # that is never modified and represents the total size of the files
775 # contained in this directory. It is possible that the directory is empty
776 # so that size == 0; this happens if there's only an invalid symlink in
777 # it.
778
779 def flatten(self):
780 out = super(Results.Directory, self).flatten()
781 out['nb_files'] = self.nb_files
782 return out
783
784 def _clone(self, new_root, new_path, tainted):
785 """Clones itself keeping meta-data."""
786 out = self.__class__(
787 new_root,
788 new_path.rstrip(os.path.sep),
789 tainted,
790 self.size,
791 self.nb_files)
792 out._real_path = self._real_path
793 return out
794
795 class Process(object):
796 """A process that was traced.
797
798 Contains references to the files accessed by this process and its children.
799 """
800 def __init__(self, pid, files, executable, command, initial_cwd, children):
801 logging.debug('Process(%s, %d, ...)' % (pid, len(files)))
802 self.pid = pid
803 self.files = sorted(files, key=lambda x: x.path)
804 self.children = children
805 self.executable = executable
806 self.command = command
807 self.initial_cwd = initial_cwd
808
809 # Check internal consistency.
810 assert len(set(f.path for f in self.files)) == len(self.files), sorted(
811 f.path for f in self.files)
812 assert isinstance(self.children, list)
813 assert isinstance(self.files, list)
814
815 @property
816 def all(self):
817 for child in self.children:
818 for i in child.all:
819 yield i
820 yield self
821
822 def flatten(self):
823 return {
824 'children': [c.flatten() for c in self.children],
825 'command': self.command,
826 'executable': self.executable,
827 'files': [f.flatten() for f in self.files],
828 'initial_cwd': self.initial_cwd,
829 'pid': self.pid,
830 }
831
832 def strip_root(self, root):
833 assert isabs(root) and root.endswith(os.path.sep), root
834 # Loads the files after since they are constructed as objects.
835 out = self.__class__(
836 self.pid,
837 filter(None, (f.strip_root(root) for f in self.files)),
838 self.executable,
839 self.command,
840 self.initial_cwd,
841 [c.strip_root(root) for c in self.children])
842 logging.debug(
843 'strip_root(%s) %d -> %d' % (root, len(self.files), len(out.files)))
844 return out
845
846 def __init__(self, process):
847 self.process = process
848 # Cache.
849 self._files = None
850
851 def flatten(self):
852 return {
853 'root': self.process.flatten(),
854 }
855
856 @property
857 def files(self):
858 if self._files is None:
859 self._files = sorted(
860 sum((p.files for p in self.process.all), []),
861 key=lambda x: x.path)
862 return self._files
863
864 @property
865 def existent(self):
866 return [f for f in self.files if f.existent]
867
868 @property
869 def non_existent(self):
870 return [f for f in self.files if not f.existent]
871
872 def strip_root(self, root):
873 """Returns a clone with all the files outside the directory |root| removed
874 and converts all the path to be relative paths.
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000875
876 It keeps files accessible through the |root| directory or that have been
877 accessed through any symlink which points to the same directory.
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000878 """
879 # Resolve any symlink
880 root = os.path.realpath(root)
881 root = get_native_path_case(root).rstrip(os.path.sep) + os.path.sep
882 logging.debug('strip_root(%s)' % root)
883 return Results(self.process.strip_root(root))
884
885
886class ApiBase(object):
887 """OS-agnostic API to trace a process and its children."""
888 class Context(object):
889 """Processes one log line at a time and keeps the list of traced processes.
890
891 The parsing is complicated by the fact that logs are traced out of order for
892 strace but in-order for dtrace and logman. In addition, on Windows it is
893 very frequent that processids are reused so a flat list cannot be used. But
894 at the same time, it is impossible to faithfully construct a graph when the
895 logs are processed out of order. So both a tree and a flat mapping are used,
896 the tree is the real process tree, while the flat mapping stores the last
897 valid process for the corresponding processid. For the strace case, the
898 tree's head is guessed at the last moment.
899 """
900 class Process(object):
901 """Keeps context for one traced child process.
902
903 Logs all the files this process touched. Ignores directories.
904 """
905 def __init__(self, blacklist, pid, initial_cwd):
906 # Check internal consistency.
907 assert isinstance(pid, int), repr(pid)
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000908 assert_is_renderable(initial_cwd)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000909 self.pid = pid
910 # children are Process instances.
911 self.children = []
912 self.initial_cwd = initial_cwd
913 self.cwd = None
914 self.files = set()
915 self.only_touched = set()
916 self.executable = None
917 self.command = None
918 self._blacklist = blacklist
919
920 def to_results_process(self):
921 """Resolves file case sensitivity and or late-bound strings."""
922 # When resolving files, it's normal to get dupe because a file could be
923 # opened multiple times with different case. Resolve the deduplication
924 # here.
925 def fix_path(x):
926 """Returns the native file path case.
927
928 Converts late-bound strings.
929 """
930 if not x:
931 # Do not convert None instance to 'None'.
932 return x
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000933 x = render(x)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000934 if os.path.isabs(x):
935 # If the path is not absolute, which tends to happen occasionally on
936 # Windows, it is not possible to get the native path case so ignore
937 # that trace. It mostly happens for 'executable' value.
938 x = get_native_path_case(x)
939 return x
940
941 def fix_and_blacklist_path(x):
942 x = fix_path(x)
943 if not x:
944 return
945 # The blacklist needs to be reapplied, since path casing could
946 # influence blacklisting.
947 if self._blacklist(x):
948 return
949 return x
950
951 # Filters out directories. Some may have passed through.
952 files = set(f for f in map(fix_and_blacklist_path, self.files) if f)
953 only_touched = set(
954 f for f in map(fix_and_blacklist_path, self.only_touched) if f)
955 only_touched -= files
956
957 files = [
958 Results.File(None, f, False, None) for f in files
959 if not os.path.isdir(f)
960 ]
961 # Using 0 as size means the file's content is ignored since the file was
962 # never opened for I/O.
963 files.extend(
964 Results.File(None, f, False, 0) for f in only_touched
965 if not os.path.isdir(f)
966 )
967 return Results.Process(
968 self.pid,
969 files,
970 fix_path(self.executable),
971 self.command,
972 fix_path(self.initial_cwd),
973 [c.to_results_process() for c in self.children])
974
975 def add_file(self, filepath, touch_only):
976 """Adds a file if it passes the blacklist."""
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000977 if self._blacklist(render(filepath)):
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000978 return
979 logging.debug('add_file(%d, %s, %s)' % (self.pid, filepath, touch_only))
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000980 # Note that filepath and not render(filepath) is added. It is
981 # because filepath could be something else than a string, like a
982 # RelativePath instance for dtrace logs.
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000983 if touch_only:
984 self.only_touched.add(filepath)
985 else:
986 self.files.add(filepath)
987
988 def __init__(self, blacklist):
989 self.blacklist = blacklist
990 # Initial process.
991 self.root_process = None
992 # dict to accelerate process lookup, to not have to lookup the whole graph
993 # each time.
994 self._process_lookup = {}
995
996 class Tracer(object):
997 """During it's lifetime, the tracing subsystem is enabled."""
998 def __init__(self, logname):
999 self._logname = logname
maruel@chromium.org808e3f22013-05-16 16:42:54 +00001000 self._lock = threading.RLock()
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001001 self._traces = []
1002 self._initialized = True
maruel@chromium.org808e3f22013-05-16 16:42:54 +00001003 self._script = None
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001004
1005 def trace(self, cmd, cwd, tracename, output):
1006 """Runs the OS-specific trace program on an executable.
1007
1008 Arguments:
1009 - cmd: The command (a list) to run.
1010 - cwd: Current directory to start the child process in.
1011 - tracename: Name of the trace in the logname file.
1012 - output: If False, redirects output to PIPEs.
1013
1014 Returns a tuple (resultcode, output) and updates the internal trace
1015 entries.
1016 """
1017 # The implementation adds an item to self._traces.
1018 raise NotImplementedError(self.__class__.__name__)
1019
1020 def close(self, _timeout=None):
1021 """Saves the meta-data in the logname file.
1022
1023 For kernel-based tracing, stops the tracing subsystem.
1024
1025 Must not be used manually when using 'with' construct.
1026 """
1027 with self._lock:
maruel@chromium.org808e3f22013-05-16 16:42:54 +00001028 if not self._initialized:
1029 raise TracingFailure(
1030 'Called %s.close() on an unitialized object' %
1031 self.__class__.__name__,
1032 None, None, None)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001033 try:
maruel@chromium.org808e3f22013-05-16 16:42:54 +00001034 if self._script:
1035 os.remove(self._script)
1036 self._script = None
1037 write_json(self._logname, self._gen_logdata(), False)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001038 finally:
1039 self._initialized = False
1040
1041 def post_process_log(self):
1042 """Post-processes the log so it becomes faster to load afterward.
1043
1044 Must not be used manually when using 'with' construct.
1045 """
1046 assert not self._initialized, 'Must stop tracing first.'
1047
maruel@chromium.org808e3f22013-05-16 16:42:54 +00001048 def _gen_logdata(self):
1049 """Returns the data to be saved in the trace file."""
1050 return {
1051 'traces': self._traces,
1052 }
1053
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001054 def __enter__(self):
1055 """Enables 'with' statement."""
1056 return self
1057
1058 def __exit__(self, exc_type, exc_value, traceback):
1059 """Enables 'with' statement."""
1060 self.close()
1061 # If an exception was thrown, do not process logs.
1062 if not exc_type:
1063 self.post_process_log()
1064
1065 def get_tracer(self, logname):
1066 """Returns an ApiBase.Tracer instance.
1067
1068 Initializes the tracing subsystem, which is a requirement for kernel-based
1069 tracers. Only one tracer instance should be live at a time!
1070
1071 logname is the filepath to the json file that will contain the meta-data
1072 about the logs.
1073 """
1074 return self.Tracer(logname)
1075
1076 @staticmethod
1077 def clean_trace(logname):
1078 """Deletes an old log."""
1079 raise NotImplementedError()
1080
1081 @classmethod
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00001082 def parse_log(cls, logname, blacklist, trace_name):
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001083 """Processes trace logs and returns the files opened and the files that do
1084 not exist.
1085
1086 It does not track directories.
1087
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00001088 Arguments:
1089 - logname: must be an absolute path.
1090 - blacklist: must be a lambda.
1091 - trace_name: optional trace to read, defaults to reading all traces.
maruel@chromium.org8b056ba2012-10-16 14:04:49 +00001092
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001093 Most of the time, files that do not exist are temporary test files that
1094 should be put in /tmp instead. See http://crbug.com/116251.
1095
1096 Returns a list of dict with keys:
1097 - results: A Results instance.
1098 - trace: The corresponding tracename parameter provided to
1099 get_tracer().trace().
1100 - output: Output gathered during execution, if get_tracer().trace(...,
1101 output=False) was used.
1102 """
1103 raise NotImplementedError(cls.__class__.__name__)
1104
1105
1106class Strace(ApiBase):
1107 """strace implies linux."""
maruel@chromium.org306e0e72012-11-02 18:22:03 +00001108 @staticmethod
1109 def load_filename(filename):
1110 """Parses a filename in a log."""
1111 assert isinstance(filename, str)
1112 out = ''
1113 i = 0
1114 while i < len(filename):
1115 c = filename[i]
1116 if c == '\\':
1117 out += chr(int(filename[i+1:i+4], 8))
1118 i += 4
1119 else:
1120 out += c
1121 i += 1
1122 # TODO(maruel): That's not necessarily true that the current code page is
1123 # utf-8.
1124 return out.decode('utf-8')
1125
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001126 class Context(ApiBase.Context):
1127 """Processes a strace log line and keeps the list of existent and non
1128 existent files accessed.
1129
1130 Ignores directories.
1131
1132 Uses late-binding to processes the cwd of each process. The problem is that
1133 strace generates one log file per process it traced but doesn't give any
1134 information about which process was started when and by who. So we don't
1135 even know which process is the initial one. So process the logs out of
1136 order and use late binding with RelativePath to be able to deduce the
1137 initial directory of each process once all the logs are parsed.
1138 """
1139 class Process(ApiBase.Context.Process):
1140 """Represents the state of a process.
1141
1142 Contains all the information retrieved from the pid-specific log.
1143 """
1144 # Function names are using ([a-z_0-9]+)
1145 # This is the most common format. function(args) = result
1146 RE_HEADER = re.compile(r'^([a-z_0-9]+)\((.+?)\)\s+= (.+)$')
1147 # An interrupted function call, only grab the minimal header.
1148 RE_UNFINISHED = re.compile(r'^([^\(]+)(.*) \<unfinished \.\.\.\>$')
1149 # A resumed function call.
1150 RE_RESUMED = re.compile(r'^<\.\.\. ([^ ]+) resumed> (.+)$')
1151 # A process received a signal.
1152 RE_SIGNAL = re.compile(r'^--- SIG[A-Z]+ .+ ---')
1153 # A process didn't handle a signal. Ignore any junk appearing before,
1154 # because the process was forcibly killed so it won't open any new file.
1155 RE_KILLED = re.compile(
1156 r'^.*\+\+\+ killed by ([A-Z]+)( \(core dumped\))? \+\+\+$')
1157 # The process has exited.
1158 RE_PROCESS_EXITED = re.compile(r'^\+\+\+ exited with (\d+) \+\+\+')
1159 # A call was canceled. Ignore any prefix.
1160 RE_UNAVAILABLE = re.compile(r'^.*\)\s*= \? <unavailable>$')
1161 # Happens when strace fails to even get the function name.
1162 UNNAMED_FUNCTION = '????'
1163
1164 # Corner-case in python, a class member function decorator must not be
1165 # @staticmethod.
1166 def parse_args(regexp, expect_zero): # pylint: disable=E0213
1167 """Automatically convert the str 'args' into a list of processed
1168 arguments.
1169
1170 Arguments:
1171 - regexp is used to parse args.
1172 - expect_zero: one of True, False or None.
1173 - True: will check for result.startswith('0') first and will ignore
1174 the trace line completely otherwise. This is important because for
1175 many functions, the regexp will not process if the call failed.
1176 - False: will check for not result.startswith(('?', '-1')) for the
1177 same reason than with True.
1178 - None: ignore result.
1179 """
1180 def meta_hook(function):
1181 assert function.__name__.startswith('handle_')
1182 def hook(self, args, result):
1183 if expect_zero is True and not result.startswith('0'):
1184 return
1185 if expect_zero is False and result.startswith(('?', '-1')):
1186 return
1187 match = re.match(regexp, args)
1188 if not match:
1189 raise TracingFailure(
1190 'Failed to parse %s(%s) = %s' %
1191 (function.__name__[len('handle_'):], args, result),
1192 None, None, None)
1193 return function(self, match.groups(), result)
1194 return hook
1195 return meta_hook
1196
1197 class RelativePath(object):
1198 """A late-bound relative path."""
1199 def __init__(self, parent, value):
maruel@chromium.org306e0e72012-11-02 18:22:03 +00001200 assert_is_renderable(parent)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001201 self.parent = parent
maruel@chromium.org306e0e72012-11-02 18:22:03 +00001202 assert isinstance(value, (None.__class__, str)), repr(value)
1203 self.value = Strace.load_filename(value) if value else value
1204 if self.value:
1205 assert '\\' not in self.value, value
1206 assert '\\' not in self.value, (repr(value), repr(self.value))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001207
1208 def render(self):
1209 """Returns the current directory this instance is representing.
1210
1211 This function is used to return the late-bound value.
1212 """
1213 if self.value and self.value.startswith(u'/'):
1214 # An absolute path.
maruel@chromium.org306e0e72012-11-02 18:22:03 +00001215 # TODO(maruel): This is wrong, we can't assert it is utf-8.
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001216 return self.value
1217 parent = self.parent.render() if self.parent else u'<None>'
1218 if self.value:
1219 return os.path.normpath(os.path.join(parent, self.value))
1220 return parent
1221
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001222 def __init__(self, root, pid):
1223 """Keeps enough information to be able to guess the original process
1224 root.
1225
1226 strace doesn't store which process was the initial process. So more
1227 information needs to be kept so the graph can be reconstructed from the
1228 flat map.
1229 """
1230 logging.info('%s(%d)' % (self.__class__.__name__, pid))
1231 super(Strace.Context.Process, self).__init__(root.blacklist, pid, None)
1232 assert isinstance(root, ApiBase.Context)
1233 self._root = weakref.ref(root)
1234 # The dict key is the function name of the pending call, like 'open'
1235 # or 'execve'.
1236 self._pending_calls = {}
1237 self._line_number = 0
1238 # Current directory when the process started.
1239 self.initial_cwd = self.RelativePath(self._root(), None)
1240 self.parentid = None
maruel@chromium.org9f72e0f2012-10-18 14:48:49 +00001241 self._done = False
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001242
1243 def get_cwd(self):
1244 """Returns the best known value of cwd."""
1245 return self.cwd or self.initial_cwd
1246
1247 def render(self):
1248 """Returns the string value of the RelativePath() object.
1249
1250 Used by RelativePath. Returns the initial directory and not the
1251 current one since the current directory 'cwd' validity is time-limited.
1252
1253 The validity is only guaranteed once all the logs are processed.
1254 """
1255 return self.initial_cwd.render()
1256
1257 def on_line(self, line):
maruel@chromium.org306e0e72012-11-02 18:22:03 +00001258 assert isinstance(line, str)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001259 self._line_number += 1
maruel@chromium.org9f72e0f2012-10-18 14:48:49 +00001260 if self._done:
1261 raise TracingFailure(
maruel@chromium.orgbcfc9e12013-03-19 18:47:17 +00001262 'Found a trace for a terminated process or corrupted log',
maruel@chromium.org9f72e0f2012-10-18 14:48:49 +00001263 None, None, None)
1264
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001265 if self.RE_SIGNAL.match(line):
1266 # Ignore signals.
1267 return
1268
1269 try:
1270 match = self.RE_KILLED.match(line)
1271 if match:
1272 # Converts a '+++ killed by Foo +++' trace into an exit_group().
1273 self.handle_exit_group(match.group(1), None)
1274 return
1275
1276 match = self.RE_PROCESS_EXITED.match(line)
1277 if match:
1278 # Converts a '+++ exited with 1 +++' trace into an exit_group()
1279 self.handle_exit_group(match.group(1), None)
1280 return
1281
1282 match = self.RE_UNFINISHED.match(line)
1283 if match:
1284 if match.group(1) in self._pending_calls:
1285 raise TracingFailure(
1286 'Found two unfinished calls for the same function',
1287 None, None, None,
1288 self._pending_calls)
1289 self._pending_calls[match.group(1)] = (
1290 match.group(1) + match.group(2))
1291 return
1292
1293 match = self.RE_UNAVAILABLE.match(line)
1294 if match:
1295 # This usually means a process was killed and a pending call was
1296 # canceled.
1297 # TODO(maruel): Look up the last exit_group() trace just above and
1298 # make sure any self._pending_calls[anything] is properly flushed.
1299 return
1300
1301 match = self.RE_RESUMED.match(line)
1302 if match:
1303 if match.group(1) not in self._pending_calls:
1304 raise TracingFailure(
1305 'Found a resumed call that was not logged as unfinished',
1306 None, None, None,
1307 self._pending_calls)
1308 pending = self._pending_calls.pop(match.group(1))
1309 # Reconstruct the line.
1310 line = pending + match.group(2)
1311
1312 match = self.RE_HEADER.match(line)
1313 if not match:
maruel@chromium.orgbcfc9e12013-03-19 18:47:17 +00001314 # The line is corrupted. It happens occasionally when a process is
1315 # killed forcibly with activity going on. Assume the process died.
1316 # No other line can be processed afterward.
1317 self._done = True
1318 return
1319
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001320 if match.group(1) == self.UNNAMED_FUNCTION:
1321 return
1322
1323 # It's a valid line, handle it.
1324 handler = getattr(self, 'handle_%s' % match.group(1), None)
1325 if not handler:
1326 self._handle_unknown(match.group(1), match.group(2), match.group(3))
1327 return handler(match.group(2), match.group(3))
1328 except TracingFailure, e:
1329 # Hack in the values since the handler could be a static function.
1330 e.pid = self.pid
1331 e.line = line
1332 e.line_number = self._line_number
1333 # Re-raise the modified exception.
1334 raise
1335 except (KeyError, NotImplementedError, ValueError), e:
1336 raise TracingFailure(
1337 'Trace generated a %s exception: %s' % (
1338 e.__class__.__name__, str(e)),
1339 self.pid,
1340 self._line_number,
1341 line,
1342 e)
1343
1344 @parse_args(r'^\"(.+?)\", [FKORWX_|]+$', True)
1345 def handle_access(self, args, _result):
1346 self._handle_file(args[0], True)
1347
1348 @parse_args(r'^\"(.+?)\"$', True)
1349 def handle_chdir(self, args, _result):
1350 """Updates cwd."""
1351 self.cwd = self.RelativePath(self, args[0])
1352 logging.debug('handle_chdir(%d, %s)' % (self.pid, self.cwd))
1353
1354 def handle_clone(self, _args, result):
1355 """Transfers cwd."""
1356 if result.startswith(('?', '-1')):
1357 # The call failed.
1358 return
1359 # Update the other process right away.
1360 childpid = int(result)
1361 child = self._root().get_or_set_proc(childpid)
1362 if child.parentid is not None or childpid in self.children:
1363 raise TracingFailure(
1364 'Found internal inconsitency in process lifetime detection '
1365 'during a clone() call',
1366 None, None, None)
1367
1368 # Copy the cwd object.
1369 child.initial_cwd = self.get_cwd()
1370 child.parentid = self.pid
1371 # It is necessary because the logs are processed out of order.
1372 self.children.append(child)
1373
1374 def handle_close(self, _args, _result):
1375 pass
1376
1377 def handle_chmod(self, _args, _result):
1378 pass
1379
1380 def handle_creat(self, _args, _result):
1381 # Ignore files created, since they didn't need to exist.
1382 pass
1383
1384 @parse_args(r'^\"(.+?)\", \[(.+)\], \[\/\* \d+ vars? \*\/\]$', True)
1385 def handle_execve(self, args, _result):
1386 # Even if in practice execve() doesn't returns when it succeeds, strace
1387 # still prints '0' as the result.
1388 filepath = args[0]
1389 self._handle_file(filepath, False)
1390 self.executable = self.RelativePath(self.get_cwd(), filepath)
1391 self.command = strace_process_quoted_arguments(args[1])
1392
1393 def handle_exit_group(self, _args, _result):
1394 """Removes cwd."""
1395 self.cwd = None
1396
csharp@chromium.org11ae0672012-10-30 14:53:17 +00001397 @parse_args(r'^(\d+|AT_FDCWD), \"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', True)
1398 def handle_faccessat(self, args, _results):
1399 if args[0] == 'AT_FDCWD':
1400 self._handle_file(args[1], True)
1401 else:
1402 raise Exception('Relative faccess not implemented.')
1403
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001404 def handle_fork(self, args, result):
1405 self._handle_unknown('fork', args, result)
1406
maruel@chromium.org9f72e0f2012-10-18 14:48:49 +00001407 def handle_futex(self, _args, _result):
1408 pass
1409
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001410 def handle_getcwd(self, _args, _result):
1411 pass
1412
1413 @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
1414 def handle_link(self, args, _result):
1415 self._handle_file(args[0], False)
1416 self._handle_file(args[1], False)
1417
1418 @parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True)
1419 def handle_lstat(self, args, _result):
1420 self._handle_file(args[0], True)
1421
1422 def handle_mkdir(self, _args, _result):
1423 pass
1424
1425 @parse_args(r'^\"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', False)
1426 def handle_open(self, args, _result):
1427 if 'O_DIRECTORY' in args[1]:
1428 return
1429 self._handle_file(args[0], False)
1430
1431 @parse_args(r'^(\d+|AT_FDCWD), \"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', False)
1432 def handle_openat(self, args, _result):
1433 if 'O_DIRECTORY' in args[2]:
1434 return
csharp@chromium.org11ae0672012-10-30 14:53:17 +00001435 if args[0] == 'AT_FDCWD':
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001436 self._handle_file(args[1], False)
1437 else:
1438 # TODO(maruel): Implement relative open if necessary instead of the
1439 # AT_FDCWD flag, let's hope not since this means tracking all active
1440 # directory handles.
1441 raise Exception('Relative open via openat not implemented.')
1442
1443 @parse_args(r'^\"(.+?)\", \".+?\"(\.\.\.)?, \d+$', False)
1444 def handle_readlink(self, args, _result):
1445 self._handle_file(args[0], False)
1446
1447 @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
1448 def handle_rename(self, args, _result):
1449 self._handle_file(args[0], False)
1450 self._handle_file(args[1], False)
1451
1452 def handle_rmdir(self, _args, _result):
1453 pass
1454
1455 def handle_setxattr(self, _args, _result):
1456 pass
1457
1458 @parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True)
1459 def handle_stat(self, args, _result):
1460 self._handle_file(args[0], True)
1461
1462 def handle_symlink(self, _args, _result):
1463 pass
1464
csharp@chromium.orgfa672922012-11-02 17:43:31 +00001465 @parse_args(r'^\"(.+?)\", \d+', True)
1466 def handle_truncate(self, args, _result):
1467 self._handle_file(args[0], False)
1468
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001469 def handle_unlink(self, _args, _result):
1470 # In theory, the file had to be created anyway.
1471 pass
1472
csharp@chromium.org11ae0672012-10-30 14:53:17 +00001473 def handle_unlinkat(self, _args, _result):
1474 # In theory, the file had to be created anyway.
1475 pass
1476
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001477 def handle_statfs(self, _args, _result):
1478 pass
1479
csharp@chromium.org11ae0672012-10-30 14:53:17 +00001480 def handle_utimensat(self, _args, _result):
1481 pass
1482
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001483 def handle_vfork(self, args, result):
1484 self._handle_unknown('vfork', args, result)
1485
1486 @staticmethod
1487 def _handle_unknown(function, args, result):
1488 raise TracingFailure(
1489 'Unexpected/unimplemented trace %s(%s)= %s' %
1490 (function, args, result),
1491 None, None, None)
1492
1493 def _handle_file(self, filepath, touch_only):
1494 filepath = self.RelativePath(self.get_cwd(), filepath)
maruel@chromium.org306e0e72012-11-02 18:22:03 +00001495 #assert not touch_only, render(filepath)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001496 self.add_file(filepath, touch_only)
1497
1498 def __init__(self, blacklist, initial_cwd):
1499 super(Strace.Context, self).__init__(blacklist)
maruel@chromium.org306e0e72012-11-02 18:22:03 +00001500 assert_is_renderable(initial_cwd)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001501 self.initial_cwd = initial_cwd
1502
1503 def render(self):
1504 """Returns the string value of the initial cwd of the root process.
1505
1506 Used by RelativePath.
1507 """
1508 return self.initial_cwd
1509
1510 def on_line(self, pid, line):
1511 """Transfers control into the Process.on_line() function."""
1512 self.get_or_set_proc(pid).on_line(line.strip())
1513
1514 def to_results(self):
1515 """Finds back the root process and verify consistency."""
1516 # TODO(maruel): Absolutely unecessary, fix me.
1517 root = [p for p in self._process_lookup.itervalues() if not p.parentid]
1518 if len(root) != 1:
1519 raise TracingFailure(
1520 'Found internal inconsitency in process lifetime detection '
1521 'while finding the root process',
1522 None,
1523 None,
1524 None,
1525 sorted(p.pid for p in root))
1526 self.root_process = root[0]
1527 process = self.root_process.to_results_process()
1528 if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
1529 raise TracingFailure(
1530 'Found internal inconsitency in process lifetime detection '
1531 'while looking for len(tree) == len(list)',
1532 None,
1533 None,
1534 None,
1535 sorted(self._process_lookup),
1536 sorted(p.pid for p in process.all))
1537 return Results(process)
1538
1539 def get_or_set_proc(self, pid):
1540 """Returns the Context.Process instance for this pid or creates a new one.
1541 """
1542 if not pid or not isinstance(pid, int):
1543 raise TracingFailure(
1544 'Unpexpected value for pid: %r' % pid,
1545 pid,
1546 None,
1547 None,
1548 pid)
1549 if pid not in self._process_lookup:
1550 self._process_lookup[pid] = self.Process(self, pid)
1551 return self._process_lookup[pid]
1552
1553 @classmethod
1554 def traces(cls):
1555 """Returns the list of all handled traces to pass this as an argument to
1556 strace.
1557 """
1558 prefix = 'handle_'
1559 return [i[len(prefix):] for i in dir(cls.Process) if i.startswith(prefix)]
1560
1561 class Tracer(ApiBase.Tracer):
1562 MAX_LEN = 256
1563
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00001564 def __init__(self, logname, use_sudo):
1565 super(Strace.Tracer, self).__init__(logname)
1566 self.use_sudo = use_sudo
1567
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001568 def trace(self, cmd, cwd, tracename, output):
1569 """Runs strace on an executable."""
1570 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
1571 assert os.path.isabs(cmd[0]), cmd[0]
1572 assert os.path.isabs(cwd), cwd
1573 assert os.path.normpath(cwd) == cwd, cwd
1574 with self._lock:
1575 if not self._initialized:
1576 raise TracingFailure(
1577 'Called Tracer.trace() on an unitialized object',
1578 None, None, None, tracename)
1579 assert tracename not in (i['trace'] for i in self._traces)
1580 stdout = stderr = None
1581 if output:
1582 stdout = subprocess.PIPE
1583 stderr = subprocess.STDOUT
1584 # Ensure all file related APIs are hooked.
1585 traces = ','.join(Strace.Context.traces() + ['file'])
1586 trace_cmd = [
1587 'strace',
1588 '-ff',
1589 '-s', '%d' % self.MAX_LEN,
1590 '-e', 'trace=%s' % traces,
1591 '-o', self._logname + '.' + tracename,
1592 ]
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00001593 if self.use_sudo is True:
1594 trace_cmd.insert(0, 'sudo')
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001595 child = subprocess.Popen(
1596 trace_cmd + cmd,
1597 cwd=cwd,
1598 stdin=subprocess.PIPE,
1599 stdout=stdout,
1600 stderr=stderr)
1601 out = child.communicate()[0]
1602 # TODO(maruel): Walk the logs and figure out the root process would
1603 # simplify parsing the logs a *lot*.
1604 with self._lock:
1605 assert tracename not in (i['trace'] for i in self._traces)
1606 self._traces.append(
1607 {
1608 'cmd': cmd,
1609 'cwd': cwd,
maruel@chromium.org964ddde2012-10-29 18:13:46 +00001610 'output': out,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001611 # The pid of strace process, not very useful.
1612 'pid': child.pid,
1613 'trace': tracename,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001614 })
1615 return child.returncode, out
1616
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00001617 def __init__(self, use_sudo=None):
1618 super(Strace, self).__init__()
1619 self.use_sudo = use_sudo
1620
1621 def get_tracer(self, logname):
1622 return self.Tracer(logname, self.use_sudo)
1623
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001624 @staticmethod
1625 def clean_trace(logname):
1626 if os.path.isfile(logname):
1627 os.remove(logname)
1628 # Also delete any pid specific file from previous traces.
1629 for i in glob.iglob(logname + '.*'):
1630 if i.rsplit('.', 1)[1].isdigit():
1631 os.remove(i)
1632
1633 @classmethod
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00001634 def parse_log(cls, logname, blacklist, trace_name):
1635 logging.info('parse_log(%s, ..., %s)', logname, trace_name)
maruel@chromium.org8b056ba2012-10-16 14:04:49 +00001636 assert os.path.isabs(logname)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001637 data = read_json(logname)
1638 out = []
1639 for item in data['traces']:
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00001640 if trace_name and item['trace'] != trace_name:
1641 continue
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001642 result = {
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001643 'output': item['output'],
maruel@chromium.org964ddde2012-10-29 18:13:46 +00001644 'trace': item['trace'],
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001645 }
1646 try:
1647 context = cls.Context(blacklist, item['cwd'])
1648 for pidfile in glob.iglob('%s.%s.*' % (logname, item['trace'])):
1649 pid = pidfile.rsplit('.', 1)[1]
1650 if pid.isdigit():
1651 pid = int(pid)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001652 for line in open(pidfile, 'rb'):
1653 context.on_line(pid, line)
1654 result['results'] = context.to_results()
maruel@chromium.org424f5692012-10-15 15:42:38 +00001655 except TracingFailure:
1656 result['exception'] = sys.exc_info()
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001657 out.append(result)
1658 return out
1659
1660
1661class Dtrace(ApiBase):
1662 """Uses DTrace framework through dtrace. Requires root access.
1663
1664 Implies Mac OSX.
1665
1666 dtruss can't be used because it has compatibility issues with python.
1667
1668 Also, the pid->cwd handling needs to be done manually since OSX has no way to
1669 get the absolute path of the 'cwd' dtrace variable from the probe.
1670
1671 Also, OSX doesn't populate curpsinfo->pr_psargs properly, see
1672 https://discussions.apple.com/thread/1980539. So resort to handling execve()
1673 manually.
1674
1675 errno is not printed in the log since this implementation currently only cares
1676 about files that were successfully opened.
1677 """
1678 class Context(ApiBase.Context):
1679 # Format: index pid function(args)
1680 RE_HEADER = re.compile(r'^\d+ (\d+) ([a-zA-Z_\-]+)\((.*?)\)$')
1681
1682 # Arguments parsing.
1683 RE_DTRACE_BEGIN = re.compile(r'^\"(.+?)\"$')
1684 RE_CHDIR = re.compile(r'^\"(.+?)\"$')
1685 RE_EXECVE = re.compile(r'^\"(.+?)\", \[(\d+), (.+)\]$')
1686 RE_OPEN = re.compile(r'^\"(.+?)\", (0x[0-9a-z]+), (0x[0-9a-z]+)$')
1687 RE_PROC_START = re.compile(r'^(\d+), \"(.+?)\", (\d+)$')
1688 RE_RENAME = re.compile(r'^\"(.+?)\", \"(.+?)\"$')
1689
1690 O_DIRECTORY = 0x100000
1691
1692 class Process(ApiBase.Context.Process):
1693 def __init__(self, *args):
1694 super(Dtrace.Context.Process, self).__init__(*args)
1695 self.cwd = self.initial_cwd
1696
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00001697 def __init__(self, blacklist, thunk_pid, initial_cwd):
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001698 logging.info(
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00001699 '%s(%d, %s)' % (self.__class__.__name__, thunk_pid, initial_cwd))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001700 super(Dtrace.Context, self).__init__(blacklist)
maruel@chromium.org306e0e72012-11-02 18:22:03 +00001701 assert isinstance(initial_cwd, unicode), initial_cwd
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001702 # Process ID of the temporary script created by create_thunk().
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00001703 self._thunk_pid = thunk_pid
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001704 self._initial_cwd = initial_cwd
1705 self._line_number = 0
1706
1707 def on_line(self, line):
maruel@chromium.org306e0e72012-11-02 18:22:03 +00001708 assert isinstance(line, unicode), line
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001709 self._line_number += 1
1710 match = self.RE_HEADER.match(line)
1711 if not match:
1712 raise TracingFailure(
1713 'Found malformed line: %s' % line,
1714 None,
1715 self._line_number,
1716 line)
1717 fn = getattr(
1718 self,
1719 'handle_%s' % match.group(2).replace('-', '_'),
1720 self._handle_ignored)
1721 # It is guaranteed to succeed because of the regexp. Or at least I thought
1722 # it would.
1723 pid = int(match.group(1))
1724 try:
1725 return fn(pid, match.group(3))
1726 except TracingFailure, e:
1727 # Hack in the values since the handler could be a static function.
1728 e.pid = pid
1729 e.line = line
1730 e.line_number = self._line_number
1731 # Re-raise the modified exception.
1732 raise
1733 except (KeyError, NotImplementedError, ValueError), e:
1734 raise TracingFailure(
1735 'Trace generated a %s exception: %s' % (
1736 e.__class__.__name__, str(e)),
1737 pid,
1738 self._line_number,
1739 line,
1740 e)
1741
1742 def to_results(self):
1743 process = self.root_process.to_results_process()
1744 # Internal concistency check.
1745 if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
1746 raise TracingFailure(
1747 'Found internal inconsitency in process lifetime detection '
1748 'while looking for len(tree) == len(list)',
1749 None,
1750 None,
1751 None,
1752 sorted(self._process_lookup),
1753 sorted(p.pid for p in process.all))
1754 return Results(process)
1755
1756 def handle_dtrace_BEGIN(self, _pid, args):
1757 if not self.RE_DTRACE_BEGIN.match(args):
1758 raise TracingFailure(
1759 'Found internal inconsitency in dtrace_BEGIN log line',
1760 None, None, None)
1761
1762 def handle_proc_start(self, pid, args):
1763 """Transfers cwd.
1764
1765 The dtrace script already takes care of only tracing the processes that
1766 are child of the traced processes so there is no need to verify the
1767 process hierarchy.
1768 """
1769 if pid in self._process_lookup:
1770 raise TracingFailure(
1771 'Found internal inconsitency in proc_start: %d started two times' %
1772 pid,
1773 None, None, None)
1774 match = self.RE_PROC_START.match(args)
1775 if not match:
1776 raise TracingFailure(
1777 'Failed to parse arguments: %s' % args,
1778 None, None, None)
1779 ppid = int(match.group(1))
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00001780 if ppid == self._thunk_pid and not self.root_process:
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001781 proc = self.root_process = self.Process(
1782 self.blacklist, pid, self._initial_cwd)
1783 elif ppid in self._process_lookup:
1784 proc = self.Process(self.blacklist, pid, self._process_lookup[ppid].cwd)
1785 self._process_lookup[ppid].children.append(proc)
1786 else:
1787 # Another process tree, ignore.
1788 return
1789 self._process_lookup[pid] = proc
1790 logging.debug(
1791 'New child: %s -> %d cwd:%s' %
maruel@chromium.org306e0e72012-11-02 18:22:03 +00001792 (ppid, pid, render(proc.initial_cwd)))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001793
1794 def handle_proc_exit(self, pid, _args):
1795 """Removes cwd."""
1796 if pid in self._process_lookup:
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00001797 # self._thunk_pid is not traced itself and other traces run neither.
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001798 self._process_lookup[pid].cwd = None
1799
1800 def handle_execve(self, pid, args):
1801 """Sets the process' executable.
1802
1803 TODO(maruel): Read command line arguments. See
1804 https://discussions.apple.com/thread/1980539 for an example.
1805 https://gist.github.com/1242279
1806
1807 Will have to put the answer at http://stackoverflow.com/questions/7556249.
1808 :)
1809 """
1810 if not pid in self._process_lookup:
1811 # Another process tree, ignore.
1812 return
1813 match = self.RE_EXECVE.match(args)
1814 if not match:
1815 raise TracingFailure(
1816 'Failed to parse arguments: %r' % args,
1817 None, None, None)
1818 proc = self._process_lookup[pid]
1819 proc.executable = match.group(1)
1820 proc.command = self.process_escaped_arguments(match.group(3))
1821 if int(match.group(2)) != len(proc.command):
1822 raise TracingFailure(
1823 'Failed to parse execve() arguments: %s' % args,
1824 None, None, None)
1825
1826 def handle_chdir(self, pid, args):
1827 """Updates cwd."""
1828 if pid not in self._process_lookup:
1829 # Another process tree, ignore.
1830 return
1831 cwd = self.RE_CHDIR.match(args).group(1)
1832 if not cwd.startswith('/'):
1833 cwd2 = os.path.join(self._process_lookup[pid].cwd, cwd)
1834 logging.debug('handle_chdir(%d, %s) -> %s' % (pid, cwd, cwd2))
1835 else:
1836 logging.debug('handle_chdir(%d, %s)' % (pid, cwd))
1837 cwd2 = cwd
1838 self._process_lookup[pid].cwd = cwd2
1839
1840 def handle_open_nocancel(self, pid, args):
1841 """Redirects to handle_open()."""
1842 return self.handle_open(pid, args)
1843
1844 def handle_open(self, pid, args):
1845 if pid not in self._process_lookup:
1846 # Another process tree, ignore.
1847 return
1848 match = self.RE_OPEN.match(args)
1849 if not match:
1850 raise TracingFailure(
1851 'Failed to parse arguments: %s' % args,
1852 None, None, None)
1853 flag = int(match.group(2), 16)
1854 if self.O_DIRECTORY & flag == self.O_DIRECTORY:
1855 # Ignore directories.
1856 return
1857 self._handle_file(pid, match.group(1))
1858
1859 def handle_rename(self, pid, args):
1860 if pid not in self._process_lookup:
1861 # Another process tree, ignore.
1862 return
1863 match = self.RE_RENAME.match(args)
1864 if not match:
1865 raise TracingFailure(
1866 'Failed to parse arguments: %s' % args,
1867 None, None, None)
1868 self._handle_file(pid, match.group(1))
1869 self._handle_file(pid, match.group(2))
1870
1871 def _handle_file(self, pid, filepath):
1872 if not filepath.startswith('/'):
1873 filepath = os.path.join(self._process_lookup[pid].cwd, filepath)
1874 # We can get '..' in the path.
1875 filepath = os.path.normpath(filepath)
1876 # Sadly, still need to filter out directories here;
1877 # saw open_nocancel(".", 0, 0) = 0 lines.
1878 if os.path.isdir(filepath):
1879 return
1880 self._process_lookup[pid].add_file(filepath, False)
1881
1882 def handle_ftruncate(self, pid, args):
1883 """Just used as a signal to kill dtrace, ignoring."""
1884 pass
1885
1886 @staticmethod
1887 def _handle_ignored(pid, args):
1888 """Is called for all the event traces that are not handled."""
1889 raise NotImplementedError('Please implement me')
1890
1891 @staticmethod
1892 def process_escaped_arguments(text):
1893 """Extracts escaped arguments on a string and return the arguments as a
1894 list.
1895
1896 Implemented as an automaton.
1897
1898 Example:
1899 With text = '\\001python2.7\\001-c\\001print(\\"hi\\")\\0', the
1900 function will return ['python2.7', '-c', 'print("hi")]
1901 """
1902 if not text.endswith('\\0'):
1903 raise ValueError('String is not null terminated: %r' % text, text)
1904 text = text[:-2]
1905
1906 def unescape(x):
1907 """Replaces '\\' with '\' and '\?' (where ? is anything) with ?."""
1908 out = []
1909 escaped = False
1910 for i in x:
1911 if i == '\\' and not escaped:
1912 escaped = True
1913 continue
1914 escaped = False
1915 out.append(i)
1916 return ''.join(out)
1917
1918 return [unescape(i) for i in text.split('\\001')]
1919
1920 class Tracer(ApiBase.Tracer):
1921 # pylint: disable=C0301
1922 #
1923 # To understand the following code, you'll want to take a look at:
1924 # http://developers.sun.com/solaris/articles/dtrace_quickref/dtrace_quickref.html
1925 # https://wikis.oracle.com/display/DTrace/Variables
1926 # http://docs.oracle.com/cd/E19205-01/820-4221/
1927 #
1928 # 0. Dump all the valid probes into a text file. It is important, you
1929 # want to redirect into a file and you don't want to constantly 'sudo'.
1930 # $ sudo dtrace -l > probes.txt
1931 #
1932 # 1. Count the number of probes:
1933 # $ wc -l probes.txt
1934 # 81823 # On OSX 10.7, including 1 header line.
1935 #
1936 # 2. List providers, intentionally skipping all the 'syspolicy10925' and the
1937 # likes and skipping the header with NR>1:
1938 # $ awk 'NR>1 { print $2 }' probes.txt | sort | uniq | grep -v '[[:digit:]]'
1939 # dtrace
1940 # fbt
1941 # io
1942 # ip
1943 # lockstat
1944 # mach_trap
1945 # proc
1946 # profile
1947 # sched
1948 # syscall
1949 # tcp
1950 # vminfo
1951 #
1952 # 3. List of valid probes:
1953 # $ grep syscall probes.txt | less
1954 # or use dtrace directly:
1955 # $ sudo dtrace -l -P syscall | less
1956 #
1957 # trackedpid is an associative array where its value can be 0, 1 or 2.
1958 # 0 is for untracked processes and is the default value for items not
1959 # in the associative array.
1960 # 1 is for tracked processes.
1961 # 2 is for the script created by create_thunk() only. It is not tracked
1962 # itself but all its decendants are.
1963 #
1964 # The script will kill itself only once waiting_to_die == 1 and
1965 # current_processes == 0, so that both getlogin() was called and that
1966 # all traced processes exited.
1967 #
1968 # TODO(maruel): Use cacheable predicates. See
1969 # https://wikis.oracle.com/display/DTrace/Performance+Considerations
1970 D_CODE = """
1971 dtrace:::BEGIN {
1972 waiting_to_die = 0;
1973 current_processes = 0;
1974 logindex = 0;
1975 printf("%d %d %s_%s(\\"%s\\")\\n",
1976 logindex, PID, probeprov, probename, SCRIPT);
1977 logindex++;
1978 }
1979
1980 proc:::start /trackedpid[ppid]/ {
1981 trackedpid[pid] = 1;
1982 current_processes += 1;
1983 printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
1984 logindex, pid, probeprov, probename,
1985 ppid,
1986 execname,
1987 current_processes);
1988 logindex++;
1989 }
1990 /* Should use SCRIPT but there is no access to this variable at that
1991 * point. */
1992 proc:::start /ppid == PID && execname == "Python"/ {
1993 trackedpid[pid] = 2;
1994 current_processes += 1;
1995 printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
1996 logindex, pid, probeprov, probename,
1997 ppid,
1998 execname,
1999 current_processes);
2000 logindex++;
2001 }
2002 proc:::exit /trackedpid[pid] &&
2003 current_processes == 1 &&
2004 waiting_to_die == 1/ {
2005 trackedpid[pid] = 0;
2006 current_processes -= 1;
2007 printf("%d %d %s_%s(%d)\\n",
2008 logindex, pid, probeprov, probename,
2009 current_processes);
2010 logindex++;
2011 exit(0);
2012 }
2013 proc:::exit /trackedpid[pid]/ {
2014 trackedpid[pid] = 0;
2015 current_processes -= 1;
2016 printf("%d %d %s_%s(%d)\\n",
2017 logindex, pid, probeprov, probename,
2018 current_processes);
2019 logindex++;
2020 }
2021
2022 /* Use an arcane function to detect when we need to die */
2023 syscall::ftruncate:entry /pid == PID && arg0 == FILE_ID/ {
2024 waiting_to_die = 1;
2025 printf("%d %d %s()\\n", logindex, pid, probefunc);
2026 logindex++;
2027 }
2028 syscall::ftruncate:entry /
2029 pid == PID && arg0 == FILE_ID && current_processes == 0/ {
2030 exit(0);
2031 }
2032
2033 syscall::open*:entry /trackedpid[pid] == 1/ {
2034 self->open_arg0 = arg0;
2035 self->open_arg1 = arg1;
2036 self->open_arg2 = arg2;
2037 }
2038 syscall::open*:return /trackedpid[pid] == 1 && errno == 0/ {
2039 this->open_arg0 = copyinstr(self->open_arg0);
2040 printf("%d %d %s(\\"%s\\", 0x%x, 0x%x)\\n",
2041 logindex, pid, probefunc,
2042 this->open_arg0,
2043 self->open_arg1,
2044 self->open_arg2);
2045 logindex++;
2046 this->open_arg0 = 0;
2047 }
2048 syscall::open*:return /trackedpid[pid] == 1/ {
2049 self->open_arg0 = 0;
2050 self->open_arg1 = 0;
2051 self->open_arg2 = 0;
2052 }
2053
2054 syscall::rename:entry /trackedpid[pid] == 1/ {
2055 self->rename_arg0 = arg0;
2056 self->rename_arg1 = arg1;
2057 }
2058 syscall::rename:return /trackedpid[pid] == 1 && errno == 0/ {
2059 this->rename_arg0 = copyinstr(self->rename_arg0);
2060 this->rename_arg1 = copyinstr(self->rename_arg1);
2061 printf("%d %d %s(\\"%s\\", \\"%s\\")\\n",
2062 logindex, pid, probefunc,
2063 this->rename_arg0,
2064 this->rename_arg1);
2065 logindex++;
2066 this->rename_arg0 = 0;
2067 this->rename_arg1 = 0;
2068 }
2069 syscall::rename:return /trackedpid[pid] == 1/ {
2070 self->rename_arg0 = 0;
2071 self->rename_arg1 = 0;
2072 }
2073
2074 /* Track chdir, it's painful because it is only receiving relative path.
2075 */
2076 syscall::chdir:entry /trackedpid[pid] == 1/ {
2077 self->chdir_arg0 = arg0;
2078 }
2079 syscall::chdir:return /trackedpid[pid] == 1 && errno == 0/ {
2080 this->chdir_arg0 = copyinstr(self->chdir_arg0);
2081 printf("%d %d %s(\\"%s\\")\\n",
2082 logindex, pid, probefunc,
2083 this->chdir_arg0);
2084 logindex++;
2085 this->chdir_arg0 = 0;
2086 }
2087 syscall::chdir:return /trackedpid[pid] == 1/ {
2088 self->chdir_arg0 = 0;
2089 }
2090 """
2091
2092 # execve-specific code, tends to throw a lot of exceptions.
2093 D_CODE_EXECVE = """
2094 /* Finally what we care about! */
2095 syscall::exec*:entry /trackedpid[pid]/ {
2096 self->exec_arg0 = copyinstr(arg0);
2097 /* Incrementally probe for a NULL in the argv parameter of execve() to
2098 * figure out argc. */
2099 /* TODO(maruel): Skip the remaining copyin() when a NULL pointer was
2100 * found. */
2101 self->exec_argc = 0;
2102 /* Probe for argc==1 */
2103 this->exec_argv = (user_addr_t*)copyin(
2104 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
2105 self->exec_argc = this->exec_argv[self->exec_argc] ?
2106 (self->exec_argc + 1) : self->exec_argc;
2107
2108 /* Probe for argc==2 */
2109 this->exec_argv = (user_addr_t*)copyin(
2110 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
2111 self->exec_argc = this->exec_argv[self->exec_argc] ?
2112 (self->exec_argc + 1) : self->exec_argc;
2113
2114 /* Probe for argc==3 */
2115 this->exec_argv = (user_addr_t*)copyin(
2116 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
2117 self->exec_argc = this->exec_argv[self->exec_argc] ?
2118 (self->exec_argc + 1) : self->exec_argc;
2119
2120 /* Probe for argc==4 */
2121 this->exec_argv = (user_addr_t*)copyin(
2122 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
2123 self->exec_argc = this->exec_argv[self->exec_argc] ?
2124 (self->exec_argc + 1) : self->exec_argc;
2125
2126 /* Copy the inputs strings since there is no guarantee they'll be
2127 * present after the call completed. */
2128 self->exec_argv0 = (self->exec_argc > 0) ?
2129 copyinstr(this->exec_argv[0]) : "";
2130 self->exec_argv1 = (self->exec_argc > 1) ?
2131 copyinstr(this->exec_argv[1]) : "";
2132 self->exec_argv2 = (self->exec_argc > 2) ?
2133 copyinstr(this->exec_argv[2]) : "";
2134 self->exec_argv3 = (self->exec_argc > 3) ?
2135 copyinstr(this->exec_argv[3]) : "";
2136 this->exec_argv = 0;
2137 }
2138 syscall::exec*:return /trackedpid[pid] && errno == 0/ {
2139 /* We need to join strings here, as using multiple printf() would
2140 * cause tearing when multiple threads/processes are traced.
2141 * Since it is impossible to escape a string and join it to another one,
2142 * like sprintf("%s%S", previous, more), use hackery.
2143 * Each of the elements are split with a \\1. \\0 cannot be used because
2144 * it is simply ignored. This will conflict with any program putting a
2145 * \\1 in their execve() string but this should be "rare enough" */
2146 this->args = "";
2147 /* Process exec_argv[0] */
2148 this->args = strjoin(
2149 this->args, (self->exec_argc > 0) ? self->exec_argv0 : "");
2150
2151 /* Process exec_argv[1] */
2152 this->args = strjoin(
2153 this->args, (self->exec_argc > 1) ? "\\1" : "");
2154 this->args = strjoin(
2155 this->args, (self->exec_argc > 1) ? self->exec_argv1 : "");
2156
2157 /* Process exec_argv[2] */
2158 this->args = strjoin(
2159 this->args, (self->exec_argc > 2) ? "\\1" : "");
2160 this->args = strjoin(
2161 this->args, (self->exec_argc > 2) ? self->exec_argv2 : "");
2162
2163 /* Process exec_argv[3] */
2164 this->args = strjoin(
2165 this->args, (self->exec_argc > 3) ? "\\1" : "");
2166 this->args = strjoin(
2167 this->args, (self->exec_argc > 3) ? self->exec_argv3 : "");
2168
2169 /* Prints self->exec_argc to permits verifying the internal
2170 * consistency since this code is quite fishy. */
2171 printf("%d %d %s(\\"%s\\", [%d, %S])\\n",
2172 logindex, pid, probefunc,
2173 self->exec_arg0,
2174 self->exec_argc,
2175 this->args);
2176 logindex++;
2177 this->args = 0;
2178 }
2179 syscall::exec*:return /trackedpid[pid]/ {
2180 self->exec_arg0 = 0;
2181 self->exec_argc = 0;
2182 self->exec_argv0 = 0;
2183 self->exec_argv1 = 0;
2184 self->exec_argv2 = 0;
2185 self->exec_argv3 = 0;
2186 }
2187 """
2188
2189 # Code currently not used.
2190 D_EXTRANEOUS = """
2191 /* This is a good learning experience, since it traces a lot of things
2192 * related to the process and child processes.
2193 * Warning: it generates a gigantic log. For example, tracing
2194 * "data/trace_inputs/child1.py --child" generates a 2mb log and takes
2195 * several minutes to execute.
2196 */
2197 /*
2198 mach_trap::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2199 printf("%d %d %s_%s() = %d\\n",
2200 logindex, pid, probeprov, probefunc, errno);
2201 logindex++;
2202 }
2203 proc::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2204 printf("%d %d %s_%s() = %d\\n",
2205 logindex, pid, probeprov, probefunc, errno);
2206 logindex++;
2207 }
2208 sched::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2209 printf("%d %d %s_%s() = %d\\n",
2210 logindex, pid, probeprov, probefunc, errno);
2211 logindex++;
2212 }
2213 syscall::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2214 printf("%d %d %s_%s() = %d\\n",
2215 logindex, pid, probeprov, probefunc, errno);
2216 logindex++;
2217 }
2218 vminfo::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2219 printf("%d %d %s_%s() = %d\\n",
2220 logindex, pid, probeprov, probefunc, errno);
2221 logindex++;
2222 }
2223 */
2224 /* TODO(maruel): *stat* functions and friends
2225 syscall::access:return,
2226 syscall::chdir:return,
2227 syscall::chflags:return,
2228 syscall::chown:return,
2229 syscall::chroot:return,
2230 syscall::getattrlist:return,
2231 syscall::getxattr:return,
2232 syscall::lchown:return,
2233 syscall::lstat64:return,
2234 syscall::lstat:return,
2235 syscall::mkdir:return,
2236 syscall::pathconf:return,
2237 syscall::readlink:return,
2238 syscall::removexattr:return,
2239 syscall::setxattr:return,
2240 syscall::stat64:return,
2241 syscall::stat:return,
2242 syscall::truncate:return,
2243 syscall::unlink:return,
2244 syscall::utimes:return,
2245 */
2246 """
2247
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00002248 def __init__(self, logname, use_sudo):
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002249 """Starts the log collection with dtrace.
2250
2251 Requires root access or chmod 4555 on dtrace. dtrace is asynchronous so
2252 this needs to wait for dtrace to be "warmed up".
2253 """
2254 super(Dtrace.Tracer, self).__init__(logname)
2255 self._script = create_thunk()
2256 # This unique dummy temp file is used to signal the dtrace script that it
2257 # should stop as soon as all the child processes are done. A bit hackish
2258 # but works fine enough.
2259 self._dummy_file_id, self._dummy_file_name = tempfile.mkstemp(
2260 prefix='trace_signal_file')
2261
maruel@chromium.org9b2112a2013-04-05 18:01:02 +00002262 dtrace_path = '/usr/sbin/dtrace'
2263 if not os.path.isfile(dtrace_path):
2264 dtrace_path = 'dtrace'
2265 elif use_sudo is None and (os.stat(dtrace_path).st_mode & stat.S_ISUID):
2266 # No need to sudo. For those following at home, don't do that.
2267 use_sudo = False
2268
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002269 # Note: do not use the -p flag. It's useless if the initial process quits
2270 # too fast, resulting in missing traces from the grand-children. The D
2271 # code manages the dtrace lifetime itself.
2272 trace_cmd = [
maruel@chromium.org9b2112a2013-04-05 18:01:02 +00002273 dtrace_path,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002274 # Use a larger buffer if getting 'out of scratch space' errors.
2275 # Ref: https://wikis.oracle.com/display/DTrace/Options+and+Tunables
2276 '-b', '10m',
2277 '-x', 'dynvarsize=10m',
2278 #'-x', 'dtrace_global_maxsize=1m',
2279 '-x', 'evaltime=exec',
2280 '-o', '/dev/stderr',
2281 '-q',
2282 '-n', self._get_dtrace_code(),
2283 ]
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00002284 if use_sudo is not False:
2285 trace_cmd.insert(0, 'sudo')
2286
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002287 with open(self._logname + '.log', 'wb') as logfile:
2288 self._dtrace = subprocess.Popen(
2289 trace_cmd, stdout=logfile, stderr=subprocess.STDOUT)
2290 logging.debug('Started dtrace pid: %d' % self._dtrace.pid)
2291
2292 # Reads until one line is printed, which signifies dtrace is up and ready.
2293 with open(self._logname + '.log', 'rb') as logfile:
2294 while 'dtrace_BEGIN' not in logfile.readline():
2295 if self._dtrace.poll() is not None:
2296 # Do a busy wait. :/
2297 break
2298 logging.debug('dtrace started')
2299
2300 def _get_dtrace_code(self):
2301 """Setups the D code to implement child process tracking.
2302
2303 Injects the cookie in the script so it knows when to stop.
2304
2305 The script will detect any instance of the script created with
2306 create_thunk() and will start tracing it.
2307 """
maruel@chromium.orgbe75f282012-11-02 19:34:37 +00002308 out = (
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002309 'inline int PID = %d;\n'
2310 'inline string SCRIPT = "%s";\n'
2311 'inline int FILE_ID = %d;\n'
2312 '\n'
2313 '%s') % (
2314 os.getpid(),
2315 self._script,
2316 self._dummy_file_id,
maruel@chromium.orgbe75f282012-11-02 19:34:37 +00002317 self.D_CODE)
2318 if os.environ.get('TRACE_INPUTS_DTRACE_ENABLE_EXECVE') == '1':
2319 # Do not enable by default since it tends to spew dtrace: error lines
2320 # because the execve() parameters are not in valid memory at the time of
2321 # logging.
2322 # TODO(maruel): Find a way to make this reliable since it's useful but
2323 # only works in limited/trivial uses cases for now.
2324 out += self.D_CODE_EXECVE
2325 return out
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002326
2327 def trace(self, cmd, cwd, tracename, output):
2328 """Runs dtrace on an executable.
2329
2330 This dtruss is broken when it starts the process itself or when tracing
2331 child processes, this code starts a wrapper process
2332 generated with create_thunk() which starts the executable to trace.
2333 """
2334 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
2335 assert os.path.isabs(cmd[0]), cmd[0]
2336 assert os.path.isabs(cwd), cwd
2337 assert os.path.normpath(cwd) == cwd, cwd
2338 with self._lock:
2339 if not self._initialized:
2340 raise TracingFailure(
2341 'Called Tracer.trace() on an unitialized object',
2342 None, None, None, tracename)
2343 assert tracename not in (i['trace'] for i in self._traces)
2344
2345 # Starts the script wrapper to start the child process. This signals the
2346 # dtrace script that this process is to be traced.
2347 stdout = stderr = None
2348 if output:
2349 stdout = subprocess.PIPE
2350 stderr = subprocess.STDOUT
2351 child_cmd = [
2352 sys.executable,
2353 self._script,
2354 tracename,
2355 ]
2356 # Call a dummy function so that dtrace knows I'm about to launch a process
2357 # that needs to be traced.
2358 # Yummy.
2359 child = subprocess.Popen(
2360 child_cmd + fix_python_path(cmd),
2361 stdin=subprocess.PIPE,
2362 stdout=stdout,
2363 stderr=stderr,
2364 cwd=cwd)
2365 logging.debug('Started child pid: %d' % child.pid)
2366
2367 out = child.communicate()[0]
2368 # This doesn't mean tracing is done, one of the grand-child process may
2369 # still be alive. It will be tracked with the dtrace script.
2370
2371 with self._lock:
2372 assert tracename not in (i['trace'] for i in self._traces)
2373 self._traces.append(
2374 {
2375 'cmd': cmd,
2376 'cwd': cwd,
2377 # The pid of strace process, not very useful.
2378 'pid': child.pid,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002379 'output': out,
maruel@chromium.org964ddde2012-10-29 18:13:46 +00002380 'trace': tracename,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002381 })
2382 return child.returncode, out
2383
2384 def close(self, timeout=None):
2385 """Terminates dtrace."""
2386 logging.debug('close(%s)' % timeout)
2387 try:
2388 try:
2389 super(Dtrace.Tracer, self).close(timeout)
2390 # Signal dtrace that it should stop now.
maruel@chromium.orgf43e68b2012-10-15 20:23:10 +00002391 # ftruncate doesn't exist on Windows.
2392 os.ftruncate(self._dummy_file_id, 0) # pylint: disable=E1101
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002393 if timeout:
2394 start = time.time()
2395 # Use polling. :/
2396 while (self._dtrace.poll() is None and
2397 (time.time() - start) < timeout):
2398 time.sleep(0.1)
2399 self._dtrace.kill()
2400 self._dtrace.wait()
2401 finally:
2402 # Make sure to kill it in any case.
2403 if self._dtrace.poll() is None:
2404 try:
2405 self._dtrace.kill()
2406 self._dtrace.wait()
2407 except OSError:
2408 pass
2409
2410 if self._dtrace.returncode != 0:
2411 # Warn about any dtrace failure but basically ignore it.
2412 print 'dtrace failure: %s' % self._dtrace.returncode
2413 finally:
2414 os.close(self._dummy_file_id)
2415 os.remove(self._dummy_file_name)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002416
2417 def post_process_log(self):
2418 """Sorts the log back in order when each call occured.
2419
2420 dtrace doesn't save the buffer in strict order since it keeps one buffer
2421 per CPU.
2422 """
2423 super(Dtrace.Tracer, self).post_process_log()
2424 logname = self._logname + '.log'
2425 with open(logname, 'rb') as logfile:
2426 lines = [l for l in logfile if l.strip()]
2427 errors = [l for l in lines if l.startswith('dtrace:')]
2428 if errors:
2429 raise TracingFailure(
2430 'Found errors in the trace: %s' % '\n'.join(errors),
2431 None, None, None, logname)
2432 try:
2433 lines = sorted(lines, key=lambda l: int(l.split(' ', 1)[0]))
2434 except ValueError:
2435 raise TracingFailure(
2436 'Found errors in the trace: %s' % '\n'.join(
2437 l for l in lines if l.split(' ', 1)[0].isdigit()),
2438 None, None, None, logname)
2439 with open(logname, 'wb') as logfile:
2440 logfile.write(''.join(lines))
2441
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00002442 def __init__(self, use_sudo=None):
2443 super(Dtrace, self).__init__()
2444 self.use_sudo = use_sudo
2445
2446 def get_tracer(self, logname):
2447 return self.Tracer(logname, self.use_sudo)
2448
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002449 @staticmethod
2450 def clean_trace(logname):
2451 for ext in ('', '.log'):
2452 if os.path.isfile(logname + ext):
2453 os.remove(logname + ext)
2454
2455 @classmethod
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00002456 def parse_log(cls, logname, blacklist, trace_name):
2457 logging.info('parse_log(%s, ..., %s)', logname, trace_name)
maruel@chromium.org8b056ba2012-10-16 14:04:49 +00002458 assert os.path.isabs(logname)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002459
2460 def blacklist_more(filepath):
2461 # All the HFS metadata is in the form /.vol/...
2462 return blacklist(filepath) or re.match(r'^\/\.vol\/.+$', filepath)
2463
2464 data = read_json(logname)
2465 out = []
2466 for item in data['traces']:
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00002467 if trace_name and item['trace'] != trace_name:
2468 continue
maruel@chromium.org964ddde2012-10-29 18:13:46 +00002469 result = {
2470 'output': item['output'],
2471 'trace': item['trace'],
2472 }
2473 try:
2474 context = cls.Context(blacklist_more, item['pid'], item['cwd'])
maruel@chromium.org306e0e72012-11-02 18:22:03 +00002475 # It's fine to assume the file as UTF-8: OSX enforces the file names to
2476 # be valid UTF-8 and we control the log output.
2477 for line in codecs.open(logname + '.log', 'rb', encoding='utf-8'):
maruel@chromium.org964ddde2012-10-29 18:13:46 +00002478 context.on_line(line)
2479 result['results'] = context.to_results()
2480 except TracingFailure:
2481 result['exception'] = sys.exc_info()
2482 out.append(result)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002483 return out
2484
2485
2486class LogmanTrace(ApiBase):
2487 """Uses the native Windows ETW based tracing functionality to trace a child
2488 process.
2489
2490 Caveat: this implementations doesn't track cwd or initial_cwd. It is because
2491 the Windows Kernel doesn't have a concept of 'current working directory' at
2492 all. A Win32 process has a map of current directories, one per drive letter
2493 and it is managed by the user mode kernel32.dll. In kernel, a file is always
2494 opened relative to another file_object or as an absolute path. All the current
2495 working directory logic is done in user mode.
2496 """
2497 class Context(ApiBase.Context):
2498 """Processes a ETW log line and keeps the list of existent and non
2499 existent files accessed.
2500
2501 Ignores directories.
2502 """
2503 # These indexes are for the stripped version in json.
2504 EVENT_NAME = 0
2505 TYPE = 1
2506 PID = 2
2507 TID = 3
2508 PROCESSOR_ID = 4
2509 TIMESTAMP = 5
2510 USER_DATA = 6
2511
2512 class Process(ApiBase.Context.Process):
2513 def __init__(self, *args):
2514 super(LogmanTrace.Context.Process, self).__init__(*args)
2515 # Handle file objects that succeeded.
2516 self.file_objects = {}
2517
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002518 def __init__(self, blacklist, thunk_pid, trace_name, thunk_cmd):
2519 logging.info(
2520 '%s(%d, %s, %s)', self.__class__.__name__, thunk_pid, trace_name,
2521 thunk_cmd)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002522 super(LogmanTrace.Context, self).__init__(blacklist)
2523 self._drive_map = DosDriveMap()
2524 # Threads mapping to the corresponding process id.
2525 self._threads_active = {}
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002526 # Process ID of the tracer, e.g. the temporary script created by
2527 # create_thunk(). This is tricky because the process id may have been
2528 # reused.
2529 self._thunk_pid = thunk_pid
2530 self._thunk_cmd = thunk_cmd
2531 self._trace_name = trace_name
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002532 self._line_number = 0
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002533 self._thunk_process = None
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002534
2535 def on_line(self, line):
2536 """Processes a json Event line."""
2537 self._line_number += 1
2538 try:
2539 # By Opcode
2540 handler = getattr(
2541 self,
2542 'handle_%s_%s' % (line[self.EVENT_NAME], line[self.TYPE]),
2543 None)
2544 if not handler:
2545 raise TracingFailure(
2546 'Unexpected event %s_%s' % (
2547 line[self.EVENT_NAME], line[self.TYPE]),
2548 None, None, None)
2549 handler(line)
2550 except TracingFailure, e:
2551 # Hack in the values since the handler could be a static function.
2552 e.pid = line[self.PID]
2553 e.line = line
2554 e.line_number = self._line_number
2555 # Re-raise the modified exception.
2556 raise
2557 except (KeyError, NotImplementedError, ValueError), e:
2558 raise TracingFailure(
2559 'Trace generated a %s exception: %s' % (
2560 e.__class__.__name__, str(e)),
2561 line[self.PID],
2562 self._line_number,
2563 line,
2564 e)
2565
2566 def to_results(self):
2567 if not self.root_process:
2568 raise TracingFailure(
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002569 'Failed to detect the initial process %d' % self._thunk_pid,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002570 None, None, None)
2571 process = self.root_process.to_results_process()
2572 return Results(process)
2573
2574 def _thread_to_process(self, tid):
2575 """Finds the process from the thread id."""
2576 tid = int(tid, 16)
2577 pid = self._threads_active.get(tid)
2578 if not pid or not self._process_lookup.get(pid):
2579 return
2580 return self._process_lookup[pid]
2581
2582 @classmethod
2583 def handle_EventTrace_Header(cls, line):
2584 """Verifies no event was dropped, e.g. no buffer overrun occured."""
2585 BUFFER_SIZE = cls.USER_DATA
2586 #VERSION = cls.USER_DATA + 1
2587 #PROVIDER_VERSION = cls.USER_DATA + 2
2588 #NUMBER_OF_PROCESSORS = cls.USER_DATA + 3
2589 #END_TIME = cls.USER_DATA + 4
2590 #TIMER_RESOLUTION = cls.USER_DATA + 5
2591 #MAX_FILE_SIZE = cls.USER_DATA + 6
2592 #LOG_FILE_MODE = cls.USER_DATA + 7
2593 #BUFFERS_WRITTEN = cls.USER_DATA + 8
2594 #START_BUFFERS = cls.USER_DATA + 9
2595 #POINTER_SIZE = cls.USER_DATA + 10
2596 EVENTS_LOST = cls.USER_DATA + 11
2597 #CPU_SPEED = cls.USER_DATA + 12
2598 #LOGGER_NAME = cls.USER_DATA + 13
2599 #LOG_FILE_NAME = cls.USER_DATA + 14
2600 #BOOT_TIME = cls.USER_DATA + 15
2601 #PERF_FREQ = cls.USER_DATA + 16
2602 #START_TIME = cls.USER_DATA + 17
2603 #RESERVED_FLAGS = cls.USER_DATA + 18
2604 #BUFFERS_LOST = cls.USER_DATA + 19
2605 #SESSION_NAME_STRING = cls.USER_DATA + 20
2606 #LOG_FILE_NAME_STRING = cls.USER_DATA + 21
2607 if line[EVENTS_LOST] != '0':
2608 raise TracingFailure(
2609 ( '%s events were lost during trace, please increase the buffer '
2610 'size from %s') % (line[EVENTS_LOST], line[BUFFER_SIZE]),
2611 None, None, None)
2612
2613 def handle_FileIo_Cleanup(self, line):
2614 """General wisdom: if a file is closed, it's because it was opened.
2615
2616 Note that FileIo_Close is not used since if a file was opened properly but
2617 not closed before the process exits, only Cleanup will be logged.
2618 """
2619 #IRP = self.USER_DATA
2620 TTID = self.USER_DATA + 1 # Thread ID, that's what we want.
2621 FILE_OBJECT = self.USER_DATA + 2
2622 #FILE_KEY = self.USER_DATA + 3
2623 proc = self._thread_to_process(line[TTID])
2624 if not proc:
2625 # Not a process we care about.
2626 return
2627 file_object = line[FILE_OBJECT]
2628 if file_object in proc.file_objects:
2629 proc.add_file(proc.file_objects.pop(file_object), False)
2630
2631 def handle_FileIo_Create(self, line):
2632 """Handles a file open.
2633
2634 All FileIo events are described at
2635 http://msdn.microsoft.com/library/windows/desktop/aa363884.aspx
2636 for some value of 'description'.
2637
2638 " (..) process and thread id values of the IO events (..) are not valid "
2639 http://msdn.microsoft.com/magazine/ee358703.aspx
2640
2641 The FileIo.Create event doesn't return if the CreateFile() call
2642 succeeded, so keep track of the file_object and check that it is
2643 eventually closed with FileIo_Cleanup.
2644 """
2645 #IRP = self.USER_DATA
2646 TTID = self.USER_DATA + 1 # Thread ID, that's what we want.
2647 FILE_OBJECT = self.USER_DATA + 2
2648 #CREATE_OPTIONS = self.USER_DATA + 3
2649 #FILE_ATTRIBUTES = self.USER_DATA + 4
2650 #self.USER_DATA + SHARE_ACCESS = 5
2651 OPEN_PATH = self.USER_DATA + 6
2652
2653 proc = self._thread_to_process(line[TTID])
2654 if not proc:
2655 # Not a process we care about.
2656 return
2657
maruel@chromium.org306e0e72012-11-02 18:22:03 +00002658 raw_path = line[OPEN_PATH]
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002659 # Ignore directories and bare drive right away.
2660 if raw_path.endswith(os.path.sep):
2661 return
2662 filepath = self._drive_map.to_win32(raw_path)
2663 # Ignore bare drive right away. Some may still fall through with format
2664 # like '\\?\X:'
2665 if len(filepath) == 2:
2666 return
2667 file_object = line[FILE_OBJECT]
2668 if os.path.isdir(filepath):
2669 # There is no O_DIRECTORY equivalent on Windows. The closed is
2670 # FILE_FLAG_BACKUP_SEMANTICS but it's not exactly right either. So
2671 # simply discard directories are they are found.
2672 return
2673 # Override any stale file object
2674 proc.file_objects[file_object] = filepath
2675
2676 def handle_FileIo_Rename(self, line):
2677 # TODO(maruel): Handle?
2678 pass
2679
2680 def handle_Process_End(self, line):
2681 pid = line[self.PID]
2682 if self._process_lookup.get(pid):
2683 logging.info('Terminated: %d' % pid)
2684 self._process_lookup[pid] = None
2685 else:
2686 logging.debug('Terminated: %d' % pid)
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002687 if self._thunk_process and self._thunk_process.pid == pid:
2688 self._thunk_process = None
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002689
2690 def handle_Process_Start(self, line):
2691 """Handles a new child process started by PID."""
2692 #UNIQUE_PROCESS_KEY = self.USER_DATA
2693 PROCESS_ID = self.USER_DATA + 1
2694 #PARENT_PID = self.USER_DATA + 2
2695 #SESSION_ID = self.USER_DATA + 3
2696 #EXIT_STATUS = self.USER_DATA + 4
2697 #DIRECTORY_TABLE_BASE = self.USER_DATA + 5
2698 #USER_SID = self.USER_DATA + 6
2699 IMAGE_FILE_NAME = self.USER_DATA + 7
2700 COMMAND_LINE = self.USER_DATA + 8
2701
2702 ppid = line[self.PID]
2703 pid = int(line[PROCESS_ID], 16)
maruel@chromium.org306e0e72012-11-02 18:22:03 +00002704 command_line = CommandLineToArgvW(line[COMMAND_LINE])
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002705 logging.debug(
2706 'New process %d->%d (%s) %s' %
maruel@chromium.org306e0e72012-11-02 18:22:03 +00002707 (ppid, pid, line[IMAGE_FILE_NAME], command_line))
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002708
2709 if pid == self._thunk_pid:
2710 # Need to ignore processes we don't know about because the log is
2711 # system-wide. self._thunk_pid shall start only one process.
2712 # This is tricky though because Windows *loves* to reuse process id and
2713 # it happens often that the process ID of the thunk script created by
2714 # create_thunk() is reused. So just detecting the pid here is not
2715 # sufficient, we must confirm the command line.
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002716 if command_line[:len(self._thunk_cmd)] != self._thunk_cmd:
2717 logging.info(
2718 'Ignoring duplicate pid %d for %s: %s while searching for %s',
2719 pid, self._trace_name, command_line, self._thunk_cmd)
2720 return
2721
2722 # TODO(maruel): The check is quite weak. Add the thunk path.
2723 if self._thunk_process:
2724 raise TracingFailure(
2725 ( 'Parent process is _thunk_pid(%d) but thunk_process(%d) is '
2726 'already set') % (self._thunk_pid, self._thunk_process.pid),
2727 None, None, None)
2728 proc = self.Process(self.blacklist, pid, None)
2729 self._thunk_process = proc
2730 return
2731 elif ppid == self._thunk_pid and self._thunk_process:
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002732 proc = self.Process(self.blacklist, pid, None)
2733 self.root_process = proc
2734 ppid = None
2735 elif self._process_lookup.get(ppid):
2736 proc = self.Process(self.blacklist, pid, None)
2737 self._process_lookup[ppid].children.append(proc)
2738 else:
2739 # Ignore
2740 return
2741 self._process_lookup[pid] = proc
2742
maruel@chromium.org306e0e72012-11-02 18:22:03 +00002743 proc.command = command_line
2744 proc.executable = line[IMAGE_FILE_NAME]
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002745 # proc.command[0] may be the absolute path of 'executable' but it may be
2746 # anything else too. If it happens that command[0] ends with executable,
2747 # use it, otherwise defaults to the base name.
2748 cmd0 = proc.command[0].lower()
2749 if not cmd0.endswith('.exe'):
2750 # TODO(maruel): That's not strictly true either.
2751 cmd0 += '.exe'
2752 if cmd0.endswith(proc.executable) and os.path.isfile(cmd0):
2753 # Fix the path.
2754 cmd0 = cmd0.replace('/', os.path.sep)
2755 cmd0 = os.path.normpath(cmd0)
2756 proc.executable = get_native_path_case(cmd0)
2757 logging.info(
2758 'New child: %s -> %d %s' % (ppid, pid, proc.executable))
2759
2760 def handle_Thread_End(self, line):
2761 """Has the same parameters as Thread_Start."""
2762 tid = int(line[self.TID], 16)
2763 self._threads_active.pop(tid, None)
2764
2765 def handle_Thread_Start(self, line):
2766 """Handles a new thread created.
2767
2768 Do not use self.PID here since a process' initial thread is created by
2769 the parent process.
2770 """
2771 PROCESS_ID = self.USER_DATA
2772 TTHREAD_ID = self.USER_DATA + 1
2773 #STACK_BASE = self.USER_DATA + 2
2774 #STACK_LIMIT = self.USER_DATA + 3
2775 #USER_STACK_BASE = self.USER_DATA + 4
2776 #USER_STACK_LIMIT = self.USER_DATA + 5
2777 #AFFINITY = self.USER_DATA + 6
2778 #WIN32_START_ADDR = self.USER_DATA + 7
2779 #TEB_BASE = self.USER_DATA + 8
2780 #SUB_PROCESS_TAG = self.USER_DATA + 9
2781 #BASE_PRIORITY = self.USER_DATA + 10
2782 #PAGE_PRIORITY = self.USER_DATA + 11
2783 #IO_PRIORITY = self.USER_DATA + 12
2784 #THREAD_FLAGS = self.USER_DATA + 13
2785 # Do not use self.PID here since a process' initial thread is created by
2786 # the parent process.
2787 pid = int(line[PROCESS_ID], 16)
2788 tid = int(line[TTHREAD_ID], 16)
2789 logging.debug('New thread pid:%d, tid:%d' % (pid, tid))
2790 self._threads_active[tid] = pid
2791
2792 @classmethod
2793 def supported_events(cls):
2794 """Returns all the procesed events."""
2795 out = []
2796 for member in dir(cls):
2797 match = re.match(r'^handle_([A-Za-z]+)_([A-Za-z]+)$', member)
2798 if match:
2799 out.append(match.groups())
2800 return out
2801
2802 class Tracer(ApiBase.Tracer):
2803 # The basic headers.
2804 EXPECTED_HEADER = [
2805 u'Event Name',
2806 u'Type',
2807 u'Event ID',
2808 u'Version',
2809 u'Channel',
2810 u'Level', # 5
2811 u'Opcode',
2812 u'Task',
2813 u'Keyword',
2814 u'PID',
2815 u'TID', # 10
2816 u'Processor Number',
2817 u'Instance ID',
2818 u'Parent Instance ID',
2819 u'Activity ID',
2820 u'Related Activity ID', # 15
2821 u'Clock-Time',
2822 u'Kernel(ms)', # Both have a resolution of ~15ms which makes them
2823 u'User(ms)', # pretty much useless.
2824 u'User Data', # Extra arguments that are event-specific.
2825 ]
2826 # Only the useful headers common to all entries are listed there. Any column
2827 # at 19 or higher is dependent on the specific event.
2828 EVENT_NAME = 0
2829 TYPE = 1
2830 PID = 9
2831 TID = 10
2832 PROCESSOR_ID = 11
2833 TIMESTAMP = 16
2834 NULL_GUID = '{00000000-0000-0000-0000-000000000000}'
2835 USER_DATA = 19
2836
maruel@chromium.org306e0e72012-11-02 18:22:03 +00002837 class CsvReader(object):
2838 """CSV reader that reads files generated by tracerpt.exe.
2839
2840 csv.reader() fails to read them properly, it mangles file names quoted
2841 with "" with a comma in it.
2842 """
2843 # 0. Had a ',' or one of the following ' ' after a comma, next should
2844 # be ' ', '"' or string or ',' for an empty field.
2845 ( HAD_DELIMITER,
2846 # 1. Processing an unquoted field up to ','.
2847 IN_STR,
2848 # 2. Processing a new field starting with '"'.
2849 STARTING_STR_QUOTED,
2850 # 3. Second quote in a row at the start of a field. It could be either
2851 # '""foo""' or '""'. Who the hell thought it was a great idea to use
2852 # the same character for delimiting and escaping?
2853 STARTING_SECOND_QUOTE,
2854 # 4. A quote inside a quoted string where the previous character was
2855 # not a quote, so the string is not empty. Can be either: end of a
2856 # quoted string (a delimiter) or a quote escape. The next char must be
2857 # either '"' or ','.
2858 HAD_QUOTE_IN_QUOTED,
2859 # 5. Second quote inside a quoted string.
2860 HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED,
2861 # 6. Processing a field that started with '"'.
2862 IN_STR_QUOTED) = range(7)
2863
2864 def __init__(self, f):
2865 self.f = f
2866
2867 def __iter__(self):
2868 return self
2869
2870 def next(self):
2871 """Splits the line in fields."""
2872 line = self.f.readline()
2873 if not line:
2874 raise StopIteration()
2875 line = line.strip()
2876 fields = []
2877 state = self.HAD_DELIMITER
2878 for i, c in enumerate(line):
2879 if state == self.HAD_DELIMITER:
2880 if c == ',':
2881 # Empty field.
2882 fields.append('')
2883 elif c == ' ':
2884 # Ignore initial whitespaces
2885 pass
2886 elif c == '"':
2887 state = self.STARTING_STR_QUOTED
2888 fields.append('')
2889 else:
2890 # Start of a new field.
2891 state = self.IN_STR
2892 fields.append(c)
2893
2894 elif state == self.IN_STR:
2895 # Do not accept quote inside unquoted field.
2896 assert c != '"', (i, c, line, fields)
2897 if c == ',':
2898 fields[-1] = fields[-1].strip()
2899 state = self.HAD_DELIMITER
2900 else:
2901 fields[-1] = fields[-1] + c
2902
2903 elif state == self.STARTING_STR_QUOTED:
2904 if c == '"':
2905 # Do not store the character yet.
2906 state = self.STARTING_SECOND_QUOTE
2907 else:
2908 state = self.IN_STR_QUOTED
2909 fields[-1] = fields[-1] + c
2910
2911 elif state == self.STARTING_SECOND_QUOTE:
2912 if c == ',':
2913 # It was an empty field. '""' == ''.
2914 state = self.HAD_DELIMITER
2915 else:
2916 fields[-1] = fields[-1] + '"' + c
2917 state = self.IN_STR_QUOTED
2918
2919 elif state == self.HAD_QUOTE_IN_QUOTED:
2920 if c == ',':
2921 # End of the string.
2922 state = self.HAD_DELIMITER
2923 elif c == '"':
2924 state = self.HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED
2925 else:
2926 # The previous double-quote was just an unescaped quote.
2927 fields[-1] = fields[-1] + '"' + c
2928 state = self.IN_STR_QUOTED
2929
2930 elif state == self.HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED:
2931 if c == ',':
2932 # End of the string.
2933 state = self.HAD_DELIMITER
2934 fields[-1] = fields[-1] + '"'
2935 else:
maruel@chromium.org261d1352013-02-25 19:23:20 +00002936 # That's just how the logger rolls. Revert back to appending the
2937 # char and "guess" it was a quote in a double-quoted string.
2938 state = self.IN_STR_QUOTED
2939 fields[-1] = fields[-1] + '"' + c
maruel@chromium.org306e0e72012-11-02 18:22:03 +00002940
2941 elif state == self.IN_STR_QUOTED:
2942 if c == '"':
2943 # Could be a delimiter or an escape.
2944 state = self.HAD_QUOTE_IN_QUOTED
2945 else:
2946 fields[-1] = fields[-1] + c
2947
2948 if state == self.HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED:
2949 fields[-1] = fields[-1] + '"'
2950 else:
2951 assert state in (
2952 # Terminated with a normal field.
2953 self.IN_STR,
2954 # Terminated with an empty field.
2955 self.STARTING_SECOND_QUOTE,
2956 # Terminated with a normal quoted field.
2957 self.HAD_QUOTE_IN_QUOTED), (
2958 line, state, fields)
2959 return fields
2960
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002961 def __init__(self, logname):
2962 """Starts the log collection.
2963
2964 Requires administrative access. logman.exe is synchronous so no need for a
2965 "warmup" call. 'Windows Kernel Trace' is *localized* so use its GUID
2966 instead. The GUID constant name is SystemTraceControlGuid. Lovely.
2967
2968 One can get the list of potentially interesting providers with:
2969 "logman query providers | findstr /i file"
2970 """
2971 super(LogmanTrace.Tracer, self).__init__(logname)
2972 self._script = create_thunk()
2973 cmd_start = [
2974 'logman.exe',
2975 'start',
2976 'NT Kernel Logger',
2977 '-p', '{9e814aad-3204-11d2-9a82-006008a86939}',
2978 # splitio,fileiocompletion,syscall,file,cswitch,img
2979 '(process,fileio,thread)',
2980 '-o', self._logname + '.etl',
2981 '-ets', # Send directly to kernel
2982 # Values extracted out of thin air.
2983 # Event Trace Session buffer size in kb.
2984 '-bs', '10240',
2985 # Number of Event Trace Session buffers.
2986 '-nb', '16', '256',
2987 ]
2988 logging.debug('Running: %s' % cmd_start)
2989 try:
2990 subprocess.check_call(
2991 cmd_start,
2992 stdin=subprocess.PIPE,
2993 stdout=subprocess.PIPE,
2994 stderr=subprocess.STDOUT)
2995 except subprocess.CalledProcessError, e:
2996 if e.returncode == -2147024891:
2997 print >> sys.stderr, 'Please restart with an elevated admin prompt'
2998 elif e.returncode == -2144337737:
2999 print >> sys.stderr, (
3000 'A kernel trace was already running, stop it and try again')
3001 raise
3002
3003 def trace(self, cmd, cwd, tracename, output):
3004 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
3005 assert os.path.isabs(cmd[0]), cmd[0]
3006 assert os.path.isabs(cwd), cwd
3007 assert os.path.normpath(cwd) == cwd, cwd
3008 with self._lock:
3009 if not self._initialized:
3010 raise TracingFailure(
3011 'Called Tracer.trace() on an unitialized object',
3012 None, None, None, tracename)
3013 assert tracename not in (i['trace'] for i in self._traces)
3014
3015 # Use "logman -?" for help.
3016
3017 stdout = stderr = None
3018 if output:
3019 stdout = subprocess.PIPE
3020 stderr = subprocess.STDOUT
3021
3022 # Run the child process.
3023 logging.debug('Running: %s' % cmd)
3024 # Use the temporary script generated with create_thunk() so we have a
3025 # clear pid owner. Since trace_inputs.py can be used as a library and
3026 # could trace multiple processes simultaneously, it makes it more complex
3027 # if the executable to be traced is executed directly here. It also solves
3028 # issues related to logman.exe that needs to be executed to control the
3029 # kernel trace.
3030 child_cmd = [
3031 sys.executable,
3032 self._script,
3033 tracename,
3034 ]
3035 child = subprocess.Popen(
3036 child_cmd + fix_python_path(cmd),
3037 cwd=cwd,
3038 stdin=subprocess.PIPE,
3039 stdout=stdout,
3040 stderr=stderr)
3041 logging.debug('Started child pid: %d' % child.pid)
3042 out = child.communicate()[0]
3043 # This doesn't mean all the grand-children are done. Sadly, we don't have
3044 # a good way to determine that.
3045
3046 with self._lock:
3047 assert tracename not in (i['trace'] for i in self._traces)
3048 self._traces.append({
maruel@chromium.orgf43e68b2012-10-15 20:23:10 +00003049 'cmd': cmd,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003050 'cwd': cwd,
maruel@chromium.org964ddde2012-10-29 18:13:46 +00003051 'output': out,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003052 'pid': child.pid,
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00003053 # Used to figure out the real process when process ids are reused.
3054 'thunk_cmd': child_cmd,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003055 'trace': tracename,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003056 })
3057
3058 return child.returncode, out
3059
3060 def close(self, _timeout=None):
3061 """Stops the kernel log collection and converts the traces to text
3062 representation.
3063 """
3064 with self._lock:
maruel@chromium.org808e3f22013-05-16 16:42:54 +00003065 try:
3066 super(LogmanTrace.Tracer, self).close()
3067 finally:
3068 cmd_stop = [
3069 'logman.exe',
3070 'stop',
3071 'NT Kernel Logger',
3072 '-ets', # Sends the command directly to the kernel.
3073 ]
3074 logging.debug('Running: %s' % cmd_stop)
3075 subprocess.check_call(
3076 cmd_stop,
3077 stdin=subprocess.PIPE,
3078 stdout=subprocess.PIPE,
3079 stderr=subprocess.STDOUT)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003080
3081 def post_process_log(self):
3082 """Converts the .etl file into .csv then into .json."""
3083 super(LogmanTrace.Tracer, self).post_process_log()
3084 logformat = 'csv'
3085 self._convert_log(logformat)
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +00003086 self._trim_log(logformat)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003087
maruel@chromium.org808e3f22013-05-16 16:42:54 +00003088 def _gen_logdata(self):
3089 return {
3090 'format': 'csv',
3091 'traces': self._traces,
3092 }
3093
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +00003094 def _trim_log(self, logformat):
3095 """Reduces the amount of data in original log by generating a 'reduced'
3096 log.
3097 """
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003098 if logformat == 'csv_utf16':
maruel@chromium.org306e0e72012-11-02 18:22:03 +00003099 file_handle = codecs.open(
3100 self._logname + '.' + logformat, 'r', encoding='utf-16')
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003101
3102 elif logformat == 'csv':
maruel@chromium.org306e0e72012-11-02 18:22:03 +00003103 assert sys.getfilesystemencoding() == 'mbcs'
3104 file_handle = codecs.open(
3105 self._logname + '.' + logformat, 'r',
3106 encoding=get_current_encoding())
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003107
3108 supported_events = LogmanTrace.Context.supported_events()
3109
3110 def trim(generator):
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +00003111 """Loads items from the generator and returns the interesting data.
3112
3113 It filters out any uninteresting line and reduce the amount of data in
3114 the trace.
3115 """
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003116 for index, line in enumerate(generator):
3117 if not index:
3118 if line != self.EXPECTED_HEADER:
3119 raise TracingFailure(
maruel@chromium.org306e0e72012-11-02 18:22:03 +00003120 'Found malformed header: %s' % line,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003121 None, None, None)
3122 continue
3123 # As you can see, the CSV is full of useful non-redundant information:
3124 if (line[2] != '0' or # Event ID
3125 line[3] not in ('2', '3') or # Version
3126 line[4] != '0' or # Channel
3127 line[5] != '0' or # Level
3128 line[7] != '0' or # Task
3129 line[8] != '0x0000000000000000' or # Keyword
3130 line[12] != '' or # Instance ID
3131 line[13] != '' or # Parent Instance ID
3132 line[14] != self.NULL_GUID or # Activity ID
3133 line[15] != ''): # Related Activity ID
3134 raise TracingFailure(
3135 'Found unexpected values in line: %s' % ' '.join(line),
3136 None, None, None)
3137
3138 if (line[self.EVENT_NAME], line[self.TYPE]) not in supported_events:
3139 continue
3140
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003141 yield [
3142 line[self.EVENT_NAME],
3143 line[self.TYPE],
3144 line[self.PID],
3145 line[self.TID],
3146 line[self.PROCESSOR_ID],
3147 line[self.TIMESTAMP],
3148 ] + line[self.USER_DATA:]
3149
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +00003150 # must not convert the trim() call into a list, since it will use too much
3151 # memory for large trace. use a csv file as a workaround since the json
3152 # parser requires a complete in-memory file.
3153 with open('%s.preprocessed' % self._logname, 'wb') as f:
3154 # $ and * can't be used in file name on windows, reducing the likelihood
3155 # of having to escape a string.
3156 out = csv.writer(
3157 f, delimiter='$', quotechar='*', quoting=csv.QUOTE_MINIMAL)
maruel@chromium.org306e0e72012-11-02 18:22:03 +00003158 for line in trim(self.CsvReader(file_handle)):
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +00003159 out.writerow([s.encode('utf-8') for s in line])
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003160
3161 def _convert_log(self, logformat):
3162 """Converts the ETL trace to text representation.
3163
3164 Normally, 'csv' is sufficient. If complex scripts are used (like eastern
3165 languages), use 'csv_utf16'. If localization gets in the way, use 'xml'.
3166
3167 Arguments:
3168 - logformat: Text format to be generated, csv, csv_utf16 or xml.
3169
3170 Use "tracerpt -?" for help.
3171 """
3172 LOCALE_INVARIANT = 0x7F
3173 windll.kernel32.SetThreadLocale(LOCALE_INVARIANT)
3174 cmd_convert = [
3175 'tracerpt.exe',
3176 '-l', self._logname + '.etl',
3177 '-o', self._logname + '.' + logformat,
3178 '-gmt', # Use UTC
3179 '-y', # No prompt
3180 # Use -of XML to get the header of each items after column 19, e.g. all
3181 # the actual headers of 'User Data'.
3182 ]
3183
3184 if logformat == 'csv':
3185 # tracerpt localizes the 'Type' column, for major brainfuck
3186 # entertainment. I can't imagine any sane reason to do that.
3187 cmd_convert.extend(['-of', 'CSV'])
3188 elif logformat == 'csv_utf16':
3189 # This causes it to use UTF-16, which doubles the log size but ensures
3190 # the log is readable for non-ASCII characters.
3191 cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode'])
3192 elif logformat == 'xml':
3193 cmd_convert.extend(['-of', 'XML'])
3194 else:
3195 raise ValueError('Unexpected log format \'%s\'' % logformat)
3196 logging.debug('Running: %s' % cmd_convert)
3197 # This can takes tens of minutes for large logs.
3198 # Redirects all output to stderr.
3199 subprocess.check_call(
3200 cmd_convert,
3201 stdin=subprocess.PIPE,
3202 stdout=sys.stderr,
3203 stderr=sys.stderr)
3204
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00003205 def __init__(self, use_sudo=False): # pylint: disable=W0613
3206 super(LogmanTrace, self).__init__()
3207 # Ignore use_sudo. It's irrelevant on Windows but kept to simplify the API.
3208
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003209 @staticmethod
3210 def clean_trace(logname):
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +00003211 for ext in ('', '.csv', '.etl', '.json', '.xml', '.preprocessed'):
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003212 if os.path.isfile(logname + ext):
3213 os.remove(logname + ext)
3214
3215 @classmethod
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00003216 def parse_log(cls, logname, blacklist, trace_name):
3217 logging.info('parse_log(%s, ..., %s)', logname, trace_name)
maruel@chromium.org8b056ba2012-10-16 14:04:49 +00003218 assert os.path.isabs(logname)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003219
3220 def blacklist_more(filepath):
3221 # All the NTFS metadata is in the form x:\$EXTEND or stuff like that.
3222 return blacklist(filepath) or re.match(r'[A-Z]\:\\\$EXTEND', filepath)
3223
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +00003224 # Create a list of (Context, result_dict) tuples. This is necessary because
3225 # the csv file may be larger than the amount of available memory.
3226 contexes = [
3227 (
3228 cls.Context(
3229 blacklist_more, item['pid'], item['trace'], item['thunk_cmd']),
3230 {
3231 'output': item['output'],
3232 'trace': item['trace'],
3233 },
3234 )
3235 for item in read_json(logname)['traces']
3236 if not trace_name or item['trace'] == trace_name
3237 ]
3238
3239 # The log may be too large to fit in memory and it is not efficient to read
3240 # it multiple times, so multiplex the contexes instead, which is slightly
3241 # more awkward.
3242 with open('%s.preprocessed' % logname, 'rb') as f:
3243 lines = csv.reader(
3244 f, delimiter='$', quotechar='*', quoting=csv.QUOTE_MINIMAL)
3245 for encoded in lines:
3246 line = [s.decode('utf-8') for s in encoded]
3247 # Convert the PID in-place from hex.
3248 line[cls.Context.PID] = int(line[cls.Context.PID], 16)
3249 for context in contexes:
3250 if 'exception' in context[1]:
3251 continue
3252 try:
3253 context[0].on_line(line)
3254 except TracingFailure:
3255 context[1]['exception'] = sys.exc_info()
3256
3257 for context in contexes:
3258 if 'exception' in context[1]:
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00003259 continue
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +00003260 context[1]['results'] = context[0].to_results()
3261
3262 return [context[1] for context in contexes]
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003263
3264
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00003265def get_api(**kwargs):
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003266 """Returns the correct implementation for the current OS."""
3267 if sys.platform == 'cygwin':
3268 raise NotImplementedError(
3269 'Not implemented for cygwin, start the script from Win32 python')
3270 flavors = {
3271 'win32': LogmanTrace,
3272 'darwin': Dtrace,
3273 'sunos5': Dtrace,
3274 'freebsd7': Dtrace,
3275 'freebsd8': Dtrace,
3276 }
3277 # Defaults to strace.
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00003278 return flavors.get(sys.platform, Strace)(**kwargs)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003279
3280
3281def extract_directories(root_dir, files, blacklist):
3282 """Detects if all the files in a directory are in |files| and if so, replace
3283 the individual files by a Results.Directory instance.
3284
3285 Takes a list of Results.File instances and returns a shorter list of
3286 Results.File and Results.Directory instances.
3287
3288 Arguments:
3289 - root_dir: Optional base directory that shouldn't be search further.
3290 - files: list of Results.File instances.
3291 - blacklist: regexp of files to ignore, for example r'.+\.pyc'.
3292 """
3293 logging.info(
3294 'extract_directories(%s, %d files, ...)' % (root_dir, len(files)))
3295 assert not (root_dir or '').endswith(os.path.sep), root_dir
maruel@chromium.org306e0e72012-11-02 18:22:03 +00003296 # It is important for root_dir to not be a symlinked path, make sure to call
3297 # os.path.realpath() as needed.
3298 assert not root_dir or (
3299 os.path.realpath(get_native_path_case(root_dir)) == root_dir)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003300 assert not any(isinstance(f, Results.Directory) for f in files)
3301 # Remove non existent files.
3302 files = [f for f in files if f.existent]
3303 if not files:
3304 return files
3305 # All files must share the same root, which can be None.
3306 assert len(set(f.root for f in files)) == 1, set(f.root for f in files)
3307
3308 # Creates a {directory: {filename: File}} mapping, up to root.
3309 buckets = {}
3310 if root_dir:
3311 buckets[root_dir] = {}
3312 for fileobj in files:
3313 path = fileobj.full_path
3314 directory = os.path.dirname(path)
3315 assert directory
3316 # Do not use os.path.basename() so trailing os.path.sep is kept.
3317 basename = path[len(directory)+1:]
3318 files_in_directory = buckets.setdefault(directory, {})
3319 files_in_directory[basename] = fileobj
3320 # Add all the directories recursively up to root.
3321 while True:
3322 old_d = directory
3323 directory = os.path.dirname(directory)
3324 if directory + os.path.sep == root_dir or directory == old_d:
3325 break
3326 buckets.setdefault(directory, {})
3327
3328 root_prefix = len(root_dir) + 1 if root_dir else 0
3329 for directory in sorted(buckets, reverse=True):
maruel@chromium.orgb1678e62013-03-19 15:55:17 +00003330 if not os.path.isdir(directory):
3331 logging.debug(
3332 '%s was a directory but doesn\'t exist anymore; ignoring', directory)
3333 continue
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003334 actual = set(f for f in os.listdir(directory) if not blacklist(f))
3335 expected = set(buckets[directory])
3336 if not (actual - expected):
3337 parent = os.path.dirname(directory)
3338 buckets[parent][os.path.basename(directory)] = Results.Directory(
3339 root_dir,
3340 directory[root_prefix:],
3341 False,
3342 sum(f.size for f in buckets[directory].itervalues()),
3343 sum(f.nb_files for f in buckets[directory].itervalues()))
3344 # Remove the whole bucket.
3345 del buckets[directory]
3346
3347 # Reverse the mapping with what remains. The original instances are returned,
3348 # so the cached meta data is kept.
3349 files = sum((x.values() for x in buckets.itervalues()), [])
3350 return sorted(files, key=lambda x: x.path)
3351
3352
3353def trace(logfile, cmd, cwd, api, output):
3354 """Traces an executable. Returns (returncode, output) from api.
3355
3356 Arguments:
3357 - logfile: file to write to.
3358 - cmd: command to run.
3359 - cwd: current directory to start the process in.
3360 - api: a tracing api instance.
3361 - output: if True, returns output, otherwise prints it at the console.
3362 """
3363 cmd = fix_python_path(cmd)
3364 api.clean_trace(logfile)
3365 with api.get_tracer(logfile) as tracer:
3366 return tracer.trace(cmd, cwd, 'default', output)
3367
3368
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003369def CMDclean(args):
3370 """Cleans up traces."""
3371 parser = OptionParserTraceInputs(command='clean')
3372 options, args = parser.parse_args(args)
3373 api = get_api()
3374 api.clean_trace(options.log)
3375 return 0
3376
3377
3378def CMDtrace(args):
3379 """Traces an executable."""
3380 parser = OptionParserTraceInputs(command='trace')
3381 parser.allow_interspersed_args = False
3382 parser.add_option(
3383 '-q', '--quiet', action='store_true',
3384 help='Redirects traced executable output to /dev/null')
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00003385 parser.add_option(
3386 '-s', '--sudo', action='store_true',
3387 help='Use sudo when shelling out the tracer tool (ignored on Windows)')
3388 parser.add_option(
3389 '-n', '--no-sudo', action='store_false',
3390 help='Don\'t use sudo')
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003391 options, args = parser.parse_args(args)
3392
3393 if not args:
3394 parser.error('Please provide a command to run')
3395
3396 if not os.path.isabs(args[0]) and os.access(args[0], os.X_OK):
3397 args[0] = os.path.abspath(args[0])
3398
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00003399 # options.sudo default value is None, which is to do whatever tracer defaults
3400 # do.
3401 api = get_api(use_sudo=options.sudo)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003402 return trace(options.log, args, os.getcwd(), api, options.quiet)[0]
3403
3404
3405def CMDread(args):
3406 """Reads the logs and prints the result."""
3407 parser = OptionParserTraceInputs(command='read')
3408 parser.add_option(
3409 '-V', '--variable',
3410 nargs=2,
3411 action='append',
3412 dest='variables',
3413 metavar='VAR_NAME directory',
3414 default=[],
3415 help=('Variables to replace relative directories against. Example: '
3416 '"-v \'$HOME\' \'/home/%s\'" will replace all occurence of your '
3417 'home dir with $HOME') % getpass.getuser())
3418 parser.add_option(
3419 '--root-dir',
3420 help='Root directory to base everything off it. Anything outside of this '
3421 'this directory will not be reported')
3422 parser.add_option(
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00003423 '--trace-name',
3424 help='Only reads one of the trace. Defaults to reading all traces')
3425 parser.add_option(
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003426 '-j', '--json', action='store_true',
3427 help='Outputs raw result data as json')
3428 parser.add_option(
3429 '-b', '--blacklist', action='append', default=[],
3430 help='List of regexp to use as blacklist filter')
3431 options, args = parser.parse_args(args)
3432
3433 if options.root_dir:
maruel@chromium.org7dc17b32013-04-02 22:27:05 +00003434 options.root_dir = get_native_path_case(
3435 unicode(os.path.abspath(options.root_dir)))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003436
3437 variables = dict(options.variables)
3438 api = get_api()
3439 def blacklist(f):
3440 return any(re.match(b, f) for b in options.blacklist)
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00003441 data = api.parse_log(options.log, blacklist, options.trace_name)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003442 # Process each trace.
3443 output_as_json = []
3444 for item in data:
3445 if 'exception' in item:
maruel@chromium.org424f5692012-10-15 15:42:38 +00003446 # Do not abort the other traces.
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003447 print >> sys.stderr, (
maruel@chromium.org424f5692012-10-15 15:42:38 +00003448 'Trace %s: Got an exception: %s' % (
3449 item['trace'], item['exception'][1]))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003450 continue
3451 results = item['results']
3452 if options.root_dir:
3453 results = results.strip_root(options.root_dir)
3454
3455 if options.json:
3456 output_as_json.append(results.flatten())
3457 else:
3458 simplified = extract_directories(
3459 options.root_dir, results.files, blacklist)
3460 simplified = [f.replace_variables(variables) for f in simplified]
3461 if len(data) > 1:
3462 print('Trace: %s' % item['trace'])
3463 print('Total: %d' % len(results.files))
3464 print('Non existent: %d' % len(results.non_existent))
3465 for f in results.non_existent:
3466 print(' %s' % f.path)
3467 print(
3468 'Interesting: %d reduced to %d' % (
3469 len(results.existent), len(simplified)))
3470 for f in simplified:
3471 print(' %s' % f.path)
3472
3473 if options.json:
3474 write_json(sys.stdout, output_as_json, False)
3475 return 0
3476
3477
3478class OptionParserWithLogging(optparse.OptionParser):
3479 """Adds --verbose option."""
3480 def __init__(self, verbose=0, **kwargs):
3481 optparse.OptionParser.__init__(self, **kwargs)
3482 self.add_option(
3483 '-v', '--verbose',
3484 action='count',
3485 default=verbose,
3486 help='Use multiple times to increase verbosity')
3487
3488 def parse_args(self, *args, **kwargs):
3489 options, args = optparse.OptionParser.parse_args(self, *args, **kwargs)
3490 levels = [logging.ERROR, logging.INFO, logging.DEBUG]
3491 logging.basicConfig(
3492 level=levels[min(len(levels)-1, options.verbose)],
3493 format='%(levelname)5s %(module)15s(%(lineno)3d): %(message)s')
3494 return options, args
3495
3496
3497class OptionParserWithNiceDescription(OptionParserWithLogging):
3498 """Generates the description with the command's docstring."""
3499 def __init__(self, **kwargs):
3500 """Sets 'description' and 'usage' if not already specified."""
3501 command = kwargs.pop('command', 'help')
3502 kwargs.setdefault(
3503 'description',
3504 re.sub('[\r\n ]{2,}', ' ', get_command_handler(command).__doc__))
3505 kwargs.setdefault('usage', '%%prog %s [options]' % command)
3506 OptionParserWithLogging.__init__(self, **kwargs)
3507
3508
3509class OptionParserTraceInputs(OptionParserWithNiceDescription):
3510 """Adds automatic --log handling."""
3511 def __init__(self, **kwargs):
3512 OptionParserWithNiceDescription.__init__(self, **kwargs)
3513 self.add_option(
3514 '-l', '--log', help='Log file to generate or read, required')
3515
3516 def parse_args(self, *args, **kwargs):
3517 """Makes sure the paths make sense.
3518
3519 On Windows, / and \ are often mixed together in a path.
3520 """
3521 options, args = OptionParserWithNiceDescription.parse_args(
3522 self, *args, **kwargs)
3523 if not options.log:
3524 self.error('Must supply a log file with -l')
3525 options.log = os.path.abspath(options.log)
3526 return options, args
3527
3528
3529def extract_documentation():
3530 """Returns a dict {command: description} for each of documented command."""
3531 commands = (
3532 fn[3:]
3533 for fn in dir(sys.modules['__main__'])
3534 if fn.startswith('CMD') and get_command_handler(fn[3:]).__doc__)
3535 return dict((fn, get_command_handler(fn).__doc__) for fn in commands)
3536
3537
3538def CMDhelp(args):
3539 """Prints list of commands or help for a specific command."""
3540 doc = extract_documentation()
3541 # Calculates the optimal offset.
3542 offset = max(len(cmd) for cmd in doc)
3543 format_str = ' %-' + str(offset + 2) + 's %s'
3544 # Generate a one-liner documentation of each commands.
3545 commands_description = '\n'.join(
3546 format_str % (cmd, doc[cmd].split('\n')[0]) for cmd in sorted(doc))
3547
3548 parser = OptionParserWithNiceDescription(
3549 usage='%prog <command> [options]',
3550 description='Commands are:\n%s\n' % commands_description)
3551 parser.format_description = lambda _: parser.description
3552
3553 # Strip out any -h or --help argument.
3554 _, args = parser.parse_args([i for i in args if not i in ('-h', '--help')])
3555 if len(args) == 1:
3556 if not get_command_handler(args[0]):
3557 parser.error('Unknown command %s' % args[0])
3558 # The command was "%prog help command", replaces ourself with
3559 # "%prog command --help" so help is correctly printed out.
3560 return main(args + ['--help'])
3561 elif args:
3562 parser.error('Unknown argument "%s"' % ' '.join(args))
3563 parser.print_help()
3564 return 0
3565
3566
3567def get_command_handler(name):
3568 """Returns the command handler or CMDhelp if it doesn't exist."""
3569 return getattr(sys.modules['__main__'], 'CMD%s' % name, None)
3570
3571
3572def main_impl(argv):
3573 command = get_command_handler(argv[0] if argv else 'help')
3574 if not command:
3575 return CMDhelp(argv)
3576 return command(argv[1:])
3577
3578def main(argv):
maruel@chromium.org46e61cc2013-03-25 19:55:34 +00003579 run_isolated.disable_buffering()
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003580 try:
3581 main_impl(argv)
3582 except TracingFailure, e:
3583 sys.stderr.write('\nError: ')
3584 sys.stderr.write(str(e))
3585 sys.stderr.write('\n')
3586 return 1
3587
3588
3589if __name__ == '__main__':
3590 sys.exit(main(sys.argv[1:]))