blob: b1430b6a30a65af8450cce3989fa044dccee3001 [file] [log] [blame]
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001#!/usr/bin/env python
2# coding=utf-8
3# Copyright (c) 2012 The Chromium Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7"""Traces an executable and its child processes and extract the files accessed
8by them.
9
10The implementation uses OS-specific API. The native Kernel logger and the ETL
11interface is used on Windows. Dtrace is used on OSX. Strace is used otherwise.
12The OS-specific implementation is hidden in an 'API' interface.
13
14The results are embedded in a Results instance. The tracing is done in two
15phases, the first is to do the actual trace and generate an
16implementation-specific log file. Then the log file is parsed to extract the
17information, including the individual child processes and the files accessed
18from the log.
19"""
20
21import codecs
22import csv
23import getpass
24import glob
25import json
26import logging
27import optparse
28import os
29import re
maruel@chromium.org9b2112a2013-04-05 18:01:02 +000030import stat
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +000031import subprocess
32import sys
33import tempfile
34import threading
35import time
maruel@chromium.org9b2112a2013-04-05 18:01:02 +000036import unicodedata
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +000037import weakref
38
39## OS-specific imports
40
41if sys.platform == 'win32':
42 from ctypes.wintypes import byref, create_unicode_buffer, c_int, c_wchar_p
43 from ctypes.wintypes import windll, FormatError # pylint: disable=E0611
44 from ctypes.wintypes import GetLastError # pylint: disable=E0611
45elif sys.platform == 'darwin':
46 import Carbon.File # pylint: disable=F0401
47 import MacOS # pylint: disable=F0401
48
maruel@chromium.org46e61cc2013-03-25 19:55:34 +000049import run_isolated
50
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +000051
52BASE_DIR = os.path.dirname(os.path.abspath(__file__))
53ROOT_DIR = os.path.dirname(os.path.dirname(BASE_DIR))
54
55
56class TracingFailure(Exception):
57 """An exception occured during tracing."""
58 def __init__(self, description, pid, line_number, line, *args):
59 super(TracingFailure, self).__init__(
60 description, pid, line_number, line, *args)
61 self.description = description
62 self.pid = pid
63 self.line_number = line_number
64 self.line = line
65 self.extra = args
66
67 def __str__(self):
68 out = self.description
69 if self.pid:
70 out += '\npid: %d' % self.pid
71 if self.line_number:
72 out += '\nline: %d' % self.line_number
73 if self.line:
74 out += '\n%s' % self.line
75 if self.extra:
76 out += '\n' + ', '.join(map(str, filter(None, self.extra)))
77 return out
78
79
80## OS-specific functions
81
82if sys.platform == 'win32':
83 def QueryDosDevice(drive_letter):
84 """Returns the Windows 'native' path for a DOS drive letter."""
85 assert re.match(r'^[a-zA-Z]:$', drive_letter), drive_letter
maruel@chromium.org306e0e72012-11-02 18:22:03 +000086 assert isinstance(drive_letter, unicode)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +000087 # Guesswork. QueryDosDeviceW never returns the required number of bytes.
88 chars = 1024
maruel@chromium.org306e0e72012-11-02 18:22:03 +000089 drive_letter = drive_letter
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +000090 p = create_unicode_buffer(chars)
91 if 0 == windll.kernel32.QueryDosDeviceW(drive_letter, p, chars):
92 err = GetLastError()
93 if err:
94 # pylint: disable=E0602
maruel@chromium.orgf37c2472012-11-02 20:06:13 +000095 msg = u'QueryDosDevice(%s): %s (%d)' % (
96 drive_letter, FormatError(err), err)
97 raise WindowsError(err, msg.encode('utf-8'))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +000098 return p.value
99
100
101 def GetShortPathName(long_path):
102 """Returns the Windows short path equivalent for a 'long' path."""
maruel@chromium.org68bb1482013-02-15 22:36:24 +0000103 assert isinstance(long_path, unicode), repr(long_path)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000104 # Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is
105 # not enforced.
106 if os.path.isabs(long_path) and not long_path.startswith('\\\\?\\'):
107 long_path = '\\\\?\\' + long_path
108 chars = windll.kernel32.GetShortPathNameW(long_path, None, 0)
109 if chars:
110 p = create_unicode_buffer(chars)
111 if windll.kernel32.GetShortPathNameW(long_path, p, chars):
112 return p.value
113
114 err = GetLastError()
115 if err:
116 # pylint: disable=E0602
maruel@chromium.orgf37c2472012-11-02 20:06:13 +0000117 msg = u'GetShortPathName(%s): %s (%d)' % (
118 long_path, FormatError(err), err)
119 raise WindowsError(err, msg.encode('utf-8'))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000120
121
122 def GetLongPathName(short_path):
123 """Returns the Windows long path equivalent for a 'short' path."""
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000124 assert isinstance(short_path, unicode)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000125 # Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is
126 # not enforced.
127 if os.path.isabs(short_path) and not short_path.startswith('\\\\?\\'):
128 short_path = '\\\\?\\' + short_path
129 chars = windll.kernel32.GetLongPathNameW(short_path, None, 0)
130 if chars:
131 p = create_unicode_buffer(chars)
132 if windll.kernel32.GetLongPathNameW(short_path, p, chars):
133 return p.value
134
135 err = GetLastError()
136 if err:
137 # pylint: disable=E0602
maruel@chromium.orgf37c2472012-11-02 20:06:13 +0000138 msg = u'GetLongPathName(%s): %s (%d)' % (
139 short_path, FormatError(err), err)
140 raise WindowsError(err, msg.encode('utf-8'))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000141
142
143 def get_current_encoding():
144 """Returns the 'ANSI' code page associated to the process."""
145 return 'cp%d' % int(windll.kernel32.GetACP())
146
147
148 class DosDriveMap(object):
149 """Maps \Device\HarddiskVolumeN to N: on Windows."""
150 # Keep one global cache.
151 _MAPPING = {}
152
153 def __init__(self):
154 """Lazy loads the cache."""
155 if not self._MAPPING:
156 # This is related to UNC resolver on windows. Ignore that.
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000157 self._MAPPING[u'\\Device\\Mup'] = None
158 self._MAPPING[u'\\SystemRoot'] = os.environ[u'SystemRoot']
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000159
160 for letter in (chr(l) for l in xrange(ord('C'), ord('Z')+1)):
161 try:
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000162 letter = u'%s:' % letter
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000163 mapped = QueryDosDevice(letter)
164 if mapped in self._MAPPING:
165 logging.warn(
166 ('Two drives: \'%s\' and \'%s\', are mapped to the same disk'
167 '. Drive letters are a user-mode concept and the kernel '
168 'traces only have NT path, so all accesses will be '
169 'associated with the first drive letter, independent of the '
170 'actual letter used by the code') % (
171 self._MAPPING[mapped], letter))
172 else:
173 self._MAPPING[mapped] = letter
174 except WindowsError: # pylint: disable=E0602
175 pass
176
177 def to_win32(self, path):
178 """Converts a native NT path to Win32/DOS compatible path."""
179 match = re.match(r'(^\\Device\\[a-zA-Z0-9]+)(\\.*)?$', path)
180 if not match:
181 raise ValueError(
182 'Can\'t convert %s into a Win32 compatible path' % path,
183 path)
184 if not match.group(1) in self._MAPPING:
185 # Unmapped partitions may be accessed by windows for the
186 # fun of it while the test is running. Discard these.
187 return None
188 drive = self._MAPPING[match.group(1)]
189 if not drive or not match.group(2):
190 return drive
191 return drive + match.group(2)
192
193
194 def isabs(path):
195 """Accepts X: as an absolute path, unlike python's os.path.isabs()."""
196 return os.path.isabs(path) or len(path) == 2 and path[1] == ':'
197
198
csharp@chromium.orgf2eacff2013-04-04 14:20:20 +0000199 def find_item_native_case(root, item):
200 """Gets the native path case of a single item based at root_path."""
201 if item == '..':
202 return item
203
204 root = get_native_path_case(root)
205 return os.path.basename(get_native_path_case(os.path.join(root, item)))
206
207
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000208 def get_native_path_case(p):
209 """Returns the native path case for an existing file.
210
211 On Windows, removes any leading '\\?\'.
212 """
maruel@chromium.orgd1e9fbd2013-02-26 14:42:24 +0000213 assert isinstance(p, unicode), repr(p)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000214 if not isabs(p):
215 raise ValueError(
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +0000216 'get_native_path_case(%r): Require an absolute path' % p, p)
217
maruel@chromium.org037758d2012-12-10 17:59:46 +0000218 # Make sure it is normalized to os.path.sep. Do not do it here to keep the
219 # function fast
220 assert '/' not in p, p
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +0000221 suffix = ''
222 count = p.count(':')
223 if count > 1:
224 # This means it has an alternate-data stream. There could be 3 ':', since
225 # it could be the $DATA datastream of an ADS. Split the whole ADS suffix
226 # off and add it back afterward. There is no way to know the native path
227 # case of an alternate data stream.
228 items = p.split(':')
229 p = ':'.join(items[0:2])
230 suffix = ''.join(':' + i for i in items[2:])
231
maruel@chromium.org42b091d2013-04-02 22:27:02 +0000232 # TODO(maruel): Use os.path.normpath?
233 if p.endswith('.\\'):
234 p = p[:-2]
235
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000236 # Windows used to have an option to turn on case sensitivity on non Win32
237 # subsystem but that's out of scope here and isn't supported anymore.
238 # Go figure why GetShortPathName() is needed.
239 try:
240 out = GetLongPathName(GetShortPathName(p))
241 except OSError, e:
242 if e.args[0] in (2, 3, 5):
243 # The path does not exist. Try to recurse and reconstruct the path.
244 base = os.path.dirname(p)
245 rest = os.path.basename(p)
246 return os.path.join(get_native_path_case(base), rest)
247 raise
248 if out.startswith('\\\\?\\'):
249 out = out[4:]
250 # Always upper case the first letter since GetLongPathName() will return the
251 # drive letter in the case it was given.
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +0000252 return out[0].upper() + out[1:] + suffix
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000253
254
255 def CommandLineToArgvW(command_line):
256 """Splits a commandline into argv using CommandLineToArgvW()."""
257 # http://msdn.microsoft.com/library/windows/desktop/bb776391.aspx
258 size = c_int()
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000259 assert isinstance(command_line, unicode)
260 ptr = windll.shell32.CommandLineToArgvW(command_line, byref(size))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000261 try:
262 return [arg for arg in (c_wchar_p * size.value).from_address(ptr)]
263 finally:
264 windll.kernel32.LocalFree(ptr)
265
266
267elif sys.platform == 'darwin':
268
269
270 # On non-windows, keep the stdlib behavior.
271 isabs = os.path.isabs
272
273
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000274 def _native_case(p):
275 """Gets the native path case. Warning: this function resolves symlinks."""
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000276 try:
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000277 rel_ref, _ = Carbon.File.FSPathMakeRef(p.encode('utf-8'))
278 # The OSX underlying code uses NFD but python strings are in NFC. This
279 # will cause issues with os.listdir() for example. Since the dtrace log
280 # *is* in NFC, normalize it here.
csharp@chromium.org5a2c3732013-03-07 14:44:56 +0000281 out = unicodedata.normalize(
282 'NFC', rel_ref.FSRefMakePath().decode('utf-8'))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000283 if p.endswith(os.path.sep) and not out.endswith(os.path.sep):
284 return out + os.path.sep
285 return out
286 except MacOS.Error, e:
287 if e.args[0] in (-43, -120):
288 # The path does not exist. Try to recurse and reconstruct the path.
289 # -43 means file not found.
290 # -120 means directory not found.
291 base = os.path.dirname(p)
292 rest = os.path.basename(p)
csharp@chromium.org5a2c3732013-03-07 14:44:56 +0000293 return os.path.join(_native_case(base), rest)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000294 raise OSError(
295 e.args[0], 'Failed to get native path for %s' % p, p, e.args[1])
296
297
298 def _split_at_symlink_native(base_path, rest):
299 """Returns the native path for a symlink."""
300 base, symlink, rest = split_at_symlink(base_path, rest)
301 if symlink:
302 if not base_path:
303 base_path = base
304 else:
305 base_path = safe_join(base_path, base)
csharp@chromium.orgf2eacff2013-04-04 14:20:20 +0000306 symlink = find_item_native_case(base_path, symlink)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000307 return base, symlink, rest
308
309
csharp@chromium.orgf2eacff2013-04-04 14:20:20 +0000310 def find_item_native_case(root_path, item):
311 """Gets the native path case of a single item based at root_path.
312
313 There is no API to get the native path case of symlinks on OSX. So it
314 needs to be done the slow way.
315 """
316 if item == '..':
317 return item
318
319 item = item.lower()
320 for element in os.listdir(root_path):
321 if element.lower() == item:
322 return element
323
324
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000325 def get_native_path_case(path):
326 """Returns the native path case for an existing file.
327
328 Technically, it's only HFS+ on OSX that is case preserving and
329 insensitive. It's the default setting on HFS+ but can be changed.
330 """
maruel@chromium.orgd1e9fbd2013-02-26 14:42:24 +0000331 assert isinstance(path, unicode), repr(path)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000332 if not isabs(path):
333 raise ValueError(
maruel@chromium.orgd1e9fbd2013-02-26 14:42:24 +0000334 'get_native_path_case(%r): Require an absolute path' % path, path)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000335 if path.startswith('/dev'):
336 # /dev is not visible from Carbon, causing an exception.
337 return path
338
339 # Starts assuming there is no symlink along the path.
340 resolved = _native_case(path)
csharp@chromium.orgf2eacff2013-04-04 14:20:20 +0000341 if path.lower() in (resolved.lower(), resolved.lower() + './'):
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000342 # This code path is incredibly faster.
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000343 logging.debug('get_native_path_case(%s) = %s' % (path, resolved))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000344 return resolved
345
346 # There was a symlink, process it.
347 base, symlink, rest = _split_at_symlink_native(None, path)
348 assert symlink, (path, base, symlink, rest, resolved)
349 prev = base
350 base = safe_join(_native_case(base), symlink)
351 assert len(base) > len(prev)
352 while rest:
353 prev = base
354 relbase, symlink, rest = _split_at_symlink_native(base, rest)
355 base = safe_join(base, relbase)
356 assert len(base) > len(prev), (prev, base, symlink)
357 if symlink:
358 base = safe_join(base, symlink)
359 assert len(base) > len(prev), (prev, base, symlink)
360 # Make sure no symlink was resolved.
361 assert base.lower() == path.lower(), (base, path)
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000362 logging.debug('get_native_path_case(%s) = %s' % (path, base))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000363 return base
364
365
366else: # OSes other than Windows and OSX.
367
368
369 # On non-windows, keep the stdlib behavior.
370 isabs = os.path.isabs
371
372
csharp@chromium.orgf2eacff2013-04-04 14:20:20 +0000373 def find_item_native_case(root, item):
374 """Gets the native path case of a single item based at root_path."""
375 if item == '..':
376 return item
377
378 root = get_native_path_case(root)
379 return os.path.basename(get_native_path_case(os.path.join(root, item)))
380
381
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000382 def get_native_path_case(path):
383 """Returns the native path case for an existing file.
384
385 On OSes other than OSX and Windows, assume the file system is
386 case-sensitive.
387
388 TODO(maruel): This is not strictly true. Implement if necessary.
389 """
maruel@chromium.orgd1e9fbd2013-02-26 14:42:24 +0000390 assert isinstance(path, unicode), repr(path)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000391 if not isabs(path):
392 raise ValueError(
maruel@chromium.orgd1e9fbd2013-02-26 14:42:24 +0000393 'get_native_path_case(%r): Require an absolute path' % path, path)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000394 # Give up on cygwin, as GetLongPathName() can't be called.
395 # Linux traces tends to not be normalized so use this occasion to normalize
396 # it. This function implementation already normalizes the path on the other
397 # OS so this needs to be done here to be coherent between OSes.
398 out = os.path.normpath(path)
399 if path.endswith(os.path.sep) and not out.endswith(os.path.sep):
400 return out + os.path.sep
401 return out
402
403
404if sys.platform != 'win32': # All non-Windows OSes.
405
406
407 def safe_join(*args):
408 """Joins path elements like os.path.join() but doesn't abort on absolute
409 path.
410
411 os.path.join('foo', '/bar') == '/bar'
412 but safe_join('foo', '/bar') == 'foo/bar'.
413 """
414 out = ''
415 for element in args:
416 if element.startswith(os.path.sep):
417 if out.endswith(os.path.sep):
418 out += element[1:]
419 else:
420 out += element
421 else:
422 if out.endswith(os.path.sep):
423 out += element
424 else:
425 out += os.path.sep + element
426 return out
427
428
429 def split_at_symlink(base_dir, relfile):
430 """Scans each component of relfile and cut the string at the symlink if
431 there is any.
432
433 Returns a tuple (base_path, symlink, rest), with symlink == rest == None if
434 not symlink was found.
435 """
436 if base_dir:
437 assert relfile
438 assert os.path.isabs(base_dir)
439 index = 0
440 else:
441 assert os.path.isabs(relfile)
442 index = 1
443
444 def at_root(rest):
445 if base_dir:
446 return safe_join(base_dir, rest)
447 return rest
448
449 while True:
450 try:
451 index = relfile.index(os.path.sep, index)
452 except ValueError:
453 index = len(relfile)
454 full = at_root(relfile[:index])
455 if os.path.islink(full):
456 # A symlink!
457 base = os.path.dirname(relfile[:index])
458 symlink = os.path.basename(relfile[:index])
459 rest = relfile[index:]
460 logging.debug(
461 'split_at_symlink(%s, %s) -> (%s, %s, %s)' %
462 (base_dir, relfile, base, symlink, rest))
463 return base, symlink, rest
464 if index == len(relfile):
465 break
466 index += 1
467 return relfile, None, None
468
469
470def fix_python_path(cmd):
471 """Returns the fixed command line to call the right python executable."""
472 out = cmd[:]
473 if out[0] == 'python':
474 out[0] = sys.executable
475 elif out[0].endswith('.py'):
476 out.insert(0, sys.executable)
477 return out
478
479
480def create_thunk():
481 handle, name = tempfile.mkstemp(prefix='trace_inputs_thunk', suffix='.py')
482 os.write(
483 handle,
484 (
485 'import subprocess\n'
486 'import sys\n'
487 'sys.exit(subprocess.call(sys.argv[2:]))\n'
488 ))
489 os.close(handle)
490 return name
491
492
493def strace_process_quoted_arguments(text):
494 """Extracts quoted arguments on a string and return the arguments as a list.
495
496 Implemented as an automaton. Supports incomplete strings in the form
497 '"foo"...'.
498
499 Example:
500 With text = '"foo", "bar"', the function will return ['foo', 'bar']
501
502 TODO(maruel): Implement escaping.
503 """
504 # All the possible states of the DFA.
505 ( NEED_QUOTE, # Begining of a new arguments.
506 INSIDE_STRING, # Inside an argument.
507 ESCAPED, # Found a '\' inside a quote. Treat the next char as-is.
508 NEED_COMMA_OR_DOT, # Right after the closing quote of an argument. Could be
509 # a serie of 3 dots or a comma.
510 NEED_SPACE, # Right after a comma
511 NEED_DOT_2, # Found a dot, need a second one.
512 NEED_DOT_3, # Found second dot, need a third one.
513 NEED_COMMA, # Found third dot, need a comma.
514 ) = range(8)
515
516 state = NEED_QUOTE
517 out = []
518 for index, char in enumerate(text):
519 if char == '"':
520 if state == NEED_QUOTE:
521 state = INSIDE_STRING
522 # A new argument was found.
523 out.append('')
524 elif state == INSIDE_STRING:
525 # The argument is now closed.
526 state = NEED_COMMA_OR_DOT
527 elif state == ESCAPED:
528 out[-1] += char
529 state = INSIDE_STRING
530 else:
531 raise ValueError(
maruel@chromium.org588e09d2013-05-21 14:38:57 +0000532 'Can\'t process char \'%s\' at column %d for: %r' % (
533 char, index, text),
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000534 index,
535 text)
536 elif char == ',':
537 if state in (NEED_COMMA_OR_DOT, NEED_COMMA):
538 state = NEED_SPACE
539 elif state == INSIDE_STRING:
540 out[-1] += char
541 elif state == ESCAPED:
542 out[-1] += char
543 state = INSIDE_STRING
544 else:
545 raise ValueError(
maruel@chromium.org588e09d2013-05-21 14:38:57 +0000546 'Can\'t process char \'%s\' at column %d for: %r' % (
547 char, index, text),
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000548 index,
549 text)
550 elif char == ' ':
551 if state == NEED_SPACE:
552 state = NEED_QUOTE
553 elif state == INSIDE_STRING:
554 out[-1] += char
555 elif state == ESCAPED:
556 out[-1] += char
557 state = INSIDE_STRING
558 else:
559 raise ValueError(
maruel@chromium.org588e09d2013-05-21 14:38:57 +0000560 'Can\'t process char \'%s\' at column %d for: %r' % (
561 char, index, text),
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000562 index,
563 text)
564 elif char == '.':
maruel@chromium.org588e09d2013-05-21 14:38:57 +0000565 if state in (NEED_QUOTE, NEED_COMMA_OR_DOT):
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000566 # The string is incomplete, this mean the strace -s flag should be
567 # increased.
maruel@chromium.org588e09d2013-05-21 14:38:57 +0000568 # For NEED_QUOTE, the input string would look like '"foo", ...'.
569 # For NEED_COMMA_OR_DOT, the input string would look like '"foo"...'
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000570 state = NEED_DOT_2
571 elif state == NEED_DOT_2:
572 state = NEED_DOT_3
573 elif state == NEED_DOT_3:
574 state = NEED_COMMA
575 elif state == INSIDE_STRING:
576 out[-1] += char
577 elif state == ESCAPED:
578 out[-1] += char
579 state = INSIDE_STRING
580 else:
581 raise ValueError(
maruel@chromium.org588e09d2013-05-21 14:38:57 +0000582 'Can\'t process char \'%s\' at column %d for: %r' % (
583 char, index, text),
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000584 index,
585 text)
586 elif char == '\\':
587 if state == ESCAPED:
588 out[-1] += char
589 state = INSIDE_STRING
590 elif state == INSIDE_STRING:
591 state = ESCAPED
592 else:
593 raise ValueError(
maruel@chromium.org588e09d2013-05-21 14:38:57 +0000594 'Can\'t process char \'%s\' at column %d for: %r' % (
595 char, index, text),
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000596 index,
597 text)
598 else:
599 if state == INSIDE_STRING:
600 out[-1] += char
601 else:
602 raise ValueError(
maruel@chromium.org588e09d2013-05-21 14:38:57 +0000603 'Can\'t process char \'%s\' at column %d for: %r' % (
604 char, index, text),
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000605 index,
606 text)
607 if state not in (NEED_COMMA, NEED_COMMA_OR_DOT):
608 raise ValueError(
609 'String is incorrectly terminated: %r' % text,
610 text)
611 return out
612
613
614def read_json(filepath):
615 with open(filepath, 'r') as f:
616 return json.load(f)
617
618
619def write_json(filepath_or_handle, data, dense):
620 """Writes data into filepath or file handle encoded as json.
621
622 If dense is True, the json is packed. Otherwise, it is human readable.
623 """
624 if hasattr(filepath_or_handle, 'write'):
625 if dense:
626 filepath_or_handle.write(
627 json.dumps(data, sort_keys=True, separators=(',',':')))
628 else:
629 filepath_or_handle.write(json.dumps(data, sort_keys=True, indent=2))
630 else:
631 with open(filepath_or_handle, 'wb') as f:
632 if dense:
633 json.dump(data, f, sort_keys=True, separators=(',',':'))
634 else:
635 json.dump(data, f, sort_keys=True, indent=2)
636
637
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000638def assert_is_renderable(pseudo_string):
639 """Asserts the input is a valid object to be processed by render()."""
640 assert (
641 isinstance(pseudo_string, (None.__class__, unicode)) or
642 hasattr(pseudo_string, 'render')), repr(pseudo_string)
643
644
645def render(pseudo_string):
646 """Converts the pseudo-string to an unicode string."""
647 assert_is_renderable(pseudo_string)
648 if isinstance(pseudo_string, (None.__class__, unicode)):
649 return pseudo_string
650 return pseudo_string.render()
651
652
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000653class Results(object):
654 """Results of a trace session."""
655
656 class _TouchedObject(object):
657 """Something, a file or a directory, that was accessed."""
658 def __init__(self, root, path, tainted, size, nb_files):
659 logging.debug(
660 '%s(%s, %s, %s, %s, %s)' %
661 (self.__class__.__name__, root, path, tainted, size, nb_files))
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000662 assert_is_renderable(root)
663 assert_is_renderable(path)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000664 self.root = root
665 self.path = path
666 self.tainted = tainted
667 self.nb_files = nb_files
668 # Can be used as a cache or a default value, depending on context.
669 self._size = size
670 # These are cache only.
671 self._real_path = None
672
673 # Check internal consistency.
674 assert path, path
675 assert tainted or bool(root) != bool(isabs(path)), (root, path)
676 assert tainted or (
677 not os.path.exists(self.full_path) or
678 (self.full_path == get_native_path_case(self.full_path))), (
679 tainted, self.full_path, get_native_path_case(self.full_path))
680
681 @property
682 def existent(self):
683 return self.size != -1
684
685 @property
686 def full_path(self):
687 if self.root:
688 return os.path.join(self.root, self.path)
689 return self.path
690
691 @property
692 def real_path(self):
693 """Returns the path with symlinks resolved."""
694 if not self._real_path:
695 self._real_path = os.path.realpath(self.full_path)
696 return self._real_path
697
698 @property
699 def size(self):
700 """File's size. -1 is not existent."""
701 if self._size is None and not self.tainted:
702 try:
703 self._size = os.stat(self.full_path).st_size
704 except OSError:
705 self._size = -1
706 return self._size
707
708 def flatten(self):
709 """Returns a dict representing this object.
710
711 A 'size' of 0 means the file was only touched and not read.
712 """
713 return {
714 'path': self.path,
715 'size': self.size,
716 }
717
718 def replace_variables(self, variables):
719 """Replaces the root of this File with one of the variables if it matches.
720
721 If a variable replacement occurs, the cloned object becomes tainted.
722 """
723 for variable, root_path in variables.iteritems():
724 if self.path.startswith(root_path):
725 return self._clone(
726 self.root, variable + self.path[len(root_path):], True)
727 # No need to clone, returns ourself.
728 return self
729
730 def strip_root(self, root):
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000731 """Returns a clone of itself with 'root' stripped off.
732
733 Note that the file is kept if it is either accessible from a symlinked
734 path that was used to access the file or through the real path.
735 """
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000736 # Check internal consistency.
737 assert self.tainted or (isabs(root) and root.endswith(os.path.sep)), root
738 if not self.full_path.startswith(root):
739 # Now try to resolve the symlinks to see if it can be reached this way.
740 # Only try *after* trying without resolving symlink.
741 if not self.real_path.startswith(root):
742 return None
743 path = self.real_path
744 else:
745 path = self.full_path
746 return self._clone(root, path[len(root):], self.tainted)
747
748 def _clone(self, new_root, new_path, tainted):
749 raise NotImplementedError(self.__class__.__name__)
750
751 class File(_TouchedObject):
752 """A file that was accessed. May not be present anymore.
753
754 If tainted is true, it means it is not a real path anymore as a variable
755 replacement occured.
756
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000757 If only_touched is True, this means the file was probed for existence, and
758 it is existent, but was never _opened_. If only_touched is True, the file
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000759 must have existed.
760 """
761 def __init__(self, root, path, tainted, size):
762 super(Results.File, self).__init__(root, path, tainted, size, 1)
763
764 def _clone(self, new_root, new_path, tainted):
765 """Clones itself keeping meta-data."""
766 # Keep the self.size and self._real_path caches for performance reason. It
767 # is also important when the file becomes tainted (with a variable instead
768 # of the real path) since self.path is not an on-disk path anymore so
769 # out._size cannot be updated.
770 out = self.__class__(new_root, new_path, tainted, self.size)
771 out._real_path = self._real_path
772 return out
773
774 class Directory(_TouchedObject):
775 """A directory of files. Must exist."""
776 def __init__(self, root, path, tainted, size, nb_files):
777 """path='.' is a valid value and must be handled appropriately."""
778 assert not path.endswith(os.path.sep), path
779 super(Results.Directory, self).__init__(
780 root, path + os.path.sep, tainted, size, nb_files)
781 # For a Directory instance, self.size is not a cache, it's an actual value
782 # that is never modified and represents the total size of the files
783 # contained in this directory. It is possible that the directory is empty
784 # so that size == 0; this happens if there's only an invalid symlink in
785 # it.
786
787 def flatten(self):
788 out = super(Results.Directory, self).flatten()
789 out['nb_files'] = self.nb_files
790 return out
791
792 def _clone(self, new_root, new_path, tainted):
793 """Clones itself keeping meta-data."""
794 out = self.__class__(
795 new_root,
796 new_path.rstrip(os.path.sep),
797 tainted,
798 self.size,
799 self.nb_files)
800 out._real_path = self._real_path
801 return out
802
803 class Process(object):
804 """A process that was traced.
805
806 Contains references to the files accessed by this process and its children.
807 """
808 def __init__(self, pid, files, executable, command, initial_cwd, children):
809 logging.debug('Process(%s, %d, ...)' % (pid, len(files)))
810 self.pid = pid
811 self.files = sorted(files, key=lambda x: x.path)
812 self.children = children
813 self.executable = executable
814 self.command = command
815 self.initial_cwd = initial_cwd
816
817 # Check internal consistency.
818 assert len(set(f.path for f in self.files)) == len(self.files), sorted(
819 f.path for f in self.files)
820 assert isinstance(self.children, list)
821 assert isinstance(self.files, list)
822
823 @property
824 def all(self):
825 for child in self.children:
826 for i in child.all:
827 yield i
828 yield self
829
830 def flatten(self):
831 return {
832 'children': [c.flatten() for c in self.children],
833 'command': self.command,
834 'executable': self.executable,
835 'files': [f.flatten() for f in self.files],
836 'initial_cwd': self.initial_cwd,
837 'pid': self.pid,
838 }
839
840 def strip_root(self, root):
841 assert isabs(root) and root.endswith(os.path.sep), root
842 # Loads the files after since they are constructed as objects.
843 out = self.__class__(
844 self.pid,
845 filter(None, (f.strip_root(root) for f in self.files)),
846 self.executable,
847 self.command,
848 self.initial_cwd,
849 [c.strip_root(root) for c in self.children])
850 logging.debug(
851 'strip_root(%s) %d -> %d' % (root, len(self.files), len(out.files)))
852 return out
853
854 def __init__(self, process):
855 self.process = process
856 # Cache.
857 self._files = None
858
859 def flatten(self):
860 return {
861 'root': self.process.flatten(),
862 }
863
864 @property
865 def files(self):
866 if self._files is None:
867 self._files = sorted(
868 sum((p.files for p in self.process.all), []),
869 key=lambda x: x.path)
870 return self._files
871
872 @property
873 def existent(self):
874 return [f for f in self.files if f.existent]
875
876 @property
877 def non_existent(self):
878 return [f for f in self.files if not f.existent]
879
880 def strip_root(self, root):
881 """Returns a clone with all the files outside the directory |root| removed
882 and converts all the path to be relative paths.
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000883
884 It keeps files accessible through the |root| directory or that have been
885 accessed through any symlink which points to the same directory.
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000886 """
887 # Resolve any symlink
888 root = os.path.realpath(root)
889 root = get_native_path_case(root).rstrip(os.path.sep) + os.path.sep
890 logging.debug('strip_root(%s)' % root)
891 return Results(self.process.strip_root(root))
892
893
894class ApiBase(object):
895 """OS-agnostic API to trace a process and its children."""
896 class Context(object):
897 """Processes one log line at a time and keeps the list of traced processes.
898
899 The parsing is complicated by the fact that logs are traced out of order for
900 strace but in-order for dtrace and logman. In addition, on Windows it is
901 very frequent that processids are reused so a flat list cannot be used. But
902 at the same time, it is impossible to faithfully construct a graph when the
903 logs are processed out of order. So both a tree and a flat mapping are used,
904 the tree is the real process tree, while the flat mapping stores the last
905 valid process for the corresponding processid. For the strace case, the
906 tree's head is guessed at the last moment.
907 """
908 class Process(object):
909 """Keeps context for one traced child process.
910
911 Logs all the files this process touched. Ignores directories.
912 """
913 def __init__(self, blacklist, pid, initial_cwd):
914 # Check internal consistency.
915 assert isinstance(pid, int), repr(pid)
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000916 assert_is_renderable(initial_cwd)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000917 self.pid = pid
918 # children are Process instances.
919 self.children = []
920 self.initial_cwd = initial_cwd
921 self.cwd = None
922 self.files = set()
923 self.only_touched = set()
924 self.executable = None
925 self.command = None
926 self._blacklist = blacklist
927
928 def to_results_process(self):
929 """Resolves file case sensitivity and or late-bound strings."""
930 # When resolving files, it's normal to get dupe because a file could be
931 # opened multiple times with different case. Resolve the deduplication
932 # here.
933 def fix_path(x):
934 """Returns the native file path case.
935
936 Converts late-bound strings.
937 """
938 if not x:
939 # Do not convert None instance to 'None'.
940 return x
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000941 x = render(x)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000942 if os.path.isabs(x):
943 # If the path is not absolute, which tends to happen occasionally on
944 # Windows, it is not possible to get the native path case so ignore
945 # that trace. It mostly happens for 'executable' value.
946 x = get_native_path_case(x)
947 return x
948
949 def fix_and_blacklist_path(x):
950 x = fix_path(x)
951 if not x:
952 return
953 # The blacklist needs to be reapplied, since path casing could
954 # influence blacklisting.
955 if self._blacklist(x):
956 return
957 return x
958
959 # Filters out directories. Some may have passed through.
960 files = set(f for f in map(fix_and_blacklist_path, self.files) if f)
961 only_touched = set(
962 f for f in map(fix_and_blacklist_path, self.only_touched) if f)
963 only_touched -= files
964
965 files = [
966 Results.File(None, f, False, None) for f in files
967 if not os.path.isdir(f)
968 ]
969 # Using 0 as size means the file's content is ignored since the file was
970 # never opened for I/O.
971 files.extend(
972 Results.File(None, f, False, 0) for f in only_touched
973 if not os.path.isdir(f)
974 )
975 return Results.Process(
976 self.pid,
977 files,
978 fix_path(self.executable),
979 self.command,
980 fix_path(self.initial_cwd),
981 [c.to_results_process() for c in self.children])
982
983 def add_file(self, filepath, touch_only):
984 """Adds a file if it passes the blacklist."""
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000985 if self._blacklist(render(filepath)):
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000986 return
987 logging.debug('add_file(%d, %s, %s)' % (self.pid, filepath, touch_only))
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000988 # Note that filepath and not render(filepath) is added. It is
989 # because filepath could be something else than a string, like a
990 # RelativePath instance for dtrace logs.
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000991 if touch_only:
992 self.only_touched.add(filepath)
993 else:
994 self.files.add(filepath)
995
996 def __init__(self, blacklist):
997 self.blacklist = blacklist
998 # Initial process.
999 self.root_process = None
1000 # dict to accelerate process lookup, to not have to lookup the whole graph
1001 # each time.
1002 self._process_lookup = {}
1003
1004 class Tracer(object):
1005 """During it's lifetime, the tracing subsystem is enabled."""
1006 def __init__(self, logname):
1007 self._logname = logname
maruel@chromium.org808e3f22013-05-16 16:42:54 +00001008 self._lock = threading.RLock()
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001009 self._traces = []
1010 self._initialized = True
maruel@chromium.org808e3f22013-05-16 16:42:54 +00001011 self._script = None
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001012
1013 def trace(self, cmd, cwd, tracename, output):
1014 """Runs the OS-specific trace program on an executable.
1015
1016 Arguments:
1017 - cmd: The command (a list) to run.
1018 - cwd: Current directory to start the child process in.
1019 - tracename: Name of the trace in the logname file.
1020 - output: If False, redirects output to PIPEs.
1021
1022 Returns a tuple (resultcode, output) and updates the internal trace
1023 entries.
1024 """
1025 # The implementation adds an item to self._traces.
1026 raise NotImplementedError(self.__class__.__name__)
1027
1028 def close(self, _timeout=None):
1029 """Saves the meta-data in the logname file.
1030
1031 For kernel-based tracing, stops the tracing subsystem.
1032
1033 Must not be used manually when using 'with' construct.
1034 """
1035 with self._lock:
maruel@chromium.org808e3f22013-05-16 16:42:54 +00001036 if not self._initialized:
1037 raise TracingFailure(
1038 'Called %s.close() on an unitialized object' %
1039 self.__class__.__name__,
1040 None, None, None)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001041 try:
maruel@chromium.org808e3f22013-05-16 16:42:54 +00001042 if self._script:
1043 os.remove(self._script)
1044 self._script = None
1045 write_json(self._logname, self._gen_logdata(), False)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001046 finally:
1047 self._initialized = False
1048
1049 def post_process_log(self):
1050 """Post-processes the log so it becomes faster to load afterward.
1051
1052 Must not be used manually when using 'with' construct.
1053 """
1054 assert not self._initialized, 'Must stop tracing first.'
1055
maruel@chromium.org808e3f22013-05-16 16:42:54 +00001056 def _gen_logdata(self):
1057 """Returns the data to be saved in the trace file."""
1058 return {
1059 'traces': self._traces,
1060 }
1061
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001062 def __enter__(self):
1063 """Enables 'with' statement."""
1064 return self
1065
1066 def __exit__(self, exc_type, exc_value, traceback):
1067 """Enables 'with' statement."""
1068 self.close()
1069 # If an exception was thrown, do not process logs.
1070 if not exc_type:
1071 self.post_process_log()
1072
1073 def get_tracer(self, logname):
1074 """Returns an ApiBase.Tracer instance.
1075
1076 Initializes the tracing subsystem, which is a requirement for kernel-based
1077 tracers. Only one tracer instance should be live at a time!
1078
1079 logname is the filepath to the json file that will contain the meta-data
1080 about the logs.
1081 """
1082 return self.Tracer(logname)
1083
1084 @staticmethod
1085 def clean_trace(logname):
1086 """Deletes an old log."""
1087 raise NotImplementedError()
1088
1089 @classmethod
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00001090 def parse_log(cls, logname, blacklist, trace_name):
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001091 """Processes trace logs and returns the files opened and the files that do
1092 not exist.
1093
1094 It does not track directories.
1095
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00001096 Arguments:
1097 - logname: must be an absolute path.
1098 - blacklist: must be a lambda.
1099 - trace_name: optional trace to read, defaults to reading all traces.
maruel@chromium.org8b056ba2012-10-16 14:04:49 +00001100
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001101 Most of the time, files that do not exist are temporary test files that
1102 should be put in /tmp instead. See http://crbug.com/116251.
1103
1104 Returns a list of dict with keys:
1105 - results: A Results instance.
1106 - trace: The corresponding tracename parameter provided to
1107 get_tracer().trace().
1108 - output: Output gathered during execution, if get_tracer().trace(...,
1109 output=False) was used.
1110 """
1111 raise NotImplementedError(cls.__class__.__name__)
1112
1113
1114class Strace(ApiBase):
1115 """strace implies linux."""
maruel@chromium.org306e0e72012-11-02 18:22:03 +00001116 @staticmethod
1117 def load_filename(filename):
1118 """Parses a filename in a log."""
1119 assert isinstance(filename, str)
1120 out = ''
1121 i = 0
1122 while i < len(filename):
1123 c = filename[i]
1124 if c == '\\':
1125 out += chr(int(filename[i+1:i+4], 8))
1126 i += 4
1127 else:
1128 out += c
1129 i += 1
1130 # TODO(maruel): That's not necessarily true that the current code page is
1131 # utf-8.
1132 return out.decode('utf-8')
1133
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001134 class Context(ApiBase.Context):
1135 """Processes a strace log line and keeps the list of existent and non
1136 existent files accessed.
1137
1138 Ignores directories.
1139
1140 Uses late-binding to processes the cwd of each process. The problem is that
1141 strace generates one log file per process it traced but doesn't give any
1142 information about which process was started when and by who. So we don't
1143 even know which process is the initial one. So process the logs out of
1144 order and use late binding with RelativePath to be able to deduce the
1145 initial directory of each process once all the logs are parsed.
1146 """
1147 class Process(ApiBase.Context.Process):
1148 """Represents the state of a process.
1149
1150 Contains all the information retrieved from the pid-specific log.
1151 """
1152 # Function names are using ([a-z_0-9]+)
1153 # This is the most common format. function(args) = result
maruel@chromium.org588e09d2013-05-21 14:38:57 +00001154 RE_HEADER = re.compile(r'^([a-z_0-9]+)\((.*?)\)\s+= (.+)$')
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001155 # An interrupted function call, only grab the minimal header.
1156 RE_UNFINISHED = re.compile(r'^([^\(]+)(.*) \<unfinished \.\.\.\>$')
1157 # A resumed function call.
1158 RE_RESUMED = re.compile(r'^<\.\.\. ([^ ]+) resumed> (.+)$')
1159 # A process received a signal.
1160 RE_SIGNAL = re.compile(r'^--- SIG[A-Z]+ .+ ---')
1161 # A process didn't handle a signal. Ignore any junk appearing before,
1162 # because the process was forcibly killed so it won't open any new file.
1163 RE_KILLED = re.compile(
1164 r'^.*\+\+\+ killed by ([A-Z]+)( \(core dumped\))? \+\+\+$')
1165 # The process has exited.
1166 RE_PROCESS_EXITED = re.compile(r'^\+\+\+ exited with (\d+) \+\+\+')
1167 # A call was canceled. Ignore any prefix.
1168 RE_UNAVAILABLE = re.compile(r'^.*\)\s*= \? <unavailable>$')
1169 # Happens when strace fails to even get the function name.
1170 UNNAMED_FUNCTION = '????'
1171
1172 # Corner-case in python, a class member function decorator must not be
1173 # @staticmethod.
1174 def parse_args(regexp, expect_zero): # pylint: disable=E0213
1175 """Automatically convert the str 'args' into a list of processed
1176 arguments.
1177
1178 Arguments:
1179 - regexp is used to parse args.
1180 - expect_zero: one of True, False or None.
1181 - True: will check for result.startswith('0') first and will ignore
1182 the trace line completely otherwise. This is important because for
1183 many functions, the regexp will not process if the call failed.
1184 - False: will check for not result.startswith(('?', '-1')) for the
1185 same reason than with True.
1186 - None: ignore result.
1187 """
1188 def meta_hook(function):
1189 assert function.__name__.startswith('handle_')
1190 def hook(self, args, result):
1191 if expect_zero is True and not result.startswith('0'):
1192 return
1193 if expect_zero is False and result.startswith(('?', '-1')):
1194 return
1195 match = re.match(regexp, args)
1196 if not match:
1197 raise TracingFailure(
1198 'Failed to parse %s(%s) = %s' %
1199 (function.__name__[len('handle_'):], args, result),
1200 None, None, None)
1201 return function(self, match.groups(), result)
1202 return hook
1203 return meta_hook
1204
1205 class RelativePath(object):
1206 """A late-bound relative path."""
1207 def __init__(self, parent, value):
maruel@chromium.org306e0e72012-11-02 18:22:03 +00001208 assert_is_renderable(parent)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001209 self.parent = parent
maruel@chromium.org306e0e72012-11-02 18:22:03 +00001210 assert isinstance(value, (None.__class__, str)), repr(value)
1211 self.value = Strace.load_filename(value) if value else value
1212 if self.value:
1213 assert '\\' not in self.value, value
1214 assert '\\' not in self.value, (repr(value), repr(self.value))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001215
1216 def render(self):
1217 """Returns the current directory this instance is representing.
1218
1219 This function is used to return the late-bound value.
1220 """
1221 if self.value and self.value.startswith(u'/'):
1222 # An absolute path.
maruel@chromium.org306e0e72012-11-02 18:22:03 +00001223 # TODO(maruel): This is wrong, we can't assert it is utf-8.
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001224 return self.value
1225 parent = self.parent.render() if self.parent else u'<None>'
1226 if self.value:
1227 return os.path.normpath(os.path.join(parent, self.value))
1228 return parent
1229
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001230 def __init__(self, root, pid):
1231 """Keeps enough information to be able to guess the original process
1232 root.
1233
1234 strace doesn't store which process was the initial process. So more
1235 information needs to be kept so the graph can be reconstructed from the
1236 flat map.
1237 """
1238 logging.info('%s(%d)' % (self.__class__.__name__, pid))
1239 super(Strace.Context.Process, self).__init__(root.blacklist, pid, None)
1240 assert isinstance(root, ApiBase.Context)
1241 self._root = weakref.ref(root)
1242 # The dict key is the function name of the pending call, like 'open'
1243 # or 'execve'.
1244 self._pending_calls = {}
1245 self._line_number = 0
1246 # Current directory when the process started.
1247 self.initial_cwd = self.RelativePath(self._root(), None)
1248 self.parentid = None
maruel@chromium.org9f72e0f2012-10-18 14:48:49 +00001249 self._done = False
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001250
1251 def get_cwd(self):
1252 """Returns the best known value of cwd."""
1253 return self.cwd or self.initial_cwd
1254
1255 def render(self):
1256 """Returns the string value of the RelativePath() object.
1257
1258 Used by RelativePath. Returns the initial directory and not the
1259 current one since the current directory 'cwd' validity is time-limited.
1260
1261 The validity is only guaranteed once all the logs are processed.
1262 """
1263 return self.initial_cwd.render()
1264
1265 def on_line(self, line):
maruel@chromium.org306e0e72012-11-02 18:22:03 +00001266 assert isinstance(line, str)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001267 self._line_number += 1
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001268 try:
maruel@chromium.org588e09d2013-05-21 14:38:57 +00001269 if self._done:
1270 raise TracingFailure(
1271 'Found a trace for a terminated process or corrupted log',
1272 None, None, None)
1273
1274 if self.RE_SIGNAL.match(line):
1275 # Ignore signals.
1276 return
1277
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001278 match = self.RE_KILLED.match(line)
1279 if match:
1280 # Converts a '+++ killed by Foo +++' trace into an exit_group().
1281 self.handle_exit_group(match.group(1), None)
1282 return
1283
1284 match = self.RE_PROCESS_EXITED.match(line)
1285 if match:
1286 # Converts a '+++ exited with 1 +++' trace into an exit_group()
1287 self.handle_exit_group(match.group(1), None)
1288 return
1289
1290 match = self.RE_UNFINISHED.match(line)
1291 if match:
1292 if match.group(1) in self._pending_calls:
1293 raise TracingFailure(
1294 'Found two unfinished calls for the same function',
1295 None, None, None,
1296 self._pending_calls)
1297 self._pending_calls[match.group(1)] = (
1298 match.group(1) + match.group(2))
1299 return
1300
1301 match = self.RE_UNAVAILABLE.match(line)
1302 if match:
1303 # This usually means a process was killed and a pending call was
1304 # canceled.
1305 # TODO(maruel): Look up the last exit_group() trace just above and
1306 # make sure any self._pending_calls[anything] is properly flushed.
1307 return
1308
1309 match = self.RE_RESUMED.match(line)
1310 if match:
1311 if match.group(1) not in self._pending_calls:
1312 raise TracingFailure(
1313 'Found a resumed call that was not logged as unfinished',
1314 None, None, None,
1315 self._pending_calls)
1316 pending = self._pending_calls.pop(match.group(1))
1317 # Reconstruct the line.
1318 line = pending + match.group(2)
1319
1320 match = self.RE_HEADER.match(line)
1321 if not match:
maruel@chromium.orgbcfc9e12013-03-19 18:47:17 +00001322 # The line is corrupted. It happens occasionally when a process is
1323 # killed forcibly with activity going on. Assume the process died.
1324 # No other line can be processed afterward.
maruel@chromium.org588e09d2013-05-21 14:38:57 +00001325 logging.debug('%d is done: %s', self.pid, line)
maruel@chromium.orgbcfc9e12013-03-19 18:47:17 +00001326 self._done = True
1327 return
1328
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001329 if match.group(1) == self.UNNAMED_FUNCTION:
1330 return
1331
1332 # It's a valid line, handle it.
1333 handler = getattr(self, 'handle_%s' % match.group(1), None)
1334 if not handler:
1335 self._handle_unknown(match.group(1), match.group(2), match.group(3))
1336 return handler(match.group(2), match.group(3))
1337 except TracingFailure, e:
1338 # Hack in the values since the handler could be a static function.
1339 e.pid = self.pid
1340 e.line = line
1341 e.line_number = self._line_number
1342 # Re-raise the modified exception.
1343 raise
1344 except (KeyError, NotImplementedError, ValueError), e:
1345 raise TracingFailure(
1346 'Trace generated a %s exception: %s' % (
1347 e.__class__.__name__, str(e)),
1348 self.pid,
1349 self._line_number,
1350 line,
1351 e)
1352
1353 @parse_args(r'^\"(.+?)\", [FKORWX_|]+$', True)
1354 def handle_access(self, args, _result):
1355 self._handle_file(args[0], True)
1356
1357 @parse_args(r'^\"(.+?)\"$', True)
1358 def handle_chdir(self, args, _result):
1359 """Updates cwd."""
1360 self.cwd = self.RelativePath(self, args[0])
1361 logging.debug('handle_chdir(%d, %s)' % (self.pid, self.cwd))
1362
1363 def handle_clone(self, _args, result):
maruel@chromium.org588e09d2013-05-21 14:38:57 +00001364 self._handling_forking('clone', result)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001365
1366 def handle_close(self, _args, _result):
1367 pass
1368
1369 def handle_chmod(self, _args, _result):
1370 pass
1371
1372 def handle_creat(self, _args, _result):
1373 # Ignore files created, since they didn't need to exist.
1374 pass
1375
1376 @parse_args(r'^\"(.+?)\", \[(.+)\], \[\/\* \d+ vars? \*\/\]$', True)
1377 def handle_execve(self, args, _result):
1378 # Even if in practice execve() doesn't returns when it succeeds, strace
1379 # still prints '0' as the result.
1380 filepath = args[0]
1381 self._handle_file(filepath, False)
1382 self.executable = self.RelativePath(self.get_cwd(), filepath)
maruel@chromium.org588e09d2013-05-21 14:38:57 +00001383 try:
1384 self.command = strace_process_quoted_arguments(args[1])
1385 except ValueError as e:
1386 raise TracingFailure(
1387 'Failed to process command line argument:\n%s' % e.args[0],
1388 None, None, None)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001389
1390 def handle_exit_group(self, _args, _result):
1391 """Removes cwd."""
1392 self.cwd = None
1393
csharp@chromium.org11ae0672012-10-30 14:53:17 +00001394 @parse_args(r'^(\d+|AT_FDCWD), \"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', True)
1395 def handle_faccessat(self, args, _results):
1396 if args[0] == 'AT_FDCWD':
1397 self._handle_file(args[1], True)
1398 else:
1399 raise Exception('Relative faccess not implemented.')
1400
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001401 def handle_fork(self, args, result):
1402 self._handle_unknown('fork', args, result)
1403
maruel@chromium.org9f72e0f2012-10-18 14:48:49 +00001404 def handle_futex(self, _args, _result):
1405 pass
1406
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001407 def handle_getcwd(self, _args, _result):
1408 pass
1409
1410 @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
1411 def handle_link(self, args, _result):
1412 self._handle_file(args[0], False)
1413 self._handle_file(args[1], False)
1414
1415 @parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True)
1416 def handle_lstat(self, args, _result):
1417 self._handle_file(args[0], True)
1418
1419 def handle_mkdir(self, _args, _result):
1420 pass
1421
1422 @parse_args(r'^\"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', False)
1423 def handle_open(self, args, _result):
1424 if 'O_DIRECTORY' in args[1]:
1425 return
1426 self._handle_file(args[0], False)
1427
1428 @parse_args(r'^(\d+|AT_FDCWD), \"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', False)
1429 def handle_openat(self, args, _result):
1430 if 'O_DIRECTORY' in args[2]:
1431 return
csharp@chromium.org11ae0672012-10-30 14:53:17 +00001432 if args[0] == 'AT_FDCWD':
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001433 self._handle_file(args[1], False)
1434 else:
1435 # TODO(maruel): Implement relative open if necessary instead of the
1436 # AT_FDCWD flag, let's hope not since this means tracking all active
1437 # directory handles.
1438 raise Exception('Relative open via openat not implemented.')
1439
1440 @parse_args(r'^\"(.+?)\", \".+?\"(\.\.\.)?, \d+$', False)
1441 def handle_readlink(self, args, _result):
1442 self._handle_file(args[0], False)
1443
1444 @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
1445 def handle_rename(self, args, _result):
1446 self._handle_file(args[0], False)
1447 self._handle_file(args[1], False)
1448
1449 def handle_rmdir(self, _args, _result):
1450 pass
1451
1452 def handle_setxattr(self, _args, _result):
1453 pass
1454
1455 @parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True)
1456 def handle_stat(self, args, _result):
1457 self._handle_file(args[0], True)
1458
1459 def handle_symlink(self, _args, _result):
1460 pass
1461
csharp@chromium.orgfa672922012-11-02 17:43:31 +00001462 @parse_args(r'^\"(.+?)\", \d+', True)
1463 def handle_truncate(self, args, _result):
1464 self._handle_file(args[0], False)
1465
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001466 def handle_unlink(self, _args, _result):
1467 # In theory, the file had to be created anyway.
1468 pass
1469
csharp@chromium.org11ae0672012-10-30 14:53:17 +00001470 def handle_unlinkat(self, _args, _result):
1471 # In theory, the file had to be created anyway.
1472 pass
1473
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001474 def handle_statfs(self, _args, _result):
1475 pass
1476
csharp@chromium.org11ae0672012-10-30 14:53:17 +00001477 def handle_utimensat(self, _args, _result):
1478 pass
1479
maruel@chromium.org588e09d2013-05-21 14:38:57 +00001480 def handle_vfork(self, _args, result):
1481 self._handling_forking('vfork', result)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001482
1483 @staticmethod
1484 def _handle_unknown(function, args, result):
1485 raise TracingFailure(
1486 'Unexpected/unimplemented trace %s(%s)= %s' %
1487 (function, args, result),
1488 None, None, None)
1489
maruel@chromium.org588e09d2013-05-21 14:38:57 +00001490 def _handling_forking(self, name, result):
1491 """Transfers cwd."""
1492 if result.startswith(('?', '-1')):
1493 # The call failed.
1494 return
1495 # Update the other process right away.
1496 childpid = int(result)
1497 child = self._root().get_or_set_proc(childpid)
1498 if child.parentid is not None or childpid in self.children:
1499 raise TracingFailure(
1500 'Found internal inconsitency in process lifetime detection '
1501 'during a %s() call' % name,
1502 None, None, None)
1503
1504 # Copy the cwd object.
1505 child.initial_cwd = self.get_cwd()
1506 child.parentid = self.pid
1507 # It is necessary because the logs are processed out of order.
1508 self.children.append(child)
1509
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001510 def _handle_file(self, filepath, touch_only):
1511 filepath = self.RelativePath(self.get_cwd(), filepath)
maruel@chromium.org306e0e72012-11-02 18:22:03 +00001512 #assert not touch_only, render(filepath)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001513 self.add_file(filepath, touch_only)
1514
1515 def __init__(self, blacklist, initial_cwd):
1516 super(Strace.Context, self).__init__(blacklist)
maruel@chromium.org306e0e72012-11-02 18:22:03 +00001517 assert_is_renderable(initial_cwd)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001518 self.initial_cwd = initial_cwd
1519
1520 def render(self):
1521 """Returns the string value of the initial cwd of the root process.
1522
1523 Used by RelativePath.
1524 """
1525 return self.initial_cwd
1526
1527 def on_line(self, pid, line):
1528 """Transfers control into the Process.on_line() function."""
1529 self.get_or_set_proc(pid).on_line(line.strip())
1530
1531 def to_results(self):
1532 """Finds back the root process and verify consistency."""
1533 # TODO(maruel): Absolutely unecessary, fix me.
1534 root = [p for p in self._process_lookup.itervalues() if not p.parentid]
1535 if len(root) != 1:
1536 raise TracingFailure(
1537 'Found internal inconsitency in process lifetime detection '
1538 'while finding the root process',
1539 None,
1540 None,
1541 None,
1542 sorted(p.pid for p in root))
1543 self.root_process = root[0]
1544 process = self.root_process.to_results_process()
1545 if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
1546 raise TracingFailure(
1547 'Found internal inconsitency in process lifetime detection '
1548 'while looking for len(tree) == len(list)',
1549 None,
1550 None,
1551 None,
1552 sorted(self._process_lookup),
1553 sorted(p.pid for p in process.all))
1554 return Results(process)
1555
1556 def get_or_set_proc(self, pid):
1557 """Returns the Context.Process instance for this pid or creates a new one.
1558 """
1559 if not pid or not isinstance(pid, int):
1560 raise TracingFailure(
1561 'Unpexpected value for pid: %r' % pid,
1562 pid,
1563 None,
1564 None,
1565 pid)
1566 if pid not in self._process_lookup:
1567 self._process_lookup[pid] = self.Process(self, pid)
1568 return self._process_lookup[pid]
1569
1570 @classmethod
1571 def traces(cls):
1572 """Returns the list of all handled traces to pass this as an argument to
1573 strace.
1574 """
1575 prefix = 'handle_'
1576 return [i[len(prefix):] for i in dir(cls.Process) if i.startswith(prefix)]
1577
1578 class Tracer(ApiBase.Tracer):
1579 MAX_LEN = 256
1580
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00001581 def __init__(self, logname, use_sudo):
1582 super(Strace.Tracer, self).__init__(logname)
1583 self.use_sudo = use_sudo
1584
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001585 def trace(self, cmd, cwd, tracename, output):
1586 """Runs strace on an executable."""
1587 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
1588 assert os.path.isabs(cmd[0]), cmd[0]
1589 assert os.path.isabs(cwd), cwd
1590 assert os.path.normpath(cwd) == cwd, cwd
1591 with self._lock:
1592 if not self._initialized:
1593 raise TracingFailure(
1594 'Called Tracer.trace() on an unitialized object',
1595 None, None, None, tracename)
1596 assert tracename not in (i['trace'] for i in self._traces)
1597 stdout = stderr = None
1598 if output:
1599 stdout = subprocess.PIPE
1600 stderr = subprocess.STDOUT
1601 # Ensure all file related APIs are hooked.
1602 traces = ','.join(Strace.Context.traces() + ['file'])
1603 trace_cmd = [
1604 'strace',
1605 '-ff',
1606 '-s', '%d' % self.MAX_LEN,
1607 '-e', 'trace=%s' % traces,
1608 '-o', self._logname + '.' + tracename,
1609 ]
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00001610 if self.use_sudo is True:
1611 trace_cmd.insert(0, 'sudo')
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001612 child = subprocess.Popen(
1613 trace_cmd + cmd,
1614 cwd=cwd,
1615 stdin=subprocess.PIPE,
1616 stdout=stdout,
1617 stderr=stderr)
1618 out = child.communicate()[0]
1619 # TODO(maruel): Walk the logs and figure out the root process would
1620 # simplify parsing the logs a *lot*.
1621 with self._lock:
1622 assert tracename not in (i['trace'] for i in self._traces)
1623 self._traces.append(
1624 {
1625 'cmd': cmd,
1626 'cwd': cwd,
maruel@chromium.org964ddde2012-10-29 18:13:46 +00001627 'output': out,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001628 # The pid of strace process, not very useful.
1629 'pid': child.pid,
1630 'trace': tracename,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001631 })
1632 return child.returncode, out
1633
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00001634 def __init__(self, use_sudo=None):
1635 super(Strace, self).__init__()
1636 self.use_sudo = use_sudo
1637
1638 def get_tracer(self, logname):
1639 return self.Tracer(logname, self.use_sudo)
1640
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001641 @staticmethod
1642 def clean_trace(logname):
1643 if os.path.isfile(logname):
1644 os.remove(logname)
1645 # Also delete any pid specific file from previous traces.
1646 for i in glob.iglob(logname + '.*'):
1647 if i.rsplit('.', 1)[1].isdigit():
1648 os.remove(i)
1649
1650 @classmethod
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00001651 def parse_log(cls, logname, blacklist, trace_name):
1652 logging.info('parse_log(%s, ..., %s)', logname, trace_name)
maruel@chromium.org8b056ba2012-10-16 14:04:49 +00001653 assert os.path.isabs(logname)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001654 data = read_json(logname)
1655 out = []
1656 for item in data['traces']:
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00001657 if trace_name and item['trace'] != trace_name:
1658 continue
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001659 result = {
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001660 'output': item['output'],
maruel@chromium.org964ddde2012-10-29 18:13:46 +00001661 'trace': item['trace'],
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001662 }
1663 try:
1664 context = cls.Context(blacklist, item['cwd'])
1665 for pidfile in glob.iglob('%s.%s.*' % (logname, item['trace'])):
1666 pid = pidfile.rsplit('.', 1)[1]
1667 if pid.isdigit():
1668 pid = int(pid)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001669 for line in open(pidfile, 'rb'):
1670 context.on_line(pid, line)
1671 result['results'] = context.to_results()
maruel@chromium.org424f5692012-10-15 15:42:38 +00001672 except TracingFailure:
1673 result['exception'] = sys.exc_info()
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001674 out.append(result)
1675 return out
1676
1677
1678class Dtrace(ApiBase):
1679 """Uses DTrace framework through dtrace. Requires root access.
1680
1681 Implies Mac OSX.
1682
1683 dtruss can't be used because it has compatibility issues with python.
1684
1685 Also, the pid->cwd handling needs to be done manually since OSX has no way to
1686 get the absolute path of the 'cwd' dtrace variable from the probe.
1687
1688 Also, OSX doesn't populate curpsinfo->pr_psargs properly, see
1689 https://discussions.apple.com/thread/1980539. So resort to handling execve()
1690 manually.
1691
1692 errno is not printed in the log since this implementation currently only cares
1693 about files that were successfully opened.
1694 """
1695 class Context(ApiBase.Context):
1696 # Format: index pid function(args)
1697 RE_HEADER = re.compile(r'^\d+ (\d+) ([a-zA-Z_\-]+)\((.*?)\)$')
1698
1699 # Arguments parsing.
1700 RE_DTRACE_BEGIN = re.compile(r'^\"(.+?)\"$')
1701 RE_CHDIR = re.compile(r'^\"(.+?)\"$')
1702 RE_EXECVE = re.compile(r'^\"(.+?)\", \[(\d+), (.+)\]$')
1703 RE_OPEN = re.compile(r'^\"(.+?)\", (0x[0-9a-z]+), (0x[0-9a-z]+)$')
1704 RE_PROC_START = re.compile(r'^(\d+), \"(.+?)\", (\d+)$')
1705 RE_RENAME = re.compile(r'^\"(.+?)\", \"(.+?)\"$')
1706
1707 O_DIRECTORY = 0x100000
1708
1709 class Process(ApiBase.Context.Process):
1710 def __init__(self, *args):
1711 super(Dtrace.Context.Process, self).__init__(*args)
1712 self.cwd = self.initial_cwd
1713
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00001714 def __init__(self, blacklist, thunk_pid, initial_cwd):
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001715 logging.info(
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00001716 '%s(%d, %s)' % (self.__class__.__name__, thunk_pid, initial_cwd))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001717 super(Dtrace.Context, self).__init__(blacklist)
maruel@chromium.org306e0e72012-11-02 18:22:03 +00001718 assert isinstance(initial_cwd, unicode), initial_cwd
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001719 # Process ID of the temporary script created by create_thunk().
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00001720 self._thunk_pid = thunk_pid
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001721 self._initial_cwd = initial_cwd
1722 self._line_number = 0
1723
1724 def on_line(self, line):
maruel@chromium.org306e0e72012-11-02 18:22:03 +00001725 assert isinstance(line, unicode), line
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001726 self._line_number += 1
1727 match = self.RE_HEADER.match(line)
1728 if not match:
1729 raise TracingFailure(
1730 'Found malformed line: %s' % line,
1731 None,
1732 self._line_number,
1733 line)
1734 fn = getattr(
1735 self,
1736 'handle_%s' % match.group(2).replace('-', '_'),
1737 self._handle_ignored)
1738 # It is guaranteed to succeed because of the regexp. Or at least I thought
1739 # it would.
1740 pid = int(match.group(1))
1741 try:
1742 return fn(pid, match.group(3))
1743 except TracingFailure, e:
1744 # Hack in the values since the handler could be a static function.
1745 e.pid = pid
1746 e.line = line
1747 e.line_number = self._line_number
1748 # Re-raise the modified exception.
1749 raise
1750 except (KeyError, NotImplementedError, ValueError), e:
1751 raise TracingFailure(
1752 'Trace generated a %s exception: %s' % (
1753 e.__class__.__name__, str(e)),
1754 pid,
1755 self._line_number,
1756 line,
1757 e)
1758
1759 def to_results(self):
1760 process = self.root_process.to_results_process()
1761 # Internal concistency check.
1762 if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
1763 raise TracingFailure(
1764 'Found internal inconsitency in process lifetime detection '
1765 'while looking for len(tree) == len(list)',
1766 None,
1767 None,
1768 None,
1769 sorted(self._process_lookup),
1770 sorted(p.pid for p in process.all))
1771 return Results(process)
1772
1773 def handle_dtrace_BEGIN(self, _pid, args):
1774 if not self.RE_DTRACE_BEGIN.match(args):
1775 raise TracingFailure(
1776 'Found internal inconsitency in dtrace_BEGIN log line',
1777 None, None, None)
1778
1779 def handle_proc_start(self, pid, args):
1780 """Transfers cwd.
1781
1782 The dtrace script already takes care of only tracing the processes that
1783 are child of the traced processes so there is no need to verify the
1784 process hierarchy.
1785 """
1786 if pid in self._process_lookup:
1787 raise TracingFailure(
1788 'Found internal inconsitency in proc_start: %d started two times' %
1789 pid,
1790 None, None, None)
1791 match = self.RE_PROC_START.match(args)
1792 if not match:
1793 raise TracingFailure(
1794 'Failed to parse arguments: %s' % args,
1795 None, None, None)
1796 ppid = int(match.group(1))
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00001797 if ppid == self._thunk_pid and not self.root_process:
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001798 proc = self.root_process = self.Process(
1799 self.blacklist, pid, self._initial_cwd)
1800 elif ppid in self._process_lookup:
1801 proc = self.Process(self.blacklist, pid, self._process_lookup[ppid].cwd)
1802 self._process_lookup[ppid].children.append(proc)
1803 else:
1804 # Another process tree, ignore.
1805 return
1806 self._process_lookup[pid] = proc
1807 logging.debug(
1808 'New child: %s -> %d cwd:%s' %
maruel@chromium.org306e0e72012-11-02 18:22:03 +00001809 (ppid, pid, render(proc.initial_cwd)))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001810
1811 def handle_proc_exit(self, pid, _args):
1812 """Removes cwd."""
1813 if pid in self._process_lookup:
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00001814 # self._thunk_pid is not traced itself and other traces run neither.
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001815 self._process_lookup[pid].cwd = None
1816
1817 def handle_execve(self, pid, args):
1818 """Sets the process' executable.
1819
1820 TODO(maruel): Read command line arguments. See
1821 https://discussions.apple.com/thread/1980539 for an example.
1822 https://gist.github.com/1242279
1823
1824 Will have to put the answer at http://stackoverflow.com/questions/7556249.
1825 :)
1826 """
1827 if not pid in self._process_lookup:
1828 # Another process tree, ignore.
1829 return
1830 match = self.RE_EXECVE.match(args)
1831 if not match:
1832 raise TracingFailure(
1833 'Failed to parse arguments: %r' % args,
1834 None, None, None)
1835 proc = self._process_lookup[pid]
1836 proc.executable = match.group(1)
1837 proc.command = self.process_escaped_arguments(match.group(3))
1838 if int(match.group(2)) != len(proc.command):
1839 raise TracingFailure(
1840 'Failed to parse execve() arguments: %s' % args,
1841 None, None, None)
1842
1843 def handle_chdir(self, pid, args):
1844 """Updates cwd."""
1845 if pid not in self._process_lookup:
1846 # Another process tree, ignore.
1847 return
1848 cwd = self.RE_CHDIR.match(args).group(1)
1849 if not cwd.startswith('/'):
1850 cwd2 = os.path.join(self._process_lookup[pid].cwd, cwd)
1851 logging.debug('handle_chdir(%d, %s) -> %s' % (pid, cwd, cwd2))
1852 else:
1853 logging.debug('handle_chdir(%d, %s)' % (pid, cwd))
1854 cwd2 = cwd
1855 self._process_lookup[pid].cwd = cwd2
1856
1857 def handle_open_nocancel(self, pid, args):
1858 """Redirects to handle_open()."""
1859 return self.handle_open(pid, args)
1860
1861 def handle_open(self, pid, args):
1862 if pid not in self._process_lookup:
1863 # Another process tree, ignore.
1864 return
1865 match = self.RE_OPEN.match(args)
1866 if not match:
1867 raise TracingFailure(
1868 'Failed to parse arguments: %s' % args,
1869 None, None, None)
1870 flag = int(match.group(2), 16)
1871 if self.O_DIRECTORY & flag == self.O_DIRECTORY:
1872 # Ignore directories.
1873 return
1874 self._handle_file(pid, match.group(1))
1875
1876 def handle_rename(self, pid, args):
1877 if pid not in self._process_lookup:
1878 # Another process tree, ignore.
1879 return
1880 match = self.RE_RENAME.match(args)
1881 if not match:
1882 raise TracingFailure(
1883 'Failed to parse arguments: %s' % args,
1884 None, None, None)
1885 self._handle_file(pid, match.group(1))
1886 self._handle_file(pid, match.group(2))
1887
1888 def _handle_file(self, pid, filepath):
1889 if not filepath.startswith('/'):
1890 filepath = os.path.join(self._process_lookup[pid].cwd, filepath)
1891 # We can get '..' in the path.
1892 filepath = os.path.normpath(filepath)
1893 # Sadly, still need to filter out directories here;
1894 # saw open_nocancel(".", 0, 0) = 0 lines.
1895 if os.path.isdir(filepath):
1896 return
1897 self._process_lookup[pid].add_file(filepath, False)
1898
1899 def handle_ftruncate(self, pid, args):
1900 """Just used as a signal to kill dtrace, ignoring."""
1901 pass
1902
1903 @staticmethod
1904 def _handle_ignored(pid, args):
1905 """Is called for all the event traces that are not handled."""
1906 raise NotImplementedError('Please implement me')
1907
1908 @staticmethod
1909 def process_escaped_arguments(text):
1910 """Extracts escaped arguments on a string and return the arguments as a
1911 list.
1912
1913 Implemented as an automaton.
1914
1915 Example:
1916 With text = '\\001python2.7\\001-c\\001print(\\"hi\\")\\0', the
1917 function will return ['python2.7', '-c', 'print("hi")]
1918 """
1919 if not text.endswith('\\0'):
1920 raise ValueError('String is not null terminated: %r' % text, text)
1921 text = text[:-2]
1922
1923 def unescape(x):
1924 """Replaces '\\' with '\' and '\?' (where ? is anything) with ?."""
1925 out = []
1926 escaped = False
1927 for i in x:
1928 if i == '\\' and not escaped:
1929 escaped = True
1930 continue
1931 escaped = False
1932 out.append(i)
1933 return ''.join(out)
1934
1935 return [unescape(i) for i in text.split('\\001')]
1936
1937 class Tracer(ApiBase.Tracer):
1938 # pylint: disable=C0301
1939 #
1940 # To understand the following code, you'll want to take a look at:
1941 # http://developers.sun.com/solaris/articles/dtrace_quickref/dtrace_quickref.html
1942 # https://wikis.oracle.com/display/DTrace/Variables
1943 # http://docs.oracle.com/cd/E19205-01/820-4221/
1944 #
1945 # 0. Dump all the valid probes into a text file. It is important, you
1946 # want to redirect into a file and you don't want to constantly 'sudo'.
1947 # $ sudo dtrace -l > probes.txt
1948 #
1949 # 1. Count the number of probes:
1950 # $ wc -l probes.txt
1951 # 81823 # On OSX 10.7, including 1 header line.
1952 #
1953 # 2. List providers, intentionally skipping all the 'syspolicy10925' and the
1954 # likes and skipping the header with NR>1:
1955 # $ awk 'NR>1 { print $2 }' probes.txt | sort | uniq | grep -v '[[:digit:]]'
1956 # dtrace
1957 # fbt
1958 # io
1959 # ip
1960 # lockstat
1961 # mach_trap
1962 # proc
1963 # profile
1964 # sched
1965 # syscall
1966 # tcp
1967 # vminfo
1968 #
1969 # 3. List of valid probes:
1970 # $ grep syscall probes.txt | less
1971 # or use dtrace directly:
1972 # $ sudo dtrace -l -P syscall | less
1973 #
1974 # trackedpid is an associative array where its value can be 0, 1 or 2.
1975 # 0 is for untracked processes and is the default value for items not
1976 # in the associative array.
1977 # 1 is for tracked processes.
1978 # 2 is for the script created by create_thunk() only. It is not tracked
1979 # itself but all its decendants are.
1980 #
1981 # The script will kill itself only once waiting_to_die == 1 and
1982 # current_processes == 0, so that both getlogin() was called and that
1983 # all traced processes exited.
1984 #
1985 # TODO(maruel): Use cacheable predicates. See
1986 # https://wikis.oracle.com/display/DTrace/Performance+Considerations
1987 D_CODE = """
1988 dtrace:::BEGIN {
1989 waiting_to_die = 0;
1990 current_processes = 0;
1991 logindex = 0;
1992 printf("%d %d %s_%s(\\"%s\\")\\n",
1993 logindex, PID, probeprov, probename, SCRIPT);
1994 logindex++;
1995 }
1996
1997 proc:::start /trackedpid[ppid]/ {
1998 trackedpid[pid] = 1;
1999 current_processes += 1;
2000 printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
2001 logindex, pid, probeprov, probename,
2002 ppid,
2003 execname,
2004 current_processes);
2005 logindex++;
2006 }
2007 /* Should use SCRIPT but there is no access to this variable at that
2008 * point. */
2009 proc:::start /ppid == PID && execname == "Python"/ {
2010 trackedpid[pid] = 2;
2011 current_processes += 1;
2012 printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
2013 logindex, pid, probeprov, probename,
2014 ppid,
2015 execname,
2016 current_processes);
2017 logindex++;
2018 }
2019 proc:::exit /trackedpid[pid] &&
2020 current_processes == 1 &&
2021 waiting_to_die == 1/ {
2022 trackedpid[pid] = 0;
2023 current_processes -= 1;
2024 printf("%d %d %s_%s(%d)\\n",
2025 logindex, pid, probeprov, probename,
2026 current_processes);
2027 logindex++;
2028 exit(0);
2029 }
2030 proc:::exit /trackedpid[pid]/ {
2031 trackedpid[pid] = 0;
2032 current_processes -= 1;
2033 printf("%d %d %s_%s(%d)\\n",
2034 logindex, pid, probeprov, probename,
2035 current_processes);
2036 logindex++;
2037 }
2038
2039 /* Use an arcane function to detect when we need to die */
2040 syscall::ftruncate:entry /pid == PID && arg0 == FILE_ID/ {
2041 waiting_to_die = 1;
2042 printf("%d %d %s()\\n", logindex, pid, probefunc);
2043 logindex++;
2044 }
2045 syscall::ftruncate:entry /
2046 pid == PID && arg0 == FILE_ID && current_processes == 0/ {
2047 exit(0);
2048 }
2049
2050 syscall::open*:entry /trackedpid[pid] == 1/ {
2051 self->open_arg0 = arg0;
2052 self->open_arg1 = arg1;
2053 self->open_arg2 = arg2;
2054 }
2055 syscall::open*:return /trackedpid[pid] == 1 && errno == 0/ {
2056 this->open_arg0 = copyinstr(self->open_arg0);
2057 printf("%d %d %s(\\"%s\\", 0x%x, 0x%x)\\n",
2058 logindex, pid, probefunc,
2059 this->open_arg0,
2060 self->open_arg1,
2061 self->open_arg2);
2062 logindex++;
2063 this->open_arg0 = 0;
2064 }
2065 syscall::open*:return /trackedpid[pid] == 1/ {
2066 self->open_arg0 = 0;
2067 self->open_arg1 = 0;
2068 self->open_arg2 = 0;
2069 }
2070
2071 syscall::rename:entry /trackedpid[pid] == 1/ {
2072 self->rename_arg0 = arg0;
2073 self->rename_arg1 = arg1;
2074 }
2075 syscall::rename:return /trackedpid[pid] == 1 && errno == 0/ {
2076 this->rename_arg0 = copyinstr(self->rename_arg0);
2077 this->rename_arg1 = copyinstr(self->rename_arg1);
2078 printf("%d %d %s(\\"%s\\", \\"%s\\")\\n",
2079 logindex, pid, probefunc,
2080 this->rename_arg0,
2081 this->rename_arg1);
2082 logindex++;
2083 this->rename_arg0 = 0;
2084 this->rename_arg1 = 0;
2085 }
2086 syscall::rename:return /trackedpid[pid] == 1/ {
2087 self->rename_arg0 = 0;
2088 self->rename_arg1 = 0;
2089 }
2090
2091 /* Track chdir, it's painful because it is only receiving relative path.
2092 */
2093 syscall::chdir:entry /trackedpid[pid] == 1/ {
2094 self->chdir_arg0 = arg0;
2095 }
2096 syscall::chdir:return /trackedpid[pid] == 1 && errno == 0/ {
2097 this->chdir_arg0 = copyinstr(self->chdir_arg0);
2098 printf("%d %d %s(\\"%s\\")\\n",
2099 logindex, pid, probefunc,
2100 this->chdir_arg0);
2101 logindex++;
2102 this->chdir_arg0 = 0;
2103 }
2104 syscall::chdir:return /trackedpid[pid] == 1/ {
2105 self->chdir_arg0 = 0;
2106 }
2107 """
2108
2109 # execve-specific code, tends to throw a lot of exceptions.
2110 D_CODE_EXECVE = """
2111 /* Finally what we care about! */
2112 syscall::exec*:entry /trackedpid[pid]/ {
2113 self->exec_arg0 = copyinstr(arg0);
2114 /* Incrementally probe for a NULL in the argv parameter of execve() to
2115 * figure out argc. */
2116 /* TODO(maruel): Skip the remaining copyin() when a NULL pointer was
2117 * found. */
2118 self->exec_argc = 0;
2119 /* Probe for argc==1 */
2120 this->exec_argv = (user_addr_t*)copyin(
2121 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
2122 self->exec_argc = this->exec_argv[self->exec_argc] ?
2123 (self->exec_argc + 1) : self->exec_argc;
2124
2125 /* Probe for argc==2 */
2126 this->exec_argv = (user_addr_t*)copyin(
2127 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
2128 self->exec_argc = this->exec_argv[self->exec_argc] ?
2129 (self->exec_argc + 1) : self->exec_argc;
2130
2131 /* Probe for argc==3 */
2132 this->exec_argv = (user_addr_t*)copyin(
2133 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
2134 self->exec_argc = this->exec_argv[self->exec_argc] ?
2135 (self->exec_argc + 1) : self->exec_argc;
2136
2137 /* Probe for argc==4 */
2138 this->exec_argv = (user_addr_t*)copyin(
2139 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
2140 self->exec_argc = this->exec_argv[self->exec_argc] ?
2141 (self->exec_argc + 1) : self->exec_argc;
2142
2143 /* Copy the inputs strings since there is no guarantee they'll be
2144 * present after the call completed. */
2145 self->exec_argv0 = (self->exec_argc > 0) ?
2146 copyinstr(this->exec_argv[0]) : "";
2147 self->exec_argv1 = (self->exec_argc > 1) ?
2148 copyinstr(this->exec_argv[1]) : "";
2149 self->exec_argv2 = (self->exec_argc > 2) ?
2150 copyinstr(this->exec_argv[2]) : "";
2151 self->exec_argv3 = (self->exec_argc > 3) ?
2152 copyinstr(this->exec_argv[3]) : "";
2153 this->exec_argv = 0;
2154 }
2155 syscall::exec*:return /trackedpid[pid] && errno == 0/ {
2156 /* We need to join strings here, as using multiple printf() would
2157 * cause tearing when multiple threads/processes are traced.
2158 * Since it is impossible to escape a string and join it to another one,
2159 * like sprintf("%s%S", previous, more), use hackery.
2160 * Each of the elements are split with a \\1. \\0 cannot be used because
2161 * it is simply ignored. This will conflict with any program putting a
2162 * \\1 in their execve() string but this should be "rare enough" */
2163 this->args = "";
2164 /* Process exec_argv[0] */
2165 this->args = strjoin(
2166 this->args, (self->exec_argc > 0) ? self->exec_argv0 : "");
2167
2168 /* Process exec_argv[1] */
2169 this->args = strjoin(
2170 this->args, (self->exec_argc > 1) ? "\\1" : "");
2171 this->args = strjoin(
2172 this->args, (self->exec_argc > 1) ? self->exec_argv1 : "");
2173
2174 /* Process exec_argv[2] */
2175 this->args = strjoin(
2176 this->args, (self->exec_argc > 2) ? "\\1" : "");
2177 this->args = strjoin(
2178 this->args, (self->exec_argc > 2) ? self->exec_argv2 : "");
2179
2180 /* Process exec_argv[3] */
2181 this->args = strjoin(
2182 this->args, (self->exec_argc > 3) ? "\\1" : "");
2183 this->args = strjoin(
2184 this->args, (self->exec_argc > 3) ? self->exec_argv3 : "");
2185
2186 /* Prints self->exec_argc to permits verifying the internal
2187 * consistency since this code is quite fishy. */
2188 printf("%d %d %s(\\"%s\\", [%d, %S])\\n",
2189 logindex, pid, probefunc,
2190 self->exec_arg0,
2191 self->exec_argc,
2192 this->args);
2193 logindex++;
2194 this->args = 0;
2195 }
2196 syscall::exec*:return /trackedpid[pid]/ {
2197 self->exec_arg0 = 0;
2198 self->exec_argc = 0;
2199 self->exec_argv0 = 0;
2200 self->exec_argv1 = 0;
2201 self->exec_argv2 = 0;
2202 self->exec_argv3 = 0;
2203 }
2204 """
2205
2206 # Code currently not used.
2207 D_EXTRANEOUS = """
2208 /* This is a good learning experience, since it traces a lot of things
2209 * related to the process and child processes.
2210 * Warning: it generates a gigantic log. For example, tracing
2211 * "data/trace_inputs/child1.py --child" generates a 2mb log and takes
2212 * several minutes to execute.
2213 */
2214 /*
2215 mach_trap::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2216 printf("%d %d %s_%s() = %d\\n",
2217 logindex, pid, probeprov, probefunc, errno);
2218 logindex++;
2219 }
2220 proc::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2221 printf("%d %d %s_%s() = %d\\n",
2222 logindex, pid, probeprov, probefunc, errno);
2223 logindex++;
2224 }
2225 sched::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2226 printf("%d %d %s_%s() = %d\\n",
2227 logindex, pid, probeprov, probefunc, errno);
2228 logindex++;
2229 }
2230 syscall::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2231 printf("%d %d %s_%s() = %d\\n",
2232 logindex, pid, probeprov, probefunc, errno);
2233 logindex++;
2234 }
2235 vminfo::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2236 printf("%d %d %s_%s() = %d\\n",
2237 logindex, pid, probeprov, probefunc, errno);
2238 logindex++;
2239 }
2240 */
2241 /* TODO(maruel): *stat* functions and friends
2242 syscall::access:return,
2243 syscall::chdir:return,
2244 syscall::chflags:return,
2245 syscall::chown:return,
2246 syscall::chroot:return,
2247 syscall::getattrlist:return,
2248 syscall::getxattr:return,
2249 syscall::lchown:return,
2250 syscall::lstat64:return,
2251 syscall::lstat:return,
2252 syscall::mkdir:return,
2253 syscall::pathconf:return,
2254 syscall::readlink:return,
2255 syscall::removexattr:return,
2256 syscall::setxattr:return,
2257 syscall::stat64:return,
2258 syscall::stat:return,
2259 syscall::truncate:return,
2260 syscall::unlink:return,
2261 syscall::utimes:return,
2262 */
2263 """
2264
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00002265 def __init__(self, logname, use_sudo):
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002266 """Starts the log collection with dtrace.
2267
2268 Requires root access or chmod 4555 on dtrace. dtrace is asynchronous so
2269 this needs to wait for dtrace to be "warmed up".
2270 """
2271 super(Dtrace.Tracer, self).__init__(logname)
2272 self._script = create_thunk()
2273 # This unique dummy temp file is used to signal the dtrace script that it
2274 # should stop as soon as all the child processes are done. A bit hackish
2275 # but works fine enough.
2276 self._dummy_file_id, self._dummy_file_name = tempfile.mkstemp(
2277 prefix='trace_signal_file')
2278
maruel@chromium.org9b2112a2013-04-05 18:01:02 +00002279 dtrace_path = '/usr/sbin/dtrace'
2280 if not os.path.isfile(dtrace_path):
2281 dtrace_path = 'dtrace'
2282 elif use_sudo is None and (os.stat(dtrace_path).st_mode & stat.S_ISUID):
2283 # No need to sudo. For those following at home, don't do that.
2284 use_sudo = False
2285
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002286 # Note: do not use the -p flag. It's useless if the initial process quits
2287 # too fast, resulting in missing traces from the grand-children. The D
2288 # code manages the dtrace lifetime itself.
2289 trace_cmd = [
maruel@chromium.org9b2112a2013-04-05 18:01:02 +00002290 dtrace_path,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002291 # Use a larger buffer if getting 'out of scratch space' errors.
2292 # Ref: https://wikis.oracle.com/display/DTrace/Options+and+Tunables
2293 '-b', '10m',
2294 '-x', 'dynvarsize=10m',
2295 #'-x', 'dtrace_global_maxsize=1m',
2296 '-x', 'evaltime=exec',
2297 '-o', '/dev/stderr',
2298 '-q',
2299 '-n', self._get_dtrace_code(),
2300 ]
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00002301 if use_sudo is not False:
2302 trace_cmd.insert(0, 'sudo')
2303
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002304 with open(self._logname + '.log', 'wb') as logfile:
2305 self._dtrace = subprocess.Popen(
2306 trace_cmd, stdout=logfile, stderr=subprocess.STDOUT)
2307 logging.debug('Started dtrace pid: %d' % self._dtrace.pid)
2308
2309 # Reads until one line is printed, which signifies dtrace is up and ready.
2310 with open(self._logname + '.log', 'rb') as logfile:
2311 while 'dtrace_BEGIN' not in logfile.readline():
2312 if self._dtrace.poll() is not None:
2313 # Do a busy wait. :/
2314 break
2315 logging.debug('dtrace started')
2316
2317 def _get_dtrace_code(self):
2318 """Setups the D code to implement child process tracking.
2319
2320 Injects the cookie in the script so it knows when to stop.
2321
2322 The script will detect any instance of the script created with
2323 create_thunk() and will start tracing it.
2324 """
maruel@chromium.orgbe75f282012-11-02 19:34:37 +00002325 out = (
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002326 'inline int PID = %d;\n'
2327 'inline string SCRIPT = "%s";\n'
2328 'inline int FILE_ID = %d;\n'
2329 '\n'
2330 '%s') % (
2331 os.getpid(),
2332 self._script,
2333 self._dummy_file_id,
maruel@chromium.orgbe75f282012-11-02 19:34:37 +00002334 self.D_CODE)
2335 if os.environ.get('TRACE_INPUTS_DTRACE_ENABLE_EXECVE') == '1':
2336 # Do not enable by default since it tends to spew dtrace: error lines
2337 # because the execve() parameters are not in valid memory at the time of
2338 # logging.
2339 # TODO(maruel): Find a way to make this reliable since it's useful but
2340 # only works in limited/trivial uses cases for now.
2341 out += self.D_CODE_EXECVE
2342 return out
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002343
2344 def trace(self, cmd, cwd, tracename, output):
2345 """Runs dtrace on an executable.
2346
2347 This dtruss is broken when it starts the process itself or when tracing
2348 child processes, this code starts a wrapper process
2349 generated with create_thunk() which starts the executable to trace.
2350 """
2351 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
2352 assert os.path.isabs(cmd[0]), cmd[0]
2353 assert os.path.isabs(cwd), cwd
2354 assert os.path.normpath(cwd) == cwd, cwd
2355 with self._lock:
2356 if not self._initialized:
2357 raise TracingFailure(
2358 'Called Tracer.trace() on an unitialized object',
2359 None, None, None, tracename)
2360 assert tracename not in (i['trace'] for i in self._traces)
2361
2362 # Starts the script wrapper to start the child process. This signals the
2363 # dtrace script that this process is to be traced.
2364 stdout = stderr = None
2365 if output:
2366 stdout = subprocess.PIPE
2367 stderr = subprocess.STDOUT
2368 child_cmd = [
2369 sys.executable,
2370 self._script,
2371 tracename,
2372 ]
2373 # Call a dummy function so that dtrace knows I'm about to launch a process
2374 # that needs to be traced.
2375 # Yummy.
2376 child = subprocess.Popen(
2377 child_cmd + fix_python_path(cmd),
2378 stdin=subprocess.PIPE,
2379 stdout=stdout,
2380 stderr=stderr,
2381 cwd=cwd)
2382 logging.debug('Started child pid: %d' % child.pid)
2383
2384 out = child.communicate()[0]
2385 # This doesn't mean tracing is done, one of the grand-child process may
2386 # still be alive. It will be tracked with the dtrace script.
2387
2388 with self._lock:
2389 assert tracename not in (i['trace'] for i in self._traces)
2390 self._traces.append(
2391 {
2392 'cmd': cmd,
2393 'cwd': cwd,
2394 # The pid of strace process, not very useful.
2395 'pid': child.pid,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002396 'output': out,
maruel@chromium.org964ddde2012-10-29 18:13:46 +00002397 'trace': tracename,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002398 })
2399 return child.returncode, out
2400
2401 def close(self, timeout=None):
2402 """Terminates dtrace."""
2403 logging.debug('close(%s)' % timeout)
2404 try:
2405 try:
2406 super(Dtrace.Tracer, self).close(timeout)
2407 # Signal dtrace that it should stop now.
maruel@chromium.orgf43e68b2012-10-15 20:23:10 +00002408 # ftruncate doesn't exist on Windows.
2409 os.ftruncate(self._dummy_file_id, 0) # pylint: disable=E1101
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002410 if timeout:
2411 start = time.time()
2412 # Use polling. :/
2413 while (self._dtrace.poll() is None and
2414 (time.time() - start) < timeout):
2415 time.sleep(0.1)
2416 self._dtrace.kill()
2417 self._dtrace.wait()
2418 finally:
2419 # Make sure to kill it in any case.
2420 if self._dtrace.poll() is None:
2421 try:
2422 self._dtrace.kill()
2423 self._dtrace.wait()
2424 except OSError:
2425 pass
2426
2427 if self._dtrace.returncode != 0:
2428 # Warn about any dtrace failure but basically ignore it.
2429 print 'dtrace failure: %s' % self._dtrace.returncode
2430 finally:
2431 os.close(self._dummy_file_id)
2432 os.remove(self._dummy_file_name)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002433
2434 def post_process_log(self):
2435 """Sorts the log back in order when each call occured.
2436
2437 dtrace doesn't save the buffer in strict order since it keeps one buffer
2438 per CPU.
2439 """
2440 super(Dtrace.Tracer, self).post_process_log()
2441 logname = self._logname + '.log'
2442 with open(logname, 'rb') as logfile:
2443 lines = [l for l in logfile if l.strip()]
2444 errors = [l for l in lines if l.startswith('dtrace:')]
2445 if errors:
2446 raise TracingFailure(
2447 'Found errors in the trace: %s' % '\n'.join(errors),
2448 None, None, None, logname)
2449 try:
2450 lines = sorted(lines, key=lambda l: int(l.split(' ', 1)[0]))
2451 except ValueError:
2452 raise TracingFailure(
2453 'Found errors in the trace: %s' % '\n'.join(
2454 l for l in lines if l.split(' ', 1)[0].isdigit()),
2455 None, None, None, logname)
2456 with open(logname, 'wb') as logfile:
2457 logfile.write(''.join(lines))
2458
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00002459 def __init__(self, use_sudo=None):
2460 super(Dtrace, self).__init__()
2461 self.use_sudo = use_sudo
2462
2463 def get_tracer(self, logname):
2464 return self.Tracer(logname, self.use_sudo)
2465
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002466 @staticmethod
2467 def clean_trace(logname):
2468 for ext in ('', '.log'):
2469 if os.path.isfile(logname + ext):
2470 os.remove(logname + ext)
2471
2472 @classmethod
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00002473 def parse_log(cls, logname, blacklist, trace_name):
2474 logging.info('parse_log(%s, ..., %s)', logname, trace_name)
maruel@chromium.org8b056ba2012-10-16 14:04:49 +00002475 assert os.path.isabs(logname)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002476
2477 def blacklist_more(filepath):
2478 # All the HFS metadata is in the form /.vol/...
2479 return blacklist(filepath) or re.match(r'^\/\.vol\/.+$', filepath)
2480
2481 data = read_json(logname)
2482 out = []
2483 for item in data['traces']:
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00002484 if trace_name and item['trace'] != trace_name:
2485 continue
maruel@chromium.org964ddde2012-10-29 18:13:46 +00002486 result = {
2487 'output': item['output'],
2488 'trace': item['trace'],
2489 }
2490 try:
2491 context = cls.Context(blacklist_more, item['pid'], item['cwd'])
maruel@chromium.org306e0e72012-11-02 18:22:03 +00002492 # It's fine to assume the file as UTF-8: OSX enforces the file names to
2493 # be valid UTF-8 and we control the log output.
2494 for line in codecs.open(logname + '.log', 'rb', encoding='utf-8'):
maruel@chromium.org964ddde2012-10-29 18:13:46 +00002495 context.on_line(line)
2496 result['results'] = context.to_results()
2497 except TracingFailure:
2498 result['exception'] = sys.exc_info()
2499 out.append(result)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002500 return out
2501
2502
2503class LogmanTrace(ApiBase):
2504 """Uses the native Windows ETW based tracing functionality to trace a child
2505 process.
2506
2507 Caveat: this implementations doesn't track cwd or initial_cwd. It is because
2508 the Windows Kernel doesn't have a concept of 'current working directory' at
2509 all. A Win32 process has a map of current directories, one per drive letter
2510 and it is managed by the user mode kernel32.dll. In kernel, a file is always
2511 opened relative to another file_object or as an absolute path. All the current
2512 working directory logic is done in user mode.
2513 """
2514 class Context(ApiBase.Context):
2515 """Processes a ETW log line and keeps the list of existent and non
2516 existent files accessed.
2517
2518 Ignores directories.
2519 """
2520 # These indexes are for the stripped version in json.
2521 EVENT_NAME = 0
2522 TYPE = 1
2523 PID = 2
2524 TID = 3
2525 PROCESSOR_ID = 4
2526 TIMESTAMP = 5
2527 USER_DATA = 6
2528
2529 class Process(ApiBase.Context.Process):
2530 def __init__(self, *args):
2531 super(LogmanTrace.Context.Process, self).__init__(*args)
2532 # Handle file objects that succeeded.
2533 self.file_objects = {}
2534
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002535 def __init__(self, blacklist, thunk_pid, trace_name, thunk_cmd):
2536 logging.info(
2537 '%s(%d, %s, %s)', self.__class__.__name__, thunk_pid, trace_name,
2538 thunk_cmd)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002539 super(LogmanTrace.Context, self).__init__(blacklist)
2540 self._drive_map = DosDriveMap()
2541 # Threads mapping to the corresponding process id.
2542 self._threads_active = {}
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002543 # Process ID of the tracer, e.g. the temporary script created by
2544 # create_thunk(). This is tricky because the process id may have been
2545 # reused.
2546 self._thunk_pid = thunk_pid
2547 self._thunk_cmd = thunk_cmd
2548 self._trace_name = trace_name
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002549 self._line_number = 0
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002550 self._thunk_process = None
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002551
2552 def on_line(self, line):
2553 """Processes a json Event line."""
2554 self._line_number += 1
2555 try:
2556 # By Opcode
2557 handler = getattr(
2558 self,
2559 'handle_%s_%s' % (line[self.EVENT_NAME], line[self.TYPE]),
2560 None)
2561 if not handler:
2562 raise TracingFailure(
2563 'Unexpected event %s_%s' % (
2564 line[self.EVENT_NAME], line[self.TYPE]),
2565 None, None, None)
2566 handler(line)
2567 except TracingFailure, e:
2568 # Hack in the values since the handler could be a static function.
2569 e.pid = line[self.PID]
2570 e.line = line
2571 e.line_number = self._line_number
2572 # Re-raise the modified exception.
2573 raise
2574 except (KeyError, NotImplementedError, ValueError), e:
2575 raise TracingFailure(
2576 'Trace generated a %s exception: %s' % (
2577 e.__class__.__name__, str(e)),
2578 line[self.PID],
2579 self._line_number,
2580 line,
2581 e)
2582
2583 def to_results(self):
2584 if not self.root_process:
2585 raise TracingFailure(
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002586 'Failed to detect the initial process %d' % self._thunk_pid,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002587 None, None, None)
2588 process = self.root_process.to_results_process()
2589 return Results(process)
2590
2591 def _thread_to_process(self, tid):
2592 """Finds the process from the thread id."""
2593 tid = int(tid, 16)
2594 pid = self._threads_active.get(tid)
2595 if not pid or not self._process_lookup.get(pid):
2596 return
2597 return self._process_lookup[pid]
2598
2599 @classmethod
2600 def handle_EventTrace_Header(cls, line):
2601 """Verifies no event was dropped, e.g. no buffer overrun occured."""
2602 BUFFER_SIZE = cls.USER_DATA
2603 #VERSION = cls.USER_DATA + 1
2604 #PROVIDER_VERSION = cls.USER_DATA + 2
2605 #NUMBER_OF_PROCESSORS = cls.USER_DATA + 3
2606 #END_TIME = cls.USER_DATA + 4
2607 #TIMER_RESOLUTION = cls.USER_DATA + 5
2608 #MAX_FILE_SIZE = cls.USER_DATA + 6
2609 #LOG_FILE_MODE = cls.USER_DATA + 7
2610 #BUFFERS_WRITTEN = cls.USER_DATA + 8
2611 #START_BUFFERS = cls.USER_DATA + 9
2612 #POINTER_SIZE = cls.USER_DATA + 10
2613 EVENTS_LOST = cls.USER_DATA + 11
2614 #CPU_SPEED = cls.USER_DATA + 12
2615 #LOGGER_NAME = cls.USER_DATA + 13
2616 #LOG_FILE_NAME = cls.USER_DATA + 14
2617 #BOOT_TIME = cls.USER_DATA + 15
2618 #PERF_FREQ = cls.USER_DATA + 16
2619 #START_TIME = cls.USER_DATA + 17
2620 #RESERVED_FLAGS = cls.USER_DATA + 18
2621 #BUFFERS_LOST = cls.USER_DATA + 19
2622 #SESSION_NAME_STRING = cls.USER_DATA + 20
2623 #LOG_FILE_NAME_STRING = cls.USER_DATA + 21
2624 if line[EVENTS_LOST] != '0':
2625 raise TracingFailure(
2626 ( '%s events were lost during trace, please increase the buffer '
2627 'size from %s') % (line[EVENTS_LOST], line[BUFFER_SIZE]),
2628 None, None, None)
2629
2630 def handle_FileIo_Cleanup(self, line):
2631 """General wisdom: if a file is closed, it's because it was opened.
2632
2633 Note that FileIo_Close is not used since if a file was opened properly but
2634 not closed before the process exits, only Cleanup will be logged.
2635 """
2636 #IRP = self.USER_DATA
2637 TTID = self.USER_DATA + 1 # Thread ID, that's what we want.
2638 FILE_OBJECT = self.USER_DATA + 2
2639 #FILE_KEY = self.USER_DATA + 3
2640 proc = self._thread_to_process(line[TTID])
2641 if not proc:
2642 # Not a process we care about.
2643 return
2644 file_object = line[FILE_OBJECT]
2645 if file_object in proc.file_objects:
2646 proc.add_file(proc.file_objects.pop(file_object), False)
2647
2648 def handle_FileIo_Create(self, line):
2649 """Handles a file open.
2650
2651 All FileIo events are described at
2652 http://msdn.microsoft.com/library/windows/desktop/aa363884.aspx
2653 for some value of 'description'.
2654
2655 " (..) process and thread id values of the IO events (..) are not valid "
2656 http://msdn.microsoft.com/magazine/ee358703.aspx
2657
2658 The FileIo.Create event doesn't return if the CreateFile() call
2659 succeeded, so keep track of the file_object and check that it is
2660 eventually closed with FileIo_Cleanup.
2661 """
2662 #IRP = self.USER_DATA
2663 TTID = self.USER_DATA + 1 # Thread ID, that's what we want.
2664 FILE_OBJECT = self.USER_DATA + 2
2665 #CREATE_OPTIONS = self.USER_DATA + 3
2666 #FILE_ATTRIBUTES = self.USER_DATA + 4
2667 #self.USER_DATA + SHARE_ACCESS = 5
2668 OPEN_PATH = self.USER_DATA + 6
2669
2670 proc = self._thread_to_process(line[TTID])
2671 if not proc:
2672 # Not a process we care about.
2673 return
2674
maruel@chromium.org306e0e72012-11-02 18:22:03 +00002675 raw_path = line[OPEN_PATH]
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002676 # Ignore directories and bare drive right away.
2677 if raw_path.endswith(os.path.sep):
2678 return
2679 filepath = self._drive_map.to_win32(raw_path)
2680 # Ignore bare drive right away. Some may still fall through with format
2681 # like '\\?\X:'
2682 if len(filepath) == 2:
2683 return
2684 file_object = line[FILE_OBJECT]
2685 if os.path.isdir(filepath):
2686 # There is no O_DIRECTORY equivalent on Windows. The closed is
2687 # FILE_FLAG_BACKUP_SEMANTICS but it's not exactly right either. So
2688 # simply discard directories are they are found.
2689 return
2690 # Override any stale file object
2691 proc.file_objects[file_object] = filepath
2692
2693 def handle_FileIo_Rename(self, line):
2694 # TODO(maruel): Handle?
2695 pass
2696
2697 def handle_Process_End(self, line):
2698 pid = line[self.PID]
2699 if self._process_lookup.get(pid):
2700 logging.info('Terminated: %d' % pid)
2701 self._process_lookup[pid] = None
2702 else:
2703 logging.debug('Terminated: %d' % pid)
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002704 if self._thunk_process and self._thunk_process.pid == pid:
2705 self._thunk_process = None
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002706
2707 def handle_Process_Start(self, line):
2708 """Handles a new child process started by PID."""
2709 #UNIQUE_PROCESS_KEY = self.USER_DATA
2710 PROCESS_ID = self.USER_DATA + 1
2711 #PARENT_PID = self.USER_DATA + 2
2712 #SESSION_ID = self.USER_DATA + 3
2713 #EXIT_STATUS = self.USER_DATA + 4
2714 #DIRECTORY_TABLE_BASE = self.USER_DATA + 5
2715 #USER_SID = self.USER_DATA + 6
2716 IMAGE_FILE_NAME = self.USER_DATA + 7
2717 COMMAND_LINE = self.USER_DATA + 8
2718
2719 ppid = line[self.PID]
2720 pid = int(line[PROCESS_ID], 16)
maruel@chromium.org306e0e72012-11-02 18:22:03 +00002721 command_line = CommandLineToArgvW(line[COMMAND_LINE])
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002722 logging.debug(
2723 'New process %d->%d (%s) %s' %
maruel@chromium.org306e0e72012-11-02 18:22:03 +00002724 (ppid, pid, line[IMAGE_FILE_NAME], command_line))
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002725
2726 if pid == self._thunk_pid:
2727 # Need to ignore processes we don't know about because the log is
2728 # system-wide. self._thunk_pid shall start only one process.
2729 # This is tricky though because Windows *loves* to reuse process id and
2730 # it happens often that the process ID of the thunk script created by
2731 # create_thunk() is reused. So just detecting the pid here is not
2732 # sufficient, we must confirm the command line.
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002733 if command_line[:len(self._thunk_cmd)] != self._thunk_cmd:
2734 logging.info(
2735 'Ignoring duplicate pid %d for %s: %s while searching for %s',
2736 pid, self._trace_name, command_line, self._thunk_cmd)
2737 return
2738
2739 # TODO(maruel): The check is quite weak. Add the thunk path.
2740 if self._thunk_process:
2741 raise TracingFailure(
2742 ( 'Parent process is _thunk_pid(%d) but thunk_process(%d) is '
2743 'already set') % (self._thunk_pid, self._thunk_process.pid),
2744 None, None, None)
2745 proc = self.Process(self.blacklist, pid, None)
2746 self._thunk_process = proc
2747 return
2748 elif ppid == self._thunk_pid and self._thunk_process:
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002749 proc = self.Process(self.blacklist, pid, None)
2750 self.root_process = proc
2751 ppid = None
2752 elif self._process_lookup.get(ppid):
2753 proc = self.Process(self.blacklist, pid, None)
2754 self._process_lookup[ppid].children.append(proc)
2755 else:
2756 # Ignore
2757 return
2758 self._process_lookup[pid] = proc
2759
maruel@chromium.org306e0e72012-11-02 18:22:03 +00002760 proc.command = command_line
2761 proc.executable = line[IMAGE_FILE_NAME]
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002762 # proc.command[0] may be the absolute path of 'executable' but it may be
2763 # anything else too. If it happens that command[0] ends with executable,
2764 # use it, otherwise defaults to the base name.
2765 cmd0 = proc.command[0].lower()
2766 if not cmd0.endswith('.exe'):
2767 # TODO(maruel): That's not strictly true either.
2768 cmd0 += '.exe'
2769 if cmd0.endswith(proc.executable) and os.path.isfile(cmd0):
2770 # Fix the path.
2771 cmd0 = cmd0.replace('/', os.path.sep)
2772 cmd0 = os.path.normpath(cmd0)
2773 proc.executable = get_native_path_case(cmd0)
2774 logging.info(
2775 'New child: %s -> %d %s' % (ppid, pid, proc.executable))
2776
2777 def handle_Thread_End(self, line):
2778 """Has the same parameters as Thread_Start."""
2779 tid = int(line[self.TID], 16)
2780 self._threads_active.pop(tid, None)
2781
2782 def handle_Thread_Start(self, line):
2783 """Handles a new thread created.
2784
2785 Do not use self.PID here since a process' initial thread is created by
2786 the parent process.
2787 """
2788 PROCESS_ID = self.USER_DATA
2789 TTHREAD_ID = self.USER_DATA + 1
2790 #STACK_BASE = self.USER_DATA + 2
2791 #STACK_LIMIT = self.USER_DATA + 3
2792 #USER_STACK_BASE = self.USER_DATA + 4
2793 #USER_STACK_LIMIT = self.USER_DATA + 5
2794 #AFFINITY = self.USER_DATA + 6
2795 #WIN32_START_ADDR = self.USER_DATA + 7
2796 #TEB_BASE = self.USER_DATA + 8
2797 #SUB_PROCESS_TAG = self.USER_DATA + 9
2798 #BASE_PRIORITY = self.USER_DATA + 10
2799 #PAGE_PRIORITY = self.USER_DATA + 11
2800 #IO_PRIORITY = self.USER_DATA + 12
2801 #THREAD_FLAGS = self.USER_DATA + 13
2802 # Do not use self.PID here since a process' initial thread is created by
2803 # the parent process.
2804 pid = int(line[PROCESS_ID], 16)
2805 tid = int(line[TTHREAD_ID], 16)
2806 logging.debug('New thread pid:%d, tid:%d' % (pid, tid))
2807 self._threads_active[tid] = pid
2808
2809 @classmethod
2810 def supported_events(cls):
2811 """Returns all the procesed events."""
2812 out = []
2813 for member in dir(cls):
2814 match = re.match(r'^handle_([A-Za-z]+)_([A-Za-z]+)$', member)
2815 if match:
2816 out.append(match.groups())
2817 return out
2818
2819 class Tracer(ApiBase.Tracer):
2820 # The basic headers.
2821 EXPECTED_HEADER = [
2822 u'Event Name',
2823 u'Type',
2824 u'Event ID',
2825 u'Version',
2826 u'Channel',
2827 u'Level', # 5
2828 u'Opcode',
2829 u'Task',
2830 u'Keyword',
2831 u'PID',
2832 u'TID', # 10
2833 u'Processor Number',
2834 u'Instance ID',
2835 u'Parent Instance ID',
2836 u'Activity ID',
2837 u'Related Activity ID', # 15
2838 u'Clock-Time',
2839 u'Kernel(ms)', # Both have a resolution of ~15ms which makes them
2840 u'User(ms)', # pretty much useless.
2841 u'User Data', # Extra arguments that are event-specific.
2842 ]
2843 # Only the useful headers common to all entries are listed there. Any column
2844 # at 19 or higher is dependent on the specific event.
2845 EVENT_NAME = 0
2846 TYPE = 1
2847 PID = 9
2848 TID = 10
2849 PROCESSOR_ID = 11
2850 TIMESTAMP = 16
2851 NULL_GUID = '{00000000-0000-0000-0000-000000000000}'
2852 USER_DATA = 19
2853
maruel@chromium.org306e0e72012-11-02 18:22:03 +00002854 class CsvReader(object):
2855 """CSV reader that reads files generated by tracerpt.exe.
2856
2857 csv.reader() fails to read them properly, it mangles file names quoted
2858 with "" with a comma in it.
2859 """
2860 # 0. Had a ',' or one of the following ' ' after a comma, next should
2861 # be ' ', '"' or string or ',' for an empty field.
2862 ( HAD_DELIMITER,
2863 # 1. Processing an unquoted field up to ','.
2864 IN_STR,
2865 # 2. Processing a new field starting with '"'.
2866 STARTING_STR_QUOTED,
2867 # 3. Second quote in a row at the start of a field. It could be either
2868 # '""foo""' or '""'. Who the hell thought it was a great idea to use
2869 # the same character for delimiting and escaping?
2870 STARTING_SECOND_QUOTE,
2871 # 4. A quote inside a quoted string where the previous character was
2872 # not a quote, so the string is not empty. Can be either: end of a
2873 # quoted string (a delimiter) or a quote escape. The next char must be
2874 # either '"' or ','.
2875 HAD_QUOTE_IN_QUOTED,
2876 # 5. Second quote inside a quoted string.
2877 HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED,
2878 # 6. Processing a field that started with '"'.
2879 IN_STR_QUOTED) = range(7)
2880
2881 def __init__(self, f):
2882 self.f = f
2883
2884 def __iter__(self):
2885 return self
2886
2887 def next(self):
2888 """Splits the line in fields."""
2889 line = self.f.readline()
2890 if not line:
2891 raise StopIteration()
2892 line = line.strip()
2893 fields = []
2894 state = self.HAD_DELIMITER
2895 for i, c in enumerate(line):
2896 if state == self.HAD_DELIMITER:
2897 if c == ',':
2898 # Empty field.
2899 fields.append('')
2900 elif c == ' ':
2901 # Ignore initial whitespaces
2902 pass
2903 elif c == '"':
2904 state = self.STARTING_STR_QUOTED
2905 fields.append('')
2906 else:
2907 # Start of a new field.
2908 state = self.IN_STR
2909 fields.append(c)
2910
2911 elif state == self.IN_STR:
2912 # Do not accept quote inside unquoted field.
2913 assert c != '"', (i, c, line, fields)
2914 if c == ',':
2915 fields[-1] = fields[-1].strip()
2916 state = self.HAD_DELIMITER
2917 else:
2918 fields[-1] = fields[-1] + c
2919
2920 elif state == self.STARTING_STR_QUOTED:
2921 if c == '"':
2922 # Do not store the character yet.
2923 state = self.STARTING_SECOND_QUOTE
2924 else:
2925 state = self.IN_STR_QUOTED
2926 fields[-1] = fields[-1] + c
2927
2928 elif state == self.STARTING_SECOND_QUOTE:
2929 if c == ',':
2930 # It was an empty field. '""' == ''.
2931 state = self.HAD_DELIMITER
2932 else:
2933 fields[-1] = fields[-1] + '"' + c
2934 state = self.IN_STR_QUOTED
2935
2936 elif state == self.HAD_QUOTE_IN_QUOTED:
2937 if c == ',':
2938 # End of the string.
2939 state = self.HAD_DELIMITER
2940 elif c == '"':
2941 state = self.HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED
2942 else:
2943 # The previous double-quote was just an unescaped quote.
2944 fields[-1] = fields[-1] + '"' + c
2945 state = self.IN_STR_QUOTED
2946
2947 elif state == self.HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED:
2948 if c == ',':
2949 # End of the string.
2950 state = self.HAD_DELIMITER
2951 fields[-1] = fields[-1] + '"'
2952 else:
maruel@chromium.org261d1352013-02-25 19:23:20 +00002953 # That's just how the logger rolls. Revert back to appending the
2954 # char and "guess" it was a quote in a double-quoted string.
2955 state = self.IN_STR_QUOTED
2956 fields[-1] = fields[-1] + '"' + c
maruel@chromium.org306e0e72012-11-02 18:22:03 +00002957
2958 elif state == self.IN_STR_QUOTED:
2959 if c == '"':
2960 # Could be a delimiter or an escape.
2961 state = self.HAD_QUOTE_IN_QUOTED
2962 else:
2963 fields[-1] = fields[-1] + c
2964
2965 if state == self.HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED:
2966 fields[-1] = fields[-1] + '"'
2967 else:
2968 assert state in (
2969 # Terminated with a normal field.
2970 self.IN_STR,
2971 # Terminated with an empty field.
2972 self.STARTING_SECOND_QUOTE,
2973 # Terminated with a normal quoted field.
2974 self.HAD_QUOTE_IN_QUOTED), (
2975 line, state, fields)
2976 return fields
2977
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002978 def __init__(self, logname):
2979 """Starts the log collection.
2980
2981 Requires administrative access. logman.exe is synchronous so no need for a
2982 "warmup" call. 'Windows Kernel Trace' is *localized* so use its GUID
2983 instead. The GUID constant name is SystemTraceControlGuid. Lovely.
2984
2985 One can get the list of potentially interesting providers with:
2986 "logman query providers | findstr /i file"
2987 """
2988 super(LogmanTrace.Tracer, self).__init__(logname)
2989 self._script = create_thunk()
2990 cmd_start = [
2991 'logman.exe',
2992 'start',
2993 'NT Kernel Logger',
2994 '-p', '{9e814aad-3204-11d2-9a82-006008a86939}',
2995 # splitio,fileiocompletion,syscall,file,cswitch,img
2996 '(process,fileio,thread)',
2997 '-o', self._logname + '.etl',
2998 '-ets', # Send directly to kernel
2999 # Values extracted out of thin air.
3000 # Event Trace Session buffer size in kb.
3001 '-bs', '10240',
3002 # Number of Event Trace Session buffers.
3003 '-nb', '16', '256',
3004 ]
3005 logging.debug('Running: %s' % cmd_start)
3006 try:
3007 subprocess.check_call(
3008 cmd_start,
3009 stdin=subprocess.PIPE,
3010 stdout=subprocess.PIPE,
3011 stderr=subprocess.STDOUT)
3012 except subprocess.CalledProcessError, e:
3013 if e.returncode == -2147024891:
3014 print >> sys.stderr, 'Please restart with an elevated admin prompt'
3015 elif e.returncode == -2144337737:
3016 print >> sys.stderr, (
3017 'A kernel trace was already running, stop it and try again')
3018 raise
3019
3020 def trace(self, cmd, cwd, tracename, output):
3021 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
3022 assert os.path.isabs(cmd[0]), cmd[0]
3023 assert os.path.isabs(cwd), cwd
3024 assert os.path.normpath(cwd) == cwd, cwd
3025 with self._lock:
3026 if not self._initialized:
3027 raise TracingFailure(
3028 'Called Tracer.trace() on an unitialized object',
3029 None, None, None, tracename)
3030 assert tracename not in (i['trace'] for i in self._traces)
3031
3032 # Use "logman -?" for help.
3033
3034 stdout = stderr = None
3035 if output:
3036 stdout = subprocess.PIPE
3037 stderr = subprocess.STDOUT
3038
3039 # Run the child process.
3040 logging.debug('Running: %s' % cmd)
3041 # Use the temporary script generated with create_thunk() so we have a
3042 # clear pid owner. Since trace_inputs.py can be used as a library and
3043 # could trace multiple processes simultaneously, it makes it more complex
3044 # if the executable to be traced is executed directly here. It also solves
3045 # issues related to logman.exe that needs to be executed to control the
3046 # kernel trace.
3047 child_cmd = [
3048 sys.executable,
3049 self._script,
3050 tracename,
3051 ]
3052 child = subprocess.Popen(
3053 child_cmd + fix_python_path(cmd),
3054 cwd=cwd,
3055 stdin=subprocess.PIPE,
3056 stdout=stdout,
3057 stderr=stderr)
3058 logging.debug('Started child pid: %d' % child.pid)
3059 out = child.communicate()[0]
3060 # This doesn't mean all the grand-children are done. Sadly, we don't have
3061 # a good way to determine that.
3062
3063 with self._lock:
3064 assert tracename not in (i['trace'] for i in self._traces)
3065 self._traces.append({
maruel@chromium.orgf43e68b2012-10-15 20:23:10 +00003066 'cmd': cmd,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003067 'cwd': cwd,
maruel@chromium.org964ddde2012-10-29 18:13:46 +00003068 'output': out,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003069 'pid': child.pid,
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00003070 # Used to figure out the real process when process ids are reused.
3071 'thunk_cmd': child_cmd,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003072 'trace': tracename,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003073 })
3074
3075 return child.returncode, out
3076
3077 def close(self, _timeout=None):
3078 """Stops the kernel log collection and converts the traces to text
3079 representation.
3080 """
3081 with self._lock:
maruel@chromium.org808e3f22013-05-16 16:42:54 +00003082 try:
3083 super(LogmanTrace.Tracer, self).close()
3084 finally:
3085 cmd_stop = [
3086 'logman.exe',
3087 'stop',
3088 'NT Kernel Logger',
3089 '-ets', # Sends the command directly to the kernel.
3090 ]
3091 logging.debug('Running: %s' % cmd_stop)
3092 subprocess.check_call(
3093 cmd_stop,
3094 stdin=subprocess.PIPE,
3095 stdout=subprocess.PIPE,
3096 stderr=subprocess.STDOUT)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003097
3098 def post_process_log(self):
3099 """Converts the .etl file into .csv then into .json."""
3100 super(LogmanTrace.Tracer, self).post_process_log()
3101 logformat = 'csv'
3102 self._convert_log(logformat)
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +00003103 self._trim_log(logformat)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003104
maruel@chromium.org808e3f22013-05-16 16:42:54 +00003105 def _gen_logdata(self):
3106 return {
3107 'format': 'csv',
3108 'traces': self._traces,
3109 }
3110
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +00003111 def _trim_log(self, logformat):
3112 """Reduces the amount of data in original log by generating a 'reduced'
3113 log.
3114 """
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003115 if logformat == 'csv_utf16':
maruel@chromium.org306e0e72012-11-02 18:22:03 +00003116 file_handle = codecs.open(
3117 self._logname + '.' + logformat, 'r', encoding='utf-16')
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003118
3119 elif logformat == 'csv':
maruel@chromium.org306e0e72012-11-02 18:22:03 +00003120 assert sys.getfilesystemencoding() == 'mbcs'
3121 file_handle = codecs.open(
3122 self._logname + '.' + logformat, 'r',
3123 encoding=get_current_encoding())
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003124
3125 supported_events = LogmanTrace.Context.supported_events()
3126
3127 def trim(generator):
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +00003128 """Loads items from the generator and returns the interesting data.
3129
3130 It filters out any uninteresting line and reduce the amount of data in
3131 the trace.
3132 """
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003133 for index, line in enumerate(generator):
3134 if not index:
3135 if line != self.EXPECTED_HEADER:
3136 raise TracingFailure(
maruel@chromium.org306e0e72012-11-02 18:22:03 +00003137 'Found malformed header: %s' % line,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003138 None, None, None)
3139 continue
3140 # As you can see, the CSV is full of useful non-redundant information:
3141 if (line[2] != '0' or # Event ID
3142 line[3] not in ('2', '3') or # Version
3143 line[4] != '0' or # Channel
3144 line[5] != '0' or # Level
3145 line[7] != '0' or # Task
3146 line[8] != '0x0000000000000000' or # Keyword
3147 line[12] != '' or # Instance ID
3148 line[13] != '' or # Parent Instance ID
3149 line[14] != self.NULL_GUID or # Activity ID
3150 line[15] != ''): # Related Activity ID
3151 raise TracingFailure(
3152 'Found unexpected values in line: %s' % ' '.join(line),
3153 None, None, None)
3154
3155 if (line[self.EVENT_NAME], line[self.TYPE]) not in supported_events:
3156 continue
3157
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003158 yield [
3159 line[self.EVENT_NAME],
3160 line[self.TYPE],
3161 line[self.PID],
3162 line[self.TID],
3163 line[self.PROCESSOR_ID],
3164 line[self.TIMESTAMP],
3165 ] + line[self.USER_DATA:]
3166
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +00003167 # must not convert the trim() call into a list, since it will use too much
3168 # memory for large trace. use a csv file as a workaround since the json
3169 # parser requires a complete in-memory file.
3170 with open('%s.preprocessed' % self._logname, 'wb') as f:
3171 # $ and * can't be used in file name on windows, reducing the likelihood
3172 # of having to escape a string.
3173 out = csv.writer(
3174 f, delimiter='$', quotechar='*', quoting=csv.QUOTE_MINIMAL)
maruel@chromium.org306e0e72012-11-02 18:22:03 +00003175 for line in trim(self.CsvReader(file_handle)):
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +00003176 out.writerow([s.encode('utf-8') for s in line])
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003177
3178 def _convert_log(self, logformat):
3179 """Converts the ETL trace to text representation.
3180
3181 Normally, 'csv' is sufficient. If complex scripts are used (like eastern
3182 languages), use 'csv_utf16'. If localization gets in the way, use 'xml'.
3183
3184 Arguments:
3185 - logformat: Text format to be generated, csv, csv_utf16 or xml.
3186
3187 Use "tracerpt -?" for help.
3188 """
3189 LOCALE_INVARIANT = 0x7F
3190 windll.kernel32.SetThreadLocale(LOCALE_INVARIANT)
3191 cmd_convert = [
3192 'tracerpt.exe',
3193 '-l', self._logname + '.etl',
3194 '-o', self._logname + '.' + logformat,
3195 '-gmt', # Use UTC
3196 '-y', # No prompt
3197 # Use -of XML to get the header of each items after column 19, e.g. all
3198 # the actual headers of 'User Data'.
3199 ]
3200
3201 if logformat == 'csv':
3202 # tracerpt localizes the 'Type' column, for major brainfuck
3203 # entertainment. I can't imagine any sane reason to do that.
3204 cmd_convert.extend(['-of', 'CSV'])
3205 elif logformat == 'csv_utf16':
3206 # This causes it to use UTF-16, which doubles the log size but ensures
3207 # the log is readable for non-ASCII characters.
3208 cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode'])
3209 elif logformat == 'xml':
3210 cmd_convert.extend(['-of', 'XML'])
3211 else:
3212 raise ValueError('Unexpected log format \'%s\'' % logformat)
3213 logging.debug('Running: %s' % cmd_convert)
3214 # This can takes tens of minutes for large logs.
3215 # Redirects all output to stderr.
3216 subprocess.check_call(
3217 cmd_convert,
3218 stdin=subprocess.PIPE,
3219 stdout=sys.stderr,
3220 stderr=sys.stderr)
3221
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00003222 def __init__(self, use_sudo=False): # pylint: disable=W0613
3223 super(LogmanTrace, self).__init__()
3224 # Ignore use_sudo. It's irrelevant on Windows but kept to simplify the API.
3225
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003226 @staticmethod
3227 def clean_trace(logname):
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +00003228 for ext in ('', '.csv', '.etl', '.json', '.xml', '.preprocessed'):
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003229 if os.path.isfile(logname + ext):
3230 os.remove(logname + ext)
3231
3232 @classmethod
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00003233 def parse_log(cls, logname, blacklist, trace_name):
3234 logging.info('parse_log(%s, ..., %s)', logname, trace_name)
maruel@chromium.org8b056ba2012-10-16 14:04:49 +00003235 assert os.path.isabs(logname)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003236
3237 def blacklist_more(filepath):
3238 # All the NTFS metadata is in the form x:\$EXTEND or stuff like that.
3239 return blacklist(filepath) or re.match(r'[A-Z]\:\\\$EXTEND', filepath)
3240
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +00003241 # Create a list of (Context, result_dict) tuples. This is necessary because
3242 # the csv file may be larger than the amount of available memory.
3243 contexes = [
3244 (
3245 cls.Context(
3246 blacklist_more, item['pid'], item['trace'], item['thunk_cmd']),
3247 {
3248 'output': item['output'],
3249 'trace': item['trace'],
3250 },
3251 )
3252 for item in read_json(logname)['traces']
3253 if not trace_name or item['trace'] == trace_name
3254 ]
3255
3256 # The log may be too large to fit in memory and it is not efficient to read
3257 # it multiple times, so multiplex the contexes instead, which is slightly
3258 # more awkward.
3259 with open('%s.preprocessed' % logname, 'rb') as f:
3260 lines = csv.reader(
3261 f, delimiter='$', quotechar='*', quoting=csv.QUOTE_MINIMAL)
3262 for encoded in lines:
3263 line = [s.decode('utf-8') for s in encoded]
3264 # Convert the PID in-place from hex.
3265 line[cls.Context.PID] = int(line[cls.Context.PID], 16)
3266 for context in contexes:
3267 if 'exception' in context[1]:
3268 continue
3269 try:
3270 context[0].on_line(line)
3271 except TracingFailure:
3272 context[1]['exception'] = sys.exc_info()
3273
3274 for context in contexes:
3275 if 'exception' in context[1]:
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00003276 continue
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +00003277 context[1]['results'] = context[0].to_results()
3278
3279 return [context[1] for context in contexes]
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003280
3281
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00003282def get_api(**kwargs):
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003283 """Returns the correct implementation for the current OS."""
3284 if sys.platform == 'cygwin':
3285 raise NotImplementedError(
3286 'Not implemented for cygwin, start the script from Win32 python')
3287 flavors = {
3288 'win32': LogmanTrace,
3289 'darwin': Dtrace,
3290 'sunos5': Dtrace,
3291 'freebsd7': Dtrace,
3292 'freebsd8': Dtrace,
3293 }
3294 # Defaults to strace.
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00003295 return flavors.get(sys.platform, Strace)(**kwargs)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003296
3297
3298def extract_directories(root_dir, files, blacklist):
3299 """Detects if all the files in a directory are in |files| and if so, replace
3300 the individual files by a Results.Directory instance.
3301
3302 Takes a list of Results.File instances and returns a shorter list of
3303 Results.File and Results.Directory instances.
3304
3305 Arguments:
3306 - root_dir: Optional base directory that shouldn't be search further.
3307 - files: list of Results.File instances.
3308 - blacklist: regexp of files to ignore, for example r'.+\.pyc'.
3309 """
3310 logging.info(
3311 'extract_directories(%s, %d files, ...)' % (root_dir, len(files)))
3312 assert not (root_dir or '').endswith(os.path.sep), root_dir
maruel@chromium.org306e0e72012-11-02 18:22:03 +00003313 # It is important for root_dir to not be a symlinked path, make sure to call
3314 # os.path.realpath() as needed.
3315 assert not root_dir or (
3316 os.path.realpath(get_native_path_case(root_dir)) == root_dir)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003317 assert not any(isinstance(f, Results.Directory) for f in files)
3318 # Remove non existent files.
3319 files = [f for f in files if f.existent]
3320 if not files:
3321 return files
3322 # All files must share the same root, which can be None.
3323 assert len(set(f.root for f in files)) == 1, set(f.root for f in files)
3324
3325 # Creates a {directory: {filename: File}} mapping, up to root.
3326 buckets = {}
3327 if root_dir:
3328 buckets[root_dir] = {}
3329 for fileobj in files:
3330 path = fileobj.full_path
3331 directory = os.path.dirname(path)
3332 assert directory
3333 # Do not use os.path.basename() so trailing os.path.sep is kept.
3334 basename = path[len(directory)+1:]
3335 files_in_directory = buckets.setdefault(directory, {})
3336 files_in_directory[basename] = fileobj
3337 # Add all the directories recursively up to root.
3338 while True:
3339 old_d = directory
3340 directory = os.path.dirname(directory)
3341 if directory + os.path.sep == root_dir or directory == old_d:
3342 break
3343 buckets.setdefault(directory, {})
3344
3345 root_prefix = len(root_dir) + 1 if root_dir else 0
3346 for directory in sorted(buckets, reverse=True):
maruel@chromium.orgb1678e62013-03-19 15:55:17 +00003347 if not os.path.isdir(directory):
3348 logging.debug(
3349 '%s was a directory but doesn\'t exist anymore; ignoring', directory)
3350 continue
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003351 actual = set(f for f in os.listdir(directory) if not blacklist(f))
3352 expected = set(buckets[directory])
3353 if not (actual - expected):
3354 parent = os.path.dirname(directory)
3355 buckets[parent][os.path.basename(directory)] = Results.Directory(
3356 root_dir,
3357 directory[root_prefix:],
3358 False,
3359 sum(f.size for f in buckets[directory].itervalues()),
3360 sum(f.nb_files for f in buckets[directory].itervalues()))
3361 # Remove the whole bucket.
3362 del buckets[directory]
3363
3364 # Reverse the mapping with what remains. The original instances are returned,
3365 # so the cached meta data is kept.
3366 files = sum((x.values() for x in buckets.itervalues()), [])
3367 return sorted(files, key=lambda x: x.path)
3368
3369
3370def trace(logfile, cmd, cwd, api, output):
3371 """Traces an executable. Returns (returncode, output) from api.
3372
3373 Arguments:
3374 - logfile: file to write to.
3375 - cmd: command to run.
3376 - cwd: current directory to start the process in.
3377 - api: a tracing api instance.
3378 - output: if True, returns output, otherwise prints it at the console.
3379 """
3380 cmd = fix_python_path(cmd)
3381 api.clean_trace(logfile)
3382 with api.get_tracer(logfile) as tracer:
3383 return tracer.trace(cmd, cwd, 'default', output)
3384
3385
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003386def CMDclean(args):
3387 """Cleans up traces."""
3388 parser = OptionParserTraceInputs(command='clean')
3389 options, args = parser.parse_args(args)
3390 api = get_api()
3391 api.clean_trace(options.log)
3392 return 0
3393
3394
3395def CMDtrace(args):
3396 """Traces an executable."""
3397 parser = OptionParserTraceInputs(command='trace')
3398 parser.allow_interspersed_args = False
3399 parser.add_option(
3400 '-q', '--quiet', action='store_true',
3401 help='Redirects traced executable output to /dev/null')
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00003402 parser.add_option(
3403 '-s', '--sudo', action='store_true',
3404 help='Use sudo when shelling out the tracer tool (ignored on Windows)')
3405 parser.add_option(
3406 '-n', '--no-sudo', action='store_false',
3407 help='Don\'t use sudo')
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003408 options, args = parser.parse_args(args)
3409
3410 if not args:
3411 parser.error('Please provide a command to run')
3412
3413 if not os.path.isabs(args[0]) and os.access(args[0], os.X_OK):
3414 args[0] = os.path.abspath(args[0])
3415
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00003416 # options.sudo default value is None, which is to do whatever tracer defaults
3417 # do.
3418 api = get_api(use_sudo=options.sudo)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003419 return trace(options.log, args, os.getcwd(), api, options.quiet)[0]
3420
3421
3422def CMDread(args):
3423 """Reads the logs and prints the result."""
3424 parser = OptionParserTraceInputs(command='read')
3425 parser.add_option(
3426 '-V', '--variable',
3427 nargs=2,
3428 action='append',
3429 dest='variables',
3430 metavar='VAR_NAME directory',
3431 default=[],
3432 help=('Variables to replace relative directories against. Example: '
3433 '"-v \'$HOME\' \'/home/%s\'" will replace all occurence of your '
3434 'home dir with $HOME') % getpass.getuser())
3435 parser.add_option(
3436 '--root-dir',
3437 help='Root directory to base everything off it. Anything outside of this '
3438 'this directory will not be reported')
3439 parser.add_option(
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00003440 '--trace-name',
3441 help='Only reads one of the trace. Defaults to reading all traces')
3442 parser.add_option(
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003443 '-j', '--json', action='store_true',
3444 help='Outputs raw result data as json')
3445 parser.add_option(
3446 '-b', '--blacklist', action='append', default=[],
3447 help='List of regexp to use as blacklist filter')
3448 options, args = parser.parse_args(args)
3449
3450 if options.root_dir:
maruel@chromium.org7dc17b32013-04-02 22:27:05 +00003451 options.root_dir = get_native_path_case(
3452 unicode(os.path.abspath(options.root_dir)))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003453
3454 variables = dict(options.variables)
3455 api = get_api()
3456 def blacklist(f):
3457 return any(re.match(b, f) for b in options.blacklist)
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00003458 data = api.parse_log(options.log, blacklist, options.trace_name)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003459 # Process each trace.
3460 output_as_json = []
3461 for item in data:
3462 if 'exception' in item:
maruel@chromium.org424f5692012-10-15 15:42:38 +00003463 # Do not abort the other traces.
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003464 print >> sys.stderr, (
maruel@chromium.org424f5692012-10-15 15:42:38 +00003465 'Trace %s: Got an exception: %s' % (
3466 item['trace'], item['exception'][1]))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003467 continue
3468 results = item['results']
3469 if options.root_dir:
3470 results = results.strip_root(options.root_dir)
3471
3472 if options.json:
3473 output_as_json.append(results.flatten())
3474 else:
3475 simplified = extract_directories(
3476 options.root_dir, results.files, blacklist)
3477 simplified = [f.replace_variables(variables) for f in simplified]
3478 if len(data) > 1:
3479 print('Trace: %s' % item['trace'])
3480 print('Total: %d' % len(results.files))
3481 print('Non existent: %d' % len(results.non_existent))
3482 for f in results.non_existent:
3483 print(' %s' % f.path)
3484 print(
3485 'Interesting: %d reduced to %d' % (
3486 len(results.existent), len(simplified)))
3487 for f in simplified:
3488 print(' %s' % f.path)
3489
3490 if options.json:
3491 write_json(sys.stdout, output_as_json, False)
3492 return 0
3493
3494
3495class OptionParserWithLogging(optparse.OptionParser):
3496 """Adds --verbose option."""
3497 def __init__(self, verbose=0, **kwargs):
3498 optparse.OptionParser.__init__(self, **kwargs)
3499 self.add_option(
3500 '-v', '--verbose',
3501 action='count',
3502 default=verbose,
3503 help='Use multiple times to increase verbosity')
3504
3505 def parse_args(self, *args, **kwargs):
3506 options, args = optparse.OptionParser.parse_args(self, *args, **kwargs)
3507 levels = [logging.ERROR, logging.INFO, logging.DEBUG]
3508 logging.basicConfig(
3509 level=levels[min(len(levels)-1, options.verbose)],
3510 format='%(levelname)5s %(module)15s(%(lineno)3d): %(message)s')
3511 return options, args
3512
3513
3514class OptionParserWithNiceDescription(OptionParserWithLogging):
3515 """Generates the description with the command's docstring."""
3516 def __init__(self, **kwargs):
3517 """Sets 'description' and 'usage' if not already specified."""
3518 command = kwargs.pop('command', 'help')
3519 kwargs.setdefault(
3520 'description',
3521 re.sub('[\r\n ]{2,}', ' ', get_command_handler(command).__doc__))
3522 kwargs.setdefault('usage', '%%prog %s [options]' % command)
3523 OptionParserWithLogging.__init__(self, **kwargs)
3524
3525
3526class OptionParserTraceInputs(OptionParserWithNiceDescription):
3527 """Adds automatic --log handling."""
3528 def __init__(self, **kwargs):
3529 OptionParserWithNiceDescription.__init__(self, **kwargs)
3530 self.add_option(
3531 '-l', '--log', help='Log file to generate or read, required')
3532
3533 def parse_args(self, *args, **kwargs):
3534 """Makes sure the paths make sense.
3535
3536 On Windows, / and \ are often mixed together in a path.
3537 """
3538 options, args = OptionParserWithNiceDescription.parse_args(
3539 self, *args, **kwargs)
3540 if not options.log:
3541 self.error('Must supply a log file with -l')
3542 options.log = os.path.abspath(options.log)
3543 return options, args
3544
3545
3546def extract_documentation():
3547 """Returns a dict {command: description} for each of documented command."""
3548 commands = (
3549 fn[3:]
3550 for fn in dir(sys.modules['__main__'])
3551 if fn.startswith('CMD') and get_command_handler(fn[3:]).__doc__)
3552 return dict((fn, get_command_handler(fn).__doc__) for fn in commands)
3553
3554
3555def CMDhelp(args):
3556 """Prints list of commands or help for a specific command."""
3557 doc = extract_documentation()
3558 # Calculates the optimal offset.
3559 offset = max(len(cmd) for cmd in doc)
3560 format_str = ' %-' + str(offset + 2) + 's %s'
3561 # Generate a one-liner documentation of each commands.
3562 commands_description = '\n'.join(
3563 format_str % (cmd, doc[cmd].split('\n')[0]) for cmd in sorted(doc))
3564
3565 parser = OptionParserWithNiceDescription(
3566 usage='%prog <command> [options]',
3567 description='Commands are:\n%s\n' % commands_description)
3568 parser.format_description = lambda _: parser.description
3569
3570 # Strip out any -h or --help argument.
3571 _, args = parser.parse_args([i for i in args if not i in ('-h', '--help')])
3572 if len(args) == 1:
3573 if not get_command_handler(args[0]):
3574 parser.error('Unknown command %s' % args[0])
3575 # The command was "%prog help command", replaces ourself with
3576 # "%prog command --help" so help is correctly printed out.
3577 return main(args + ['--help'])
3578 elif args:
3579 parser.error('Unknown argument "%s"' % ' '.join(args))
3580 parser.print_help()
3581 return 0
3582
3583
3584def get_command_handler(name):
3585 """Returns the command handler or CMDhelp if it doesn't exist."""
3586 return getattr(sys.modules['__main__'], 'CMD%s' % name, None)
3587
3588
3589def main_impl(argv):
3590 command = get_command_handler(argv[0] if argv else 'help')
3591 if not command:
3592 return CMDhelp(argv)
3593 return command(argv[1:])
3594
3595def main(argv):
maruel@chromium.org46e61cc2013-03-25 19:55:34 +00003596 run_isolated.disable_buffering()
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003597 try:
3598 main_impl(argv)
3599 except TracingFailure, e:
3600 sys.stderr.write('\nError: ')
3601 sys.stderr.write(str(e))
3602 sys.stderr.write('\n')
3603 return 1
3604
3605
3606if __name__ == '__main__':
3607 sys.exit(main(sys.argv[1:]))