blob: eef212ddbed20e76319159235a6945c61bf410b1 [file] [log] [blame]
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001#!/usr/bin/env python
2# coding=utf-8
3# Copyright (c) 2012 The Chromium Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7"""Traces an executable and its child processes and extract the files accessed
8by them.
9
10The implementation uses OS-specific API. The native Kernel logger and the ETL
11interface is used on Windows. Dtrace is used on OSX. Strace is used otherwise.
12The OS-specific implementation is hidden in an 'API' interface.
13
14The results are embedded in a Results instance. The tracing is done in two
15phases, the first is to do the actual trace and generate an
16implementation-specific log file. Then the log file is parsed to extract the
17information, including the individual child processes and the files accessed
18from the log.
19"""
20
21import codecs
22import csv
23import getpass
24import glob
25import json
26import logging
27import optparse
28import os
29import re
30import subprocess
31import sys
32import tempfile
33import threading
34import time
35import weakref
36
37## OS-specific imports
38
39if sys.platform == 'win32':
40 from ctypes.wintypes import byref, create_unicode_buffer, c_int, c_wchar_p
41 from ctypes.wintypes import windll, FormatError # pylint: disable=E0611
42 from ctypes.wintypes import GetLastError # pylint: disable=E0611
43elif sys.platform == 'darwin':
44 import Carbon.File # pylint: disable=F0401
45 import MacOS # pylint: disable=F0401
46
47
48BASE_DIR = os.path.dirname(os.path.abspath(__file__))
49ROOT_DIR = os.path.dirname(os.path.dirname(BASE_DIR))
50
51
52class TracingFailure(Exception):
53 """An exception occured during tracing."""
54 def __init__(self, description, pid, line_number, line, *args):
55 super(TracingFailure, self).__init__(
56 description, pid, line_number, line, *args)
57 self.description = description
58 self.pid = pid
59 self.line_number = line_number
60 self.line = line
61 self.extra = args
62
63 def __str__(self):
64 out = self.description
65 if self.pid:
66 out += '\npid: %d' % self.pid
67 if self.line_number:
68 out += '\nline: %d' % self.line_number
69 if self.line:
70 out += '\n%s' % self.line
71 if self.extra:
72 out += '\n' + ', '.join(map(str, filter(None, self.extra)))
73 return out
74
75
76## OS-specific functions
77
78if sys.platform == 'win32':
79 def QueryDosDevice(drive_letter):
80 """Returns the Windows 'native' path for a DOS drive letter."""
81 assert re.match(r'^[a-zA-Z]:$', drive_letter), drive_letter
82 # Guesswork. QueryDosDeviceW never returns the required number of bytes.
83 chars = 1024
84 drive_letter = unicode(drive_letter)
85 p = create_unicode_buffer(chars)
86 if 0 == windll.kernel32.QueryDosDeviceW(drive_letter, p, chars):
87 err = GetLastError()
88 if err:
89 # pylint: disable=E0602
90 raise WindowsError(
91 err,
92 'QueryDosDevice(%s): %s (%d)' % (
93 str(drive_letter), FormatError(err), err))
94 return p.value
95
96
97 def GetShortPathName(long_path):
98 """Returns the Windows short path equivalent for a 'long' path."""
99 long_path = unicode(long_path)
100 # Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is
101 # not enforced.
102 if os.path.isabs(long_path) and not long_path.startswith('\\\\?\\'):
103 long_path = '\\\\?\\' + long_path
104 chars = windll.kernel32.GetShortPathNameW(long_path, None, 0)
105 if chars:
106 p = create_unicode_buffer(chars)
107 if windll.kernel32.GetShortPathNameW(long_path, p, chars):
108 return p.value
109
110 err = GetLastError()
111 if err:
112 # pylint: disable=E0602
113 raise WindowsError(
114 err,
115 'GetShortPathName(%s): %s (%d)' % (
116 str(long_path), FormatError(err), err))
117
118
119 def GetLongPathName(short_path):
120 """Returns the Windows long path equivalent for a 'short' path."""
121 short_path = unicode(short_path)
122 # Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is
123 # not enforced.
124 if os.path.isabs(short_path) and not short_path.startswith('\\\\?\\'):
125 short_path = '\\\\?\\' + short_path
126 chars = windll.kernel32.GetLongPathNameW(short_path, None, 0)
127 if chars:
128 p = create_unicode_buffer(chars)
129 if windll.kernel32.GetLongPathNameW(short_path, p, chars):
130 return p.value
131
132 err = GetLastError()
133 if err:
134 # pylint: disable=E0602
135 raise WindowsError(
136 err,
137 'GetLongPathName(%s): %s (%d)' % (
138 str(short_path), FormatError(err), err))
139
140
141 def get_current_encoding():
142 """Returns the 'ANSI' code page associated to the process."""
143 return 'cp%d' % int(windll.kernel32.GetACP())
144
145
146 class DosDriveMap(object):
147 """Maps \Device\HarddiskVolumeN to N: on Windows."""
148 # Keep one global cache.
149 _MAPPING = {}
150
151 def __init__(self):
152 """Lazy loads the cache."""
153 if not self._MAPPING:
154 # This is related to UNC resolver on windows. Ignore that.
155 self._MAPPING['\\Device\\Mup'] = None
156 self._MAPPING['\\SystemRoot'] = os.environ['SystemRoot']
157
158 for letter in (chr(l) for l in xrange(ord('C'), ord('Z')+1)):
159 try:
160 letter = '%s:' % letter
161 mapped = QueryDosDevice(letter)
162 if mapped in self._MAPPING:
163 logging.warn(
164 ('Two drives: \'%s\' and \'%s\', are mapped to the same disk'
165 '. Drive letters are a user-mode concept and the kernel '
166 'traces only have NT path, so all accesses will be '
167 'associated with the first drive letter, independent of the '
168 'actual letter used by the code') % (
169 self._MAPPING[mapped], letter))
170 else:
171 self._MAPPING[mapped] = letter
172 except WindowsError: # pylint: disable=E0602
173 pass
174
175 def to_win32(self, path):
176 """Converts a native NT path to Win32/DOS compatible path."""
177 match = re.match(r'(^\\Device\\[a-zA-Z0-9]+)(\\.*)?$', path)
178 if not match:
179 raise ValueError(
180 'Can\'t convert %s into a Win32 compatible path' % path,
181 path)
182 if not match.group(1) in self._MAPPING:
183 # Unmapped partitions may be accessed by windows for the
184 # fun of it while the test is running. Discard these.
185 return None
186 drive = self._MAPPING[match.group(1)]
187 if not drive or not match.group(2):
188 return drive
189 return drive + match.group(2)
190
191
192 def isabs(path):
193 """Accepts X: as an absolute path, unlike python's os.path.isabs()."""
194 return os.path.isabs(path) or len(path) == 2 and path[1] == ':'
195
196
197 def get_native_path_case(p):
198 """Returns the native path case for an existing file.
199
200 On Windows, removes any leading '\\?\'.
201 """
202 if not isabs(p):
203 raise ValueError(
204 'Can\'t get native path case for a non-absolute path: %s' % p,
205 p)
206 # Windows used to have an option to turn on case sensitivity on non Win32
207 # subsystem but that's out of scope here and isn't supported anymore.
208 # Go figure why GetShortPathName() is needed.
209 try:
210 out = GetLongPathName(GetShortPathName(p))
211 except OSError, e:
212 if e.args[0] in (2, 3, 5):
213 # The path does not exist. Try to recurse and reconstruct the path.
214 base = os.path.dirname(p)
215 rest = os.path.basename(p)
216 return os.path.join(get_native_path_case(base), rest)
217 raise
218 if out.startswith('\\\\?\\'):
219 out = out[4:]
220 # Always upper case the first letter since GetLongPathName() will return the
221 # drive letter in the case it was given.
222 return out[0].upper() + out[1:]
223
224
225 def CommandLineToArgvW(command_line):
226 """Splits a commandline into argv using CommandLineToArgvW()."""
227 # http://msdn.microsoft.com/library/windows/desktop/bb776391.aspx
228 size = c_int()
229 ptr = windll.shell32.CommandLineToArgvW(unicode(command_line), byref(size))
230 try:
231 return [arg for arg in (c_wchar_p * size.value).from_address(ptr)]
232 finally:
233 windll.kernel32.LocalFree(ptr)
234
235
236elif sys.platform == 'darwin':
237
238
239 # On non-windows, keep the stdlib behavior.
240 isabs = os.path.isabs
241
242
243 def _find_item_native_case(root_path, item):
244 """Gets the native path case of a single item based at root_path.
245
246 There is no API to get the native path case of symlinks on OSX. So it
247 needs to be done the slow way.
248 """
249 item = item.lower()
250 for element in os.listdir(root_path):
251 if element.lower() == item:
252 return element
253
254
255 def _native_case(p):
256 """Gets the native path case. Warning: this function resolves symlinks."""
257 logging.debug('native_case(%s)' % p)
258 try:
259 rel_ref, _ = Carbon.File.FSPathMakeRef(p)
260 out = rel_ref.FSRefMakePath()
261 if p.endswith(os.path.sep) and not out.endswith(os.path.sep):
262 return out + os.path.sep
263 return out
264 except MacOS.Error, e:
265 if e.args[0] in (-43, -120):
266 # The path does not exist. Try to recurse and reconstruct the path.
267 # -43 means file not found.
268 # -120 means directory not found.
269 base = os.path.dirname(p)
270 rest = os.path.basename(p)
271 return os.path.join(_native_case(base), rest)
272 raise OSError(
273 e.args[0], 'Failed to get native path for %s' % p, p, e.args[1])
274
275
276 def _split_at_symlink_native(base_path, rest):
277 """Returns the native path for a symlink."""
278 base, symlink, rest = split_at_symlink(base_path, rest)
279 if symlink:
280 if not base_path:
281 base_path = base
282 else:
283 base_path = safe_join(base_path, base)
284 symlink = _find_item_native_case(base_path, symlink)
285 return base, symlink, rest
286
287
288 def get_native_path_case(path):
289 """Returns the native path case for an existing file.
290
291 Technically, it's only HFS+ on OSX that is case preserving and
292 insensitive. It's the default setting on HFS+ but can be changed.
293 """
294 if not isabs(path):
295 raise ValueError(
296 'Can\'t get native path case for a non-absolute path: %s' % path,
297 path)
298 if path.startswith('/dev'):
299 # /dev is not visible from Carbon, causing an exception.
300 return path
301
302 # Starts assuming there is no symlink along the path.
303 resolved = _native_case(path)
304 if resolved.lower() == path.lower():
305 # This code path is incredibly faster.
306 return resolved
307
308 # There was a symlink, process it.
309 base, symlink, rest = _split_at_symlink_native(None, path)
310 assert symlink, (path, base, symlink, rest, resolved)
311 prev = base
312 base = safe_join(_native_case(base), symlink)
313 assert len(base) > len(prev)
314 while rest:
315 prev = base
316 relbase, symlink, rest = _split_at_symlink_native(base, rest)
317 base = safe_join(base, relbase)
318 assert len(base) > len(prev), (prev, base, symlink)
319 if symlink:
320 base = safe_join(base, symlink)
321 assert len(base) > len(prev), (prev, base, symlink)
322 # Make sure no symlink was resolved.
323 assert base.lower() == path.lower(), (base, path)
324 return base
325
326
327else: # OSes other than Windows and OSX.
328
329
330 # On non-windows, keep the stdlib behavior.
331 isabs = os.path.isabs
332
333
334 def get_native_path_case(path):
335 """Returns the native path case for an existing file.
336
337 On OSes other than OSX and Windows, assume the file system is
338 case-sensitive.
339
340 TODO(maruel): This is not strictly true. Implement if necessary.
341 """
342 if not isabs(path):
343 raise ValueError(
344 'Can\'t get native path case for a non-absolute path: %s' % path,
345 path)
346 # Give up on cygwin, as GetLongPathName() can't be called.
347 # Linux traces tends to not be normalized so use this occasion to normalize
348 # it. This function implementation already normalizes the path on the other
349 # OS so this needs to be done here to be coherent between OSes.
350 out = os.path.normpath(path)
351 if path.endswith(os.path.sep) and not out.endswith(os.path.sep):
352 return out + os.path.sep
353 return out
354
355
356if sys.platform != 'win32': # All non-Windows OSes.
357
358
359 def safe_join(*args):
360 """Joins path elements like os.path.join() but doesn't abort on absolute
361 path.
362
363 os.path.join('foo', '/bar') == '/bar'
364 but safe_join('foo', '/bar') == 'foo/bar'.
365 """
366 out = ''
367 for element in args:
368 if element.startswith(os.path.sep):
369 if out.endswith(os.path.sep):
370 out += element[1:]
371 else:
372 out += element
373 else:
374 if out.endswith(os.path.sep):
375 out += element
376 else:
377 out += os.path.sep + element
378 return out
379
380
381 def split_at_symlink(base_dir, relfile):
382 """Scans each component of relfile and cut the string at the symlink if
383 there is any.
384
385 Returns a tuple (base_path, symlink, rest), with symlink == rest == None if
386 not symlink was found.
387 """
388 if base_dir:
389 assert relfile
390 assert os.path.isabs(base_dir)
391 index = 0
392 else:
393 assert os.path.isabs(relfile)
394 index = 1
395
396 def at_root(rest):
397 if base_dir:
398 return safe_join(base_dir, rest)
399 return rest
400
401 while True:
402 try:
403 index = relfile.index(os.path.sep, index)
404 except ValueError:
405 index = len(relfile)
406 full = at_root(relfile[:index])
407 if os.path.islink(full):
408 # A symlink!
409 base = os.path.dirname(relfile[:index])
410 symlink = os.path.basename(relfile[:index])
411 rest = relfile[index:]
412 logging.debug(
413 'split_at_symlink(%s, %s) -> (%s, %s, %s)' %
414 (base_dir, relfile, base, symlink, rest))
415 return base, symlink, rest
416 if index == len(relfile):
417 break
418 index += 1
419 return relfile, None, None
420
421
422def fix_python_path(cmd):
423 """Returns the fixed command line to call the right python executable."""
424 out = cmd[:]
425 if out[0] == 'python':
426 out[0] = sys.executable
427 elif out[0].endswith('.py'):
428 out.insert(0, sys.executable)
429 return out
430
431
432def create_thunk():
433 handle, name = tempfile.mkstemp(prefix='trace_inputs_thunk', suffix='.py')
434 os.write(
435 handle,
436 (
437 'import subprocess\n'
438 'import sys\n'
439 'sys.exit(subprocess.call(sys.argv[2:]))\n'
440 ))
441 os.close(handle)
442 return name
443
444
445def strace_process_quoted_arguments(text):
446 """Extracts quoted arguments on a string and return the arguments as a list.
447
448 Implemented as an automaton. Supports incomplete strings in the form
449 '"foo"...'.
450
451 Example:
452 With text = '"foo", "bar"', the function will return ['foo', 'bar']
453
454 TODO(maruel): Implement escaping.
455 """
456 # All the possible states of the DFA.
457 ( NEED_QUOTE, # Begining of a new arguments.
458 INSIDE_STRING, # Inside an argument.
459 ESCAPED, # Found a '\' inside a quote. Treat the next char as-is.
460 NEED_COMMA_OR_DOT, # Right after the closing quote of an argument. Could be
461 # a serie of 3 dots or a comma.
462 NEED_SPACE, # Right after a comma
463 NEED_DOT_2, # Found a dot, need a second one.
464 NEED_DOT_3, # Found second dot, need a third one.
465 NEED_COMMA, # Found third dot, need a comma.
466 ) = range(8)
467
468 state = NEED_QUOTE
469 out = []
470 for index, char in enumerate(text):
471 if char == '"':
472 if state == NEED_QUOTE:
473 state = INSIDE_STRING
474 # A new argument was found.
475 out.append('')
476 elif state == INSIDE_STRING:
477 # The argument is now closed.
478 state = NEED_COMMA_OR_DOT
479 elif state == ESCAPED:
480 out[-1] += char
481 state = INSIDE_STRING
482 else:
483 raise ValueError(
484 'Can\'t process char at column %d for: %r' % (index, text),
485 index,
486 text)
487 elif char == ',':
488 if state in (NEED_COMMA_OR_DOT, NEED_COMMA):
489 state = NEED_SPACE
490 elif state == INSIDE_STRING:
491 out[-1] += char
492 elif state == ESCAPED:
493 out[-1] += char
494 state = INSIDE_STRING
495 else:
496 raise ValueError(
497 'Can\'t process char at column %d for: %r' % (index, text),
498 index,
499 text)
500 elif char == ' ':
501 if state == NEED_SPACE:
502 state = NEED_QUOTE
503 elif state == INSIDE_STRING:
504 out[-1] += char
505 elif state == ESCAPED:
506 out[-1] += char
507 state = INSIDE_STRING
508 else:
509 raise ValueError(
510 'Can\'t process char at column %d for: %r' % (index, text),
511 index,
512 text)
513 elif char == '.':
514 if state == NEED_COMMA_OR_DOT:
515 # The string is incomplete, this mean the strace -s flag should be
516 # increased.
517 state = NEED_DOT_2
518 elif state == NEED_DOT_2:
519 state = NEED_DOT_3
520 elif state == NEED_DOT_3:
521 state = NEED_COMMA
522 elif state == INSIDE_STRING:
523 out[-1] += char
524 elif state == ESCAPED:
525 out[-1] += char
526 state = INSIDE_STRING
527 else:
528 raise ValueError(
529 'Can\'t process char at column %d for: %r' % (index, text),
530 index,
531 text)
532 elif char == '\\':
533 if state == ESCAPED:
534 out[-1] += char
535 state = INSIDE_STRING
536 elif state == INSIDE_STRING:
537 state = ESCAPED
538 else:
539 raise ValueError(
540 'Can\'t process char at column %d for: %r' % (index, text),
541 index,
542 text)
543 else:
544 if state == INSIDE_STRING:
545 out[-1] += char
546 else:
547 raise ValueError(
548 'Can\'t process char at column %d for: %r' % (index, text),
549 index,
550 text)
551 if state not in (NEED_COMMA, NEED_COMMA_OR_DOT):
552 raise ValueError(
553 'String is incorrectly terminated: %r' % text,
554 text)
555 return out
556
557
558def read_json(filepath):
559 with open(filepath, 'r') as f:
560 return json.load(f)
561
562
563def write_json(filepath_or_handle, data, dense):
564 """Writes data into filepath or file handle encoded as json.
565
566 If dense is True, the json is packed. Otherwise, it is human readable.
567 """
568 if hasattr(filepath_or_handle, 'write'):
569 if dense:
570 filepath_or_handle.write(
571 json.dumps(data, sort_keys=True, separators=(',',':')))
572 else:
573 filepath_or_handle.write(json.dumps(data, sort_keys=True, indent=2))
574 else:
575 with open(filepath_or_handle, 'wb') as f:
576 if dense:
577 json.dump(data, f, sort_keys=True, separators=(',',':'))
578 else:
579 json.dump(data, f, sort_keys=True, indent=2)
580
581
582class Results(object):
583 """Results of a trace session."""
584
585 class _TouchedObject(object):
586 """Something, a file or a directory, that was accessed."""
587 def __init__(self, root, path, tainted, size, nb_files):
588 logging.debug(
589 '%s(%s, %s, %s, %s, %s)' %
590 (self.__class__.__name__, root, path, tainted, size, nb_files))
591 self.root = root
592 self.path = path
593 self.tainted = tainted
594 self.nb_files = nb_files
595 # Can be used as a cache or a default value, depending on context.
596 self._size = size
597 # These are cache only.
598 self._real_path = None
599
600 # Check internal consistency.
601 assert path, path
602 assert tainted or bool(root) != bool(isabs(path)), (root, path)
603 assert tainted or (
604 not os.path.exists(self.full_path) or
605 (self.full_path == get_native_path_case(self.full_path))), (
606 tainted, self.full_path, get_native_path_case(self.full_path))
607
608 @property
609 def existent(self):
610 return self.size != -1
611
612 @property
613 def full_path(self):
614 if self.root:
615 return os.path.join(self.root, self.path)
616 return self.path
617
618 @property
619 def real_path(self):
620 """Returns the path with symlinks resolved."""
621 if not self._real_path:
622 self._real_path = os.path.realpath(self.full_path)
623 return self._real_path
624
625 @property
626 def size(self):
627 """File's size. -1 is not existent."""
628 if self._size is None and not self.tainted:
629 try:
630 self._size = os.stat(self.full_path).st_size
631 except OSError:
632 self._size = -1
633 return self._size
634
635 def flatten(self):
636 """Returns a dict representing this object.
637
638 A 'size' of 0 means the file was only touched and not read.
639 """
640 return {
641 'path': self.path,
642 'size': self.size,
643 }
644
645 def replace_variables(self, variables):
646 """Replaces the root of this File with one of the variables if it matches.
647
648 If a variable replacement occurs, the cloned object becomes tainted.
649 """
650 for variable, root_path in variables.iteritems():
651 if self.path.startswith(root_path):
652 return self._clone(
653 self.root, variable + self.path[len(root_path):], True)
654 # No need to clone, returns ourself.
655 return self
656
657 def strip_root(self, root):
658 """Returns a clone of itself with 'root' stripped off."""
659 # Check internal consistency.
660 assert self.tainted or (isabs(root) and root.endswith(os.path.sep)), root
661 if not self.full_path.startswith(root):
662 # Now try to resolve the symlinks to see if it can be reached this way.
663 # Only try *after* trying without resolving symlink.
664 if not self.real_path.startswith(root):
665 return None
666 path = self.real_path
667 else:
668 path = self.full_path
669 return self._clone(root, path[len(root):], self.tainted)
670
671 def _clone(self, new_root, new_path, tainted):
672 raise NotImplementedError(self.__class__.__name__)
673
674 class File(_TouchedObject):
675 """A file that was accessed. May not be present anymore.
676
677 If tainted is true, it means it is not a real path anymore as a variable
678 replacement occured.
679
680 If touched_only is True, this means the file was probed for existence, and
681 it is existent, but was never _opened_. If touched_only is True, the file
682 must have existed.
683 """
684 def __init__(self, root, path, tainted, size):
685 super(Results.File, self).__init__(root, path, tainted, size, 1)
686
687 def _clone(self, new_root, new_path, tainted):
688 """Clones itself keeping meta-data."""
689 # Keep the self.size and self._real_path caches for performance reason. It
690 # is also important when the file becomes tainted (with a variable instead
691 # of the real path) since self.path is not an on-disk path anymore so
692 # out._size cannot be updated.
693 out = self.__class__(new_root, new_path, tainted, self.size)
694 out._real_path = self._real_path
695 return out
696
697 class Directory(_TouchedObject):
698 """A directory of files. Must exist."""
699 def __init__(self, root, path, tainted, size, nb_files):
700 """path='.' is a valid value and must be handled appropriately."""
701 assert not path.endswith(os.path.sep), path
702 super(Results.Directory, self).__init__(
703 root, path + os.path.sep, tainted, size, nb_files)
704 # For a Directory instance, self.size is not a cache, it's an actual value
705 # that is never modified and represents the total size of the files
706 # contained in this directory. It is possible that the directory is empty
707 # so that size == 0; this happens if there's only an invalid symlink in
708 # it.
709
710 def flatten(self):
711 out = super(Results.Directory, self).flatten()
712 out['nb_files'] = self.nb_files
713 return out
714
715 def _clone(self, new_root, new_path, tainted):
716 """Clones itself keeping meta-data."""
717 out = self.__class__(
718 new_root,
719 new_path.rstrip(os.path.sep),
720 tainted,
721 self.size,
722 self.nb_files)
723 out._real_path = self._real_path
724 return out
725
726 class Process(object):
727 """A process that was traced.
728
729 Contains references to the files accessed by this process and its children.
730 """
731 def __init__(self, pid, files, executable, command, initial_cwd, children):
732 logging.debug('Process(%s, %d, ...)' % (pid, len(files)))
733 self.pid = pid
734 self.files = sorted(files, key=lambda x: x.path)
735 self.children = children
736 self.executable = executable
737 self.command = command
738 self.initial_cwd = initial_cwd
739
740 # Check internal consistency.
741 assert len(set(f.path for f in self.files)) == len(self.files), sorted(
742 f.path for f in self.files)
743 assert isinstance(self.children, list)
744 assert isinstance(self.files, list)
745
746 @property
747 def all(self):
748 for child in self.children:
749 for i in child.all:
750 yield i
751 yield self
752
753 def flatten(self):
754 return {
755 'children': [c.flatten() for c in self.children],
756 'command': self.command,
757 'executable': self.executable,
758 'files': [f.flatten() for f in self.files],
759 'initial_cwd': self.initial_cwd,
760 'pid': self.pid,
761 }
762
763 def strip_root(self, root):
764 assert isabs(root) and root.endswith(os.path.sep), root
765 # Loads the files after since they are constructed as objects.
766 out = self.__class__(
767 self.pid,
768 filter(None, (f.strip_root(root) for f in self.files)),
769 self.executable,
770 self.command,
771 self.initial_cwd,
772 [c.strip_root(root) for c in self.children])
773 logging.debug(
774 'strip_root(%s) %d -> %d' % (root, len(self.files), len(out.files)))
775 return out
776
777 def __init__(self, process):
778 self.process = process
779 # Cache.
780 self._files = None
781
782 def flatten(self):
783 return {
784 'root': self.process.flatten(),
785 }
786
787 @property
788 def files(self):
789 if self._files is None:
790 self._files = sorted(
791 sum((p.files for p in self.process.all), []),
792 key=lambda x: x.path)
793 return self._files
794
795 @property
796 def existent(self):
797 return [f for f in self.files if f.existent]
798
799 @property
800 def non_existent(self):
801 return [f for f in self.files if not f.existent]
802
803 def strip_root(self, root):
804 """Returns a clone with all the files outside the directory |root| removed
805 and converts all the path to be relative paths.
806 """
807 # Resolve any symlink
808 root = os.path.realpath(root)
809 root = get_native_path_case(root).rstrip(os.path.sep) + os.path.sep
810 logging.debug('strip_root(%s)' % root)
811 return Results(self.process.strip_root(root))
812
813
814class ApiBase(object):
815 """OS-agnostic API to trace a process and its children."""
816 class Context(object):
817 """Processes one log line at a time and keeps the list of traced processes.
818
819 The parsing is complicated by the fact that logs are traced out of order for
820 strace but in-order for dtrace and logman. In addition, on Windows it is
821 very frequent that processids are reused so a flat list cannot be used. But
822 at the same time, it is impossible to faithfully construct a graph when the
823 logs are processed out of order. So both a tree and a flat mapping are used,
824 the tree is the real process tree, while the flat mapping stores the last
825 valid process for the corresponding processid. For the strace case, the
826 tree's head is guessed at the last moment.
827 """
828 class Process(object):
829 """Keeps context for one traced child process.
830
831 Logs all the files this process touched. Ignores directories.
832 """
833 def __init__(self, blacklist, pid, initial_cwd):
834 # Check internal consistency.
835 assert isinstance(pid, int), repr(pid)
836 self.pid = pid
837 # children are Process instances.
838 self.children = []
839 self.initial_cwd = initial_cwd
840 self.cwd = None
841 self.files = set()
842 self.only_touched = set()
843 self.executable = None
844 self.command = None
845 self._blacklist = blacklist
846
847 def to_results_process(self):
848 """Resolves file case sensitivity and or late-bound strings."""
849 # When resolving files, it's normal to get dupe because a file could be
850 # opened multiple times with different case. Resolve the deduplication
851 # here.
852 def fix_path(x):
853 """Returns the native file path case.
854
855 Converts late-bound strings.
856 """
857 if not x:
858 # Do not convert None instance to 'None'.
859 return x
860 # TODO(maruel): Do not upconvert to unicode here, on linux we don't
861 # know the file path encoding so they must be treated as bytes.
862 x = unicode(x)
863 if os.path.isabs(x):
864 # If the path is not absolute, which tends to happen occasionally on
865 # Windows, it is not possible to get the native path case so ignore
866 # that trace. It mostly happens for 'executable' value.
867 x = get_native_path_case(x)
868 return x
869
870 def fix_and_blacklist_path(x):
871 x = fix_path(x)
872 if not x:
873 return
874 # The blacklist needs to be reapplied, since path casing could
875 # influence blacklisting.
876 if self._blacklist(x):
877 return
878 return x
879
880 # Filters out directories. Some may have passed through.
881 files = set(f for f in map(fix_and_blacklist_path, self.files) if f)
882 only_touched = set(
883 f for f in map(fix_and_blacklist_path, self.only_touched) if f)
884 only_touched -= files
885
886 files = [
887 Results.File(None, f, False, None) for f in files
888 if not os.path.isdir(f)
889 ]
890 # Using 0 as size means the file's content is ignored since the file was
891 # never opened for I/O.
892 files.extend(
893 Results.File(None, f, False, 0) for f in only_touched
894 if not os.path.isdir(f)
895 )
896 return Results.Process(
897 self.pid,
898 files,
899 fix_path(self.executable),
900 self.command,
901 fix_path(self.initial_cwd),
902 [c.to_results_process() for c in self.children])
903
904 def add_file(self, filepath, touch_only):
905 """Adds a file if it passes the blacklist."""
906 if self._blacklist(unicode(filepath)):
907 return
908 logging.debug('add_file(%d, %s, %s)' % (self.pid, filepath, touch_only))
909 # Note that filepath and not unicode(filepath) is added. It is because
910 # filepath could be something else than a string, like a RelativePath
911 # instance for dtrace logs.
912 if touch_only:
913 self.only_touched.add(filepath)
914 else:
915 self.files.add(filepath)
916
917 def __init__(self, blacklist):
918 self.blacklist = blacklist
919 # Initial process.
920 self.root_process = None
921 # dict to accelerate process lookup, to not have to lookup the whole graph
922 # each time.
923 self._process_lookup = {}
924
925 class Tracer(object):
926 """During it's lifetime, the tracing subsystem is enabled."""
927 def __init__(self, logname):
928 self._logname = logname
929 self._lock = threading.Lock()
930 self._traces = []
931 self._initialized = True
932
933 def trace(self, cmd, cwd, tracename, output):
934 """Runs the OS-specific trace program on an executable.
935
936 Arguments:
937 - cmd: The command (a list) to run.
938 - cwd: Current directory to start the child process in.
939 - tracename: Name of the trace in the logname file.
940 - output: If False, redirects output to PIPEs.
941
942 Returns a tuple (resultcode, output) and updates the internal trace
943 entries.
944 """
945 # The implementation adds an item to self._traces.
946 raise NotImplementedError(self.__class__.__name__)
947
948 def close(self, _timeout=None):
949 """Saves the meta-data in the logname file.
950
951 For kernel-based tracing, stops the tracing subsystem.
952
953 Must not be used manually when using 'with' construct.
954 """
955 with self._lock:
956 assert self._initialized
957 try:
958 data = {
959 'traces': self._traces,
960 }
961 write_json(self._logname, data, False)
962 finally:
963 self._initialized = False
964
965 def post_process_log(self):
966 """Post-processes the log so it becomes faster to load afterward.
967
968 Must not be used manually when using 'with' construct.
969 """
970 assert not self._initialized, 'Must stop tracing first.'
971
972 def __enter__(self):
973 """Enables 'with' statement."""
974 return self
975
976 def __exit__(self, exc_type, exc_value, traceback):
977 """Enables 'with' statement."""
978 self.close()
979 # If an exception was thrown, do not process logs.
980 if not exc_type:
981 self.post_process_log()
982
983 def get_tracer(self, logname):
984 """Returns an ApiBase.Tracer instance.
985
986 Initializes the tracing subsystem, which is a requirement for kernel-based
987 tracers. Only one tracer instance should be live at a time!
988
989 logname is the filepath to the json file that will contain the meta-data
990 about the logs.
991 """
992 return self.Tracer(logname)
993
994 @staticmethod
995 def clean_trace(logname):
996 """Deletes an old log."""
997 raise NotImplementedError()
998
999 @classmethod
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00001000 def parse_log(cls, logname, blacklist, trace_name):
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001001 """Processes trace logs and returns the files opened and the files that do
1002 not exist.
1003
1004 It does not track directories.
1005
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00001006 Arguments:
1007 - logname: must be an absolute path.
1008 - blacklist: must be a lambda.
1009 - trace_name: optional trace to read, defaults to reading all traces.
maruel@chromium.org8b056ba2012-10-16 14:04:49 +00001010
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001011 Most of the time, files that do not exist are temporary test files that
1012 should be put in /tmp instead. See http://crbug.com/116251.
1013
1014 Returns a list of dict with keys:
1015 - results: A Results instance.
1016 - trace: The corresponding tracename parameter provided to
1017 get_tracer().trace().
1018 - output: Output gathered during execution, if get_tracer().trace(...,
1019 output=False) was used.
1020 """
1021 raise NotImplementedError(cls.__class__.__name__)
1022
1023
1024class Strace(ApiBase):
1025 """strace implies linux."""
1026 class Context(ApiBase.Context):
1027 """Processes a strace log line and keeps the list of existent and non
1028 existent files accessed.
1029
1030 Ignores directories.
1031
1032 Uses late-binding to processes the cwd of each process. The problem is that
1033 strace generates one log file per process it traced but doesn't give any
1034 information about which process was started when and by who. So we don't
1035 even know which process is the initial one. So process the logs out of
1036 order and use late binding with RelativePath to be able to deduce the
1037 initial directory of each process once all the logs are parsed.
1038 """
1039 class Process(ApiBase.Context.Process):
1040 """Represents the state of a process.
1041
1042 Contains all the information retrieved from the pid-specific log.
1043 """
1044 # Function names are using ([a-z_0-9]+)
1045 # This is the most common format. function(args) = result
1046 RE_HEADER = re.compile(r'^([a-z_0-9]+)\((.+?)\)\s+= (.+)$')
1047 # An interrupted function call, only grab the minimal header.
1048 RE_UNFINISHED = re.compile(r'^([^\(]+)(.*) \<unfinished \.\.\.\>$')
maruel@chromium.org9f72e0f2012-10-18 14:48:49 +00001049 # An interrupted function call, with the process exiting. It must be the
1050 # last line in the log.
1051 RE_UNFINISHED_EXIT = re.compile(
1052 r'^([^\(]+)(.*) \<unfinished \.\.\.\ exit status \d+>$')
maruel@chromium.org4e506502012-10-18 15:02:58 +00001053 # An interrupted function call the hard way. Usually observed with futex()
1054 # on ubuntu 12.04.
csharp@chromium.org4d753552012-10-25 19:31:55 +00001055 RE_INTERRUPTED_HARD = re.compile(r'^([^\(]+)\([A-Z0-9a-fx\,\_\|\{\} ]*$')
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001056 # A resumed function call.
1057 RE_RESUMED = re.compile(r'^<\.\.\. ([^ ]+) resumed> (.+)$')
1058 # A process received a signal.
1059 RE_SIGNAL = re.compile(r'^--- SIG[A-Z]+ .+ ---')
1060 # A process didn't handle a signal. Ignore any junk appearing before,
1061 # because the process was forcibly killed so it won't open any new file.
1062 RE_KILLED = re.compile(
1063 r'^.*\+\+\+ killed by ([A-Z]+)( \(core dumped\))? \+\+\+$')
1064 # The process has exited.
1065 RE_PROCESS_EXITED = re.compile(r'^\+\+\+ exited with (\d+) \+\+\+')
1066 # A call was canceled. Ignore any prefix.
1067 RE_UNAVAILABLE = re.compile(r'^.*\)\s*= \? <unavailable>$')
1068 # Happens when strace fails to even get the function name.
1069 UNNAMED_FUNCTION = '????'
1070
1071 # Corner-case in python, a class member function decorator must not be
1072 # @staticmethod.
1073 def parse_args(regexp, expect_zero): # pylint: disable=E0213
1074 """Automatically convert the str 'args' into a list of processed
1075 arguments.
1076
1077 Arguments:
1078 - regexp is used to parse args.
1079 - expect_zero: one of True, False or None.
1080 - True: will check for result.startswith('0') first and will ignore
1081 the trace line completely otherwise. This is important because for
1082 many functions, the regexp will not process if the call failed.
1083 - False: will check for not result.startswith(('?', '-1')) for the
1084 same reason than with True.
1085 - None: ignore result.
1086 """
1087 def meta_hook(function):
1088 assert function.__name__.startswith('handle_')
1089 def hook(self, args, result):
1090 if expect_zero is True and not result.startswith('0'):
1091 return
1092 if expect_zero is False and result.startswith(('?', '-1')):
1093 return
1094 match = re.match(regexp, args)
1095 if not match:
1096 raise TracingFailure(
1097 'Failed to parse %s(%s) = %s' %
1098 (function.__name__[len('handle_'):], args, result),
1099 None, None, None)
1100 return function(self, match.groups(), result)
1101 return hook
1102 return meta_hook
1103
1104 class RelativePath(object):
1105 """A late-bound relative path."""
1106 def __init__(self, parent, value):
1107 self.parent = parent
1108 self.value = value
1109
1110 def render(self):
1111 """Returns the current directory this instance is representing.
1112
1113 This function is used to return the late-bound value.
1114 """
1115 if self.value and self.value.startswith(u'/'):
1116 # An absolute path.
1117 return self.value
1118 parent = self.parent.render() if self.parent else u'<None>'
1119 if self.value:
1120 return os.path.normpath(os.path.join(parent, self.value))
1121 return parent
1122
1123 def __unicode__(self):
1124 """Acts as a string whenever needed."""
1125 return unicode(self.render())
1126
1127 def __str__(self):
1128 """Acts as a string whenever needed."""
1129 return str(self.render())
1130
1131 def __init__(self, root, pid):
1132 """Keeps enough information to be able to guess the original process
1133 root.
1134
1135 strace doesn't store which process was the initial process. So more
1136 information needs to be kept so the graph can be reconstructed from the
1137 flat map.
1138 """
1139 logging.info('%s(%d)' % (self.__class__.__name__, pid))
1140 super(Strace.Context.Process, self).__init__(root.blacklist, pid, None)
1141 assert isinstance(root, ApiBase.Context)
1142 self._root = weakref.ref(root)
1143 # The dict key is the function name of the pending call, like 'open'
1144 # or 'execve'.
1145 self._pending_calls = {}
1146 self._line_number = 0
1147 # Current directory when the process started.
1148 self.initial_cwd = self.RelativePath(self._root(), None)
1149 self.parentid = None
maruel@chromium.org9f72e0f2012-10-18 14:48:49 +00001150 self._done = False
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001151
1152 def get_cwd(self):
1153 """Returns the best known value of cwd."""
1154 return self.cwd or self.initial_cwd
1155
1156 def render(self):
1157 """Returns the string value of the RelativePath() object.
1158
1159 Used by RelativePath. Returns the initial directory and not the
1160 current one since the current directory 'cwd' validity is time-limited.
1161
1162 The validity is only guaranteed once all the logs are processed.
1163 """
1164 return self.initial_cwd.render()
1165
1166 def on_line(self, line):
1167 self._line_number += 1
maruel@chromium.org9f72e0f2012-10-18 14:48:49 +00001168 if self._done:
1169 raise TracingFailure(
1170 'Found a trace for a terminated process',
1171 None, None, None)
1172
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001173 if self.RE_SIGNAL.match(line):
1174 # Ignore signals.
1175 return
1176
1177 try:
1178 match = self.RE_KILLED.match(line)
1179 if match:
1180 # Converts a '+++ killed by Foo +++' trace into an exit_group().
1181 self.handle_exit_group(match.group(1), None)
1182 return
1183
1184 match = self.RE_PROCESS_EXITED.match(line)
1185 if match:
1186 # Converts a '+++ exited with 1 +++' trace into an exit_group()
1187 self.handle_exit_group(match.group(1), None)
1188 return
1189
1190 match = self.RE_UNFINISHED.match(line)
1191 if match:
1192 if match.group(1) in self._pending_calls:
1193 raise TracingFailure(
1194 'Found two unfinished calls for the same function',
1195 None, None, None,
1196 self._pending_calls)
1197 self._pending_calls[match.group(1)] = (
1198 match.group(1) + match.group(2))
1199 return
1200
maruel@chromium.org4e506502012-10-18 15:02:58 +00001201 match = (
1202 self.RE_UNFINISHED_EXIT.match(line) or
1203 self.RE_INTERRUPTED_HARD.match(line))
maruel@chromium.org9f72e0f2012-10-18 14:48:49 +00001204 if match:
1205 # The process died. No other line can be processed afterward.
1206 self._done = True
1207 return
1208
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001209 match = self.RE_UNAVAILABLE.match(line)
1210 if match:
1211 # This usually means a process was killed and a pending call was
1212 # canceled.
1213 # TODO(maruel): Look up the last exit_group() trace just above and
1214 # make sure any self._pending_calls[anything] is properly flushed.
1215 return
1216
1217 match = self.RE_RESUMED.match(line)
1218 if match:
1219 if match.group(1) not in self._pending_calls:
1220 raise TracingFailure(
1221 'Found a resumed call that was not logged as unfinished',
1222 None, None, None,
1223 self._pending_calls)
1224 pending = self._pending_calls.pop(match.group(1))
1225 # Reconstruct the line.
1226 line = pending + match.group(2)
1227
1228 match = self.RE_HEADER.match(line)
1229 if not match:
1230 raise TracingFailure(
1231 'Found an invalid line: %s' % line,
1232 None, None, None)
1233 if match.group(1) == self.UNNAMED_FUNCTION:
1234 return
1235
1236 # It's a valid line, handle it.
1237 handler = getattr(self, 'handle_%s' % match.group(1), None)
1238 if not handler:
1239 self._handle_unknown(match.group(1), match.group(2), match.group(3))
1240 return handler(match.group(2), match.group(3))
1241 except TracingFailure, e:
1242 # Hack in the values since the handler could be a static function.
1243 e.pid = self.pid
1244 e.line = line
1245 e.line_number = self._line_number
1246 # Re-raise the modified exception.
1247 raise
1248 except (KeyError, NotImplementedError, ValueError), e:
1249 raise TracingFailure(
1250 'Trace generated a %s exception: %s' % (
1251 e.__class__.__name__, str(e)),
1252 self.pid,
1253 self._line_number,
1254 line,
1255 e)
1256
1257 @parse_args(r'^\"(.+?)\", [FKORWX_|]+$', True)
1258 def handle_access(self, args, _result):
1259 self._handle_file(args[0], True)
1260
1261 @parse_args(r'^\"(.+?)\"$', True)
1262 def handle_chdir(self, args, _result):
1263 """Updates cwd."""
1264 self.cwd = self.RelativePath(self, args[0])
1265 logging.debug('handle_chdir(%d, %s)' % (self.pid, self.cwd))
1266
1267 def handle_clone(self, _args, result):
1268 """Transfers cwd."""
1269 if result.startswith(('?', '-1')):
1270 # The call failed.
1271 return
1272 # Update the other process right away.
1273 childpid = int(result)
1274 child = self._root().get_or_set_proc(childpid)
1275 if child.parentid is not None or childpid in self.children:
1276 raise TracingFailure(
1277 'Found internal inconsitency in process lifetime detection '
1278 'during a clone() call',
1279 None, None, None)
1280
1281 # Copy the cwd object.
1282 child.initial_cwd = self.get_cwd()
1283 child.parentid = self.pid
1284 # It is necessary because the logs are processed out of order.
1285 self.children.append(child)
1286
1287 def handle_close(self, _args, _result):
1288 pass
1289
1290 def handle_chmod(self, _args, _result):
1291 pass
1292
1293 def handle_creat(self, _args, _result):
1294 # Ignore files created, since they didn't need to exist.
1295 pass
1296
1297 @parse_args(r'^\"(.+?)\", \[(.+)\], \[\/\* \d+ vars? \*\/\]$', True)
1298 def handle_execve(self, args, _result):
1299 # Even if in practice execve() doesn't returns when it succeeds, strace
1300 # still prints '0' as the result.
1301 filepath = args[0]
1302 self._handle_file(filepath, False)
1303 self.executable = self.RelativePath(self.get_cwd(), filepath)
1304 self.command = strace_process_quoted_arguments(args[1])
1305
1306 def handle_exit_group(self, _args, _result):
1307 """Removes cwd."""
1308 self.cwd = None
1309
csharp@chromium.org11ae0672012-10-30 14:53:17 +00001310 @parse_args(r'^(\d+|AT_FDCWD), \"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', True)
1311 def handle_faccessat(self, args, _results):
1312 if args[0] == 'AT_FDCWD':
1313 self._handle_file(args[1], True)
1314 else:
1315 raise Exception('Relative faccess not implemented.')
1316
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001317 def handle_fork(self, args, result):
1318 self._handle_unknown('fork', args, result)
1319
maruel@chromium.org9f72e0f2012-10-18 14:48:49 +00001320 def handle_futex(self, _args, _result):
1321 pass
1322
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001323 def handle_getcwd(self, _args, _result):
1324 pass
1325
1326 @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
1327 def handle_link(self, args, _result):
1328 self._handle_file(args[0], False)
1329 self._handle_file(args[1], False)
1330
1331 @parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True)
1332 def handle_lstat(self, args, _result):
1333 self._handle_file(args[0], True)
1334
1335 def handle_mkdir(self, _args, _result):
1336 pass
1337
1338 @parse_args(r'^\"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', False)
1339 def handle_open(self, args, _result):
1340 if 'O_DIRECTORY' in args[1]:
1341 return
1342 self._handle_file(args[0], False)
1343
1344 @parse_args(r'^(\d+|AT_FDCWD), \"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', False)
1345 def handle_openat(self, args, _result):
1346 if 'O_DIRECTORY' in args[2]:
1347 return
csharp@chromium.org11ae0672012-10-30 14:53:17 +00001348 if args[0] == 'AT_FDCWD':
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001349 self._handle_file(args[1], False)
1350 else:
1351 # TODO(maruel): Implement relative open if necessary instead of the
1352 # AT_FDCWD flag, let's hope not since this means tracking all active
1353 # directory handles.
1354 raise Exception('Relative open via openat not implemented.')
1355
1356 @parse_args(r'^\"(.+?)\", \".+?\"(\.\.\.)?, \d+$', False)
1357 def handle_readlink(self, args, _result):
1358 self._handle_file(args[0], False)
1359
1360 @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
1361 def handle_rename(self, args, _result):
1362 self._handle_file(args[0], False)
1363 self._handle_file(args[1], False)
1364
1365 def handle_rmdir(self, _args, _result):
1366 pass
1367
1368 def handle_setxattr(self, _args, _result):
1369 pass
1370
1371 @parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True)
1372 def handle_stat(self, args, _result):
1373 self._handle_file(args[0], True)
1374
1375 def handle_symlink(self, _args, _result):
1376 pass
1377
1378 def handle_unlink(self, _args, _result):
1379 # In theory, the file had to be created anyway.
1380 pass
1381
csharp@chromium.org11ae0672012-10-30 14:53:17 +00001382 def handle_unlinkat(self, _args, _result):
1383 # In theory, the file had to be created anyway.
1384 pass
1385
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001386 def handle_statfs(self, _args, _result):
1387 pass
1388
csharp@chromium.org11ae0672012-10-30 14:53:17 +00001389 def handle_utimensat(self, _args, _result):
1390 pass
1391
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001392 def handle_vfork(self, args, result):
1393 self._handle_unknown('vfork', args, result)
1394
1395 @staticmethod
1396 def _handle_unknown(function, args, result):
1397 raise TracingFailure(
1398 'Unexpected/unimplemented trace %s(%s)= %s' %
1399 (function, args, result),
1400 None, None, None)
1401
1402 def _handle_file(self, filepath, touch_only):
1403 filepath = self.RelativePath(self.get_cwd(), filepath)
1404 #assert not touch_only, unicode(filepath)
1405 self.add_file(filepath, touch_only)
1406
1407 def __init__(self, blacklist, initial_cwd):
1408 super(Strace.Context, self).__init__(blacklist)
1409 self.initial_cwd = initial_cwd
1410
1411 def render(self):
1412 """Returns the string value of the initial cwd of the root process.
1413
1414 Used by RelativePath.
1415 """
1416 return self.initial_cwd
1417
1418 def on_line(self, pid, line):
1419 """Transfers control into the Process.on_line() function."""
1420 self.get_or_set_proc(pid).on_line(line.strip())
1421
1422 def to_results(self):
1423 """Finds back the root process and verify consistency."""
1424 # TODO(maruel): Absolutely unecessary, fix me.
1425 root = [p for p in self._process_lookup.itervalues() if not p.parentid]
1426 if len(root) != 1:
1427 raise TracingFailure(
1428 'Found internal inconsitency in process lifetime detection '
1429 'while finding the root process',
1430 None,
1431 None,
1432 None,
1433 sorted(p.pid for p in root))
1434 self.root_process = root[0]
1435 process = self.root_process.to_results_process()
1436 if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
1437 raise TracingFailure(
1438 'Found internal inconsitency in process lifetime detection '
1439 'while looking for len(tree) == len(list)',
1440 None,
1441 None,
1442 None,
1443 sorted(self._process_lookup),
1444 sorted(p.pid for p in process.all))
1445 return Results(process)
1446
1447 def get_or_set_proc(self, pid):
1448 """Returns the Context.Process instance for this pid or creates a new one.
1449 """
1450 if not pid or not isinstance(pid, int):
1451 raise TracingFailure(
1452 'Unpexpected value for pid: %r' % pid,
1453 pid,
1454 None,
1455 None,
1456 pid)
1457 if pid not in self._process_lookup:
1458 self._process_lookup[pid] = self.Process(self, pid)
1459 return self._process_lookup[pid]
1460
1461 @classmethod
1462 def traces(cls):
1463 """Returns the list of all handled traces to pass this as an argument to
1464 strace.
1465 """
1466 prefix = 'handle_'
1467 return [i[len(prefix):] for i in dir(cls.Process) if i.startswith(prefix)]
1468
1469 class Tracer(ApiBase.Tracer):
1470 MAX_LEN = 256
1471
1472 def trace(self, cmd, cwd, tracename, output):
1473 """Runs strace on an executable."""
1474 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
1475 assert os.path.isabs(cmd[0]), cmd[0]
1476 assert os.path.isabs(cwd), cwd
1477 assert os.path.normpath(cwd) == cwd, cwd
1478 with self._lock:
1479 if not self._initialized:
1480 raise TracingFailure(
1481 'Called Tracer.trace() on an unitialized object',
1482 None, None, None, tracename)
1483 assert tracename not in (i['trace'] for i in self._traces)
1484 stdout = stderr = None
1485 if output:
1486 stdout = subprocess.PIPE
1487 stderr = subprocess.STDOUT
1488 # Ensure all file related APIs are hooked.
1489 traces = ','.join(Strace.Context.traces() + ['file'])
1490 trace_cmd = [
1491 'strace',
1492 '-ff',
1493 '-s', '%d' % self.MAX_LEN,
1494 '-e', 'trace=%s' % traces,
1495 '-o', self._logname + '.' + tracename,
1496 ]
1497 child = subprocess.Popen(
1498 trace_cmd + cmd,
1499 cwd=cwd,
1500 stdin=subprocess.PIPE,
1501 stdout=stdout,
1502 stderr=stderr)
1503 out = child.communicate()[0]
1504 # TODO(maruel): Walk the logs and figure out the root process would
1505 # simplify parsing the logs a *lot*.
1506 with self._lock:
1507 assert tracename not in (i['trace'] for i in self._traces)
1508 self._traces.append(
1509 {
1510 'cmd': cmd,
1511 'cwd': cwd,
maruel@chromium.org964ddde2012-10-29 18:13:46 +00001512 'output': out,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001513 # The pid of strace process, not very useful.
1514 'pid': child.pid,
1515 'trace': tracename,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001516 })
1517 return child.returncode, out
1518
1519 @staticmethod
1520 def clean_trace(logname):
1521 if os.path.isfile(logname):
1522 os.remove(logname)
1523 # Also delete any pid specific file from previous traces.
1524 for i in glob.iglob(logname + '.*'):
1525 if i.rsplit('.', 1)[1].isdigit():
1526 os.remove(i)
1527
1528 @classmethod
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00001529 def parse_log(cls, logname, blacklist, trace_name):
1530 logging.info('parse_log(%s, ..., %s)', logname, trace_name)
maruel@chromium.org8b056ba2012-10-16 14:04:49 +00001531 assert os.path.isabs(logname)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001532 data = read_json(logname)
1533 out = []
1534 for item in data['traces']:
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00001535 if trace_name and item['trace'] != trace_name:
1536 continue
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001537 result = {
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001538 'output': item['output'],
maruel@chromium.org964ddde2012-10-29 18:13:46 +00001539 'trace': item['trace'],
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001540 }
1541 try:
1542 context = cls.Context(blacklist, item['cwd'])
1543 for pidfile in glob.iglob('%s.%s.*' % (logname, item['trace'])):
1544 pid = pidfile.rsplit('.', 1)[1]
1545 if pid.isdigit():
1546 pid = int(pid)
1547 # TODO(maruel): Load as utf-8
1548 for line in open(pidfile, 'rb'):
1549 context.on_line(pid, line)
1550 result['results'] = context.to_results()
maruel@chromium.org424f5692012-10-15 15:42:38 +00001551 except TracingFailure:
1552 result['exception'] = sys.exc_info()
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001553 out.append(result)
1554 return out
1555
1556
1557class Dtrace(ApiBase):
1558 """Uses DTrace framework through dtrace. Requires root access.
1559
1560 Implies Mac OSX.
1561
1562 dtruss can't be used because it has compatibility issues with python.
1563
1564 Also, the pid->cwd handling needs to be done manually since OSX has no way to
1565 get the absolute path of the 'cwd' dtrace variable from the probe.
1566
1567 Also, OSX doesn't populate curpsinfo->pr_psargs properly, see
1568 https://discussions.apple.com/thread/1980539. So resort to handling execve()
1569 manually.
1570
1571 errno is not printed in the log since this implementation currently only cares
1572 about files that were successfully opened.
1573 """
1574 class Context(ApiBase.Context):
1575 # Format: index pid function(args)
1576 RE_HEADER = re.compile(r'^\d+ (\d+) ([a-zA-Z_\-]+)\((.*?)\)$')
1577
1578 # Arguments parsing.
1579 RE_DTRACE_BEGIN = re.compile(r'^\"(.+?)\"$')
1580 RE_CHDIR = re.compile(r'^\"(.+?)\"$')
1581 RE_EXECVE = re.compile(r'^\"(.+?)\", \[(\d+), (.+)\]$')
1582 RE_OPEN = re.compile(r'^\"(.+?)\", (0x[0-9a-z]+), (0x[0-9a-z]+)$')
1583 RE_PROC_START = re.compile(r'^(\d+), \"(.+?)\", (\d+)$')
1584 RE_RENAME = re.compile(r'^\"(.+?)\", \"(.+?)\"$')
1585
1586 O_DIRECTORY = 0x100000
1587
1588 class Process(ApiBase.Context.Process):
1589 def __init__(self, *args):
1590 super(Dtrace.Context.Process, self).__init__(*args)
1591 self.cwd = self.initial_cwd
1592
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00001593 def __init__(self, blacklist, thunk_pid, initial_cwd):
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001594 logging.info(
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00001595 '%s(%d, %s)' % (self.__class__.__name__, thunk_pid, initial_cwd))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001596 super(Dtrace.Context, self).__init__(blacklist)
1597 # Process ID of the temporary script created by create_thunk().
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00001598 self._thunk_pid = thunk_pid
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001599 self._initial_cwd = initial_cwd
1600 self._line_number = 0
1601
1602 def on_line(self, line):
1603 self._line_number += 1
1604 match = self.RE_HEADER.match(line)
1605 if not match:
1606 raise TracingFailure(
1607 'Found malformed line: %s' % line,
1608 None,
1609 self._line_number,
1610 line)
1611 fn = getattr(
1612 self,
1613 'handle_%s' % match.group(2).replace('-', '_'),
1614 self._handle_ignored)
1615 # It is guaranteed to succeed because of the regexp. Or at least I thought
1616 # it would.
1617 pid = int(match.group(1))
1618 try:
1619 return fn(pid, match.group(3))
1620 except TracingFailure, e:
1621 # Hack in the values since the handler could be a static function.
1622 e.pid = pid
1623 e.line = line
1624 e.line_number = self._line_number
1625 # Re-raise the modified exception.
1626 raise
1627 except (KeyError, NotImplementedError, ValueError), e:
1628 raise TracingFailure(
1629 'Trace generated a %s exception: %s' % (
1630 e.__class__.__name__, str(e)),
1631 pid,
1632 self._line_number,
1633 line,
1634 e)
1635
1636 def to_results(self):
1637 process = self.root_process.to_results_process()
1638 # Internal concistency check.
1639 if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
1640 raise TracingFailure(
1641 'Found internal inconsitency in process lifetime detection '
1642 'while looking for len(tree) == len(list)',
1643 None,
1644 None,
1645 None,
1646 sorted(self._process_lookup),
1647 sorted(p.pid for p in process.all))
1648 return Results(process)
1649
1650 def handle_dtrace_BEGIN(self, _pid, args):
1651 if not self.RE_DTRACE_BEGIN.match(args):
1652 raise TracingFailure(
1653 'Found internal inconsitency in dtrace_BEGIN log line',
1654 None, None, None)
1655
1656 def handle_proc_start(self, pid, args):
1657 """Transfers cwd.
1658
1659 The dtrace script already takes care of only tracing the processes that
1660 are child of the traced processes so there is no need to verify the
1661 process hierarchy.
1662 """
1663 if pid in self._process_lookup:
1664 raise TracingFailure(
1665 'Found internal inconsitency in proc_start: %d started two times' %
1666 pid,
1667 None, None, None)
1668 match = self.RE_PROC_START.match(args)
1669 if not match:
1670 raise TracingFailure(
1671 'Failed to parse arguments: %s' % args,
1672 None, None, None)
1673 ppid = int(match.group(1))
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00001674 if ppid == self._thunk_pid and not self.root_process:
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001675 proc = self.root_process = self.Process(
1676 self.blacklist, pid, self._initial_cwd)
1677 elif ppid in self._process_lookup:
1678 proc = self.Process(self.blacklist, pid, self._process_lookup[ppid].cwd)
1679 self._process_lookup[ppid].children.append(proc)
1680 else:
1681 # Another process tree, ignore.
1682 return
1683 self._process_lookup[pid] = proc
1684 logging.debug(
1685 'New child: %s -> %d cwd:%s' %
1686 (ppid, pid, unicode(proc.initial_cwd)))
1687
1688 def handle_proc_exit(self, pid, _args):
1689 """Removes cwd."""
1690 if pid in self._process_lookup:
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00001691 # self._thunk_pid is not traced itself and other traces run neither.
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001692 self._process_lookup[pid].cwd = None
1693
1694 def handle_execve(self, pid, args):
1695 """Sets the process' executable.
1696
1697 TODO(maruel): Read command line arguments. See
1698 https://discussions.apple.com/thread/1980539 for an example.
1699 https://gist.github.com/1242279
1700
1701 Will have to put the answer at http://stackoverflow.com/questions/7556249.
1702 :)
1703 """
1704 if not pid in self._process_lookup:
1705 # Another process tree, ignore.
1706 return
1707 match = self.RE_EXECVE.match(args)
1708 if not match:
1709 raise TracingFailure(
1710 'Failed to parse arguments: %r' % args,
1711 None, None, None)
1712 proc = self._process_lookup[pid]
1713 proc.executable = match.group(1)
1714 proc.command = self.process_escaped_arguments(match.group(3))
1715 if int(match.group(2)) != len(proc.command):
1716 raise TracingFailure(
1717 'Failed to parse execve() arguments: %s' % args,
1718 None, None, None)
1719
1720 def handle_chdir(self, pid, args):
1721 """Updates cwd."""
1722 if pid not in self._process_lookup:
1723 # Another process tree, ignore.
1724 return
1725 cwd = self.RE_CHDIR.match(args).group(1)
1726 if not cwd.startswith('/'):
1727 cwd2 = os.path.join(self._process_lookup[pid].cwd, cwd)
1728 logging.debug('handle_chdir(%d, %s) -> %s' % (pid, cwd, cwd2))
1729 else:
1730 logging.debug('handle_chdir(%d, %s)' % (pid, cwd))
1731 cwd2 = cwd
1732 self._process_lookup[pid].cwd = cwd2
1733
1734 def handle_open_nocancel(self, pid, args):
1735 """Redirects to handle_open()."""
1736 return self.handle_open(pid, args)
1737
1738 def handle_open(self, pid, args):
1739 if pid not in self._process_lookup:
1740 # Another process tree, ignore.
1741 return
1742 match = self.RE_OPEN.match(args)
1743 if not match:
1744 raise TracingFailure(
1745 'Failed to parse arguments: %s' % args,
1746 None, None, None)
1747 flag = int(match.group(2), 16)
1748 if self.O_DIRECTORY & flag == self.O_DIRECTORY:
1749 # Ignore directories.
1750 return
1751 self._handle_file(pid, match.group(1))
1752
1753 def handle_rename(self, pid, args):
1754 if pid not in self._process_lookup:
1755 # Another process tree, ignore.
1756 return
1757 match = self.RE_RENAME.match(args)
1758 if not match:
1759 raise TracingFailure(
1760 'Failed to parse arguments: %s' % args,
1761 None, None, None)
1762 self._handle_file(pid, match.group(1))
1763 self._handle_file(pid, match.group(2))
1764
1765 def _handle_file(self, pid, filepath):
1766 if not filepath.startswith('/'):
1767 filepath = os.path.join(self._process_lookup[pid].cwd, filepath)
1768 # We can get '..' in the path.
1769 filepath = os.path.normpath(filepath)
1770 # Sadly, still need to filter out directories here;
1771 # saw open_nocancel(".", 0, 0) = 0 lines.
1772 if os.path.isdir(filepath):
1773 return
1774 self._process_lookup[pid].add_file(filepath, False)
1775
1776 def handle_ftruncate(self, pid, args):
1777 """Just used as a signal to kill dtrace, ignoring."""
1778 pass
1779
1780 @staticmethod
1781 def _handle_ignored(pid, args):
1782 """Is called for all the event traces that are not handled."""
1783 raise NotImplementedError('Please implement me')
1784
1785 @staticmethod
1786 def process_escaped_arguments(text):
1787 """Extracts escaped arguments on a string and return the arguments as a
1788 list.
1789
1790 Implemented as an automaton.
1791
1792 Example:
1793 With text = '\\001python2.7\\001-c\\001print(\\"hi\\")\\0', the
1794 function will return ['python2.7', '-c', 'print("hi")]
1795 """
1796 if not text.endswith('\\0'):
1797 raise ValueError('String is not null terminated: %r' % text, text)
1798 text = text[:-2]
1799
1800 def unescape(x):
1801 """Replaces '\\' with '\' and '\?' (where ? is anything) with ?."""
1802 out = []
1803 escaped = False
1804 for i in x:
1805 if i == '\\' and not escaped:
1806 escaped = True
1807 continue
1808 escaped = False
1809 out.append(i)
1810 return ''.join(out)
1811
1812 return [unescape(i) for i in text.split('\\001')]
1813
1814 class Tracer(ApiBase.Tracer):
1815 # pylint: disable=C0301
1816 #
1817 # To understand the following code, you'll want to take a look at:
1818 # http://developers.sun.com/solaris/articles/dtrace_quickref/dtrace_quickref.html
1819 # https://wikis.oracle.com/display/DTrace/Variables
1820 # http://docs.oracle.com/cd/E19205-01/820-4221/
1821 #
1822 # 0. Dump all the valid probes into a text file. It is important, you
1823 # want to redirect into a file and you don't want to constantly 'sudo'.
1824 # $ sudo dtrace -l > probes.txt
1825 #
1826 # 1. Count the number of probes:
1827 # $ wc -l probes.txt
1828 # 81823 # On OSX 10.7, including 1 header line.
1829 #
1830 # 2. List providers, intentionally skipping all the 'syspolicy10925' and the
1831 # likes and skipping the header with NR>1:
1832 # $ awk 'NR>1 { print $2 }' probes.txt | sort | uniq | grep -v '[[:digit:]]'
1833 # dtrace
1834 # fbt
1835 # io
1836 # ip
1837 # lockstat
1838 # mach_trap
1839 # proc
1840 # profile
1841 # sched
1842 # syscall
1843 # tcp
1844 # vminfo
1845 #
1846 # 3. List of valid probes:
1847 # $ grep syscall probes.txt | less
1848 # or use dtrace directly:
1849 # $ sudo dtrace -l -P syscall | less
1850 #
1851 # trackedpid is an associative array where its value can be 0, 1 or 2.
1852 # 0 is for untracked processes and is the default value for items not
1853 # in the associative array.
1854 # 1 is for tracked processes.
1855 # 2 is for the script created by create_thunk() only. It is not tracked
1856 # itself but all its decendants are.
1857 #
1858 # The script will kill itself only once waiting_to_die == 1 and
1859 # current_processes == 0, so that both getlogin() was called and that
1860 # all traced processes exited.
1861 #
1862 # TODO(maruel): Use cacheable predicates. See
1863 # https://wikis.oracle.com/display/DTrace/Performance+Considerations
1864 D_CODE = """
1865 dtrace:::BEGIN {
1866 waiting_to_die = 0;
1867 current_processes = 0;
1868 logindex = 0;
1869 printf("%d %d %s_%s(\\"%s\\")\\n",
1870 logindex, PID, probeprov, probename, SCRIPT);
1871 logindex++;
1872 }
1873
1874 proc:::start /trackedpid[ppid]/ {
1875 trackedpid[pid] = 1;
1876 current_processes += 1;
1877 printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
1878 logindex, pid, probeprov, probename,
1879 ppid,
1880 execname,
1881 current_processes);
1882 logindex++;
1883 }
1884 /* Should use SCRIPT but there is no access to this variable at that
1885 * point. */
1886 proc:::start /ppid == PID && execname == "Python"/ {
1887 trackedpid[pid] = 2;
1888 current_processes += 1;
1889 printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
1890 logindex, pid, probeprov, probename,
1891 ppid,
1892 execname,
1893 current_processes);
1894 logindex++;
1895 }
1896 proc:::exit /trackedpid[pid] &&
1897 current_processes == 1 &&
1898 waiting_to_die == 1/ {
1899 trackedpid[pid] = 0;
1900 current_processes -= 1;
1901 printf("%d %d %s_%s(%d)\\n",
1902 logindex, pid, probeprov, probename,
1903 current_processes);
1904 logindex++;
1905 exit(0);
1906 }
1907 proc:::exit /trackedpid[pid]/ {
1908 trackedpid[pid] = 0;
1909 current_processes -= 1;
1910 printf("%d %d %s_%s(%d)\\n",
1911 logindex, pid, probeprov, probename,
1912 current_processes);
1913 logindex++;
1914 }
1915
1916 /* Use an arcane function to detect when we need to die */
1917 syscall::ftruncate:entry /pid == PID && arg0 == FILE_ID/ {
1918 waiting_to_die = 1;
1919 printf("%d %d %s()\\n", logindex, pid, probefunc);
1920 logindex++;
1921 }
1922 syscall::ftruncate:entry /
1923 pid == PID && arg0 == FILE_ID && current_processes == 0/ {
1924 exit(0);
1925 }
1926
1927 syscall::open*:entry /trackedpid[pid] == 1/ {
1928 self->open_arg0 = arg0;
1929 self->open_arg1 = arg1;
1930 self->open_arg2 = arg2;
1931 }
1932 syscall::open*:return /trackedpid[pid] == 1 && errno == 0/ {
1933 this->open_arg0 = copyinstr(self->open_arg0);
1934 printf("%d %d %s(\\"%s\\", 0x%x, 0x%x)\\n",
1935 logindex, pid, probefunc,
1936 this->open_arg0,
1937 self->open_arg1,
1938 self->open_arg2);
1939 logindex++;
1940 this->open_arg0 = 0;
1941 }
1942 syscall::open*:return /trackedpid[pid] == 1/ {
1943 self->open_arg0 = 0;
1944 self->open_arg1 = 0;
1945 self->open_arg2 = 0;
1946 }
1947
1948 syscall::rename:entry /trackedpid[pid] == 1/ {
1949 self->rename_arg0 = arg0;
1950 self->rename_arg1 = arg1;
1951 }
1952 syscall::rename:return /trackedpid[pid] == 1 && errno == 0/ {
1953 this->rename_arg0 = copyinstr(self->rename_arg0);
1954 this->rename_arg1 = copyinstr(self->rename_arg1);
1955 printf("%d %d %s(\\"%s\\", \\"%s\\")\\n",
1956 logindex, pid, probefunc,
1957 this->rename_arg0,
1958 this->rename_arg1);
1959 logindex++;
1960 this->rename_arg0 = 0;
1961 this->rename_arg1 = 0;
1962 }
1963 syscall::rename:return /trackedpid[pid] == 1/ {
1964 self->rename_arg0 = 0;
1965 self->rename_arg1 = 0;
1966 }
1967
1968 /* Track chdir, it's painful because it is only receiving relative path.
1969 */
1970 syscall::chdir:entry /trackedpid[pid] == 1/ {
1971 self->chdir_arg0 = arg0;
1972 }
1973 syscall::chdir:return /trackedpid[pid] == 1 && errno == 0/ {
1974 this->chdir_arg0 = copyinstr(self->chdir_arg0);
1975 printf("%d %d %s(\\"%s\\")\\n",
1976 logindex, pid, probefunc,
1977 this->chdir_arg0);
1978 logindex++;
1979 this->chdir_arg0 = 0;
1980 }
1981 syscall::chdir:return /trackedpid[pid] == 1/ {
1982 self->chdir_arg0 = 0;
1983 }
1984 """
1985
1986 # execve-specific code, tends to throw a lot of exceptions.
1987 D_CODE_EXECVE = """
1988 /* Finally what we care about! */
1989 syscall::exec*:entry /trackedpid[pid]/ {
1990 self->exec_arg0 = copyinstr(arg0);
1991 /* Incrementally probe for a NULL in the argv parameter of execve() to
1992 * figure out argc. */
1993 /* TODO(maruel): Skip the remaining copyin() when a NULL pointer was
1994 * found. */
1995 self->exec_argc = 0;
1996 /* Probe for argc==1 */
1997 this->exec_argv = (user_addr_t*)copyin(
1998 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
1999 self->exec_argc = this->exec_argv[self->exec_argc] ?
2000 (self->exec_argc + 1) : self->exec_argc;
2001
2002 /* Probe for argc==2 */
2003 this->exec_argv = (user_addr_t*)copyin(
2004 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
2005 self->exec_argc = this->exec_argv[self->exec_argc] ?
2006 (self->exec_argc + 1) : self->exec_argc;
2007
2008 /* Probe for argc==3 */
2009 this->exec_argv = (user_addr_t*)copyin(
2010 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
2011 self->exec_argc = this->exec_argv[self->exec_argc] ?
2012 (self->exec_argc + 1) : self->exec_argc;
2013
2014 /* Probe for argc==4 */
2015 this->exec_argv = (user_addr_t*)copyin(
2016 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
2017 self->exec_argc = this->exec_argv[self->exec_argc] ?
2018 (self->exec_argc + 1) : self->exec_argc;
2019
2020 /* Copy the inputs strings since there is no guarantee they'll be
2021 * present after the call completed. */
2022 self->exec_argv0 = (self->exec_argc > 0) ?
2023 copyinstr(this->exec_argv[0]) : "";
2024 self->exec_argv1 = (self->exec_argc > 1) ?
2025 copyinstr(this->exec_argv[1]) : "";
2026 self->exec_argv2 = (self->exec_argc > 2) ?
2027 copyinstr(this->exec_argv[2]) : "";
2028 self->exec_argv3 = (self->exec_argc > 3) ?
2029 copyinstr(this->exec_argv[3]) : "";
2030 this->exec_argv = 0;
2031 }
2032 syscall::exec*:return /trackedpid[pid] && errno == 0/ {
2033 /* We need to join strings here, as using multiple printf() would
2034 * cause tearing when multiple threads/processes are traced.
2035 * Since it is impossible to escape a string and join it to another one,
2036 * like sprintf("%s%S", previous, more), use hackery.
2037 * Each of the elements are split with a \\1. \\0 cannot be used because
2038 * it is simply ignored. This will conflict with any program putting a
2039 * \\1 in their execve() string but this should be "rare enough" */
2040 this->args = "";
2041 /* Process exec_argv[0] */
2042 this->args = strjoin(
2043 this->args, (self->exec_argc > 0) ? self->exec_argv0 : "");
2044
2045 /* Process exec_argv[1] */
2046 this->args = strjoin(
2047 this->args, (self->exec_argc > 1) ? "\\1" : "");
2048 this->args = strjoin(
2049 this->args, (self->exec_argc > 1) ? self->exec_argv1 : "");
2050
2051 /* Process exec_argv[2] */
2052 this->args = strjoin(
2053 this->args, (self->exec_argc > 2) ? "\\1" : "");
2054 this->args = strjoin(
2055 this->args, (self->exec_argc > 2) ? self->exec_argv2 : "");
2056
2057 /* Process exec_argv[3] */
2058 this->args = strjoin(
2059 this->args, (self->exec_argc > 3) ? "\\1" : "");
2060 this->args = strjoin(
2061 this->args, (self->exec_argc > 3) ? self->exec_argv3 : "");
2062
2063 /* Prints self->exec_argc to permits verifying the internal
2064 * consistency since this code is quite fishy. */
2065 printf("%d %d %s(\\"%s\\", [%d, %S])\\n",
2066 logindex, pid, probefunc,
2067 self->exec_arg0,
2068 self->exec_argc,
2069 this->args);
2070 logindex++;
2071 this->args = 0;
2072 }
2073 syscall::exec*:return /trackedpid[pid]/ {
2074 self->exec_arg0 = 0;
2075 self->exec_argc = 0;
2076 self->exec_argv0 = 0;
2077 self->exec_argv1 = 0;
2078 self->exec_argv2 = 0;
2079 self->exec_argv3 = 0;
2080 }
2081 """
2082
2083 # Code currently not used.
2084 D_EXTRANEOUS = """
2085 /* This is a good learning experience, since it traces a lot of things
2086 * related to the process and child processes.
2087 * Warning: it generates a gigantic log. For example, tracing
2088 * "data/trace_inputs/child1.py --child" generates a 2mb log and takes
2089 * several minutes to execute.
2090 */
2091 /*
2092 mach_trap::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2093 printf("%d %d %s_%s() = %d\\n",
2094 logindex, pid, probeprov, probefunc, errno);
2095 logindex++;
2096 }
2097 proc::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2098 printf("%d %d %s_%s() = %d\\n",
2099 logindex, pid, probeprov, probefunc, errno);
2100 logindex++;
2101 }
2102 sched::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2103 printf("%d %d %s_%s() = %d\\n",
2104 logindex, pid, probeprov, probefunc, errno);
2105 logindex++;
2106 }
2107 syscall::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2108 printf("%d %d %s_%s() = %d\\n",
2109 logindex, pid, probeprov, probefunc, errno);
2110 logindex++;
2111 }
2112 vminfo::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2113 printf("%d %d %s_%s() = %d\\n",
2114 logindex, pid, probeprov, probefunc, errno);
2115 logindex++;
2116 }
2117 */
2118 /* TODO(maruel): *stat* functions and friends
2119 syscall::access:return,
2120 syscall::chdir:return,
2121 syscall::chflags:return,
2122 syscall::chown:return,
2123 syscall::chroot:return,
2124 syscall::getattrlist:return,
2125 syscall::getxattr:return,
2126 syscall::lchown:return,
2127 syscall::lstat64:return,
2128 syscall::lstat:return,
2129 syscall::mkdir:return,
2130 syscall::pathconf:return,
2131 syscall::readlink:return,
2132 syscall::removexattr:return,
2133 syscall::setxattr:return,
2134 syscall::stat64:return,
2135 syscall::stat:return,
2136 syscall::truncate:return,
2137 syscall::unlink:return,
2138 syscall::utimes:return,
2139 */
2140 """
2141
2142 def __init__(self, logname):
2143 """Starts the log collection with dtrace.
2144
2145 Requires root access or chmod 4555 on dtrace. dtrace is asynchronous so
2146 this needs to wait for dtrace to be "warmed up".
2147 """
2148 super(Dtrace.Tracer, self).__init__(logname)
2149 self._script = create_thunk()
2150 # This unique dummy temp file is used to signal the dtrace script that it
2151 # should stop as soon as all the child processes are done. A bit hackish
2152 # but works fine enough.
2153 self._dummy_file_id, self._dummy_file_name = tempfile.mkstemp(
2154 prefix='trace_signal_file')
2155
2156 # Note: do not use the -p flag. It's useless if the initial process quits
2157 # too fast, resulting in missing traces from the grand-children. The D
2158 # code manages the dtrace lifetime itself.
2159 trace_cmd = [
2160 'sudo',
2161 'dtrace',
2162 # Use a larger buffer if getting 'out of scratch space' errors.
2163 # Ref: https://wikis.oracle.com/display/DTrace/Options+and+Tunables
2164 '-b', '10m',
2165 '-x', 'dynvarsize=10m',
2166 #'-x', 'dtrace_global_maxsize=1m',
2167 '-x', 'evaltime=exec',
2168 '-o', '/dev/stderr',
2169 '-q',
2170 '-n', self._get_dtrace_code(),
2171 ]
2172 with open(self._logname + '.log', 'wb') as logfile:
2173 self._dtrace = subprocess.Popen(
2174 trace_cmd, stdout=logfile, stderr=subprocess.STDOUT)
2175 logging.debug('Started dtrace pid: %d' % self._dtrace.pid)
2176
2177 # Reads until one line is printed, which signifies dtrace is up and ready.
2178 with open(self._logname + '.log', 'rb') as logfile:
2179 while 'dtrace_BEGIN' not in logfile.readline():
2180 if self._dtrace.poll() is not None:
2181 # Do a busy wait. :/
2182 break
2183 logging.debug('dtrace started')
2184
2185 def _get_dtrace_code(self):
2186 """Setups the D code to implement child process tracking.
2187
2188 Injects the cookie in the script so it knows when to stop.
2189
2190 The script will detect any instance of the script created with
2191 create_thunk() and will start tracing it.
2192 """
2193 return (
2194 'inline int PID = %d;\n'
2195 'inline string SCRIPT = "%s";\n'
2196 'inline int FILE_ID = %d;\n'
2197 '\n'
2198 '%s') % (
2199 os.getpid(),
2200 self._script,
2201 self._dummy_file_id,
2202 self.D_CODE) + self.D_CODE_EXECVE
2203
2204 def trace(self, cmd, cwd, tracename, output):
2205 """Runs dtrace on an executable.
2206
2207 This dtruss is broken when it starts the process itself or when tracing
2208 child processes, this code starts a wrapper process
2209 generated with create_thunk() which starts the executable to trace.
2210 """
2211 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
2212 assert os.path.isabs(cmd[0]), cmd[0]
2213 assert os.path.isabs(cwd), cwd
2214 assert os.path.normpath(cwd) == cwd, cwd
2215 with self._lock:
2216 if not self._initialized:
2217 raise TracingFailure(
2218 'Called Tracer.trace() on an unitialized object',
2219 None, None, None, tracename)
2220 assert tracename not in (i['trace'] for i in self._traces)
2221
2222 # Starts the script wrapper to start the child process. This signals the
2223 # dtrace script that this process is to be traced.
2224 stdout = stderr = None
2225 if output:
2226 stdout = subprocess.PIPE
2227 stderr = subprocess.STDOUT
2228 child_cmd = [
2229 sys.executable,
2230 self._script,
2231 tracename,
2232 ]
2233 # Call a dummy function so that dtrace knows I'm about to launch a process
2234 # that needs to be traced.
2235 # Yummy.
2236 child = subprocess.Popen(
2237 child_cmd + fix_python_path(cmd),
2238 stdin=subprocess.PIPE,
2239 stdout=stdout,
2240 stderr=stderr,
2241 cwd=cwd)
2242 logging.debug('Started child pid: %d' % child.pid)
2243
2244 out = child.communicate()[0]
2245 # This doesn't mean tracing is done, one of the grand-child process may
2246 # still be alive. It will be tracked with the dtrace script.
2247
2248 with self._lock:
2249 assert tracename not in (i['trace'] for i in self._traces)
2250 self._traces.append(
2251 {
2252 'cmd': cmd,
2253 'cwd': cwd,
2254 # The pid of strace process, not very useful.
2255 'pid': child.pid,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002256 'output': out,
maruel@chromium.org964ddde2012-10-29 18:13:46 +00002257 'trace': tracename,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002258 })
2259 return child.returncode, out
2260
2261 def close(self, timeout=None):
2262 """Terminates dtrace."""
2263 logging.debug('close(%s)' % timeout)
2264 try:
2265 try:
2266 super(Dtrace.Tracer, self).close(timeout)
2267 # Signal dtrace that it should stop now.
maruel@chromium.orgf43e68b2012-10-15 20:23:10 +00002268 # ftruncate doesn't exist on Windows.
2269 os.ftruncate(self._dummy_file_id, 0) # pylint: disable=E1101
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002270 if timeout:
2271 start = time.time()
2272 # Use polling. :/
2273 while (self._dtrace.poll() is None and
2274 (time.time() - start) < timeout):
2275 time.sleep(0.1)
2276 self._dtrace.kill()
2277 self._dtrace.wait()
2278 finally:
2279 # Make sure to kill it in any case.
2280 if self._dtrace.poll() is None:
2281 try:
2282 self._dtrace.kill()
2283 self._dtrace.wait()
2284 except OSError:
2285 pass
2286
2287 if self._dtrace.returncode != 0:
2288 # Warn about any dtrace failure but basically ignore it.
2289 print 'dtrace failure: %s' % self._dtrace.returncode
2290 finally:
2291 os.close(self._dummy_file_id)
2292 os.remove(self._dummy_file_name)
2293 os.remove(self._script)
2294
2295 def post_process_log(self):
2296 """Sorts the log back in order when each call occured.
2297
2298 dtrace doesn't save the buffer in strict order since it keeps one buffer
2299 per CPU.
2300 """
2301 super(Dtrace.Tracer, self).post_process_log()
2302 logname = self._logname + '.log'
2303 with open(logname, 'rb') as logfile:
2304 lines = [l for l in logfile if l.strip()]
2305 errors = [l for l in lines if l.startswith('dtrace:')]
2306 if errors:
2307 raise TracingFailure(
2308 'Found errors in the trace: %s' % '\n'.join(errors),
2309 None, None, None, logname)
2310 try:
2311 lines = sorted(lines, key=lambda l: int(l.split(' ', 1)[0]))
2312 except ValueError:
2313 raise TracingFailure(
2314 'Found errors in the trace: %s' % '\n'.join(
2315 l for l in lines if l.split(' ', 1)[0].isdigit()),
2316 None, None, None, logname)
2317 with open(logname, 'wb') as logfile:
2318 logfile.write(''.join(lines))
2319
2320 @staticmethod
2321 def clean_trace(logname):
2322 for ext in ('', '.log'):
2323 if os.path.isfile(logname + ext):
2324 os.remove(logname + ext)
2325
2326 @classmethod
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00002327 def parse_log(cls, logname, blacklist, trace_name):
2328 logging.info('parse_log(%s, ..., %s)', logname, trace_name)
maruel@chromium.org8b056ba2012-10-16 14:04:49 +00002329 assert os.path.isabs(logname)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002330
2331 def blacklist_more(filepath):
2332 # All the HFS metadata is in the form /.vol/...
2333 return blacklist(filepath) or re.match(r'^\/\.vol\/.+$', filepath)
2334
2335 data = read_json(logname)
2336 out = []
2337 for item in data['traces']:
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00002338 if trace_name and item['trace'] != trace_name:
2339 continue
maruel@chromium.org964ddde2012-10-29 18:13:46 +00002340 result = {
2341 'output': item['output'],
2342 'trace': item['trace'],
2343 }
2344 try:
2345 context = cls.Context(blacklist_more, item['pid'], item['cwd'])
2346 for line in open(logname + '.log', 'rb'):
2347 context.on_line(line)
2348 result['results'] = context.to_results()
2349 except TracingFailure:
2350 result['exception'] = sys.exc_info()
2351 out.append(result)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002352 return out
2353
2354
2355class LogmanTrace(ApiBase):
2356 """Uses the native Windows ETW based tracing functionality to trace a child
2357 process.
2358
2359 Caveat: this implementations doesn't track cwd or initial_cwd. It is because
2360 the Windows Kernel doesn't have a concept of 'current working directory' at
2361 all. A Win32 process has a map of current directories, one per drive letter
2362 and it is managed by the user mode kernel32.dll. In kernel, a file is always
2363 opened relative to another file_object or as an absolute path. All the current
2364 working directory logic is done in user mode.
2365 """
2366 class Context(ApiBase.Context):
2367 """Processes a ETW log line and keeps the list of existent and non
2368 existent files accessed.
2369
2370 Ignores directories.
2371 """
2372 # These indexes are for the stripped version in json.
2373 EVENT_NAME = 0
2374 TYPE = 1
2375 PID = 2
2376 TID = 3
2377 PROCESSOR_ID = 4
2378 TIMESTAMP = 5
2379 USER_DATA = 6
2380
2381 class Process(ApiBase.Context.Process):
2382 def __init__(self, *args):
2383 super(LogmanTrace.Context.Process, self).__init__(*args)
2384 # Handle file objects that succeeded.
2385 self.file_objects = {}
2386
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002387 def __init__(self, blacklist, thunk_pid, trace_name, thunk_cmd):
2388 logging.info(
2389 '%s(%d, %s, %s)', self.__class__.__name__, thunk_pid, trace_name,
2390 thunk_cmd)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002391 super(LogmanTrace.Context, self).__init__(blacklist)
2392 self._drive_map = DosDriveMap()
2393 # Threads mapping to the corresponding process id.
2394 self._threads_active = {}
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002395 # Process ID of the tracer, e.g. the temporary script created by
2396 # create_thunk(). This is tricky because the process id may have been
2397 # reused.
2398 self._thunk_pid = thunk_pid
2399 self._thunk_cmd = thunk_cmd
2400 self._trace_name = trace_name
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002401 self._line_number = 0
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002402 self._thunk_process = None
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002403
2404 def on_line(self, line):
2405 """Processes a json Event line."""
2406 self._line_number += 1
2407 try:
2408 # By Opcode
2409 handler = getattr(
2410 self,
2411 'handle_%s_%s' % (line[self.EVENT_NAME], line[self.TYPE]),
2412 None)
2413 if not handler:
2414 raise TracingFailure(
2415 'Unexpected event %s_%s' % (
2416 line[self.EVENT_NAME], line[self.TYPE]),
2417 None, None, None)
2418 handler(line)
2419 except TracingFailure, e:
2420 # Hack in the values since the handler could be a static function.
2421 e.pid = line[self.PID]
2422 e.line = line
2423 e.line_number = self._line_number
2424 # Re-raise the modified exception.
2425 raise
2426 except (KeyError, NotImplementedError, ValueError), e:
2427 raise TracingFailure(
2428 'Trace generated a %s exception: %s' % (
2429 e.__class__.__name__, str(e)),
2430 line[self.PID],
2431 self._line_number,
2432 line,
2433 e)
2434
2435 def to_results(self):
2436 if not self.root_process:
2437 raise TracingFailure(
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002438 'Failed to detect the initial process %d' % self._thunk_pid,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002439 None, None, None)
2440 process = self.root_process.to_results_process()
2441 return Results(process)
2442
2443 def _thread_to_process(self, tid):
2444 """Finds the process from the thread id."""
2445 tid = int(tid, 16)
2446 pid = self._threads_active.get(tid)
2447 if not pid or not self._process_lookup.get(pid):
2448 return
2449 return self._process_lookup[pid]
2450
2451 @classmethod
2452 def handle_EventTrace_Header(cls, line):
2453 """Verifies no event was dropped, e.g. no buffer overrun occured."""
2454 BUFFER_SIZE = cls.USER_DATA
2455 #VERSION = cls.USER_DATA + 1
2456 #PROVIDER_VERSION = cls.USER_DATA + 2
2457 #NUMBER_OF_PROCESSORS = cls.USER_DATA + 3
2458 #END_TIME = cls.USER_DATA + 4
2459 #TIMER_RESOLUTION = cls.USER_DATA + 5
2460 #MAX_FILE_SIZE = cls.USER_DATA + 6
2461 #LOG_FILE_MODE = cls.USER_DATA + 7
2462 #BUFFERS_WRITTEN = cls.USER_DATA + 8
2463 #START_BUFFERS = cls.USER_DATA + 9
2464 #POINTER_SIZE = cls.USER_DATA + 10
2465 EVENTS_LOST = cls.USER_DATA + 11
2466 #CPU_SPEED = cls.USER_DATA + 12
2467 #LOGGER_NAME = cls.USER_DATA + 13
2468 #LOG_FILE_NAME = cls.USER_DATA + 14
2469 #BOOT_TIME = cls.USER_DATA + 15
2470 #PERF_FREQ = cls.USER_DATA + 16
2471 #START_TIME = cls.USER_DATA + 17
2472 #RESERVED_FLAGS = cls.USER_DATA + 18
2473 #BUFFERS_LOST = cls.USER_DATA + 19
2474 #SESSION_NAME_STRING = cls.USER_DATA + 20
2475 #LOG_FILE_NAME_STRING = cls.USER_DATA + 21
2476 if line[EVENTS_LOST] != '0':
2477 raise TracingFailure(
2478 ( '%s events were lost during trace, please increase the buffer '
2479 'size from %s') % (line[EVENTS_LOST], line[BUFFER_SIZE]),
2480 None, None, None)
2481
2482 def handle_FileIo_Cleanup(self, line):
2483 """General wisdom: if a file is closed, it's because it was opened.
2484
2485 Note that FileIo_Close is not used since if a file was opened properly but
2486 not closed before the process exits, only Cleanup will be logged.
2487 """
2488 #IRP = self.USER_DATA
2489 TTID = self.USER_DATA + 1 # Thread ID, that's what we want.
2490 FILE_OBJECT = self.USER_DATA + 2
2491 #FILE_KEY = self.USER_DATA + 3
2492 proc = self._thread_to_process(line[TTID])
2493 if not proc:
2494 # Not a process we care about.
2495 return
2496 file_object = line[FILE_OBJECT]
2497 if file_object in proc.file_objects:
2498 proc.add_file(proc.file_objects.pop(file_object), False)
2499
2500 def handle_FileIo_Create(self, line):
2501 """Handles a file open.
2502
2503 All FileIo events are described at
2504 http://msdn.microsoft.com/library/windows/desktop/aa363884.aspx
2505 for some value of 'description'.
2506
2507 " (..) process and thread id values of the IO events (..) are not valid "
2508 http://msdn.microsoft.com/magazine/ee358703.aspx
2509
2510 The FileIo.Create event doesn't return if the CreateFile() call
2511 succeeded, so keep track of the file_object and check that it is
2512 eventually closed with FileIo_Cleanup.
2513 """
2514 #IRP = self.USER_DATA
2515 TTID = self.USER_DATA + 1 # Thread ID, that's what we want.
2516 FILE_OBJECT = self.USER_DATA + 2
2517 #CREATE_OPTIONS = self.USER_DATA + 3
2518 #FILE_ATTRIBUTES = self.USER_DATA + 4
2519 #self.USER_DATA + SHARE_ACCESS = 5
2520 OPEN_PATH = self.USER_DATA + 6
2521
2522 proc = self._thread_to_process(line[TTID])
2523 if not proc:
2524 # Not a process we care about.
2525 return
2526
2527 match = re.match(r'^\"(.+)\"$', line[OPEN_PATH])
2528 raw_path = match.group(1)
2529 # Ignore directories and bare drive right away.
2530 if raw_path.endswith(os.path.sep):
2531 return
2532 filepath = self._drive_map.to_win32(raw_path)
2533 # Ignore bare drive right away. Some may still fall through with format
2534 # like '\\?\X:'
2535 if len(filepath) == 2:
2536 return
2537 file_object = line[FILE_OBJECT]
2538 if os.path.isdir(filepath):
2539 # There is no O_DIRECTORY equivalent on Windows. The closed is
2540 # FILE_FLAG_BACKUP_SEMANTICS but it's not exactly right either. So
2541 # simply discard directories are they are found.
2542 return
2543 # Override any stale file object
2544 proc.file_objects[file_object] = filepath
2545
2546 def handle_FileIo_Rename(self, line):
2547 # TODO(maruel): Handle?
2548 pass
2549
2550 def handle_Process_End(self, line):
2551 pid = line[self.PID]
2552 if self._process_lookup.get(pid):
2553 logging.info('Terminated: %d' % pid)
2554 self._process_lookup[pid] = None
2555 else:
2556 logging.debug('Terminated: %d' % pid)
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002557 if self._thunk_process and self._thunk_process.pid == pid:
2558 self._thunk_process = None
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002559
2560 def handle_Process_Start(self, line):
2561 """Handles a new child process started by PID."""
2562 #UNIQUE_PROCESS_KEY = self.USER_DATA
2563 PROCESS_ID = self.USER_DATA + 1
2564 #PARENT_PID = self.USER_DATA + 2
2565 #SESSION_ID = self.USER_DATA + 3
2566 #EXIT_STATUS = self.USER_DATA + 4
2567 #DIRECTORY_TABLE_BASE = self.USER_DATA + 5
2568 #USER_SID = self.USER_DATA + 6
2569 IMAGE_FILE_NAME = self.USER_DATA + 7
2570 COMMAND_LINE = self.USER_DATA + 8
2571
2572 ppid = line[self.PID]
2573 pid = int(line[PROCESS_ID], 16)
2574 logging.debug(
2575 'New process %d->%d (%s) %s' %
2576 (ppid, pid, line[IMAGE_FILE_NAME], line[COMMAND_LINE]))
2577
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002578 command_line = None
2579 def get_command_line():
2580 # TODO(maruel): Process escapes.
2581 if (not line[COMMAND_LINE].startswith('"') or
2582 not line[COMMAND_LINE].endswith('"')):
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002583 raise TracingFailure(
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002584 'Command line is not properly quoted: %s' % line[COMMAND_LINE],
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002585 None, None, None)
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002586 return CommandLineToArgvW(line[COMMAND_LINE][1:-1])
2587
2588 if pid == self._thunk_pid:
2589 # Need to ignore processes we don't know about because the log is
2590 # system-wide. self._thunk_pid shall start only one process.
2591 # This is tricky though because Windows *loves* to reuse process id and
2592 # it happens often that the process ID of the thunk script created by
2593 # create_thunk() is reused. So just detecting the pid here is not
2594 # sufficient, we must confirm the command line.
2595 command_line = get_command_line()
2596 if command_line[:len(self._thunk_cmd)] != self._thunk_cmd:
2597 logging.info(
2598 'Ignoring duplicate pid %d for %s: %s while searching for %s',
2599 pid, self._trace_name, command_line, self._thunk_cmd)
2600 return
2601
2602 # TODO(maruel): The check is quite weak. Add the thunk path.
2603 if self._thunk_process:
2604 raise TracingFailure(
2605 ( 'Parent process is _thunk_pid(%d) but thunk_process(%d) is '
2606 'already set') % (self._thunk_pid, self._thunk_process.pid),
2607 None, None, None)
2608 proc = self.Process(self.blacklist, pid, None)
2609 self._thunk_process = proc
2610 return
2611 elif ppid == self._thunk_pid and self._thunk_process:
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002612 proc = self.Process(self.blacklist, pid, None)
2613 self.root_process = proc
2614 ppid = None
2615 elif self._process_lookup.get(ppid):
2616 proc = self.Process(self.blacklist, pid, None)
2617 self._process_lookup[ppid].children.append(proc)
2618 else:
2619 # Ignore
2620 return
2621 self._process_lookup[pid] = proc
2622
2623 if (not line[IMAGE_FILE_NAME].startswith('"') or
2624 not line[IMAGE_FILE_NAME].endswith('"')):
2625 raise TracingFailure(
2626 'Command line is not properly quoted: %s' % line[IMAGE_FILE_NAME],
2627 None, None, None)
2628
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002629 proc.command = command_line or get_command_line()
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002630 proc.executable = line[IMAGE_FILE_NAME][1:-1]
2631 # proc.command[0] may be the absolute path of 'executable' but it may be
2632 # anything else too. If it happens that command[0] ends with executable,
2633 # use it, otherwise defaults to the base name.
2634 cmd0 = proc.command[0].lower()
2635 if not cmd0.endswith('.exe'):
2636 # TODO(maruel): That's not strictly true either.
2637 cmd0 += '.exe'
2638 if cmd0.endswith(proc.executable) and os.path.isfile(cmd0):
2639 # Fix the path.
2640 cmd0 = cmd0.replace('/', os.path.sep)
2641 cmd0 = os.path.normpath(cmd0)
2642 proc.executable = get_native_path_case(cmd0)
2643 logging.info(
2644 'New child: %s -> %d %s' % (ppid, pid, proc.executable))
2645
2646 def handle_Thread_End(self, line):
2647 """Has the same parameters as Thread_Start."""
2648 tid = int(line[self.TID], 16)
2649 self._threads_active.pop(tid, None)
2650
2651 def handle_Thread_Start(self, line):
2652 """Handles a new thread created.
2653
2654 Do not use self.PID here since a process' initial thread is created by
2655 the parent process.
2656 """
2657 PROCESS_ID = self.USER_DATA
2658 TTHREAD_ID = self.USER_DATA + 1
2659 #STACK_BASE = self.USER_DATA + 2
2660 #STACK_LIMIT = self.USER_DATA + 3
2661 #USER_STACK_BASE = self.USER_DATA + 4
2662 #USER_STACK_LIMIT = self.USER_DATA + 5
2663 #AFFINITY = self.USER_DATA + 6
2664 #WIN32_START_ADDR = self.USER_DATA + 7
2665 #TEB_BASE = self.USER_DATA + 8
2666 #SUB_PROCESS_TAG = self.USER_DATA + 9
2667 #BASE_PRIORITY = self.USER_DATA + 10
2668 #PAGE_PRIORITY = self.USER_DATA + 11
2669 #IO_PRIORITY = self.USER_DATA + 12
2670 #THREAD_FLAGS = self.USER_DATA + 13
2671 # Do not use self.PID here since a process' initial thread is created by
2672 # the parent process.
2673 pid = int(line[PROCESS_ID], 16)
2674 tid = int(line[TTHREAD_ID], 16)
2675 logging.debug('New thread pid:%d, tid:%d' % (pid, tid))
2676 self._threads_active[tid] = pid
2677
2678 @classmethod
2679 def supported_events(cls):
2680 """Returns all the procesed events."""
2681 out = []
2682 for member in dir(cls):
2683 match = re.match(r'^handle_([A-Za-z]+)_([A-Za-z]+)$', member)
2684 if match:
2685 out.append(match.groups())
2686 return out
2687
2688 class Tracer(ApiBase.Tracer):
2689 # The basic headers.
2690 EXPECTED_HEADER = [
2691 u'Event Name',
2692 u'Type',
2693 u'Event ID',
2694 u'Version',
2695 u'Channel',
2696 u'Level', # 5
2697 u'Opcode',
2698 u'Task',
2699 u'Keyword',
2700 u'PID',
2701 u'TID', # 10
2702 u'Processor Number',
2703 u'Instance ID',
2704 u'Parent Instance ID',
2705 u'Activity ID',
2706 u'Related Activity ID', # 15
2707 u'Clock-Time',
2708 u'Kernel(ms)', # Both have a resolution of ~15ms which makes them
2709 u'User(ms)', # pretty much useless.
2710 u'User Data', # Extra arguments that are event-specific.
2711 ]
2712 # Only the useful headers common to all entries are listed there. Any column
2713 # at 19 or higher is dependent on the specific event.
2714 EVENT_NAME = 0
2715 TYPE = 1
2716 PID = 9
2717 TID = 10
2718 PROCESSOR_ID = 11
2719 TIMESTAMP = 16
2720 NULL_GUID = '{00000000-0000-0000-0000-000000000000}'
2721 USER_DATA = 19
2722
2723 def __init__(self, logname):
2724 """Starts the log collection.
2725
2726 Requires administrative access. logman.exe is synchronous so no need for a
2727 "warmup" call. 'Windows Kernel Trace' is *localized* so use its GUID
2728 instead. The GUID constant name is SystemTraceControlGuid. Lovely.
2729
2730 One can get the list of potentially interesting providers with:
2731 "logman query providers | findstr /i file"
2732 """
2733 super(LogmanTrace.Tracer, self).__init__(logname)
2734 self._script = create_thunk()
2735 cmd_start = [
2736 'logman.exe',
2737 'start',
2738 'NT Kernel Logger',
2739 '-p', '{9e814aad-3204-11d2-9a82-006008a86939}',
2740 # splitio,fileiocompletion,syscall,file,cswitch,img
2741 '(process,fileio,thread)',
2742 '-o', self._logname + '.etl',
2743 '-ets', # Send directly to kernel
2744 # Values extracted out of thin air.
2745 # Event Trace Session buffer size in kb.
2746 '-bs', '10240',
2747 # Number of Event Trace Session buffers.
2748 '-nb', '16', '256',
2749 ]
2750 logging.debug('Running: %s' % cmd_start)
2751 try:
2752 subprocess.check_call(
2753 cmd_start,
2754 stdin=subprocess.PIPE,
2755 stdout=subprocess.PIPE,
2756 stderr=subprocess.STDOUT)
2757 except subprocess.CalledProcessError, e:
2758 if e.returncode == -2147024891:
2759 print >> sys.stderr, 'Please restart with an elevated admin prompt'
2760 elif e.returncode == -2144337737:
2761 print >> sys.stderr, (
2762 'A kernel trace was already running, stop it and try again')
2763 raise
2764
2765 def trace(self, cmd, cwd, tracename, output):
2766 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
2767 assert os.path.isabs(cmd[0]), cmd[0]
2768 assert os.path.isabs(cwd), cwd
2769 assert os.path.normpath(cwd) == cwd, cwd
2770 with self._lock:
2771 if not self._initialized:
2772 raise TracingFailure(
2773 'Called Tracer.trace() on an unitialized object',
2774 None, None, None, tracename)
2775 assert tracename not in (i['trace'] for i in self._traces)
2776
2777 # Use "logman -?" for help.
2778
2779 stdout = stderr = None
2780 if output:
2781 stdout = subprocess.PIPE
2782 stderr = subprocess.STDOUT
2783
2784 # Run the child process.
2785 logging.debug('Running: %s' % cmd)
2786 # Use the temporary script generated with create_thunk() so we have a
2787 # clear pid owner. Since trace_inputs.py can be used as a library and
2788 # could trace multiple processes simultaneously, it makes it more complex
2789 # if the executable to be traced is executed directly here. It also solves
2790 # issues related to logman.exe that needs to be executed to control the
2791 # kernel trace.
2792 child_cmd = [
2793 sys.executable,
2794 self._script,
2795 tracename,
2796 ]
2797 child = subprocess.Popen(
2798 child_cmd + fix_python_path(cmd),
2799 cwd=cwd,
2800 stdin=subprocess.PIPE,
2801 stdout=stdout,
2802 stderr=stderr)
2803 logging.debug('Started child pid: %d' % child.pid)
2804 out = child.communicate()[0]
2805 # This doesn't mean all the grand-children are done. Sadly, we don't have
2806 # a good way to determine that.
2807
2808 with self._lock:
2809 assert tracename not in (i['trace'] for i in self._traces)
2810 self._traces.append({
maruel@chromium.orgf43e68b2012-10-15 20:23:10 +00002811 'cmd': cmd,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002812 'cwd': cwd,
maruel@chromium.org964ddde2012-10-29 18:13:46 +00002813 'output': out,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002814 'pid': child.pid,
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002815 # Used to figure out the real process when process ids are reused.
2816 'thunk_cmd': child_cmd,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002817 'trace': tracename,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002818 })
2819
2820 return child.returncode, out
2821
2822 def close(self, _timeout=None):
2823 """Stops the kernel log collection and converts the traces to text
2824 representation.
2825 """
2826 with self._lock:
2827 if not self._initialized:
2828 raise TracingFailure(
2829 'Called Tracer.close() on an unitialized object',
2830 None, None, None)
2831 os.remove(self._script)
2832 # Save metadata, add 'format' key..
2833 data = {
2834 'format': 'csv',
2835 'traces': self._traces,
2836 }
2837 write_json(self._logname, data, False)
2838
2839 cmd_stop = [
2840 'logman.exe',
2841 'stop',
2842 'NT Kernel Logger',
2843 '-ets', # Sends the command directly to the kernel.
2844 ]
2845 logging.debug('Running: %s' % cmd_stop)
2846 subprocess.check_call(
2847 cmd_stop,
2848 stdin=subprocess.PIPE,
2849 stdout=subprocess.PIPE,
2850 stderr=subprocess.STDOUT)
2851 self._initialized = False
2852
2853 def post_process_log(self):
2854 """Converts the .etl file into .csv then into .json."""
2855 super(LogmanTrace.Tracer, self).post_process_log()
2856 logformat = 'csv'
2857 self._convert_log(logformat)
2858
2859 if logformat == 'csv_utf16':
2860 def load_file():
2861 def utf_8_encoder(unicode_csv_data):
2862 """Encodes the unicode object as utf-8 encoded str instance"""
2863 for line in unicode_csv_data:
2864 yield line.encode('utf-8')
2865
2866 def unicode_csv_reader(unicode_csv_data, **kwargs):
2867 """Encodes temporarily as UTF-8 since csv module doesn't do unicode.
2868 """
2869 csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), **kwargs)
2870 for row in csv_reader:
2871 # Decode str utf-8 instances back to unicode instances, cell by
2872 # cell:
2873 yield [cell.decode('utf-8') for cell in row]
2874
2875 # The CSV file is UTF-16 so use codecs.open() to load the file into
2876 # the python internal unicode format (utf-8). Then explicitly
2877 # re-encode as utf8 as str instances so csv can parse it fine. Then
2878 # decode the utf-8 str back into python unicode instances. This
2879 # sounds about right.
2880 for line in unicode_csv_reader(
2881 codecs.open(self._logname + '.' + logformat, 'r', 'utf-16')):
2882 # line is a list of unicode objects
2883 # So much white space!
2884 yield [i.strip() for i in line]
2885
2886 elif logformat == 'csv':
2887 def load_file():
2888 def ansi_csv_reader(ansi_csv_data, **kwargs):
2889 """Loads an 'ANSI' code page and returns unicode() objects."""
2890 assert sys.getfilesystemencoding() == 'mbcs'
2891 encoding = get_current_encoding()
2892 for row in csv.reader(ansi_csv_data, **kwargs):
2893 # Decode str 'ansi' instances to unicode instances, cell by cell:
2894 yield [cell.decode(encoding) for cell in row]
2895
2896 # The fastest and smallest format but only supports 'ANSI' file paths.
2897 # E.g. the filenames are encoding in the 'current' encoding.
2898 for line in ansi_csv_reader(open(self._logname + '.' + logformat)):
2899 # line is a list of unicode objects.
2900 yield [i.strip() for i in line]
2901
2902 supported_events = LogmanTrace.Context.supported_events()
2903
2904 def trim(generator):
2905 for index, line in enumerate(generator):
2906 if not index:
2907 if line != self.EXPECTED_HEADER:
2908 raise TracingFailure(
2909 'Found malformed header: %s' % ' '.join(line),
2910 None, None, None)
2911 continue
2912 # As you can see, the CSV is full of useful non-redundant information:
2913 if (line[2] != '0' or # Event ID
2914 line[3] not in ('2', '3') or # Version
2915 line[4] != '0' or # Channel
2916 line[5] != '0' or # Level
2917 line[7] != '0' or # Task
2918 line[8] != '0x0000000000000000' or # Keyword
2919 line[12] != '' or # Instance ID
2920 line[13] != '' or # Parent Instance ID
2921 line[14] != self.NULL_GUID or # Activity ID
2922 line[15] != ''): # Related Activity ID
2923 raise TracingFailure(
2924 'Found unexpected values in line: %s' % ' '.join(line),
2925 None, None, None)
2926
2927 if (line[self.EVENT_NAME], line[self.TYPE]) not in supported_events:
2928 continue
2929
2930 # Convert the PID in-place from hex.
2931 line[self.PID] = int(line[self.PID], 16)
2932
2933 yield [
2934 line[self.EVENT_NAME],
2935 line[self.TYPE],
2936 line[self.PID],
2937 line[self.TID],
2938 line[self.PROCESSOR_ID],
2939 line[self.TIMESTAMP],
2940 ] + line[self.USER_DATA:]
2941
2942 write_json('%s.json' % self._logname, list(trim(load_file())), True)
2943
2944 def _convert_log(self, logformat):
2945 """Converts the ETL trace to text representation.
2946
2947 Normally, 'csv' is sufficient. If complex scripts are used (like eastern
2948 languages), use 'csv_utf16'. If localization gets in the way, use 'xml'.
2949
2950 Arguments:
2951 - logformat: Text format to be generated, csv, csv_utf16 or xml.
2952
2953 Use "tracerpt -?" for help.
2954 """
2955 LOCALE_INVARIANT = 0x7F
2956 windll.kernel32.SetThreadLocale(LOCALE_INVARIANT)
2957 cmd_convert = [
2958 'tracerpt.exe',
2959 '-l', self._logname + '.etl',
2960 '-o', self._logname + '.' + logformat,
2961 '-gmt', # Use UTC
2962 '-y', # No prompt
2963 # Use -of XML to get the header of each items after column 19, e.g. all
2964 # the actual headers of 'User Data'.
2965 ]
2966
2967 if logformat == 'csv':
2968 # tracerpt localizes the 'Type' column, for major brainfuck
2969 # entertainment. I can't imagine any sane reason to do that.
2970 cmd_convert.extend(['-of', 'CSV'])
2971 elif logformat == 'csv_utf16':
2972 # This causes it to use UTF-16, which doubles the log size but ensures
2973 # the log is readable for non-ASCII characters.
2974 cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode'])
2975 elif logformat == 'xml':
2976 cmd_convert.extend(['-of', 'XML'])
2977 else:
2978 raise ValueError('Unexpected log format \'%s\'' % logformat)
2979 logging.debug('Running: %s' % cmd_convert)
2980 # This can takes tens of minutes for large logs.
2981 # Redirects all output to stderr.
2982 subprocess.check_call(
2983 cmd_convert,
2984 stdin=subprocess.PIPE,
2985 stdout=sys.stderr,
2986 stderr=sys.stderr)
2987
2988 @staticmethod
2989 def clean_trace(logname):
2990 for ext in ('', '.csv', '.etl', '.json', '.xml'):
2991 if os.path.isfile(logname + ext):
2992 os.remove(logname + ext)
2993
2994 @classmethod
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00002995 def parse_log(cls, logname, blacklist, trace_name):
2996 logging.info('parse_log(%s, ..., %s)', logname, trace_name)
maruel@chromium.org8b056ba2012-10-16 14:04:49 +00002997 assert os.path.isabs(logname)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002998
2999 def blacklist_more(filepath):
3000 # All the NTFS metadata is in the form x:\$EXTEND or stuff like that.
3001 return blacklist(filepath) or re.match(r'[A-Z]\:\\\$EXTEND', filepath)
3002
3003 data = read_json(logname)
3004 lines = read_json(logname + '.json')
3005 out = []
3006 for item in data['traces']:
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00003007 if trace_name and item['trace'] != trace_name:
3008 continue
maruel@chromium.org964ddde2012-10-29 18:13:46 +00003009 result = {
3010 'output': item['output'],
3011 'trace': item['trace'],
3012 }
3013 try:
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00003014 context = cls.Context(
3015 blacklist_more, item['pid'], item['trace'], item['thunk_cmd'])
maruel@chromium.org964ddde2012-10-29 18:13:46 +00003016 for line in lines:
3017 context.on_line(line)
3018 result['results'] = context.to_results()
3019 except TracingFailure:
3020 result['exception'] = sys.exc_info()
3021 out.append(result)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003022 return out
3023
3024
3025def get_api():
3026 """Returns the correct implementation for the current OS."""
3027 if sys.platform == 'cygwin':
3028 raise NotImplementedError(
3029 'Not implemented for cygwin, start the script from Win32 python')
3030 flavors = {
3031 'win32': LogmanTrace,
3032 'darwin': Dtrace,
3033 'sunos5': Dtrace,
3034 'freebsd7': Dtrace,
3035 'freebsd8': Dtrace,
3036 }
3037 # Defaults to strace.
3038 return flavors.get(sys.platform, Strace)()
3039
3040
3041def extract_directories(root_dir, files, blacklist):
3042 """Detects if all the files in a directory are in |files| and if so, replace
3043 the individual files by a Results.Directory instance.
3044
3045 Takes a list of Results.File instances and returns a shorter list of
3046 Results.File and Results.Directory instances.
3047
3048 Arguments:
3049 - root_dir: Optional base directory that shouldn't be search further.
3050 - files: list of Results.File instances.
3051 - blacklist: regexp of files to ignore, for example r'.+\.pyc'.
3052 """
3053 logging.info(
3054 'extract_directories(%s, %d files, ...)' % (root_dir, len(files)))
3055 assert not (root_dir or '').endswith(os.path.sep), root_dir
3056 assert not root_dir or (get_native_path_case(root_dir) == root_dir)
3057 assert not any(isinstance(f, Results.Directory) for f in files)
3058 # Remove non existent files.
3059 files = [f for f in files if f.existent]
3060 if not files:
3061 return files
3062 # All files must share the same root, which can be None.
3063 assert len(set(f.root for f in files)) == 1, set(f.root for f in files)
3064
3065 # Creates a {directory: {filename: File}} mapping, up to root.
3066 buckets = {}
3067 if root_dir:
3068 buckets[root_dir] = {}
3069 for fileobj in files:
3070 path = fileobj.full_path
3071 directory = os.path.dirname(path)
3072 assert directory
3073 # Do not use os.path.basename() so trailing os.path.sep is kept.
3074 basename = path[len(directory)+1:]
3075 files_in_directory = buckets.setdefault(directory, {})
3076 files_in_directory[basename] = fileobj
3077 # Add all the directories recursively up to root.
3078 while True:
3079 old_d = directory
3080 directory = os.path.dirname(directory)
3081 if directory + os.path.sep == root_dir or directory == old_d:
3082 break
3083 buckets.setdefault(directory, {})
3084
3085 root_prefix = len(root_dir) + 1 if root_dir else 0
3086 for directory in sorted(buckets, reverse=True):
3087 actual = set(f for f in os.listdir(directory) if not blacklist(f))
3088 expected = set(buckets[directory])
3089 if not (actual - expected):
3090 parent = os.path.dirname(directory)
3091 buckets[parent][os.path.basename(directory)] = Results.Directory(
3092 root_dir,
3093 directory[root_prefix:],
3094 False,
3095 sum(f.size for f in buckets[directory].itervalues()),
3096 sum(f.nb_files for f in buckets[directory].itervalues()))
3097 # Remove the whole bucket.
3098 del buckets[directory]
3099
3100 # Reverse the mapping with what remains. The original instances are returned,
3101 # so the cached meta data is kept.
3102 files = sum((x.values() for x in buckets.itervalues()), [])
3103 return sorted(files, key=lambda x: x.path)
3104
3105
3106def trace(logfile, cmd, cwd, api, output):
3107 """Traces an executable. Returns (returncode, output) from api.
3108
3109 Arguments:
3110 - logfile: file to write to.
3111 - cmd: command to run.
3112 - cwd: current directory to start the process in.
3113 - api: a tracing api instance.
3114 - output: if True, returns output, otherwise prints it at the console.
3115 """
3116 cmd = fix_python_path(cmd)
3117 api.clean_trace(logfile)
3118 with api.get_tracer(logfile) as tracer:
3119 return tracer.trace(cmd, cwd, 'default', output)
3120
3121
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003122def CMDclean(args):
3123 """Cleans up traces."""
3124 parser = OptionParserTraceInputs(command='clean')
3125 options, args = parser.parse_args(args)
3126 api = get_api()
3127 api.clean_trace(options.log)
3128 return 0
3129
3130
3131def CMDtrace(args):
3132 """Traces an executable."""
3133 parser = OptionParserTraceInputs(command='trace')
3134 parser.allow_interspersed_args = False
3135 parser.add_option(
3136 '-q', '--quiet', action='store_true',
3137 help='Redirects traced executable output to /dev/null')
3138 options, args = parser.parse_args(args)
3139
3140 if not args:
3141 parser.error('Please provide a command to run')
3142
3143 if not os.path.isabs(args[0]) and os.access(args[0], os.X_OK):
3144 args[0] = os.path.abspath(args[0])
3145
3146 api = get_api()
3147 return trace(options.log, args, os.getcwd(), api, options.quiet)[0]
3148
3149
3150def CMDread(args):
3151 """Reads the logs and prints the result."""
3152 parser = OptionParserTraceInputs(command='read')
3153 parser.add_option(
3154 '-V', '--variable',
3155 nargs=2,
3156 action='append',
3157 dest='variables',
3158 metavar='VAR_NAME directory',
3159 default=[],
3160 help=('Variables to replace relative directories against. Example: '
3161 '"-v \'$HOME\' \'/home/%s\'" will replace all occurence of your '
3162 'home dir with $HOME') % getpass.getuser())
3163 parser.add_option(
3164 '--root-dir',
3165 help='Root directory to base everything off it. Anything outside of this '
3166 'this directory will not be reported')
3167 parser.add_option(
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00003168 '--trace-name',
3169 help='Only reads one of the trace. Defaults to reading all traces')
3170 parser.add_option(
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003171 '-j', '--json', action='store_true',
3172 help='Outputs raw result data as json')
3173 parser.add_option(
3174 '-b', '--blacklist', action='append', default=[],
3175 help='List of regexp to use as blacklist filter')
3176 options, args = parser.parse_args(args)
3177
3178 if options.root_dir:
3179 options.root_dir = os.path.abspath(options.root_dir)
3180
3181 variables = dict(options.variables)
3182 api = get_api()
3183 def blacklist(f):
3184 return any(re.match(b, f) for b in options.blacklist)
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00003185 data = api.parse_log(options.log, blacklist, options.trace_name)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003186 # Process each trace.
3187 output_as_json = []
3188 for item in data:
3189 if 'exception' in item:
maruel@chromium.org424f5692012-10-15 15:42:38 +00003190 # Do not abort the other traces.
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003191 print >> sys.stderr, (
maruel@chromium.org424f5692012-10-15 15:42:38 +00003192 'Trace %s: Got an exception: %s' % (
3193 item['trace'], item['exception'][1]))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003194 continue
3195 results = item['results']
3196 if options.root_dir:
3197 results = results.strip_root(options.root_dir)
3198
3199 if options.json:
3200 output_as_json.append(results.flatten())
3201 else:
3202 simplified = extract_directories(
3203 options.root_dir, results.files, blacklist)
3204 simplified = [f.replace_variables(variables) for f in simplified]
3205 if len(data) > 1:
3206 print('Trace: %s' % item['trace'])
3207 print('Total: %d' % len(results.files))
3208 print('Non existent: %d' % len(results.non_existent))
3209 for f in results.non_existent:
3210 print(' %s' % f.path)
3211 print(
3212 'Interesting: %d reduced to %d' % (
3213 len(results.existent), len(simplified)))
3214 for f in simplified:
3215 print(' %s' % f.path)
3216
3217 if options.json:
3218 write_json(sys.stdout, output_as_json, False)
3219 return 0
3220
3221
3222class OptionParserWithLogging(optparse.OptionParser):
3223 """Adds --verbose option."""
3224 def __init__(self, verbose=0, **kwargs):
3225 optparse.OptionParser.__init__(self, **kwargs)
3226 self.add_option(
3227 '-v', '--verbose',
3228 action='count',
3229 default=verbose,
3230 help='Use multiple times to increase verbosity')
3231
3232 def parse_args(self, *args, **kwargs):
3233 options, args = optparse.OptionParser.parse_args(self, *args, **kwargs)
3234 levels = [logging.ERROR, logging.INFO, logging.DEBUG]
3235 logging.basicConfig(
3236 level=levels[min(len(levels)-1, options.verbose)],
3237 format='%(levelname)5s %(module)15s(%(lineno)3d): %(message)s')
3238 return options, args
3239
3240
3241class OptionParserWithNiceDescription(OptionParserWithLogging):
3242 """Generates the description with the command's docstring."""
3243 def __init__(self, **kwargs):
3244 """Sets 'description' and 'usage' if not already specified."""
3245 command = kwargs.pop('command', 'help')
3246 kwargs.setdefault(
3247 'description',
3248 re.sub('[\r\n ]{2,}', ' ', get_command_handler(command).__doc__))
3249 kwargs.setdefault('usage', '%%prog %s [options]' % command)
3250 OptionParserWithLogging.__init__(self, **kwargs)
3251
3252
3253class OptionParserTraceInputs(OptionParserWithNiceDescription):
3254 """Adds automatic --log handling."""
3255 def __init__(self, **kwargs):
3256 OptionParserWithNiceDescription.__init__(self, **kwargs)
3257 self.add_option(
3258 '-l', '--log', help='Log file to generate or read, required')
3259
3260 def parse_args(self, *args, **kwargs):
3261 """Makes sure the paths make sense.
3262
3263 On Windows, / and \ are often mixed together in a path.
3264 """
3265 options, args = OptionParserWithNiceDescription.parse_args(
3266 self, *args, **kwargs)
3267 if not options.log:
3268 self.error('Must supply a log file with -l')
3269 options.log = os.path.abspath(options.log)
3270 return options, args
3271
3272
3273def extract_documentation():
3274 """Returns a dict {command: description} for each of documented command."""
3275 commands = (
3276 fn[3:]
3277 for fn in dir(sys.modules['__main__'])
3278 if fn.startswith('CMD') and get_command_handler(fn[3:]).__doc__)
3279 return dict((fn, get_command_handler(fn).__doc__) for fn in commands)
3280
3281
3282def CMDhelp(args):
3283 """Prints list of commands or help for a specific command."""
3284 doc = extract_documentation()
3285 # Calculates the optimal offset.
3286 offset = max(len(cmd) for cmd in doc)
3287 format_str = ' %-' + str(offset + 2) + 's %s'
3288 # Generate a one-liner documentation of each commands.
3289 commands_description = '\n'.join(
3290 format_str % (cmd, doc[cmd].split('\n')[0]) for cmd in sorted(doc))
3291
3292 parser = OptionParserWithNiceDescription(
3293 usage='%prog <command> [options]',
3294 description='Commands are:\n%s\n' % commands_description)
3295 parser.format_description = lambda _: parser.description
3296
3297 # Strip out any -h or --help argument.
3298 _, args = parser.parse_args([i for i in args if not i in ('-h', '--help')])
3299 if len(args) == 1:
3300 if not get_command_handler(args[0]):
3301 parser.error('Unknown command %s' % args[0])
3302 # The command was "%prog help command", replaces ourself with
3303 # "%prog command --help" so help is correctly printed out.
3304 return main(args + ['--help'])
3305 elif args:
3306 parser.error('Unknown argument "%s"' % ' '.join(args))
3307 parser.print_help()
3308 return 0
3309
3310
3311def get_command_handler(name):
3312 """Returns the command handler or CMDhelp if it doesn't exist."""
3313 return getattr(sys.modules['__main__'], 'CMD%s' % name, None)
3314
3315
3316def main_impl(argv):
3317 command = get_command_handler(argv[0] if argv else 'help')
3318 if not command:
3319 return CMDhelp(argv)
3320 return command(argv[1:])
3321
3322def main(argv):
3323 try:
3324 main_impl(argv)
3325 except TracingFailure, e:
3326 sys.stderr.write('\nError: ')
3327 sys.stderr.write(str(e))
3328 sys.stderr.write('\n')
3329 return 1
3330
3331
3332if __name__ == '__main__':
3333 sys.exit(main(sys.argv[1:]))