blob: 79092b5901781d0341307be0fb9ae027b8c867b9 [file] [log] [blame]
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001#!/usr/bin/env python
2# coding=utf-8
3# Copyright (c) 2012 The Chromium Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7"""Traces an executable and its child processes and extract the files accessed
8by them.
9
10The implementation uses OS-specific API. The native Kernel logger and the ETL
11interface is used on Windows. Dtrace is used on OSX. Strace is used otherwise.
12The OS-specific implementation is hidden in an 'API' interface.
13
14The results are embedded in a Results instance. The tracing is done in two
15phases, the first is to do the actual trace and generate an
16implementation-specific log file. Then the log file is parsed to extract the
17information, including the individual child processes and the files accessed
18from the log.
19"""
20
21import codecs
22import csv
23import getpass
24import glob
25import json
26import logging
27import optparse
28import os
29import re
30import subprocess
31import sys
32import tempfile
33import threading
34import time
35import weakref
36
37## OS-specific imports
38
39if sys.platform == 'win32':
40 from ctypes.wintypes import byref, create_unicode_buffer, c_int, c_wchar_p
41 from ctypes.wintypes import windll, FormatError # pylint: disable=E0611
42 from ctypes.wintypes import GetLastError # pylint: disable=E0611
43elif sys.platform == 'darwin':
44 import Carbon.File # pylint: disable=F0401
45 import MacOS # pylint: disable=F0401
46
47
48BASE_DIR = os.path.dirname(os.path.abspath(__file__))
49ROOT_DIR = os.path.dirname(os.path.dirname(BASE_DIR))
50
51
52class TracingFailure(Exception):
53 """An exception occured during tracing."""
54 def __init__(self, description, pid, line_number, line, *args):
55 super(TracingFailure, self).__init__(
56 description, pid, line_number, line, *args)
57 self.description = description
58 self.pid = pid
59 self.line_number = line_number
60 self.line = line
61 self.extra = args
62
63 def __str__(self):
64 out = self.description
65 if self.pid:
66 out += '\npid: %d' % self.pid
67 if self.line_number:
68 out += '\nline: %d' % self.line_number
69 if self.line:
70 out += '\n%s' % self.line
71 if self.extra:
72 out += '\n' + ', '.join(map(str, filter(None, self.extra)))
73 return out
74
75
76## OS-specific functions
77
78if sys.platform == 'win32':
79 def QueryDosDevice(drive_letter):
80 """Returns the Windows 'native' path for a DOS drive letter."""
81 assert re.match(r'^[a-zA-Z]:$', drive_letter), drive_letter
82 # Guesswork. QueryDosDeviceW never returns the required number of bytes.
83 chars = 1024
84 drive_letter = unicode(drive_letter)
85 p = create_unicode_buffer(chars)
86 if 0 == windll.kernel32.QueryDosDeviceW(drive_letter, p, chars):
87 err = GetLastError()
88 if err:
89 # pylint: disable=E0602
90 raise WindowsError(
91 err,
92 'QueryDosDevice(%s): %s (%d)' % (
93 str(drive_letter), FormatError(err), err))
94 return p.value
95
96
97 def GetShortPathName(long_path):
98 """Returns the Windows short path equivalent for a 'long' path."""
99 long_path = unicode(long_path)
100 # Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is
101 # not enforced.
102 if os.path.isabs(long_path) and not long_path.startswith('\\\\?\\'):
103 long_path = '\\\\?\\' + long_path
104 chars = windll.kernel32.GetShortPathNameW(long_path, None, 0)
105 if chars:
106 p = create_unicode_buffer(chars)
107 if windll.kernel32.GetShortPathNameW(long_path, p, chars):
108 return p.value
109
110 err = GetLastError()
111 if err:
112 # pylint: disable=E0602
113 raise WindowsError(
114 err,
115 'GetShortPathName(%s): %s (%d)' % (
116 str(long_path), FormatError(err), err))
117
118
119 def GetLongPathName(short_path):
120 """Returns the Windows long path equivalent for a 'short' path."""
121 short_path = unicode(short_path)
122 # Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is
123 # not enforced.
124 if os.path.isabs(short_path) and not short_path.startswith('\\\\?\\'):
125 short_path = '\\\\?\\' + short_path
126 chars = windll.kernel32.GetLongPathNameW(short_path, None, 0)
127 if chars:
128 p = create_unicode_buffer(chars)
129 if windll.kernel32.GetLongPathNameW(short_path, p, chars):
130 return p.value
131
132 err = GetLastError()
133 if err:
134 # pylint: disable=E0602
135 raise WindowsError(
136 err,
137 'GetLongPathName(%s): %s (%d)' % (
138 str(short_path), FormatError(err), err))
139
140
141 def get_current_encoding():
142 """Returns the 'ANSI' code page associated to the process."""
143 return 'cp%d' % int(windll.kernel32.GetACP())
144
145
146 class DosDriveMap(object):
147 """Maps \Device\HarddiskVolumeN to N: on Windows."""
148 # Keep one global cache.
149 _MAPPING = {}
150
151 def __init__(self):
152 """Lazy loads the cache."""
153 if not self._MAPPING:
154 # This is related to UNC resolver on windows. Ignore that.
155 self._MAPPING['\\Device\\Mup'] = None
156 self._MAPPING['\\SystemRoot'] = os.environ['SystemRoot']
157
158 for letter in (chr(l) for l in xrange(ord('C'), ord('Z')+1)):
159 try:
160 letter = '%s:' % letter
161 mapped = QueryDosDevice(letter)
162 if mapped in self._MAPPING:
163 logging.warn(
164 ('Two drives: \'%s\' and \'%s\', are mapped to the same disk'
165 '. Drive letters are a user-mode concept and the kernel '
166 'traces only have NT path, so all accesses will be '
167 'associated with the first drive letter, independent of the '
168 'actual letter used by the code') % (
169 self._MAPPING[mapped], letter))
170 else:
171 self._MAPPING[mapped] = letter
172 except WindowsError: # pylint: disable=E0602
173 pass
174
175 def to_win32(self, path):
176 """Converts a native NT path to Win32/DOS compatible path."""
177 match = re.match(r'(^\\Device\\[a-zA-Z0-9]+)(\\.*)?$', path)
178 if not match:
179 raise ValueError(
180 'Can\'t convert %s into a Win32 compatible path' % path,
181 path)
182 if not match.group(1) in self._MAPPING:
183 # Unmapped partitions may be accessed by windows for the
184 # fun of it while the test is running. Discard these.
185 return None
186 drive = self._MAPPING[match.group(1)]
187 if not drive or not match.group(2):
188 return drive
189 return drive + match.group(2)
190
191
192 def isabs(path):
193 """Accepts X: as an absolute path, unlike python's os.path.isabs()."""
194 return os.path.isabs(path) or len(path) == 2 and path[1] == ':'
195
196
197 def get_native_path_case(p):
198 """Returns the native path case for an existing file.
199
200 On Windows, removes any leading '\\?\'.
201 """
202 if not isabs(p):
203 raise ValueError(
204 'Can\'t get native path case for a non-absolute path: %s' % p,
205 p)
206 # Windows used to have an option to turn on case sensitivity on non Win32
207 # subsystem but that's out of scope here and isn't supported anymore.
208 # Go figure why GetShortPathName() is needed.
209 try:
210 out = GetLongPathName(GetShortPathName(p))
211 except OSError, e:
212 if e.args[0] in (2, 3, 5):
213 # The path does not exist. Try to recurse and reconstruct the path.
214 base = os.path.dirname(p)
215 rest = os.path.basename(p)
216 return os.path.join(get_native_path_case(base), rest)
217 raise
218 if out.startswith('\\\\?\\'):
219 out = out[4:]
220 # Always upper case the first letter since GetLongPathName() will return the
221 # drive letter in the case it was given.
222 return out[0].upper() + out[1:]
223
224
225 def CommandLineToArgvW(command_line):
226 """Splits a commandline into argv using CommandLineToArgvW()."""
227 # http://msdn.microsoft.com/library/windows/desktop/bb776391.aspx
228 size = c_int()
229 ptr = windll.shell32.CommandLineToArgvW(unicode(command_line), byref(size))
230 try:
231 return [arg for arg in (c_wchar_p * size.value).from_address(ptr)]
232 finally:
233 windll.kernel32.LocalFree(ptr)
234
235
236elif sys.platform == 'darwin':
237
238
239 # On non-windows, keep the stdlib behavior.
240 isabs = os.path.isabs
241
242
243 def _find_item_native_case(root_path, item):
244 """Gets the native path case of a single item based at root_path.
245
246 There is no API to get the native path case of symlinks on OSX. So it
247 needs to be done the slow way.
248 """
249 item = item.lower()
250 for element in os.listdir(root_path):
251 if element.lower() == item:
252 return element
253
254
255 def _native_case(p):
256 """Gets the native path case. Warning: this function resolves symlinks."""
257 logging.debug('native_case(%s)' % p)
258 try:
259 rel_ref, _ = Carbon.File.FSPathMakeRef(p)
260 out = rel_ref.FSRefMakePath()
261 if p.endswith(os.path.sep) and not out.endswith(os.path.sep):
262 return out + os.path.sep
263 return out
264 except MacOS.Error, e:
265 if e.args[0] in (-43, -120):
266 # The path does not exist. Try to recurse and reconstruct the path.
267 # -43 means file not found.
268 # -120 means directory not found.
269 base = os.path.dirname(p)
270 rest = os.path.basename(p)
271 return os.path.join(_native_case(base), rest)
272 raise OSError(
273 e.args[0], 'Failed to get native path for %s' % p, p, e.args[1])
274
275
276 def _split_at_symlink_native(base_path, rest):
277 """Returns the native path for a symlink."""
278 base, symlink, rest = split_at_symlink(base_path, rest)
279 if symlink:
280 if not base_path:
281 base_path = base
282 else:
283 base_path = safe_join(base_path, base)
284 symlink = _find_item_native_case(base_path, symlink)
285 return base, symlink, rest
286
287
288 def get_native_path_case(path):
289 """Returns the native path case for an existing file.
290
291 Technically, it's only HFS+ on OSX that is case preserving and
292 insensitive. It's the default setting on HFS+ but can be changed.
293 """
294 if not isabs(path):
295 raise ValueError(
296 'Can\'t get native path case for a non-absolute path: %s' % path,
297 path)
298 if path.startswith('/dev'):
299 # /dev is not visible from Carbon, causing an exception.
300 return path
301
302 # Starts assuming there is no symlink along the path.
303 resolved = _native_case(path)
304 if resolved.lower() == path.lower():
305 # This code path is incredibly faster.
306 return resolved
307
308 # There was a symlink, process it.
309 base, symlink, rest = _split_at_symlink_native(None, path)
310 assert symlink, (path, base, symlink, rest, resolved)
311 prev = base
312 base = safe_join(_native_case(base), symlink)
313 assert len(base) > len(prev)
314 while rest:
315 prev = base
316 relbase, symlink, rest = _split_at_symlink_native(base, rest)
317 base = safe_join(base, relbase)
318 assert len(base) > len(prev), (prev, base, symlink)
319 if symlink:
320 base = safe_join(base, symlink)
321 assert len(base) > len(prev), (prev, base, symlink)
322 # Make sure no symlink was resolved.
323 assert base.lower() == path.lower(), (base, path)
324 return base
325
326
327else: # OSes other than Windows and OSX.
328
329
330 # On non-windows, keep the stdlib behavior.
331 isabs = os.path.isabs
332
333
334 def get_native_path_case(path):
335 """Returns the native path case for an existing file.
336
337 On OSes other than OSX and Windows, assume the file system is
338 case-sensitive.
339
340 TODO(maruel): This is not strictly true. Implement if necessary.
341 """
342 if not isabs(path):
343 raise ValueError(
344 'Can\'t get native path case for a non-absolute path: %s' % path,
345 path)
346 # Give up on cygwin, as GetLongPathName() can't be called.
347 # Linux traces tends to not be normalized so use this occasion to normalize
348 # it. This function implementation already normalizes the path on the other
349 # OS so this needs to be done here to be coherent between OSes.
350 out = os.path.normpath(path)
351 if path.endswith(os.path.sep) and not out.endswith(os.path.sep):
352 return out + os.path.sep
353 return out
354
355
356if sys.platform != 'win32': # All non-Windows OSes.
357
358
359 def safe_join(*args):
360 """Joins path elements like os.path.join() but doesn't abort on absolute
361 path.
362
363 os.path.join('foo', '/bar') == '/bar'
364 but safe_join('foo', '/bar') == 'foo/bar'.
365 """
366 out = ''
367 for element in args:
368 if element.startswith(os.path.sep):
369 if out.endswith(os.path.sep):
370 out += element[1:]
371 else:
372 out += element
373 else:
374 if out.endswith(os.path.sep):
375 out += element
376 else:
377 out += os.path.sep + element
378 return out
379
380
381 def split_at_symlink(base_dir, relfile):
382 """Scans each component of relfile and cut the string at the symlink if
383 there is any.
384
385 Returns a tuple (base_path, symlink, rest), with symlink == rest == None if
386 not symlink was found.
387 """
388 if base_dir:
389 assert relfile
390 assert os.path.isabs(base_dir)
391 index = 0
392 else:
393 assert os.path.isabs(relfile)
394 index = 1
395
396 def at_root(rest):
397 if base_dir:
398 return safe_join(base_dir, rest)
399 return rest
400
401 while True:
402 try:
403 index = relfile.index(os.path.sep, index)
404 except ValueError:
405 index = len(relfile)
406 full = at_root(relfile[:index])
407 if os.path.islink(full):
408 # A symlink!
409 base = os.path.dirname(relfile[:index])
410 symlink = os.path.basename(relfile[:index])
411 rest = relfile[index:]
412 logging.debug(
413 'split_at_symlink(%s, %s) -> (%s, %s, %s)' %
414 (base_dir, relfile, base, symlink, rest))
415 return base, symlink, rest
416 if index == len(relfile):
417 break
418 index += 1
419 return relfile, None, None
420
421
422def fix_python_path(cmd):
423 """Returns the fixed command line to call the right python executable."""
424 out = cmd[:]
425 if out[0] == 'python':
426 out[0] = sys.executable
427 elif out[0].endswith('.py'):
428 out.insert(0, sys.executable)
429 return out
430
431
432def create_thunk():
433 handle, name = tempfile.mkstemp(prefix='trace_inputs_thunk', suffix='.py')
434 os.write(
435 handle,
436 (
437 'import subprocess\n'
438 'import sys\n'
439 'sys.exit(subprocess.call(sys.argv[2:]))\n'
440 ))
441 os.close(handle)
442 return name
443
444
445def strace_process_quoted_arguments(text):
446 """Extracts quoted arguments on a string and return the arguments as a list.
447
448 Implemented as an automaton. Supports incomplete strings in the form
449 '"foo"...'.
450
451 Example:
452 With text = '"foo", "bar"', the function will return ['foo', 'bar']
453
454 TODO(maruel): Implement escaping.
455 """
456 # All the possible states of the DFA.
457 ( NEED_QUOTE, # Begining of a new arguments.
458 INSIDE_STRING, # Inside an argument.
459 ESCAPED, # Found a '\' inside a quote. Treat the next char as-is.
460 NEED_COMMA_OR_DOT, # Right after the closing quote of an argument. Could be
461 # a serie of 3 dots or a comma.
462 NEED_SPACE, # Right after a comma
463 NEED_DOT_2, # Found a dot, need a second one.
464 NEED_DOT_3, # Found second dot, need a third one.
465 NEED_COMMA, # Found third dot, need a comma.
466 ) = range(8)
467
468 state = NEED_QUOTE
469 out = []
470 for index, char in enumerate(text):
471 if char == '"':
472 if state == NEED_QUOTE:
473 state = INSIDE_STRING
474 # A new argument was found.
475 out.append('')
476 elif state == INSIDE_STRING:
477 # The argument is now closed.
478 state = NEED_COMMA_OR_DOT
479 elif state == ESCAPED:
480 out[-1] += char
481 state = INSIDE_STRING
482 else:
483 raise ValueError(
484 'Can\'t process char at column %d for: %r' % (index, text),
485 index,
486 text)
487 elif char == ',':
488 if state in (NEED_COMMA_OR_DOT, NEED_COMMA):
489 state = NEED_SPACE
490 elif state == INSIDE_STRING:
491 out[-1] += char
492 elif state == ESCAPED:
493 out[-1] += char
494 state = INSIDE_STRING
495 else:
496 raise ValueError(
497 'Can\'t process char at column %d for: %r' % (index, text),
498 index,
499 text)
500 elif char == ' ':
501 if state == NEED_SPACE:
502 state = NEED_QUOTE
503 elif state == INSIDE_STRING:
504 out[-1] += char
505 elif state == ESCAPED:
506 out[-1] += char
507 state = INSIDE_STRING
508 else:
509 raise ValueError(
510 'Can\'t process char at column %d for: %r' % (index, text),
511 index,
512 text)
513 elif char == '.':
514 if state == NEED_COMMA_OR_DOT:
515 # The string is incomplete, this mean the strace -s flag should be
516 # increased.
517 state = NEED_DOT_2
518 elif state == NEED_DOT_2:
519 state = NEED_DOT_3
520 elif state == NEED_DOT_3:
521 state = NEED_COMMA
522 elif state == INSIDE_STRING:
523 out[-1] += char
524 elif state == ESCAPED:
525 out[-1] += char
526 state = INSIDE_STRING
527 else:
528 raise ValueError(
529 'Can\'t process char at column %d for: %r' % (index, text),
530 index,
531 text)
532 elif char == '\\':
533 if state == ESCAPED:
534 out[-1] += char
535 state = INSIDE_STRING
536 elif state == INSIDE_STRING:
537 state = ESCAPED
538 else:
539 raise ValueError(
540 'Can\'t process char at column %d for: %r' % (index, text),
541 index,
542 text)
543 else:
544 if state == INSIDE_STRING:
545 out[-1] += char
546 else:
547 raise ValueError(
548 'Can\'t process char at column %d for: %r' % (index, text),
549 index,
550 text)
551 if state not in (NEED_COMMA, NEED_COMMA_OR_DOT):
552 raise ValueError(
553 'String is incorrectly terminated: %r' % text,
554 text)
555 return out
556
557
558def read_json(filepath):
559 with open(filepath, 'r') as f:
560 return json.load(f)
561
562
563def write_json(filepath_or_handle, data, dense):
564 """Writes data into filepath or file handle encoded as json.
565
566 If dense is True, the json is packed. Otherwise, it is human readable.
567 """
568 if hasattr(filepath_or_handle, 'write'):
569 if dense:
570 filepath_or_handle.write(
571 json.dumps(data, sort_keys=True, separators=(',',':')))
572 else:
573 filepath_or_handle.write(json.dumps(data, sort_keys=True, indent=2))
574 else:
575 with open(filepath_or_handle, 'wb') as f:
576 if dense:
577 json.dump(data, f, sort_keys=True, separators=(',',':'))
578 else:
579 json.dump(data, f, sort_keys=True, indent=2)
580
581
582class Results(object):
583 """Results of a trace session."""
584
585 class _TouchedObject(object):
586 """Something, a file or a directory, that was accessed."""
587 def __init__(self, root, path, tainted, size, nb_files):
588 logging.debug(
589 '%s(%s, %s, %s, %s, %s)' %
590 (self.__class__.__name__, root, path, tainted, size, nb_files))
591 self.root = root
592 self.path = path
593 self.tainted = tainted
594 self.nb_files = nb_files
595 # Can be used as a cache or a default value, depending on context.
596 self._size = size
597 # These are cache only.
598 self._real_path = None
599
600 # Check internal consistency.
601 assert path, path
602 assert tainted or bool(root) != bool(isabs(path)), (root, path)
603 assert tainted or (
604 not os.path.exists(self.full_path) or
605 (self.full_path == get_native_path_case(self.full_path))), (
606 tainted, self.full_path, get_native_path_case(self.full_path))
607
608 @property
609 def existent(self):
610 return self.size != -1
611
612 @property
613 def full_path(self):
614 if self.root:
615 return os.path.join(self.root, self.path)
616 return self.path
617
618 @property
619 def real_path(self):
620 """Returns the path with symlinks resolved."""
621 if not self._real_path:
622 self._real_path = os.path.realpath(self.full_path)
623 return self._real_path
624
625 @property
626 def size(self):
627 """File's size. -1 is not existent."""
628 if self._size is None and not self.tainted:
629 try:
630 self._size = os.stat(self.full_path).st_size
631 except OSError:
632 self._size = -1
633 return self._size
634
635 def flatten(self):
636 """Returns a dict representing this object.
637
638 A 'size' of 0 means the file was only touched and not read.
639 """
640 return {
641 'path': self.path,
642 'size': self.size,
643 }
644
645 def replace_variables(self, variables):
646 """Replaces the root of this File with one of the variables if it matches.
647
648 If a variable replacement occurs, the cloned object becomes tainted.
649 """
650 for variable, root_path in variables.iteritems():
651 if self.path.startswith(root_path):
652 return self._clone(
653 self.root, variable + self.path[len(root_path):], True)
654 # No need to clone, returns ourself.
655 return self
656
657 def strip_root(self, root):
658 """Returns a clone of itself with 'root' stripped off."""
659 # Check internal consistency.
660 assert self.tainted or (isabs(root) and root.endswith(os.path.sep)), root
661 if not self.full_path.startswith(root):
662 # Now try to resolve the symlinks to see if it can be reached this way.
663 # Only try *after* trying without resolving symlink.
664 if not self.real_path.startswith(root):
665 return None
666 path = self.real_path
667 else:
668 path = self.full_path
669 return self._clone(root, path[len(root):], self.tainted)
670
671 def _clone(self, new_root, new_path, tainted):
672 raise NotImplementedError(self.__class__.__name__)
673
674 class File(_TouchedObject):
675 """A file that was accessed. May not be present anymore.
676
677 If tainted is true, it means it is not a real path anymore as a variable
678 replacement occured.
679
680 If touched_only is True, this means the file was probed for existence, and
681 it is existent, but was never _opened_. If touched_only is True, the file
682 must have existed.
683 """
684 def __init__(self, root, path, tainted, size):
685 super(Results.File, self).__init__(root, path, tainted, size, 1)
686
687 def _clone(self, new_root, new_path, tainted):
688 """Clones itself keeping meta-data."""
689 # Keep the self.size and self._real_path caches for performance reason. It
690 # is also important when the file becomes tainted (with a variable instead
691 # of the real path) since self.path is not an on-disk path anymore so
692 # out._size cannot be updated.
693 out = self.__class__(new_root, new_path, tainted, self.size)
694 out._real_path = self._real_path
695 return out
696
697 class Directory(_TouchedObject):
698 """A directory of files. Must exist."""
699 def __init__(self, root, path, tainted, size, nb_files):
700 """path='.' is a valid value and must be handled appropriately."""
701 assert not path.endswith(os.path.sep), path
702 super(Results.Directory, self).__init__(
703 root, path + os.path.sep, tainted, size, nb_files)
704 # For a Directory instance, self.size is not a cache, it's an actual value
705 # that is never modified and represents the total size of the files
706 # contained in this directory. It is possible that the directory is empty
707 # so that size == 0; this happens if there's only an invalid symlink in
708 # it.
709
710 def flatten(self):
711 out = super(Results.Directory, self).flatten()
712 out['nb_files'] = self.nb_files
713 return out
714
715 def _clone(self, new_root, new_path, tainted):
716 """Clones itself keeping meta-data."""
717 out = self.__class__(
718 new_root,
719 new_path.rstrip(os.path.sep),
720 tainted,
721 self.size,
722 self.nb_files)
723 out._real_path = self._real_path
724 return out
725
726 class Process(object):
727 """A process that was traced.
728
729 Contains references to the files accessed by this process and its children.
730 """
731 def __init__(self, pid, files, executable, command, initial_cwd, children):
732 logging.debug('Process(%s, %d, ...)' % (pid, len(files)))
733 self.pid = pid
734 self.files = sorted(files, key=lambda x: x.path)
735 self.children = children
736 self.executable = executable
737 self.command = command
738 self.initial_cwd = initial_cwd
739
740 # Check internal consistency.
741 assert len(set(f.path for f in self.files)) == len(self.files), sorted(
742 f.path for f in self.files)
743 assert isinstance(self.children, list)
744 assert isinstance(self.files, list)
745
746 @property
747 def all(self):
748 for child in self.children:
749 for i in child.all:
750 yield i
751 yield self
752
753 def flatten(self):
754 return {
755 'children': [c.flatten() for c in self.children],
756 'command': self.command,
757 'executable': self.executable,
758 'files': [f.flatten() for f in self.files],
759 'initial_cwd': self.initial_cwd,
760 'pid': self.pid,
761 }
762
763 def strip_root(self, root):
764 assert isabs(root) and root.endswith(os.path.sep), root
765 # Loads the files after since they are constructed as objects.
766 out = self.__class__(
767 self.pid,
768 filter(None, (f.strip_root(root) for f in self.files)),
769 self.executable,
770 self.command,
771 self.initial_cwd,
772 [c.strip_root(root) for c in self.children])
773 logging.debug(
774 'strip_root(%s) %d -> %d' % (root, len(self.files), len(out.files)))
775 return out
776
777 def __init__(self, process):
778 self.process = process
779 # Cache.
780 self._files = None
781
782 def flatten(self):
783 return {
784 'root': self.process.flatten(),
785 }
786
787 @property
788 def files(self):
789 if self._files is None:
790 self._files = sorted(
791 sum((p.files for p in self.process.all), []),
792 key=lambda x: x.path)
793 return self._files
794
795 @property
796 def existent(self):
797 return [f for f in self.files if f.existent]
798
799 @property
800 def non_existent(self):
801 return [f for f in self.files if not f.existent]
802
803 def strip_root(self, root):
804 """Returns a clone with all the files outside the directory |root| removed
805 and converts all the path to be relative paths.
806 """
807 # Resolve any symlink
808 root = os.path.realpath(root)
809 root = get_native_path_case(root).rstrip(os.path.sep) + os.path.sep
810 logging.debug('strip_root(%s)' % root)
811 return Results(self.process.strip_root(root))
812
813
814class ApiBase(object):
815 """OS-agnostic API to trace a process and its children."""
816 class Context(object):
817 """Processes one log line at a time and keeps the list of traced processes.
818
819 The parsing is complicated by the fact that logs are traced out of order for
820 strace but in-order for dtrace and logman. In addition, on Windows it is
821 very frequent that processids are reused so a flat list cannot be used. But
822 at the same time, it is impossible to faithfully construct a graph when the
823 logs are processed out of order. So both a tree and a flat mapping are used,
824 the tree is the real process tree, while the flat mapping stores the last
825 valid process for the corresponding processid. For the strace case, the
826 tree's head is guessed at the last moment.
827 """
828 class Process(object):
829 """Keeps context for one traced child process.
830
831 Logs all the files this process touched. Ignores directories.
832 """
833 def __init__(self, blacklist, pid, initial_cwd):
834 # Check internal consistency.
835 assert isinstance(pid, int), repr(pid)
836 self.pid = pid
837 # children are Process instances.
838 self.children = []
839 self.initial_cwd = initial_cwd
840 self.cwd = None
841 self.files = set()
842 self.only_touched = set()
843 self.executable = None
844 self.command = None
845 self._blacklist = blacklist
846
847 def to_results_process(self):
848 """Resolves file case sensitivity and or late-bound strings."""
849 # When resolving files, it's normal to get dupe because a file could be
850 # opened multiple times with different case. Resolve the deduplication
851 # here.
852 def fix_path(x):
853 """Returns the native file path case.
854
855 Converts late-bound strings.
856 """
857 if not x:
858 # Do not convert None instance to 'None'.
859 return x
860 # TODO(maruel): Do not upconvert to unicode here, on linux we don't
861 # know the file path encoding so they must be treated as bytes.
862 x = unicode(x)
863 if os.path.isabs(x):
864 # If the path is not absolute, which tends to happen occasionally on
865 # Windows, it is not possible to get the native path case so ignore
866 # that trace. It mostly happens for 'executable' value.
867 x = get_native_path_case(x)
868 return x
869
870 def fix_and_blacklist_path(x):
871 x = fix_path(x)
872 if not x:
873 return
874 # The blacklist needs to be reapplied, since path casing could
875 # influence blacklisting.
876 if self._blacklist(x):
877 return
878 return x
879
880 # Filters out directories. Some may have passed through.
881 files = set(f for f in map(fix_and_blacklist_path, self.files) if f)
882 only_touched = set(
883 f for f in map(fix_and_blacklist_path, self.only_touched) if f)
884 only_touched -= files
885
886 files = [
887 Results.File(None, f, False, None) for f in files
888 if not os.path.isdir(f)
889 ]
890 # Using 0 as size means the file's content is ignored since the file was
891 # never opened for I/O.
892 files.extend(
893 Results.File(None, f, False, 0) for f in only_touched
894 if not os.path.isdir(f)
895 )
896 return Results.Process(
897 self.pid,
898 files,
899 fix_path(self.executable),
900 self.command,
901 fix_path(self.initial_cwd),
902 [c.to_results_process() for c in self.children])
903
904 def add_file(self, filepath, touch_only):
905 """Adds a file if it passes the blacklist."""
906 if self._blacklist(unicode(filepath)):
907 return
908 logging.debug('add_file(%d, %s, %s)' % (self.pid, filepath, touch_only))
909 # Note that filepath and not unicode(filepath) is added. It is because
910 # filepath could be something else than a string, like a RelativePath
911 # instance for dtrace logs.
912 if touch_only:
913 self.only_touched.add(filepath)
914 else:
915 self.files.add(filepath)
916
917 def __init__(self, blacklist):
918 self.blacklist = blacklist
919 # Initial process.
920 self.root_process = None
921 # dict to accelerate process lookup, to not have to lookup the whole graph
922 # each time.
923 self._process_lookup = {}
924
925 class Tracer(object):
926 """During it's lifetime, the tracing subsystem is enabled."""
927 def __init__(self, logname):
928 self._logname = logname
929 self._lock = threading.Lock()
930 self._traces = []
931 self._initialized = True
932
933 def trace(self, cmd, cwd, tracename, output):
934 """Runs the OS-specific trace program on an executable.
935
936 Arguments:
937 - cmd: The command (a list) to run.
938 - cwd: Current directory to start the child process in.
939 - tracename: Name of the trace in the logname file.
940 - output: If False, redirects output to PIPEs.
941
942 Returns a tuple (resultcode, output) and updates the internal trace
943 entries.
944 """
945 # The implementation adds an item to self._traces.
946 raise NotImplementedError(self.__class__.__name__)
947
948 def close(self, _timeout=None):
949 """Saves the meta-data in the logname file.
950
951 For kernel-based tracing, stops the tracing subsystem.
952
953 Must not be used manually when using 'with' construct.
954 """
955 with self._lock:
956 assert self._initialized
957 try:
958 data = {
959 'traces': self._traces,
960 }
961 write_json(self._logname, data, False)
962 finally:
963 self._initialized = False
964
965 def post_process_log(self):
966 """Post-processes the log so it becomes faster to load afterward.
967
968 Must not be used manually when using 'with' construct.
969 """
970 assert not self._initialized, 'Must stop tracing first.'
971
972 def __enter__(self):
973 """Enables 'with' statement."""
974 return self
975
976 def __exit__(self, exc_type, exc_value, traceback):
977 """Enables 'with' statement."""
978 self.close()
979 # If an exception was thrown, do not process logs.
980 if not exc_type:
981 self.post_process_log()
982
983 def get_tracer(self, logname):
984 """Returns an ApiBase.Tracer instance.
985
986 Initializes the tracing subsystem, which is a requirement for kernel-based
987 tracers. Only one tracer instance should be live at a time!
988
989 logname is the filepath to the json file that will contain the meta-data
990 about the logs.
991 """
992 return self.Tracer(logname)
993
994 @staticmethod
995 def clean_trace(logname):
996 """Deletes an old log."""
997 raise NotImplementedError()
998
999 @classmethod
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00001000 def parse_log(cls, logname, blacklist, trace_name):
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001001 """Processes trace logs and returns the files opened and the files that do
1002 not exist.
1003
1004 It does not track directories.
1005
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00001006 Arguments:
1007 - logname: must be an absolute path.
1008 - blacklist: must be a lambda.
1009 - trace_name: optional trace to read, defaults to reading all traces.
maruel@chromium.org8b056ba2012-10-16 14:04:49 +00001010
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001011 Most of the time, files that do not exist are temporary test files that
1012 should be put in /tmp instead. See http://crbug.com/116251.
1013
1014 Returns a list of dict with keys:
1015 - results: A Results instance.
1016 - trace: The corresponding tracename parameter provided to
1017 get_tracer().trace().
1018 - output: Output gathered during execution, if get_tracer().trace(...,
1019 output=False) was used.
1020 """
1021 raise NotImplementedError(cls.__class__.__name__)
1022
1023
1024class Strace(ApiBase):
1025 """strace implies linux."""
1026 class Context(ApiBase.Context):
1027 """Processes a strace log line and keeps the list of existent and non
1028 existent files accessed.
1029
1030 Ignores directories.
1031
1032 Uses late-binding to processes the cwd of each process. The problem is that
1033 strace generates one log file per process it traced but doesn't give any
1034 information about which process was started when and by who. So we don't
1035 even know which process is the initial one. So process the logs out of
1036 order and use late binding with RelativePath to be able to deduce the
1037 initial directory of each process once all the logs are parsed.
1038 """
1039 class Process(ApiBase.Context.Process):
1040 """Represents the state of a process.
1041
1042 Contains all the information retrieved from the pid-specific log.
1043 """
1044 # Function names are using ([a-z_0-9]+)
1045 # This is the most common format. function(args) = result
1046 RE_HEADER = re.compile(r'^([a-z_0-9]+)\((.+?)\)\s+= (.+)$')
1047 # An interrupted function call, only grab the minimal header.
1048 RE_UNFINISHED = re.compile(r'^([^\(]+)(.*) \<unfinished \.\.\.\>$')
maruel@chromium.org9f72e0f2012-10-18 14:48:49 +00001049 # An interrupted function call, with the process exiting. It must be the
1050 # last line in the log.
1051 RE_UNFINISHED_EXIT = re.compile(
1052 r'^([^\(]+)(.*) \<unfinished \.\.\.\ exit status \d+>$')
maruel@chromium.org4e506502012-10-18 15:02:58 +00001053 # An interrupted function call the hard way. Usually observed with futex()
1054 # on ubuntu 12.04.
csharp@chromium.org4d753552012-10-25 19:31:55 +00001055 RE_INTERRUPTED_HARD = re.compile(r'^([^\(]+)\([A-Z0-9a-fx\,\_\|\{\} ]*$')
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001056 # A resumed function call.
1057 RE_RESUMED = re.compile(r'^<\.\.\. ([^ ]+) resumed> (.+)$')
1058 # A process received a signal.
1059 RE_SIGNAL = re.compile(r'^--- SIG[A-Z]+ .+ ---')
1060 # A process didn't handle a signal. Ignore any junk appearing before,
1061 # because the process was forcibly killed so it won't open any new file.
1062 RE_KILLED = re.compile(
1063 r'^.*\+\+\+ killed by ([A-Z]+)( \(core dumped\))? \+\+\+$')
1064 # The process has exited.
1065 RE_PROCESS_EXITED = re.compile(r'^\+\+\+ exited with (\d+) \+\+\+')
1066 # A call was canceled. Ignore any prefix.
1067 RE_UNAVAILABLE = re.compile(r'^.*\)\s*= \? <unavailable>$')
1068 # Happens when strace fails to even get the function name.
1069 UNNAMED_FUNCTION = '????'
1070
1071 # Corner-case in python, a class member function decorator must not be
1072 # @staticmethod.
1073 def parse_args(regexp, expect_zero): # pylint: disable=E0213
1074 """Automatically convert the str 'args' into a list of processed
1075 arguments.
1076
1077 Arguments:
1078 - regexp is used to parse args.
1079 - expect_zero: one of True, False or None.
1080 - True: will check for result.startswith('0') first and will ignore
1081 the trace line completely otherwise. This is important because for
1082 many functions, the regexp will not process if the call failed.
1083 - False: will check for not result.startswith(('?', '-1')) for the
1084 same reason than with True.
1085 - None: ignore result.
1086 """
1087 def meta_hook(function):
1088 assert function.__name__.startswith('handle_')
1089 def hook(self, args, result):
1090 if expect_zero is True and not result.startswith('0'):
1091 return
1092 if expect_zero is False and result.startswith(('?', '-1')):
1093 return
1094 match = re.match(regexp, args)
1095 if not match:
1096 raise TracingFailure(
1097 'Failed to parse %s(%s) = %s' %
1098 (function.__name__[len('handle_'):], args, result),
1099 None, None, None)
1100 return function(self, match.groups(), result)
1101 return hook
1102 return meta_hook
1103
1104 class RelativePath(object):
1105 """A late-bound relative path."""
1106 def __init__(self, parent, value):
1107 self.parent = parent
1108 self.value = value
1109
1110 def render(self):
1111 """Returns the current directory this instance is representing.
1112
1113 This function is used to return the late-bound value.
1114 """
1115 if self.value and self.value.startswith(u'/'):
1116 # An absolute path.
1117 return self.value
1118 parent = self.parent.render() if self.parent else u'<None>'
1119 if self.value:
1120 return os.path.normpath(os.path.join(parent, self.value))
1121 return parent
1122
1123 def __unicode__(self):
1124 """Acts as a string whenever needed."""
1125 return unicode(self.render())
1126
1127 def __str__(self):
1128 """Acts as a string whenever needed."""
1129 return str(self.render())
1130
1131 def __init__(self, root, pid):
1132 """Keeps enough information to be able to guess the original process
1133 root.
1134
1135 strace doesn't store which process was the initial process. So more
1136 information needs to be kept so the graph can be reconstructed from the
1137 flat map.
1138 """
1139 logging.info('%s(%d)' % (self.__class__.__name__, pid))
1140 super(Strace.Context.Process, self).__init__(root.blacklist, pid, None)
1141 assert isinstance(root, ApiBase.Context)
1142 self._root = weakref.ref(root)
1143 # The dict key is the function name of the pending call, like 'open'
1144 # or 'execve'.
1145 self._pending_calls = {}
1146 self._line_number = 0
1147 # Current directory when the process started.
1148 self.initial_cwd = self.RelativePath(self._root(), None)
1149 self.parentid = None
maruel@chromium.org9f72e0f2012-10-18 14:48:49 +00001150 self._done = False
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001151
1152 def get_cwd(self):
1153 """Returns the best known value of cwd."""
1154 return self.cwd or self.initial_cwd
1155
1156 def render(self):
1157 """Returns the string value of the RelativePath() object.
1158
1159 Used by RelativePath. Returns the initial directory and not the
1160 current one since the current directory 'cwd' validity is time-limited.
1161
1162 The validity is only guaranteed once all the logs are processed.
1163 """
1164 return self.initial_cwd.render()
1165
1166 def on_line(self, line):
1167 self._line_number += 1
maruel@chromium.org9f72e0f2012-10-18 14:48:49 +00001168 if self._done:
1169 raise TracingFailure(
1170 'Found a trace for a terminated process',
1171 None, None, None)
1172
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001173 if self.RE_SIGNAL.match(line):
1174 # Ignore signals.
1175 return
1176
1177 try:
1178 match = self.RE_KILLED.match(line)
1179 if match:
1180 # Converts a '+++ killed by Foo +++' trace into an exit_group().
1181 self.handle_exit_group(match.group(1), None)
1182 return
1183
1184 match = self.RE_PROCESS_EXITED.match(line)
1185 if match:
1186 # Converts a '+++ exited with 1 +++' trace into an exit_group()
1187 self.handle_exit_group(match.group(1), None)
1188 return
1189
1190 match = self.RE_UNFINISHED.match(line)
1191 if match:
1192 if match.group(1) in self._pending_calls:
1193 raise TracingFailure(
1194 'Found two unfinished calls for the same function',
1195 None, None, None,
1196 self._pending_calls)
1197 self._pending_calls[match.group(1)] = (
1198 match.group(1) + match.group(2))
1199 return
1200
maruel@chromium.org4e506502012-10-18 15:02:58 +00001201 match = (
1202 self.RE_UNFINISHED_EXIT.match(line) or
1203 self.RE_INTERRUPTED_HARD.match(line))
maruel@chromium.org9f72e0f2012-10-18 14:48:49 +00001204 if match:
1205 # The process died. No other line can be processed afterward.
1206 self._done = True
1207 return
1208
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001209 match = self.RE_UNAVAILABLE.match(line)
1210 if match:
1211 # This usually means a process was killed and a pending call was
1212 # canceled.
1213 # TODO(maruel): Look up the last exit_group() trace just above and
1214 # make sure any self._pending_calls[anything] is properly flushed.
1215 return
1216
1217 match = self.RE_RESUMED.match(line)
1218 if match:
1219 if match.group(1) not in self._pending_calls:
1220 raise TracingFailure(
1221 'Found a resumed call that was not logged as unfinished',
1222 None, None, None,
1223 self._pending_calls)
1224 pending = self._pending_calls.pop(match.group(1))
1225 # Reconstruct the line.
1226 line = pending + match.group(2)
1227
1228 match = self.RE_HEADER.match(line)
1229 if not match:
1230 raise TracingFailure(
1231 'Found an invalid line: %s' % line,
1232 None, None, None)
1233 if match.group(1) == self.UNNAMED_FUNCTION:
1234 return
1235
1236 # It's a valid line, handle it.
1237 handler = getattr(self, 'handle_%s' % match.group(1), None)
1238 if not handler:
1239 self._handle_unknown(match.group(1), match.group(2), match.group(3))
1240 return handler(match.group(2), match.group(3))
1241 except TracingFailure, e:
1242 # Hack in the values since the handler could be a static function.
1243 e.pid = self.pid
1244 e.line = line
1245 e.line_number = self._line_number
1246 # Re-raise the modified exception.
1247 raise
1248 except (KeyError, NotImplementedError, ValueError), e:
1249 raise TracingFailure(
1250 'Trace generated a %s exception: %s' % (
1251 e.__class__.__name__, str(e)),
1252 self.pid,
1253 self._line_number,
1254 line,
1255 e)
1256
1257 @parse_args(r'^\"(.+?)\", [FKORWX_|]+$', True)
1258 def handle_access(self, args, _result):
1259 self._handle_file(args[0], True)
1260
1261 @parse_args(r'^\"(.+?)\"$', True)
1262 def handle_chdir(self, args, _result):
1263 """Updates cwd."""
1264 self.cwd = self.RelativePath(self, args[0])
1265 logging.debug('handle_chdir(%d, %s)' % (self.pid, self.cwd))
1266
1267 def handle_clone(self, _args, result):
1268 """Transfers cwd."""
1269 if result.startswith(('?', '-1')):
1270 # The call failed.
1271 return
1272 # Update the other process right away.
1273 childpid = int(result)
1274 child = self._root().get_or_set_proc(childpid)
1275 if child.parentid is not None or childpid in self.children:
1276 raise TracingFailure(
1277 'Found internal inconsitency in process lifetime detection '
1278 'during a clone() call',
1279 None, None, None)
1280
1281 # Copy the cwd object.
1282 child.initial_cwd = self.get_cwd()
1283 child.parentid = self.pid
1284 # It is necessary because the logs are processed out of order.
1285 self.children.append(child)
1286
1287 def handle_close(self, _args, _result):
1288 pass
1289
1290 def handle_chmod(self, _args, _result):
1291 pass
1292
1293 def handle_creat(self, _args, _result):
1294 # Ignore files created, since they didn't need to exist.
1295 pass
1296
1297 @parse_args(r'^\"(.+?)\", \[(.+)\], \[\/\* \d+ vars? \*\/\]$', True)
1298 def handle_execve(self, args, _result):
1299 # Even if in practice execve() doesn't returns when it succeeds, strace
1300 # still prints '0' as the result.
1301 filepath = args[0]
1302 self._handle_file(filepath, False)
1303 self.executable = self.RelativePath(self.get_cwd(), filepath)
1304 self.command = strace_process_quoted_arguments(args[1])
1305
1306 def handle_exit_group(self, _args, _result):
1307 """Removes cwd."""
1308 self.cwd = None
1309
1310 def handle_fork(self, args, result):
1311 self._handle_unknown('fork', args, result)
1312
maruel@chromium.org9f72e0f2012-10-18 14:48:49 +00001313 def handle_futex(self, _args, _result):
1314 pass
1315
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001316 def handle_getcwd(self, _args, _result):
1317 pass
1318
1319 @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
1320 def handle_link(self, args, _result):
1321 self._handle_file(args[0], False)
1322 self._handle_file(args[1], False)
1323
1324 @parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True)
1325 def handle_lstat(self, args, _result):
1326 self._handle_file(args[0], True)
1327
1328 def handle_mkdir(self, _args, _result):
1329 pass
1330
1331 @parse_args(r'^\"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', False)
1332 def handle_open(self, args, _result):
1333 if 'O_DIRECTORY' in args[1]:
1334 return
1335 self._handle_file(args[0], False)
1336
1337 @parse_args(r'^(\d+|AT_FDCWD), \"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', False)
1338 def handle_openat(self, args, _result):
1339 if 'O_DIRECTORY' in args[2]:
1340 return
1341 if args[1] == 'AT_FDCWD':
1342 self._handle_file(args[1], False)
1343 else:
1344 # TODO(maruel): Implement relative open if necessary instead of the
1345 # AT_FDCWD flag, let's hope not since this means tracking all active
1346 # directory handles.
1347 raise Exception('Relative open via openat not implemented.')
1348
1349 @parse_args(r'^\"(.+?)\", \".+?\"(\.\.\.)?, \d+$', False)
1350 def handle_readlink(self, args, _result):
1351 self._handle_file(args[0], False)
1352
1353 @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
1354 def handle_rename(self, args, _result):
1355 self._handle_file(args[0], False)
1356 self._handle_file(args[1], False)
1357
1358 def handle_rmdir(self, _args, _result):
1359 pass
1360
1361 def handle_setxattr(self, _args, _result):
1362 pass
1363
1364 @parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True)
1365 def handle_stat(self, args, _result):
1366 self._handle_file(args[0], True)
1367
1368 def handle_symlink(self, _args, _result):
1369 pass
1370
1371 def handle_unlink(self, _args, _result):
1372 # In theory, the file had to be created anyway.
1373 pass
1374
1375 def handle_statfs(self, _args, _result):
1376 pass
1377
1378 def handle_vfork(self, args, result):
1379 self._handle_unknown('vfork', args, result)
1380
1381 @staticmethod
1382 def _handle_unknown(function, args, result):
1383 raise TracingFailure(
1384 'Unexpected/unimplemented trace %s(%s)= %s' %
1385 (function, args, result),
1386 None, None, None)
1387
1388 def _handle_file(self, filepath, touch_only):
1389 filepath = self.RelativePath(self.get_cwd(), filepath)
1390 #assert not touch_only, unicode(filepath)
1391 self.add_file(filepath, touch_only)
1392
1393 def __init__(self, blacklist, initial_cwd):
1394 super(Strace.Context, self).__init__(blacklist)
1395 self.initial_cwd = initial_cwd
1396
1397 def render(self):
1398 """Returns the string value of the initial cwd of the root process.
1399
1400 Used by RelativePath.
1401 """
1402 return self.initial_cwd
1403
1404 def on_line(self, pid, line):
1405 """Transfers control into the Process.on_line() function."""
1406 self.get_or_set_proc(pid).on_line(line.strip())
1407
1408 def to_results(self):
1409 """Finds back the root process and verify consistency."""
1410 # TODO(maruel): Absolutely unecessary, fix me.
1411 root = [p for p in self._process_lookup.itervalues() if not p.parentid]
1412 if len(root) != 1:
1413 raise TracingFailure(
1414 'Found internal inconsitency in process lifetime detection '
1415 'while finding the root process',
1416 None,
1417 None,
1418 None,
1419 sorted(p.pid for p in root))
1420 self.root_process = root[0]
1421 process = self.root_process.to_results_process()
1422 if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
1423 raise TracingFailure(
1424 'Found internal inconsitency in process lifetime detection '
1425 'while looking for len(tree) == len(list)',
1426 None,
1427 None,
1428 None,
1429 sorted(self._process_lookup),
1430 sorted(p.pid for p in process.all))
1431 return Results(process)
1432
1433 def get_or_set_proc(self, pid):
1434 """Returns the Context.Process instance for this pid or creates a new one.
1435 """
1436 if not pid or not isinstance(pid, int):
1437 raise TracingFailure(
1438 'Unpexpected value for pid: %r' % pid,
1439 pid,
1440 None,
1441 None,
1442 pid)
1443 if pid not in self._process_lookup:
1444 self._process_lookup[pid] = self.Process(self, pid)
1445 return self._process_lookup[pid]
1446
1447 @classmethod
1448 def traces(cls):
1449 """Returns the list of all handled traces to pass this as an argument to
1450 strace.
1451 """
1452 prefix = 'handle_'
1453 return [i[len(prefix):] for i in dir(cls.Process) if i.startswith(prefix)]
1454
1455 class Tracer(ApiBase.Tracer):
1456 MAX_LEN = 256
1457
1458 def trace(self, cmd, cwd, tracename, output):
1459 """Runs strace on an executable."""
1460 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
1461 assert os.path.isabs(cmd[0]), cmd[0]
1462 assert os.path.isabs(cwd), cwd
1463 assert os.path.normpath(cwd) == cwd, cwd
1464 with self._lock:
1465 if not self._initialized:
1466 raise TracingFailure(
1467 'Called Tracer.trace() on an unitialized object',
1468 None, None, None, tracename)
1469 assert tracename not in (i['trace'] for i in self._traces)
1470 stdout = stderr = None
1471 if output:
1472 stdout = subprocess.PIPE
1473 stderr = subprocess.STDOUT
1474 # Ensure all file related APIs are hooked.
1475 traces = ','.join(Strace.Context.traces() + ['file'])
1476 trace_cmd = [
1477 'strace',
1478 '-ff',
1479 '-s', '%d' % self.MAX_LEN,
1480 '-e', 'trace=%s' % traces,
1481 '-o', self._logname + '.' + tracename,
1482 ]
1483 child = subprocess.Popen(
1484 trace_cmd + cmd,
1485 cwd=cwd,
1486 stdin=subprocess.PIPE,
1487 stdout=stdout,
1488 stderr=stderr)
1489 out = child.communicate()[0]
1490 # TODO(maruel): Walk the logs and figure out the root process would
1491 # simplify parsing the logs a *lot*.
1492 with self._lock:
1493 assert tracename not in (i['trace'] for i in self._traces)
1494 self._traces.append(
1495 {
1496 'cmd': cmd,
1497 'cwd': cwd,
maruel@chromium.org964ddde2012-10-29 18:13:46 +00001498 'output': out,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001499 # The pid of strace process, not very useful.
1500 'pid': child.pid,
1501 'trace': tracename,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001502 })
1503 return child.returncode, out
1504
1505 @staticmethod
1506 def clean_trace(logname):
1507 if os.path.isfile(logname):
1508 os.remove(logname)
1509 # Also delete any pid specific file from previous traces.
1510 for i in glob.iglob(logname + '.*'):
1511 if i.rsplit('.', 1)[1].isdigit():
1512 os.remove(i)
1513
1514 @classmethod
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00001515 def parse_log(cls, logname, blacklist, trace_name):
1516 logging.info('parse_log(%s, ..., %s)', logname, trace_name)
maruel@chromium.org8b056ba2012-10-16 14:04:49 +00001517 assert os.path.isabs(logname)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001518 data = read_json(logname)
1519 out = []
1520 for item in data['traces']:
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00001521 if trace_name and item['trace'] != trace_name:
1522 continue
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001523 result = {
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001524 'output': item['output'],
maruel@chromium.org964ddde2012-10-29 18:13:46 +00001525 'trace': item['trace'],
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001526 }
1527 try:
1528 context = cls.Context(blacklist, item['cwd'])
1529 for pidfile in glob.iglob('%s.%s.*' % (logname, item['trace'])):
1530 pid = pidfile.rsplit('.', 1)[1]
1531 if pid.isdigit():
1532 pid = int(pid)
1533 # TODO(maruel): Load as utf-8
1534 for line in open(pidfile, 'rb'):
1535 context.on_line(pid, line)
1536 result['results'] = context.to_results()
maruel@chromium.org424f5692012-10-15 15:42:38 +00001537 except TracingFailure:
1538 result['exception'] = sys.exc_info()
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001539 out.append(result)
1540 return out
1541
1542
1543class Dtrace(ApiBase):
1544 """Uses DTrace framework through dtrace. Requires root access.
1545
1546 Implies Mac OSX.
1547
1548 dtruss can't be used because it has compatibility issues with python.
1549
1550 Also, the pid->cwd handling needs to be done manually since OSX has no way to
1551 get the absolute path of the 'cwd' dtrace variable from the probe.
1552
1553 Also, OSX doesn't populate curpsinfo->pr_psargs properly, see
1554 https://discussions.apple.com/thread/1980539. So resort to handling execve()
1555 manually.
1556
1557 errno is not printed in the log since this implementation currently only cares
1558 about files that were successfully opened.
1559 """
1560 class Context(ApiBase.Context):
1561 # Format: index pid function(args)
1562 RE_HEADER = re.compile(r'^\d+ (\d+) ([a-zA-Z_\-]+)\((.*?)\)$')
1563
1564 # Arguments parsing.
1565 RE_DTRACE_BEGIN = re.compile(r'^\"(.+?)\"$')
1566 RE_CHDIR = re.compile(r'^\"(.+?)\"$')
1567 RE_EXECVE = re.compile(r'^\"(.+?)\", \[(\d+), (.+)\]$')
1568 RE_OPEN = re.compile(r'^\"(.+?)\", (0x[0-9a-z]+), (0x[0-9a-z]+)$')
1569 RE_PROC_START = re.compile(r'^(\d+), \"(.+?)\", (\d+)$')
1570 RE_RENAME = re.compile(r'^\"(.+?)\", \"(.+?)\"$')
1571
1572 O_DIRECTORY = 0x100000
1573
1574 class Process(ApiBase.Context.Process):
1575 def __init__(self, *args):
1576 super(Dtrace.Context.Process, self).__init__(*args)
1577 self.cwd = self.initial_cwd
1578
1579 def __init__(self, blacklist, tracer_pid, initial_cwd):
1580 logging.info(
1581 '%s(%d, %s)' % (self.__class__.__name__, tracer_pid, initial_cwd))
1582 super(Dtrace.Context, self).__init__(blacklist)
1583 # Process ID of the temporary script created by create_thunk().
1584 self._tracer_pid = tracer_pid
1585 self._initial_cwd = initial_cwd
1586 self._line_number = 0
1587
1588 def on_line(self, line):
1589 self._line_number += 1
1590 match = self.RE_HEADER.match(line)
1591 if not match:
1592 raise TracingFailure(
1593 'Found malformed line: %s' % line,
1594 None,
1595 self._line_number,
1596 line)
1597 fn = getattr(
1598 self,
1599 'handle_%s' % match.group(2).replace('-', '_'),
1600 self._handle_ignored)
1601 # It is guaranteed to succeed because of the regexp. Or at least I thought
1602 # it would.
1603 pid = int(match.group(1))
1604 try:
1605 return fn(pid, match.group(3))
1606 except TracingFailure, e:
1607 # Hack in the values since the handler could be a static function.
1608 e.pid = pid
1609 e.line = line
1610 e.line_number = self._line_number
1611 # Re-raise the modified exception.
1612 raise
1613 except (KeyError, NotImplementedError, ValueError), e:
1614 raise TracingFailure(
1615 'Trace generated a %s exception: %s' % (
1616 e.__class__.__name__, str(e)),
1617 pid,
1618 self._line_number,
1619 line,
1620 e)
1621
1622 def to_results(self):
1623 process = self.root_process.to_results_process()
1624 # Internal concistency check.
1625 if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
1626 raise TracingFailure(
1627 'Found internal inconsitency in process lifetime detection '
1628 'while looking for len(tree) == len(list)',
1629 None,
1630 None,
1631 None,
1632 sorted(self._process_lookup),
1633 sorted(p.pid for p in process.all))
1634 return Results(process)
1635
1636 def handle_dtrace_BEGIN(self, _pid, args):
1637 if not self.RE_DTRACE_BEGIN.match(args):
1638 raise TracingFailure(
1639 'Found internal inconsitency in dtrace_BEGIN log line',
1640 None, None, None)
1641
1642 def handle_proc_start(self, pid, args):
1643 """Transfers cwd.
1644
1645 The dtrace script already takes care of only tracing the processes that
1646 are child of the traced processes so there is no need to verify the
1647 process hierarchy.
1648 """
1649 if pid in self._process_lookup:
1650 raise TracingFailure(
1651 'Found internal inconsitency in proc_start: %d started two times' %
1652 pid,
1653 None, None, None)
1654 match = self.RE_PROC_START.match(args)
1655 if not match:
1656 raise TracingFailure(
1657 'Failed to parse arguments: %s' % args,
1658 None, None, None)
1659 ppid = int(match.group(1))
1660 if ppid == self._tracer_pid and not self.root_process:
1661 proc = self.root_process = self.Process(
1662 self.blacklist, pid, self._initial_cwd)
1663 elif ppid in self._process_lookup:
1664 proc = self.Process(self.blacklist, pid, self._process_lookup[ppid].cwd)
1665 self._process_lookup[ppid].children.append(proc)
1666 else:
1667 # Another process tree, ignore.
1668 return
1669 self._process_lookup[pid] = proc
1670 logging.debug(
1671 'New child: %s -> %d cwd:%s' %
1672 (ppid, pid, unicode(proc.initial_cwd)))
1673
1674 def handle_proc_exit(self, pid, _args):
1675 """Removes cwd."""
1676 if pid in self._process_lookup:
1677 # self._tracer_pid is not traced itself and other traces run neither.
1678 self._process_lookup[pid].cwd = None
1679
1680 def handle_execve(self, pid, args):
1681 """Sets the process' executable.
1682
1683 TODO(maruel): Read command line arguments. See
1684 https://discussions.apple.com/thread/1980539 for an example.
1685 https://gist.github.com/1242279
1686
1687 Will have to put the answer at http://stackoverflow.com/questions/7556249.
1688 :)
1689 """
1690 if not pid in self._process_lookup:
1691 # Another process tree, ignore.
1692 return
1693 match = self.RE_EXECVE.match(args)
1694 if not match:
1695 raise TracingFailure(
1696 'Failed to parse arguments: %r' % args,
1697 None, None, None)
1698 proc = self._process_lookup[pid]
1699 proc.executable = match.group(1)
1700 proc.command = self.process_escaped_arguments(match.group(3))
1701 if int(match.group(2)) != len(proc.command):
1702 raise TracingFailure(
1703 'Failed to parse execve() arguments: %s' % args,
1704 None, None, None)
1705
1706 def handle_chdir(self, pid, args):
1707 """Updates cwd."""
1708 if pid not in self._process_lookup:
1709 # Another process tree, ignore.
1710 return
1711 cwd = self.RE_CHDIR.match(args).group(1)
1712 if not cwd.startswith('/'):
1713 cwd2 = os.path.join(self._process_lookup[pid].cwd, cwd)
1714 logging.debug('handle_chdir(%d, %s) -> %s' % (pid, cwd, cwd2))
1715 else:
1716 logging.debug('handle_chdir(%d, %s)' % (pid, cwd))
1717 cwd2 = cwd
1718 self._process_lookup[pid].cwd = cwd2
1719
1720 def handle_open_nocancel(self, pid, args):
1721 """Redirects to handle_open()."""
1722 return self.handle_open(pid, args)
1723
1724 def handle_open(self, pid, args):
1725 if pid not in self._process_lookup:
1726 # Another process tree, ignore.
1727 return
1728 match = self.RE_OPEN.match(args)
1729 if not match:
1730 raise TracingFailure(
1731 'Failed to parse arguments: %s' % args,
1732 None, None, None)
1733 flag = int(match.group(2), 16)
1734 if self.O_DIRECTORY & flag == self.O_DIRECTORY:
1735 # Ignore directories.
1736 return
1737 self._handle_file(pid, match.group(1))
1738
1739 def handle_rename(self, pid, args):
1740 if pid not in self._process_lookup:
1741 # Another process tree, ignore.
1742 return
1743 match = self.RE_RENAME.match(args)
1744 if not match:
1745 raise TracingFailure(
1746 'Failed to parse arguments: %s' % args,
1747 None, None, None)
1748 self._handle_file(pid, match.group(1))
1749 self._handle_file(pid, match.group(2))
1750
1751 def _handle_file(self, pid, filepath):
1752 if not filepath.startswith('/'):
1753 filepath = os.path.join(self._process_lookup[pid].cwd, filepath)
1754 # We can get '..' in the path.
1755 filepath = os.path.normpath(filepath)
1756 # Sadly, still need to filter out directories here;
1757 # saw open_nocancel(".", 0, 0) = 0 lines.
1758 if os.path.isdir(filepath):
1759 return
1760 self._process_lookup[pid].add_file(filepath, False)
1761
1762 def handle_ftruncate(self, pid, args):
1763 """Just used as a signal to kill dtrace, ignoring."""
1764 pass
1765
1766 @staticmethod
1767 def _handle_ignored(pid, args):
1768 """Is called for all the event traces that are not handled."""
1769 raise NotImplementedError('Please implement me')
1770
1771 @staticmethod
1772 def process_escaped_arguments(text):
1773 """Extracts escaped arguments on a string and return the arguments as a
1774 list.
1775
1776 Implemented as an automaton.
1777
1778 Example:
1779 With text = '\\001python2.7\\001-c\\001print(\\"hi\\")\\0', the
1780 function will return ['python2.7', '-c', 'print("hi")]
1781 """
1782 if not text.endswith('\\0'):
1783 raise ValueError('String is not null terminated: %r' % text, text)
1784 text = text[:-2]
1785
1786 def unescape(x):
1787 """Replaces '\\' with '\' and '\?' (where ? is anything) with ?."""
1788 out = []
1789 escaped = False
1790 for i in x:
1791 if i == '\\' and not escaped:
1792 escaped = True
1793 continue
1794 escaped = False
1795 out.append(i)
1796 return ''.join(out)
1797
1798 return [unescape(i) for i in text.split('\\001')]
1799
1800 class Tracer(ApiBase.Tracer):
1801 # pylint: disable=C0301
1802 #
1803 # To understand the following code, you'll want to take a look at:
1804 # http://developers.sun.com/solaris/articles/dtrace_quickref/dtrace_quickref.html
1805 # https://wikis.oracle.com/display/DTrace/Variables
1806 # http://docs.oracle.com/cd/E19205-01/820-4221/
1807 #
1808 # 0. Dump all the valid probes into a text file. It is important, you
1809 # want to redirect into a file and you don't want to constantly 'sudo'.
1810 # $ sudo dtrace -l > probes.txt
1811 #
1812 # 1. Count the number of probes:
1813 # $ wc -l probes.txt
1814 # 81823 # On OSX 10.7, including 1 header line.
1815 #
1816 # 2. List providers, intentionally skipping all the 'syspolicy10925' and the
1817 # likes and skipping the header with NR>1:
1818 # $ awk 'NR>1 { print $2 }' probes.txt | sort | uniq | grep -v '[[:digit:]]'
1819 # dtrace
1820 # fbt
1821 # io
1822 # ip
1823 # lockstat
1824 # mach_trap
1825 # proc
1826 # profile
1827 # sched
1828 # syscall
1829 # tcp
1830 # vminfo
1831 #
1832 # 3. List of valid probes:
1833 # $ grep syscall probes.txt | less
1834 # or use dtrace directly:
1835 # $ sudo dtrace -l -P syscall | less
1836 #
1837 # trackedpid is an associative array where its value can be 0, 1 or 2.
1838 # 0 is for untracked processes and is the default value for items not
1839 # in the associative array.
1840 # 1 is for tracked processes.
1841 # 2 is for the script created by create_thunk() only. It is not tracked
1842 # itself but all its decendants are.
1843 #
1844 # The script will kill itself only once waiting_to_die == 1 and
1845 # current_processes == 0, so that both getlogin() was called and that
1846 # all traced processes exited.
1847 #
1848 # TODO(maruel): Use cacheable predicates. See
1849 # https://wikis.oracle.com/display/DTrace/Performance+Considerations
1850 D_CODE = """
1851 dtrace:::BEGIN {
1852 waiting_to_die = 0;
1853 current_processes = 0;
1854 logindex = 0;
1855 printf("%d %d %s_%s(\\"%s\\")\\n",
1856 logindex, PID, probeprov, probename, SCRIPT);
1857 logindex++;
1858 }
1859
1860 proc:::start /trackedpid[ppid]/ {
1861 trackedpid[pid] = 1;
1862 current_processes += 1;
1863 printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
1864 logindex, pid, probeprov, probename,
1865 ppid,
1866 execname,
1867 current_processes);
1868 logindex++;
1869 }
1870 /* Should use SCRIPT but there is no access to this variable at that
1871 * point. */
1872 proc:::start /ppid == PID && execname == "Python"/ {
1873 trackedpid[pid] = 2;
1874 current_processes += 1;
1875 printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
1876 logindex, pid, probeprov, probename,
1877 ppid,
1878 execname,
1879 current_processes);
1880 logindex++;
1881 }
1882 proc:::exit /trackedpid[pid] &&
1883 current_processes == 1 &&
1884 waiting_to_die == 1/ {
1885 trackedpid[pid] = 0;
1886 current_processes -= 1;
1887 printf("%d %d %s_%s(%d)\\n",
1888 logindex, pid, probeprov, probename,
1889 current_processes);
1890 logindex++;
1891 exit(0);
1892 }
1893 proc:::exit /trackedpid[pid]/ {
1894 trackedpid[pid] = 0;
1895 current_processes -= 1;
1896 printf("%d %d %s_%s(%d)\\n",
1897 logindex, pid, probeprov, probename,
1898 current_processes);
1899 logindex++;
1900 }
1901
1902 /* Use an arcane function to detect when we need to die */
1903 syscall::ftruncate:entry /pid == PID && arg0 == FILE_ID/ {
1904 waiting_to_die = 1;
1905 printf("%d %d %s()\\n", logindex, pid, probefunc);
1906 logindex++;
1907 }
1908 syscall::ftruncate:entry /
1909 pid == PID && arg0 == FILE_ID && current_processes == 0/ {
1910 exit(0);
1911 }
1912
1913 syscall::open*:entry /trackedpid[pid] == 1/ {
1914 self->open_arg0 = arg0;
1915 self->open_arg1 = arg1;
1916 self->open_arg2 = arg2;
1917 }
1918 syscall::open*:return /trackedpid[pid] == 1 && errno == 0/ {
1919 this->open_arg0 = copyinstr(self->open_arg0);
1920 printf("%d %d %s(\\"%s\\", 0x%x, 0x%x)\\n",
1921 logindex, pid, probefunc,
1922 this->open_arg0,
1923 self->open_arg1,
1924 self->open_arg2);
1925 logindex++;
1926 this->open_arg0 = 0;
1927 }
1928 syscall::open*:return /trackedpid[pid] == 1/ {
1929 self->open_arg0 = 0;
1930 self->open_arg1 = 0;
1931 self->open_arg2 = 0;
1932 }
1933
1934 syscall::rename:entry /trackedpid[pid] == 1/ {
1935 self->rename_arg0 = arg0;
1936 self->rename_arg1 = arg1;
1937 }
1938 syscall::rename:return /trackedpid[pid] == 1 && errno == 0/ {
1939 this->rename_arg0 = copyinstr(self->rename_arg0);
1940 this->rename_arg1 = copyinstr(self->rename_arg1);
1941 printf("%d %d %s(\\"%s\\", \\"%s\\")\\n",
1942 logindex, pid, probefunc,
1943 this->rename_arg0,
1944 this->rename_arg1);
1945 logindex++;
1946 this->rename_arg0 = 0;
1947 this->rename_arg1 = 0;
1948 }
1949 syscall::rename:return /trackedpid[pid] == 1/ {
1950 self->rename_arg0 = 0;
1951 self->rename_arg1 = 0;
1952 }
1953
1954 /* Track chdir, it's painful because it is only receiving relative path.
1955 */
1956 syscall::chdir:entry /trackedpid[pid] == 1/ {
1957 self->chdir_arg0 = arg0;
1958 }
1959 syscall::chdir:return /trackedpid[pid] == 1 && errno == 0/ {
1960 this->chdir_arg0 = copyinstr(self->chdir_arg0);
1961 printf("%d %d %s(\\"%s\\")\\n",
1962 logindex, pid, probefunc,
1963 this->chdir_arg0);
1964 logindex++;
1965 this->chdir_arg0 = 0;
1966 }
1967 syscall::chdir:return /trackedpid[pid] == 1/ {
1968 self->chdir_arg0 = 0;
1969 }
1970 """
1971
1972 # execve-specific code, tends to throw a lot of exceptions.
1973 D_CODE_EXECVE = """
1974 /* Finally what we care about! */
1975 syscall::exec*:entry /trackedpid[pid]/ {
1976 self->exec_arg0 = copyinstr(arg0);
1977 /* Incrementally probe for a NULL in the argv parameter of execve() to
1978 * figure out argc. */
1979 /* TODO(maruel): Skip the remaining copyin() when a NULL pointer was
1980 * found. */
1981 self->exec_argc = 0;
1982 /* Probe for argc==1 */
1983 this->exec_argv = (user_addr_t*)copyin(
1984 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
1985 self->exec_argc = this->exec_argv[self->exec_argc] ?
1986 (self->exec_argc + 1) : self->exec_argc;
1987
1988 /* Probe for argc==2 */
1989 this->exec_argv = (user_addr_t*)copyin(
1990 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
1991 self->exec_argc = this->exec_argv[self->exec_argc] ?
1992 (self->exec_argc + 1) : self->exec_argc;
1993
1994 /* Probe for argc==3 */
1995 this->exec_argv = (user_addr_t*)copyin(
1996 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
1997 self->exec_argc = this->exec_argv[self->exec_argc] ?
1998 (self->exec_argc + 1) : self->exec_argc;
1999
2000 /* Probe for argc==4 */
2001 this->exec_argv = (user_addr_t*)copyin(
2002 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
2003 self->exec_argc = this->exec_argv[self->exec_argc] ?
2004 (self->exec_argc + 1) : self->exec_argc;
2005
2006 /* Copy the inputs strings since there is no guarantee they'll be
2007 * present after the call completed. */
2008 self->exec_argv0 = (self->exec_argc > 0) ?
2009 copyinstr(this->exec_argv[0]) : "";
2010 self->exec_argv1 = (self->exec_argc > 1) ?
2011 copyinstr(this->exec_argv[1]) : "";
2012 self->exec_argv2 = (self->exec_argc > 2) ?
2013 copyinstr(this->exec_argv[2]) : "";
2014 self->exec_argv3 = (self->exec_argc > 3) ?
2015 copyinstr(this->exec_argv[3]) : "";
2016 this->exec_argv = 0;
2017 }
2018 syscall::exec*:return /trackedpid[pid] && errno == 0/ {
2019 /* We need to join strings here, as using multiple printf() would
2020 * cause tearing when multiple threads/processes are traced.
2021 * Since it is impossible to escape a string and join it to another one,
2022 * like sprintf("%s%S", previous, more), use hackery.
2023 * Each of the elements are split with a \\1. \\0 cannot be used because
2024 * it is simply ignored. This will conflict with any program putting a
2025 * \\1 in their execve() string but this should be "rare enough" */
2026 this->args = "";
2027 /* Process exec_argv[0] */
2028 this->args = strjoin(
2029 this->args, (self->exec_argc > 0) ? self->exec_argv0 : "");
2030
2031 /* Process exec_argv[1] */
2032 this->args = strjoin(
2033 this->args, (self->exec_argc > 1) ? "\\1" : "");
2034 this->args = strjoin(
2035 this->args, (self->exec_argc > 1) ? self->exec_argv1 : "");
2036
2037 /* Process exec_argv[2] */
2038 this->args = strjoin(
2039 this->args, (self->exec_argc > 2) ? "\\1" : "");
2040 this->args = strjoin(
2041 this->args, (self->exec_argc > 2) ? self->exec_argv2 : "");
2042
2043 /* Process exec_argv[3] */
2044 this->args = strjoin(
2045 this->args, (self->exec_argc > 3) ? "\\1" : "");
2046 this->args = strjoin(
2047 this->args, (self->exec_argc > 3) ? self->exec_argv3 : "");
2048
2049 /* Prints self->exec_argc to permits verifying the internal
2050 * consistency since this code is quite fishy. */
2051 printf("%d %d %s(\\"%s\\", [%d, %S])\\n",
2052 logindex, pid, probefunc,
2053 self->exec_arg0,
2054 self->exec_argc,
2055 this->args);
2056 logindex++;
2057 this->args = 0;
2058 }
2059 syscall::exec*:return /trackedpid[pid]/ {
2060 self->exec_arg0 = 0;
2061 self->exec_argc = 0;
2062 self->exec_argv0 = 0;
2063 self->exec_argv1 = 0;
2064 self->exec_argv2 = 0;
2065 self->exec_argv3 = 0;
2066 }
2067 """
2068
2069 # Code currently not used.
2070 D_EXTRANEOUS = """
2071 /* This is a good learning experience, since it traces a lot of things
2072 * related to the process and child processes.
2073 * Warning: it generates a gigantic log. For example, tracing
2074 * "data/trace_inputs/child1.py --child" generates a 2mb log and takes
2075 * several minutes to execute.
2076 */
2077 /*
2078 mach_trap::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2079 printf("%d %d %s_%s() = %d\\n",
2080 logindex, pid, probeprov, probefunc, errno);
2081 logindex++;
2082 }
2083 proc::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2084 printf("%d %d %s_%s() = %d\\n",
2085 logindex, pid, probeprov, probefunc, errno);
2086 logindex++;
2087 }
2088 sched::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2089 printf("%d %d %s_%s() = %d\\n",
2090 logindex, pid, probeprov, probefunc, errno);
2091 logindex++;
2092 }
2093 syscall::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2094 printf("%d %d %s_%s() = %d\\n",
2095 logindex, pid, probeprov, probefunc, errno);
2096 logindex++;
2097 }
2098 vminfo::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2099 printf("%d %d %s_%s() = %d\\n",
2100 logindex, pid, probeprov, probefunc, errno);
2101 logindex++;
2102 }
2103 */
2104 /* TODO(maruel): *stat* functions and friends
2105 syscall::access:return,
2106 syscall::chdir:return,
2107 syscall::chflags:return,
2108 syscall::chown:return,
2109 syscall::chroot:return,
2110 syscall::getattrlist:return,
2111 syscall::getxattr:return,
2112 syscall::lchown:return,
2113 syscall::lstat64:return,
2114 syscall::lstat:return,
2115 syscall::mkdir:return,
2116 syscall::pathconf:return,
2117 syscall::readlink:return,
2118 syscall::removexattr:return,
2119 syscall::setxattr:return,
2120 syscall::stat64:return,
2121 syscall::stat:return,
2122 syscall::truncate:return,
2123 syscall::unlink:return,
2124 syscall::utimes:return,
2125 */
2126 """
2127
2128 def __init__(self, logname):
2129 """Starts the log collection with dtrace.
2130
2131 Requires root access or chmod 4555 on dtrace. dtrace is asynchronous so
2132 this needs to wait for dtrace to be "warmed up".
2133 """
2134 super(Dtrace.Tracer, self).__init__(logname)
2135 self._script = create_thunk()
2136 # This unique dummy temp file is used to signal the dtrace script that it
2137 # should stop as soon as all the child processes are done. A bit hackish
2138 # but works fine enough.
2139 self._dummy_file_id, self._dummy_file_name = tempfile.mkstemp(
2140 prefix='trace_signal_file')
2141
2142 # Note: do not use the -p flag. It's useless if the initial process quits
2143 # too fast, resulting in missing traces from the grand-children. The D
2144 # code manages the dtrace lifetime itself.
2145 trace_cmd = [
2146 'sudo',
2147 'dtrace',
2148 # Use a larger buffer if getting 'out of scratch space' errors.
2149 # Ref: https://wikis.oracle.com/display/DTrace/Options+and+Tunables
2150 '-b', '10m',
2151 '-x', 'dynvarsize=10m',
2152 #'-x', 'dtrace_global_maxsize=1m',
2153 '-x', 'evaltime=exec',
2154 '-o', '/dev/stderr',
2155 '-q',
2156 '-n', self._get_dtrace_code(),
2157 ]
2158 with open(self._logname + '.log', 'wb') as logfile:
2159 self._dtrace = subprocess.Popen(
2160 trace_cmd, stdout=logfile, stderr=subprocess.STDOUT)
2161 logging.debug('Started dtrace pid: %d' % self._dtrace.pid)
2162
2163 # Reads until one line is printed, which signifies dtrace is up and ready.
2164 with open(self._logname + '.log', 'rb') as logfile:
2165 while 'dtrace_BEGIN' not in logfile.readline():
2166 if self._dtrace.poll() is not None:
2167 # Do a busy wait. :/
2168 break
2169 logging.debug('dtrace started')
2170
2171 def _get_dtrace_code(self):
2172 """Setups the D code to implement child process tracking.
2173
2174 Injects the cookie in the script so it knows when to stop.
2175
2176 The script will detect any instance of the script created with
2177 create_thunk() and will start tracing it.
2178 """
2179 return (
2180 'inline int PID = %d;\n'
2181 'inline string SCRIPT = "%s";\n'
2182 'inline int FILE_ID = %d;\n'
2183 '\n'
2184 '%s') % (
2185 os.getpid(),
2186 self._script,
2187 self._dummy_file_id,
2188 self.D_CODE) + self.D_CODE_EXECVE
2189
2190 def trace(self, cmd, cwd, tracename, output):
2191 """Runs dtrace on an executable.
2192
2193 This dtruss is broken when it starts the process itself or when tracing
2194 child processes, this code starts a wrapper process
2195 generated with create_thunk() which starts the executable to trace.
2196 """
2197 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
2198 assert os.path.isabs(cmd[0]), cmd[0]
2199 assert os.path.isabs(cwd), cwd
2200 assert os.path.normpath(cwd) == cwd, cwd
2201 with self._lock:
2202 if not self._initialized:
2203 raise TracingFailure(
2204 'Called Tracer.trace() on an unitialized object',
2205 None, None, None, tracename)
2206 assert tracename not in (i['trace'] for i in self._traces)
2207
2208 # Starts the script wrapper to start the child process. This signals the
2209 # dtrace script that this process is to be traced.
2210 stdout = stderr = None
2211 if output:
2212 stdout = subprocess.PIPE
2213 stderr = subprocess.STDOUT
2214 child_cmd = [
2215 sys.executable,
2216 self._script,
2217 tracename,
2218 ]
2219 # Call a dummy function so that dtrace knows I'm about to launch a process
2220 # that needs to be traced.
2221 # Yummy.
2222 child = subprocess.Popen(
2223 child_cmd + fix_python_path(cmd),
2224 stdin=subprocess.PIPE,
2225 stdout=stdout,
2226 stderr=stderr,
2227 cwd=cwd)
2228 logging.debug('Started child pid: %d' % child.pid)
2229
2230 out = child.communicate()[0]
2231 # This doesn't mean tracing is done, one of the grand-child process may
2232 # still be alive. It will be tracked with the dtrace script.
2233
2234 with self._lock:
2235 assert tracename not in (i['trace'] for i in self._traces)
2236 self._traces.append(
2237 {
2238 'cmd': cmd,
2239 'cwd': cwd,
2240 # The pid of strace process, not very useful.
2241 'pid': child.pid,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002242 'output': out,
maruel@chromium.org964ddde2012-10-29 18:13:46 +00002243 'trace': tracename,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002244 })
2245 return child.returncode, out
2246
2247 def close(self, timeout=None):
2248 """Terminates dtrace."""
2249 logging.debug('close(%s)' % timeout)
2250 try:
2251 try:
2252 super(Dtrace.Tracer, self).close(timeout)
2253 # Signal dtrace that it should stop now.
maruel@chromium.orgf43e68b2012-10-15 20:23:10 +00002254 # ftruncate doesn't exist on Windows.
2255 os.ftruncate(self._dummy_file_id, 0) # pylint: disable=E1101
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002256 if timeout:
2257 start = time.time()
2258 # Use polling. :/
2259 while (self._dtrace.poll() is None and
2260 (time.time() - start) < timeout):
2261 time.sleep(0.1)
2262 self._dtrace.kill()
2263 self._dtrace.wait()
2264 finally:
2265 # Make sure to kill it in any case.
2266 if self._dtrace.poll() is None:
2267 try:
2268 self._dtrace.kill()
2269 self._dtrace.wait()
2270 except OSError:
2271 pass
2272
2273 if self._dtrace.returncode != 0:
2274 # Warn about any dtrace failure but basically ignore it.
2275 print 'dtrace failure: %s' % self._dtrace.returncode
2276 finally:
2277 os.close(self._dummy_file_id)
2278 os.remove(self._dummy_file_name)
2279 os.remove(self._script)
2280
2281 def post_process_log(self):
2282 """Sorts the log back in order when each call occured.
2283
2284 dtrace doesn't save the buffer in strict order since it keeps one buffer
2285 per CPU.
2286 """
2287 super(Dtrace.Tracer, self).post_process_log()
2288 logname = self._logname + '.log'
2289 with open(logname, 'rb') as logfile:
2290 lines = [l for l in logfile if l.strip()]
2291 errors = [l for l in lines if l.startswith('dtrace:')]
2292 if errors:
2293 raise TracingFailure(
2294 'Found errors in the trace: %s' % '\n'.join(errors),
2295 None, None, None, logname)
2296 try:
2297 lines = sorted(lines, key=lambda l: int(l.split(' ', 1)[0]))
2298 except ValueError:
2299 raise TracingFailure(
2300 'Found errors in the trace: %s' % '\n'.join(
2301 l for l in lines if l.split(' ', 1)[0].isdigit()),
2302 None, None, None, logname)
2303 with open(logname, 'wb') as logfile:
2304 logfile.write(''.join(lines))
2305
2306 @staticmethod
2307 def clean_trace(logname):
2308 for ext in ('', '.log'):
2309 if os.path.isfile(logname + ext):
2310 os.remove(logname + ext)
2311
2312 @classmethod
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00002313 def parse_log(cls, logname, blacklist, trace_name):
2314 logging.info('parse_log(%s, ..., %s)', logname, trace_name)
maruel@chromium.org8b056ba2012-10-16 14:04:49 +00002315 assert os.path.isabs(logname)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002316
2317 def blacklist_more(filepath):
2318 # All the HFS metadata is in the form /.vol/...
2319 return blacklist(filepath) or re.match(r'^\/\.vol\/.+$', filepath)
2320
2321 data = read_json(logname)
2322 out = []
2323 for item in data['traces']:
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00002324 if trace_name and item['trace'] != trace_name:
2325 continue
maruel@chromium.org964ddde2012-10-29 18:13:46 +00002326 result = {
2327 'output': item['output'],
2328 'trace': item['trace'],
2329 }
2330 try:
2331 context = cls.Context(blacklist_more, item['pid'], item['cwd'])
2332 for line in open(logname + '.log', 'rb'):
2333 context.on_line(line)
2334 result['results'] = context.to_results()
2335 except TracingFailure:
2336 result['exception'] = sys.exc_info()
2337 out.append(result)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002338 return out
2339
2340
2341class LogmanTrace(ApiBase):
2342 """Uses the native Windows ETW based tracing functionality to trace a child
2343 process.
2344
2345 Caveat: this implementations doesn't track cwd or initial_cwd. It is because
2346 the Windows Kernel doesn't have a concept of 'current working directory' at
2347 all. A Win32 process has a map of current directories, one per drive letter
2348 and it is managed by the user mode kernel32.dll. In kernel, a file is always
2349 opened relative to another file_object or as an absolute path. All the current
2350 working directory logic is done in user mode.
2351 """
2352 class Context(ApiBase.Context):
2353 """Processes a ETW log line and keeps the list of existent and non
2354 existent files accessed.
2355
2356 Ignores directories.
2357 """
2358 # These indexes are for the stripped version in json.
2359 EVENT_NAME = 0
2360 TYPE = 1
2361 PID = 2
2362 TID = 3
2363 PROCESSOR_ID = 4
2364 TIMESTAMP = 5
2365 USER_DATA = 6
2366
2367 class Process(ApiBase.Context.Process):
2368 def __init__(self, *args):
2369 super(LogmanTrace.Context.Process, self).__init__(*args)
2370 # Handle file objects that succeeded.
2371 self.file_objects = {}
2372
2373 def __init__(self, blacklist, tracer_pid):
2374 logging.info('%s(%d)' % (self.__class__.__name__, tracer_pid))
2375 super(LogmanTrace.Context, self).__init__(blacklist)
2376 self._drive_map = DosDriveMap()
2377 # Threads mapping to the corresponding process id.
2378 self._threads_active = {}
2379 # Process ID of the tracer, e.g. tracer_inputs.py
2380 self._tracer_pid = tracer_pid
2381 self._line_number = 0
2382
2383 def on_line(self, line):
2384 """Processes a json Event line."""
2385 self._line_number += 1
2386 try:
2387 # By Opcode
2388 handler = getattr(
2389 self,
2390 'handle_%s_%s' % (line[self.EVENT_NAME], line[self.TYPE]),
2391 None)
2392 if not handler:
2393 raise TracingFailure(
2394 'Unexpected event %s_%s' % (
2395 line[self.EVENT_NAME], line[self.TYPE]),
2396 None, None, None)
2397 handler(line)
2398 except TracingFailure, e:
2399 # Hack in the values since the handler could be a static function.
2400 e.pid = line[self.PID]
2401 e.line = line
2402 e.line_number = self._line_number
2403 # Re-raise the modified exception.
2404 raise
2405 except (KeyError, NotImplementedError, ValueError), e:
2406 raise TracingFailure(
2407 'Trace generated a %s exception: %s' % (
2408 e.__class__.__name__, str(e)),
2409 line[self.PID],
2410 self._line_number,
2411 line,
2412 e)
2413
2414 def to_results(self):
2415 if not self.root_process:
2416 raise TracingFailure(
2417 'Failed to detect the initial process',
2418 None, None, None)
2419 process = self.root_process.to_results_process()
2420 return Results(process)
2421
2422 def _thread_to_process(self, tid):
2423 """Finds the process from the thread id."""
2424 tid = int(tid, 16)
2425 pid = self._threads_active.get(tid)
2426 if not pid or not self._process_lookup.get(pid):
2427 return
2428 return self._process_lookup[pid]
2429
2430 @classmethod
2431 def handle_EventTrace_Header(cls, line):
2432 """Verifies no event was dropped, e.g. no buffer overrun occured."""
2433 BUFFER_SIZE = cls.USER_DATA
2434 #VERSION = cls.USER_DATA + 1
2435 #PROVIDER_VERSION = cls.USER_DATA + 2
2436 #NUMBER_OF_PROCESSORS = cls.USER_DATA + 3
2437 #END_TIME = cls.USER_DATA + 4
2438 #TIMER_RESOLUTION = cls.USER_DATA + 5
2439 #MAX_FILE_SIZE = cls.USER_DATA + 6
2440 #LOG_FILE_MODE = cls.USER_DATA + 7
2441 #BUFFERS_WRITTEN = cls.USER_DATA + 8
2442 #START_BUFFERS = cls.USER_DATA + 9
2443 #POINTER_SIZE = cls.USER_DATA + 10
2444 EVENTS_LOST = cls.USER_DATA + 11
2445 #CPU_SPEED = cls.USER_DATA + 12
2446 #LOGGER_NAME = cls.USER_DATA + 13
2447 #LOG_FILE_NAME = cls.USER_DATA + 14
2448 #BOOT_TIME = cls.USER_DATA + 15
2449 #PERF_FREQ = cls.USER_DATA + 16
2450 #START_TIME = cls.USER_DATA + 17
2451 #RESERVED_FLAGS = cls.USER_DATA + 18
2452 #BUFFERS_LOST = cls.USER_DATA + 19
2453 #SESSION_NAME_STRING = cls.USER_DATA + 20
2454 #LOG_FILE_NAME_STRING = cls.USER_DATA + 21
2455 if line[EVENTS_LOST] != '0':
2456 raise TracingFailure(
2457 ( '%s events were lost during trace, please increase the buffer '
2458 'size from %s') % (line[EVENTS_LOST], line[BUFFER_SIZE]),
2459 None, None, None)
2460
2461 def handle_FileIo_Cleanup(self, line):
2462 """General wisdom: if a file is closed, it's because it was opened.
2463
2464 Note that FileIo_Close is not used since if a file was opened properly but
2465 not closed before the process exits, only Cleanup will be logged.
2466 """
2467 #IRP = self.USER_DATA
2468 TTID = self.USER_DATA + 1 # Thread ID, that's what we want.
2469 FILE_OBJECT = self.USER_DATA + 2
2470 #FILE_KEY = self.USER_DATA + 3
2471 proc = self._thread_to_process(line[TTID])
2472 if not proc:
2473 # Not a process we care about.
2474 return
2475 file_object = line[FILE_OBJECT]
2476 if file_object in proc.file_objects:
2477 proc.add_file(proc.file_objects.pop(file_object), False)
2478
2479 def handle_FileIo_Create(self, line):
2480 """Handles a file open.
2481
2482 All FileIo events are described at
2483 http://msdn.microsoft.com/library/windows/desktop/aa363884.aspx
2484 for some value of 'description'.
2485
2486 " (..) process and thread id values of the IO events (..) are not valid "
2487 http://msdn.microsoft.com/magazine/ee358703.aspx
2488
2489 The FileIo.Create event doesn't return if the CreateFile() call
2490 succeeded, so keep track of the file_object and check that it is
2491 eventually closed with FileIo_Cleanup.
2492 """
2493 #IRP = self.USER_DATA
2494 TTID = self.USER_DATA + 1 # Thread ID, that's what we want.
2495 FILE_OBJECT = self.USER_DATA + 2
2496 #CREATE_OPTIONS = self.USER_DATA + 3
2497 #FILE_ATTRIBUTES = self.USER_DATA + 4
2498 #self.USER_DATA + SHARE_ACCESS = 5
2499 OPEN_PATH = self.USER_DATA + 6
2500
2501 proc = self._thread_to_process(line[TTID])
2502 if not proc:
2503 # Not a process we care about.
2504 return
2505
2506 match = re.match(r'^\"(.+)\"$', line[OPEN_PATH])
2507 raw_path = match.group(1)
2508 # Ignore directories and bare drive right away.
2509 if raw_path.endswith(os.path.sep):
2510 return
2511 filepath = self._drive_map.to_win32(raw_path)
2512 # Ignore bare drive right away. Some may still fall through with format
2513 # like '\\?\X:'
2514 if len(filepath) == 2:
2515 return
2516 file_object = line[FILE_OBJECT]
2517 if os.path.isdir(filepath):
2518 # There is no O_DIRECTORY equivalent on Windows. The closed is
2519 # FILE_FLAG_BACKUP_SEMANTICS but it's not exactly right either. So
2520 # simply discard directories are they are found.
2521 return
2522 # Override any stale file object
2523 proc.file_objects[file_object] = filepath
2524
2525 def handle_FileIo_Rename(self, line):
2526 # TODO(maruel): Handle?
2527 pass
2528
2529 def handle_Process_End(self, line):
2530 pid = line[self.PID]
2531 if self._process_lookup.get(pid):
2532 logging.info('Terminated: %d' % pid)
2533 self._process_lookup[pid] = None
2534 else:
2535 logging.debug('Terminated: %d' % pid)
2536
2537 def handle_Process_Start(self, line):
2538 """Handles a new child process started by PID."""
2539 #UNIQUE_PROCESS_KEY = self.USER_DATA
2540 PROCESS_ID = self.USER_DATA + 1
2541 #PARENT_PID = self.USER_DATA + 2
2542 #SESSION_ID = self.USER_DATA + 3
2543 #EXIT_STATUS = self.USER_DATA + 4
2544 #DIRECTORY_TABLE_BASE = self.USER_DATA + 5
2545 #USER_SID = self.USER_DATA + 6
2546 IMAGE_FILE_NAME = self.USER_DATA + 7
2547 COMMAND_LINE = self.USER_DATA + 8
2548
2549 ppid = line[self.PID]
2550 pid = int(line[PROCESS_ID], 16)
2551 logging.debug(
2552 'New process %d->%d (%s) %s' %
2553 (ppid, pid, line[IMAGE_FILE_NAME], line[COMMAND_LINE]))
2554
2555 if ppid == self._tracer_pid:
2556 # Need to ignore processes we don't know about because the log is
2557 # system-wide. self._tracer_pid shall start only one process.
2558 if self.root_process:
2559 raise TracingFailure(
2560 ( 'Parent process is _tracer_pid(%d) but root_process(%d) is '
2561 'already set') % (self._tracer_pid, self.root_process.pid),
2562 None, None, None)
2563 proc = self.Process(self.blacklist, pid, None)
2564 self.root_process = proc
2565 ppid = None
2566 elif self._process_lookup.get(ppid):
2567 proc = self.Process(self.blacklist, pid, None)
2568 self._process_lookup[ppid].children.append(proc)
2569 else:
2570 # Ignore
2571 return
2572 self._process_lookup[pid] = proc
2573
2574 if (not line[IMAGE_FILE_NAME].startswith('"') or
2575 not line[IMAGE_FILE_NAME].endswith('"')):
2576 raise TracingFailure(
2577 'Command line is not properly quoted: %s' % line[IMAGE_FILE_NAME],
2578 None, None, None)
2579
2580 # TODO(maruel): Process escapes.
2581 if (not line[COMMAND_LINE].startswith('"') or
2582 not line[COMMAND_LINE].endswith('"')):
2583 raise TracingFailure(
2584 'Command line is not properly quoted: %s' % line[COMMAND_LINE],
2585 None, None, None)
2586 proc.command = CommandLineToArgvW(line[COMMAND_LINE][1:-1])
2587 proc.executable = line[IMAGE_FILE_NAME][1:-1]
2588 # proc.command[0] may be the absolute path of 'executable' but it may be
2589 # anything else too. If it happens that command[0] ends with executable,
2590 # use it, otherwise defaults to the base name.
2591 cmd0 = proc.command[0].lower()
2592 if not cmd0.endswith('.exe'):
2593 # TODO(maruel): That's not strictly true either.
2594 cmd0 += '.exe'
2595 if cmd0.endswith(proc.executable) and os.path.isfile(cmd0):
2596 # Fix the path.
2597 cmd0 = cmd0.replace('/', os.path.sep)
2598 cmd0 = os.path.normpath(cmd0)
2599 proc.executable = get_native_path_case(cmd0)
2600 logging.info(
2601 'New child: %s -> %d %s' % (ppid, pid, proc.executable))
2602
2603 def handle_Thread_End(self, line):
2604 """Has the same parameters as Thread_Start."""
2605 tid = int(line[self.TID], 16)
2606 self._threads_active.pop(tid, None)
2607
2608 def handle_Thread_Start(self, line):
2609 """Handles a new thread created.
2610
2611 Do not use self.PID here since a process' initial thread is created by
2612 the parent process.
2613 """
2614 PROCESS_ID = self.USER_DATA
2615 TTHREAD_ID = self.USER_DATA + 1
2616 #STACK_BASE = self.USER_DATA + 2
2617 #STACK_LIMIT = self.USER_DATA + 3
2618 #USER_STACK_BASE = self.USER_DATA + 4
2619 #USER_STACK_LIMIT = self.USER_DATA + 5
2620 #AFFINITY = self.USER_DATA + 6
2621 #WIN32_START_ADDR = self.USER_DATA + 7
2622 #TEB_BASE = self.USER_DATA + 8
2623 #SUB_PROCESS_TAG = self.USER_DATA + 9
2624 #BASE_PRIORITY = self.USER_DATA + 10
2625 #PAGE_PRIORITY = self.USER_DATA + 11
2626 #IO_PRIORITY = self.USER_DATA + 12
2627 #THREAD_FLAGS = self.USER_DATA + 13
2628 # Do not use self.PID here since a process' initial thread is created by
2629 # the parent process.
2630 pid = int(line[PROCESS_ID], 16)
2631 tid = int(line[TTHREAD_ID], 16)
2632 logging.debug('New thread pid:%d, tid:%d' % (pid, tid))
2633 self._threads_active[tid] = pid
2634
2635 @classmethod
2636 def supported_events(cls):
2637 """Returns all the procesed events."""
2638 out = []
2639 for member in dir(cls):
2640 match = re.match(r'^handle_([A-Za-z]+)_([A-Za-z]+)$', member)
2641 if match:
2642 out.append(match.groups())
2643 return out
2644
2645 class Tracer(ApiBase.Tracer):
2646 # The basic headers.
2647 EXPECTED_HEADER = [
2648 u'Event Name',
2649 u'Type',
2650 u'Event ID',
2651 u'Version',
2652 u'Channel',
2653 u'Level', # 5
2654 u'Opcode',
2655 u'Task',
2656 u'Keyword',
2657 u'PID',
2658 u'TID', # 10
2659 u'Processor Number',
2660 u'Instance ID',
2661 u'Parent Instance ID',
2662 u'Activity ID',
2663 u'Related Activity ID', # 15
2664 u'Clock-Time',
2665 u'Kernel(ms)', # Both have a resolution of ~15ms which makes them
2666 u'User(ms)', # pretty much useless.
2667 u'User Data', # Extra arguments that are event-specific.
2668 ]
2669 # Only the useful headers common to all entries are listed there. Any column
2670 # at 19 or higher is dependent on the specific event.
2671 EVENT_NAME = 0
2672 TYPE = 1
2673 PID = 9
2674 TID = 10
2675 PROCESSOR_ID = 11
2676 TIMESTAMP = 16
2677 NULL_GUID = '{00000000-0000-0000-0000-000000000000}'
2678 USER_DATA = 19
2679
2680 def __init__(self, logname):
2681 """Starts the log collection.
2682
2683 Requires administrative access. logman.exe is synchronous so no need for a
2684 "warmup" call. 'Windows Kernel Trace' is *localized* so use its GUID
2685 instead. The GUID constant name is SystemTraceControlGuid. Lovely.
2686
2687 One can get the list of potentially interesting providers with:
2688 "logman query providers | findstr /i file"
2689 """
2690 super(LogmanTrace.Tracer, self).__init__(logname)
2691 self._script = create_thunk()
2692 cmd_start = [
2693 'logman.exe',
2694 'start',
2695 'NT Kernel Logger',
2696 '-p', '{9e814aad-3204-11d2-9a82-006008a86939}',
2697 # splitio,fileiocompletion,syscall,file,cswitch,img
2698 '(process,fileio,thread)',
2699 '-o', self._logname + '.etl',
2700 '-ets', # Send directly to kernel
2701 # Values extracted out of thin air.
2702 # Event Trace Session buffer size in kb.
2703 '-bs', '10240',
2704 # Number of Event Trace Session buffers.
2705 '-nb', '16', '256',
2706 ]
2707 logging.debug('Running: %s' % cmd_start)
2708 try:
2709 subprocess.check_call(
2710 cmd_start,
2711 stdin=subprocess.PIPE,
2712 stdout=subprocess.PIPE,
2713 stderr=subprocess.STDOUT)
2714 except subprocess.CalledProcessError, e:
2715 if e.returncode == -2147024891:
2716 print >> sys.stderr, 'Please restart with an elevated admin prompt'
2717 elif e.returncode == -2144337737:
2718 print >> sys.stderr, (
2719 'A kernel trace was already running, stop it and try again')
2720 raise
2721
2722 def trace(self, cmd, cwd, tracename, output):
2723 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
2724 assert os.path.isabs(cmd[0]), cmd[0]
2725 assert os.path.isabs(cwd), cwd
2726 assert os.path.normpath(cwd) == cwd, cwd
2727 with self._lock:
2728 if not self._initialized:
2729 raise TracingFailure(
2730 'Called Tracer.trace() on an unitialized object',
2731 None, None, None, tracename)
2732 assert tracename not in (i['trace'] for i in self._traces)
2733
2734 # Use "logman -?" for help.
2735
2736 stdout = stderr = None
2737 if output:
2738 stdout = subprocess.PIPE
2739 stderr = subprocess.STDOUT
2740
2741 # Run the child process.
2742 logging.debug('Running: %s' % cmd)
2743 # Use the temporary script generated with create_thunk() so we have a
2744 # clear pid owner. Since trace_inputs.py can be used as a library and
2745 # could trace multiple processes simultaneously, it makes it more complex
2746 # if the executable to be traced is executed directly here. It also solves
2747 # issues related to logman.exe that needs to be executed to control the
2748 # kernel trace.
2749 child_cmd = [
2750 sys.executable,
2751 self._script,
2752 tracename,
2753 ]
2754 child = subprocess.Popen(
2755 child_cmd + fix_python_path(cmd),
2756 cwd=cwd,
2757 stdin=subprocess.PIPE,
2758 stdout=stdout,
2759 stderr=stderr)
2760 logging.debug('Started child pid: %d' % child.pid)
2761 out = child.communicate()[0]
2762 # This doesn't mean all the grand-children are done. Sadly, we don't have
2763 # a good way to determine that.
2764
2765 with self._lock:
2766 assert tracename not in (i['trace'] for i in self._traces)
2767 self._traces.append({
maruel@chromium.orgf43e68b2012-10-15 20:23:10 +00002768 'cmd': cmd,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002769 'cwd': cwd,
maruel@chromium.org964ddde2012-10-29 18:13:46 +00002770 'output': out,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002771 'pid': child.pid,
2772 'trace': tracename,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002773 })
2774
2775 return child.returncode, out
2776
2777 def close(self, _timeout=None):
2778 """Stops the kernel log collection and converts the traces to text
2779 representation.
2780 """
2781 with self._lock:
2782 if not self._initialized:
2783 raise TracingFailure(
2784 'Called Tracer.close() on an unitialized object',
2785 None, None, None)
2786 os.remove(self._script)
2787 # Save metadata, add 'format' key..
2788 data = {
2789 'format': 'csv',
2790 'traces': self._traces,
2791 }
2792 write_json(self._logname, data, False)
2793
2794 cmd_stop = [
2795 'logman.exe',
2796 'stop',
2797 'NT Kernel Logger',
2798 '-ets', # Sends the command directly to the kernel.
2799 ]
2800 logging.debug('Running: %s' % cmd_stop)
2801 subprocess.check_call(
2802 cmd_stop,
2803 stdin=subprocess.PIPE,
2804 stdout=subprocess.PIPE,
2805 stderr=subprocess.STDOUT)
2806 self._initialized = False
2807
2808 def post_process_log(self):
2809 """Converts the .etl file into .csv then into .json."""
2810 super(LogmanTrace.Tracer, self).post_process_log()
2811 logformat = 'csv'
2812 self._convert_log(logformat)
2813
2814 if logformat == 'csv_utf16':
2815 def load_file():
2816 def utf_8_encoder(unicode_csv_data):
2817 """Encodes the unicode object as utf-8 encoded str instance"""
2818 for line in unicode_csv_data:
2819 yield line.encode('utf-8')
2820
2821 def unicode_csv_reader(unicode_csv_data, **kwargs):
2822 """Encodes temporarily as UTF-8 since csv module doesn't do unicode.
2823 """
2824 csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), **kwargs)
2825 for row in csv_reader:
2826 # Decode str utf-8 instances back to unicode instances, cell by
2827 # cell:
2828 yield [cell.decode('utf-8') for cell in row]
2829
2830 # The CSV file is UTF-16 so use codecs.open() to load the file into
2831 # the python internal unicode format (utf-8). Then explicitly
2832 # re-encode as utf8 as str instances so csv can parse it fine. Then
2833 # decode the utf-8 str back into python unicode instances. This
2834 # sounds about right.
2835 for line in unicode_csv_reader(
2836 codecs.open(self._logname + '.' + logformat, 'r', 'utf-16')):
2837 # line is a list of unicode objects
2838 # So much white space!
2839 yield [i.strip() for i in line]
2840
2841 elif logformat == 'csv':
2842 def load_file():
2843 def ansi_csv_reader(ansi_csv_data, **kwargs):
2844 """Loads an 'ANSI' code page and returns unicode() objects."""
2845 assert sys.getfilesystemencoding() == 'mbcs'
2846 encoding = get_current_encoding()
2847 for row in csv.reader(ansi_csv_data, **kwargs):
2848 # Decode str 'ansi' instances to unicode instances, cell by cell:
2849 yield [cell.decode(encoding) for cell in row]
2850
2851 # The fastest and smallest format but only supports 'ANSI' file paths.
2852 # E.g. the filenames are encoding in the 'current' encoding.
2853 for line in ansi_csv_reader(open(self._logname + '.' + logformat)):
2854 # line is a list of unicode objects.
2855 yield [i.strip() for i in line]
2856
2857 supported_events = LogmanTrace.Context.supported_events()
2858
2859 def trim(generator):
2860 for index, line in enumerate(generator):
2861 if not index:
2862 if line != self.EXPECTED_HEADER:
2863 raise TracingFailure(
2864 'Found malformed header: %s' % ' '.join(line),
2865 None, None, None)
2866 continue
2867 # As you can see, the CSV is full of useful non-redundant information:
2868 if (line[2] != '0' or # Event ID
2869 line[3] not in ('2', '3') or # Version
2870 line[4] != '0' or # Channel
2871 line[5] != '0' or # Level
2872 line[7] != '0' or # Task
2873 line[8] != '0x0000000000000000' or # Keyword
2874 line[12] != '' or # Instance ID
2875 line[13] != '' or # Parent Instance ID
2876 line[14] != self.NULL_GUID or # Activity ID
2877 line[15] != ''): # Related Activity ID
2878 raise TracingFailure(
2879 'Found unexpected values in line: %s' % ' '.join(line),
2880 None, None, None)
2881
2882 if (line[self.EVENT_NAME], line[self.TYPE]) not in supported_events:
2883 continue
2884
2885 # Convert the PID in-place from hex.
2886 line[self.PID] = int(line[self.PID], 16)
2887
2888 yield [
2889 line[self.EVENT_NAME],
2890 line[self.TYPE],
2891 line[self.PID],
2892 line[self.TID],
2893 line[self.PROCESSOR_ID],
2894 line[self.TIMESTAMP],
2895 ] + line[self.USER_DATA:]
2896
2897 write_json('%s.json' % self._logname, list(trim(load_file())), True)
2898
2899 def _convert_log(self, logformat):
2900 """Converts the ETL trace to text representation.
2901
2902 Normally, 'csv' is sufficient. If complex scripts are used (like eastern
2903 languages), use 'csv_utf16'. If localization gets in the way, use 'xml'.
2904
2905 Arguments:
2906 - logformat: Text format to be generated, csv, csv_utf16 or xml.
2907
2908 Use "tracerpt -?" for help.
2909 """
2910 LOCALE_INVARIANT = 0x7F
2911 windll.kernel32.SetThreadLocale(LOCALE_INVARIANT)
2912 cmd_convert = [
2913 'tracerpt.exe',
2914 '-l', self._logname + '.etl',
2915 '-o', self._logname + '.' + logformat,
2916 '-gmt', # Use UTC
2917 '-y', # No prompt
2918 # Use -of XML to get the header of each items after column 19, e.g. all
2919 # the actual headers of 'User Data'.
2920 ]
2921
2922 if logformat == 'csv':
2923 # tracerpt localizes the 'Type' column, for major brainfuck
2924 # entertainment. I can't imagine any sane reason to do that.
2925 cmd_convert.extend(['-of', 'CSV'])
2926 elif logformat == 'csv_utf16':
2927 # This causes it to use UTF-16, which doubles the log size but ensures
2928 # the log is readable for non-ASCII characters.
2929 cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode'])
2930 elif logformat == 'xml':
2931 cmd_convert.extend(['-of', 'XML'])
2932 else:
2933 raise ValueError('Unexpected log format \'%s\'' % logformat)
2934 logging.debug('Running: %s' % cmd_convert)
2935 # This can takes tens of minutes for large logs.
2936 # Redirects all output to stderr.
2937 subprocess.check_call(
2938 cmd_convert,
2939 stdin=subprocess.PIPE,
2940 stdout=sys.stderr,
2941 stderr=sys.stderr)
2942
2943 @staticmethod
2944 def clean_trace(logname):
2945 for ext in ('', '.csv', '.etl', '.json', '.xml'):
2946 if os.path.isfile(logname + ext):
2947 os.remove(logname + ext)
2948
2949 @classmethod
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00002950 def parse_log(cls, logname, blacklist, trace_name):
2951 logging.info('parse_log(%s, ..., %s)', logname, trace_name)
maruel@chromium.org8b056ba2012-10-16 14:04:49 +00002952 assert os.path.isabs(logname)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002953
2954 def blacklist_more(filepath):
2955 # All the NTFS metadata is in the form x:\$EXTEND or stuff like that.
2956 return blacklist(filepath) or re.match(r'[A-Z]\:\\\$EXTEND', filepath)
2957
2958 data = read_json(logname)
2959 lines = read_json(logname + '.json')
2960 out = []
2961 for item in data['traces']:
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00002962 if trace_name and item['trace'] != trace_name:
2963 continue
maruel@chromium.org964ddde2012-10-29 18:13:46 +00002964 result = {
2965 'output': item['output'],
2966 'trace': item['trace'],
2967 }
2968 try:
2969 context = cls.Context(blacklist_more, item['pid'], item['trace'])
2970 for line in lines:
2971 context.on_line(line)
2972 result['results'] = context.to_results()
2973 except TracingFailure:
2974 result['exception'] = sys.exc_info()
2975 out.append(result)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002976 return out
2977
2978
2979def get_api():
2980 """Returns the correct implementation for the current OS."""
2981 if sys.platform == 'cygwin':
2982 raise NotImplementedError(
2983 'Not implemented for cygwin, start the script from Win32 python')
2984 flavors = {
2985 'win32': LogmanTrace,
2986 'darwin': Dtrace,
2987 'sunos5': Dtrace,
2988 'freebsd7': Dtrace,
2989 'freebsd8': Dtrace,
2990 }
2991 # Defaults to strace.
2992 return flavors.get(sys.platform, Strace)()
2993
2994
2995def extract_directories(root_dir, files, blacklist):
2996 """Detects if all the files in a directory are in |files| and if so, replace
2997 the individual files by a Results.Directory instance.
2998
2999 Takes a list of Results.File instances and returns a shorter list of
3000 Results.File and Results.Directory instances.
3001
3002 Arguments:
3003 - root_dir: Optional base directory that shouldn't be search further.
3004 - files: list of Results.File instances.
3005 - blacklist: regexp of files to ignore, for example r'.+\.pyc'.
3006 """
3007 logging.info(
3008 'extract_directories(%s, %d files, ...)' % (root_dir, len(files)))
3009 assert not (root_dir or '').endswith(os.path.sep), root_dir
3010 assert not root_dir or (get_native_path_case(root_dir) == root_dir)
3011 assert not any(isinstance(f, Results.Directory) for f in files)
3012 # Remove non existent files.
3013 files = [f for f in files if f.existent]
3014 if not files:
3015 return files
3016 # All files must share the same root, which can be None.
3017 assert len(set(f.root for f in files)) == 1, set(f.root for f in files)
3018
3019 # Creates a {directory: {filename: File}} mapping, up to root.
3020 buckets = {}
3021 if root_dir:
3022 buckets[root_dir] = {}
3023 for fileobj in files:
3024 path = fileobj.full_path
3025 directory = os.path.dirname(path)
3026 assert directory
3027 # Do not use os.path.basename() so trailing os.path.sep is kept.
3028 basename = path[len(directory)+1:]
3029 files_in_directory = buckets.setdefault(directory, {})
3030 files_in_directory[basename] = fileobj
3031 # Add all the directories recursively up to root.
3032 while True:
3033 old_d = directory
3034 directory = os.path.dirname(directory)
3035 if directory + os.path.sep == root_dir or directory == old_d:
3036 break
3037 buckets.setdefault(directory, {})
3038
3039 root_prefix = len(root_dir) + 1 if root_dir else 0
3040 for directory in sorted(buckets, reverse=True):
3041 actual = set(f for f in os.listdir(directory) if not blacklist(f))
3042 expected = set(buckets[directory])
3043 if not (actual - expected):
3044 parent = os.path.dirname(directory)
3045 buckets[parent][os.path.basename(directory)] = Results.Directory(
3046 root_dir,
3047 directory[root_prefix:],
3048 False,
3049 sum(f.size for f in buckets[directory].itervalues()),
3050 sum(f.nb_files for f in buckets[directory].itervalues()))
3051 # Remove the whole bucket.
3052 del buckets[directory]
3053
3054 # Reverse the mapping with what remains. The original instances are returned,
3055 # so the cached meta data is kept.
3056 files = sum((x.values() for x in buckets.itervalues()), [])
3057 return sorted(files, key=lambda x: x.path)
3058
3059
3060def trace(logfile, cmd, cwd, api, output):
3061 """Traces an executable. Returns (returncode, output) from api.
3062
3063 Arguments:
3064 - logfile: file to write to.
3065 - cmd: command to run.
3066 - cwd: current directory to start the process in.
3067 - api: a tracing api instance.
3068 - output: if True, returns output, otherwise prints it at the console.
3069 """
3070 cmd = fix_python_path(cmd)
3071 api.clean_trace(logfile)
3072 with api.get_tracer(logfile) as tracer:
3073 return tracer.trace(cmd, cwd, 'default', output)
3074
3075
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003076def CMDclean(args):
3077 """Cleans up traces."""
3078 parser = OptionParserTraceInputs(command='clean')
3079 options, args = parser.parse_args(args)
3080 api = get_api()
3081 api.clean_trace(options.log)
3082 return 0
3083
3084
3085def CMDtrace(args):
3086 """Traces an executable."""
3087 parser = OptionParserTraceInputs(command='trace')
3088 parser.allow_interspersed_args = False
3089 parser.add_option(
3090 '-q', '--quiet', action='store_true',
3091 help='Redirects traced executable output to /dev/null')
3092 options, args = parser.parse_args(args)
3093
3094 if not args:
3095 parser.error('Please provide a command to run')
3096
3097 if not os.path.isabs(args[0]) and os.access(args[0], os.X_OK):
3098 args[0] = os.path.abspath(args[0])
3099
3100 api = get_api()
3101 return trace(options.log, args, os.getcwd(), api, options.quiet)[0]
3102
3103
3104def CMDread(args):
3105 """Reads the logs and prints the result."""
3106 parser = OptionParserTraceInputs(command='read')
3107 parser.add_option(
3108 '-V', '--variable',
3109 nargs=2,
3110 action='append',
3111 dest='variables',
3112 metavar='VAR_NAME directory',
3113 default=[],
3114 help=('Variables to replace relative directories against. Example: '
3115 '"-v \'$HOME\' \'/home/%s\'" will replace all occurence of your '
3116 'home dir with $HOME') % getpass.getuser())
3117 parser.add_option(
3118 '--root-dir',
3119 help='Root directory to base everything off it. Anything outside of this '
3120 'this directory will not be reported')
3121 parser.add_option(
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00003122 '--trace-name',
3123 help='Only reads one of the trace. Defaults to reading all traces')
3124 parser.add_option(
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003125 '-j', '--json', action='store_true',
3126 help='Outputs raw result data as json')
3127 parser.add_option(
3128 '-b', '--blacklist', action='append', default=[],
3129 help='List of regexp to use as blacklist filter')
3130 options, args = parser.parse_args(args)
3131
3132 if options.root_dir:
3133 options.root_dir = os.path.abspath(options.root_dir)
3134
3135 variables = dict(options.variables)
3136 api = get_api()
3137 def blacklist(f):
3138 return any(re.match(b, f) for b in options.blacklist)
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00003139 data = api.parse_log(options.log, blacklist, options.trace_name)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003140 # Process each trace.
3141 output_as_json = []
3142 for item in data:
3143 if 'exception' in item:
maruel@chromium.org424f5692012-10-15 15:42:38 +00003144 # Do not abort the other traces.
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003145 print >> sys.stderr, (
maruel@chromium.org424f5692012-10-15 15:42:38 +00003146 'Trace %s: Got an exception: %s' % (
3147 item['trace'], item['exception'][1]))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003148 continue
3149 results = item['results']
3150 if options.root_dir:
3151 results = results.strip_root(options.root_dir)
3152
3153 if options.json:
3154 output_as_json.append(results.flatten())
3155 else:
3156 simplified = extract_directories(
3157 options.root_dir, results.files, blacklist)
3158 simplified = [f.replace_variables(variables) for f in simplified]
3159 if len(data) > 1:
3160 print('Trace: %s' % item['trace'])
3161 print('Total: %d' % len(results.files))
3162 print('Non existent: %d' % len(results.non_existent))
3163 for f in results.non_existent:
3164 print(' %s' % f.path)
3165 print(
3166 'Interesting: %d reduced to %d' % (
3167 len(results.existent), len(simplified)))
3168 for f in simplified:
3169 print(' %s' % f.path)
3170
3171 if options.json:
3172 write_json(sys.stdout, output_as_json, False)
3173 return 0
3174
3175
3176class OptionParserWithLogging(optparse.OptionParser):
3177 """Adds --verbose option."""
3178 def __init__(self, verbose=0, **kwargs):
3179 optparse.OptionParser.__init__(self, **kwargs)
3180 self.add_option(
3181 '-v', '--verbose',
3182 action='count',
3183 default=verbose,
3184 help='Use multiple times to increase verbosity')
3185
3186 def parse_args(self, *args, **kwargs):
3187 options, args = optparse.OptionParser.parse_args(self, *args, **kwargs)
3188 levels = [logging.ERROR, logging.INFO, logging.DEBUG]
3189 logging.basicConfig(
3190 level=levels[min(len(levels)-1, options.verbose)],
3191 format='%(levelname)5s %(module)15s(%(lineno)3d): %(message)s')
3192 return options, args
3193
3194
3195class OptionParserWithNiceDescription(OptionParserWithLogging):
3196 """Generates the description with the command's docstring."""
3197 def __init__(self, **kwargs):
3198 """Sets 'description' and 'usage' if not already specified."""
3199 command = kwargs.pop('command', 'help')
3200 kwargs.setdefault(
3201 'description',
3202 re.sub('[\r\n ]{2,}', ' ', get_command_handler(command).__doc__))
3203 kwargs.setdefault('usage', '%%prog %s [options]' % command)
3204 OptionParserWithLogging.__init__(self, **kwargs)
3205
3206
3207class OptionParserTraceInputs(OptionParserWithNiceDescription):
3208 """Adds automatic --log handling."""
3209 def __init__(self, **kwargs):
3210 OptionParserWithNiceDescription.__init__(self, **kwargs)
3211 self.add_option(
3212 '-l', '--log', help='Log file to generate or read, required')
3213
3214 def parse_args(self, *args, **kwargs):
3215 """Makes sure the paths make sense.
3216
3217 On Windows, / and \ are often mixed together in a path.
3218 """
3219 options, args = OptionParserWithNiceDescription.parse_args(
3220 self, *args, **kwargs)
3221 if not options.log:
3222 self.error('Must supply a log file with -l')
3223 options.log = os.path.abspath(options.log)
3224 return options, args
3225
3226
3227def extract_documentation():
3228 """Returns a dict {command: description} for each of documented command."""
3229 commands = (
3230 fn[3:]
3231 for fn in dir(sys.modules['__main__'])
3232 if fn.startswith('CMD') and get_command_handler(fn[3:]).__doc__)
3233 return dict((fn, get_command_handler(fn).__doc__) for fn in commands)
3234
3235
3236def CMDhelp(args):
3237 """Prints list of commands or help for a specific command."""
3238 doc = extract_documentation()
3239 # Calculates the optimal offset.
3240 offset = max(len(cmd) for cmd in doc)
3241 format_str = ' %-' + str(offset + 2) + 's %s'
3242 # Generate a one-liner documentation of each commands.
3243 commands_description = '\n'.join(
3244 format_str % (cmd, doc[cmd].split('\n')[0]) for cmd in sorted(doc))
3245
3246 parser = OptionParserWithNiceDescription(
3247 usage='%prog <command> [options]',
3248 description='Commands are:\n%s\n' % commands_description)
3249 parser.format_description = lambda _: parser.description
3250
3251 # Strip out any -h or --help argument.
3252 _, args = parser.parse_args([i for i in args if not i in ('-h', '--help')])
3253 if len(args) == 1:
3254 if not get_command_handler(args[0]):
3255 parser.error('Unknown command %s' % args[0])
3256 # The command was "%prog help command", replaces ourself with
3257 # "%prog command --help" so help is correctly printed out.
3258 return main(args + ['--help'])
3259 elif args:
3260 parser.error('Unknown argument "%s"' % ' '.join(args))
3261 parser.print_help()
3262 return 0
3263
3264
3265def get_command_handler(name):
3266 """Returns the command handler or CMDhelp if it doesn't exist."""
3267 return getattr(sys.modules['__main__'], 'CMD%s' % name, None)
3268
3269
3270def main_impl(argv):
3271 command = get_command_handler(argv[0] if argv else 'help')
3272 if not command:
3273 return CMDhelp(argv)
3274 return command(argv[1:])
3275
3276def main(argv):
3277 try:
3278 main_impl(argv)
3279 except TracingFailure, e:
3280 sys.stderr.write('\nError: ')
3281 sys.stderr.write(str(e))
3282 sys.stderr.write('\n')
3283 return 1
3284
3285
3286if __name__ == '__main__':
3287 sys.exit(main(sys.argv[1:]))