blob: 93fd4add1079216d0c8e79777e1bda7be7bed69c [file] [log] [blame]
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001#!/usr/bin/env python
2# coding=utf-8
3# Copyright (c) 2012 The Chromium Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7"""Traces an executable and its child processes and extract the files accessed
8by them.
9
10The implementation uses OS-specific API. The native Kernel logger and the ETL
11interface is used on Windows. Dtrace is used on OSX. Strace is used otherwise.
12The OS-specific implementation is hidden in an 'API' interface.
13
14The results are embedded in a Results instance. The tracing is done in two
15phases, the first is to do the actual trace and generate an
16implementation-specific log file. Then the log file is parsed to extract the
17information, including the individual child processes and the files accessed
18from the log.
19"""
20
21import codecs
22import csv
maruel@chromium.org9e56b592013-05-21 16:56:05 +000023import errno
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +000024import getpass
25import glob
26import json
27import logging
28import optparse
29import os
30import re
maruel@chromium.org9b2112a2013-04-05 18:01:02 +000031import stat
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +000032import subprocess
33import sys
34import tempfile
35import threading
36import time
maruel@chromium.org9b2112a2013-04-05 18:01:02 +000037import unicodedata
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +000038import weakref
39
40## OS-specific imports
41
42if sys.platform == 'win32':
43 from ctypes.wintypes import byref, create_unicode_buffer, c_int, c_wchar_p
44 from ctypes.wintypes import windll, FormatError # pylint: disable=E0611
45 from ctypes.wintypes import GetLastError # pylint: disable=E0611
46elif sys.platform == 'darwin':
47 import Carbon.File # pylint: disable=F0401
48 import MacOS # pylint: disable=F0401
49
50
51BASE_DIR = os.path.dirname(os.path.abspath(__file__))
52ROOT_DIR = os.path.dirname(os.path.dirname(BASE_DIR))
53
54
55class TracingFailure(Exception):
56 """An exception occured during tracing."""
57 def __init__(self, description, pid, line_number, line, *args):
58 super(TracingFailure, self).__init__(
59 description, pid, line_number, line, *args)
60 self.description = description
61 self.pid = pid
62 self.line_number = line_number
63 self.line = line
64 self.extra = args
65
66 def __str__(self):
67 out = self.description
68 if self.pid:
69 out += '\npid: %d' % self.pid
70 if self.line_number:
71 out += '\nline: %d' % self.line_number
72 if self.line:
73 out += '\n%s' % self.line
74 if self.extra:
75 out += '\n' + ', '.join(map(str, filter(None, self.extra)))
76 return out
77
78
79## OS-specific functions
80
81if sys.platform == 'win32':
82 def QueryDosDevice(drive_letter):
83 """Returns the Windows 'native' path for a DOS drive letter."""
84 assert re.match(r'^[a-zA-Z]:$', drive_letter), drive_letter
maruel@chromium.org306e0e72012-11-02 18:22:03 +000085 assert isinstance(drive_letter, unicode)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +000086 # Guesswork. QueryDosDeviceW never returns the required number of bytes.
87 chars = 1024
maruel@chromium.org306e0e72012-11-02 18:22:03 +000088 drive_letter = drive_letter
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +000089 p = create_unicode_buffer(chars)
90 if 0 == windll.kernel32.QueryDosDeviceW(drive_letter, p, chars):
91 err = GetLastError()
92 if err:
93 # pylint: disable=E0602
maruel@chromium.orgf37c2472012-11-02 20:06:13 +000094 msg = u'QueryDosDevice(%s): %s (%d)' % (
95 drive_letter, FormatError(err), err)
96 raise WindowsError(err, msg.encode('utf-8'))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +000097 return p.value
98
99
100 def GetShortPathName(long_path):
101 """Returns the Windows short path equivalent for a 'long' path."""
maruel@chromium.org68bb1482013-02-15 22:36:24 +0000102 assert isinstance(long_path, unicode), repr(long_path)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000103 # Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is
104 # not enforced.
105 if os.path.isabs(long_path) and not long_path.startswith('\\\\?\\'):
106 long_path = '\\\\?\\' + long_path
107 chars = windll.kernel32.GetShortPathNameW(long_path, None, 0)
108 if chars:
109 p = create_unicode_buffer(chars)
110 if windll.kernel32.GetShortPathNameW(long_path, p, chars):
111 return p.value
112
113 err = GetLastError()
114 if err:
115 # pylint: disable=E0602
maruel@chromium.orgf37c2472012-11-02 20:06:13 +0000116 msg = u'GetShortPathName(%s): %s (%d)' % (
117 long_path, FormatError(err), err)
118 raise WindowsError(err, msg.encode('utf-8'))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000119
120
121 def GetLongPathName(short_path):
122 """Returns the Windows long path equivalent for a 'short' path."""
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000123 assert isinstance(short_path, unicode)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000124 # Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is
125 # not enforced.
126 if os.path.isabs(short_path) and not short_path.startswith('\\\\?\\'):
127 short_path = '\\\\?\\' + short_path
128 chars = windll.kernel32.GetLongPathNameW(short_path, None, 0)
129 if chars:
130 p = create_unicode_buffer(chars)
131 if windll.kernel32.GetLongPathNameW(short_path, p, chars):
132 return p.value
133
134 err = GetLastError()
135 if err:
136 # pylint: disable=E0602
maruel@chromium.orgf37c2472012-11-02 20:06:13 +0000137 msg = u'GetLongPathName(%s): %s (%d)' % (
138 short_path, FormatError(err), err)
139 raise WindowsError(err, msg.encode('utf-8'))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000140
141
142 def get_current_encoding():
143 """Returns the 'ANSI' code page associated to the process."""
144 return 'cp%d' % int(windll.kernel32.GetACP())
145
146
147 class DosDriveMap(object):
148 """Maps \Device\HarddiskVolumeN to N: on Windows."""
149 # Keep one global cache.
150 _MAPPING = {}
151
152 def __init__(self):
153 """Lazy loads the cache."""
154 if not self._MAPPING:
155 # This is related to UNC resolver on windows. Ignore that.
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000156 self._MAPPING[u'\\Device\\Mup'] = None
157 self._MAPPING[u'\\SystemRoot'] = os.environ[u'SystemRoot']
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000158
159 for letter in (chr(l) for l in xrange(ord('C'), ord('Z')+1)):
160 try:
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000161 letter = u'%s:' % letter
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000162 mapped = QueryDosDevice(letter)
163 if mapped in self._MAPPING:
164 logging.warn(
165 ('Two drives: \'%s\' and \'%s\', are mapped to the same disk'
166 '. Drive letters are a user-mode concept and the kernel '
167 'traces only have NT path, so all accesses will be '
168 'associated with the first drive letter, independent of the '
169 'actual letter used by the code') % (
170 self._MAPPING[mapped], letter))
171 else:
172 self._MAPPING[mapped] = letter
173 except WindowsError: # pylint: disable=E0602
174 pass
175
176 def to_win32(self, path):
177 """Converts a native NT path to Win32/DOS compatible path."""
178 match = re.match(r'(^\\Device\\[a-zA-Z0-9]+)(\\.*)?$', path)
179 if not match:
180 raise ValueError(
181 'Can\'t convert %s into a Win32 compatible path' % path,
182 path)
183 if not match.group(1) in self._MAPPING:
184 # Unmapped partitions may be accessed by windows for the
185 # fun of it while the test is running. Discard these.
186 return None
187 drive = self._MAPPING[match.group(1)]
188 if not drive or not match.group(2):
189 return drive
190 return drive + match.group(2)
191
192
193 def isabs(path):
194 """Accepts X: as an absolute path, unlike python's os.path.isabs()."""
195 return os.path.isabs(path) or len(path) == 2 and path[1] == ':'
196
197
csharp@chromium.orgf2eacff2013-04-04 14:20:20 +0000198 def find_item_native_case(root, item):
199 """Gets the native path case of a single item based at root_path."""
200 if item == '..':
201 return item
202
203 root = get_native_path_case(root)
204 return os.path.basename(get_native_path_case(os.path.join(root, item)))
205
206
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000207 def get_native_path_case(p):
208 """Returns the native path case for an existing file.
209
210 On Windows, removes any leading '\\?\'.
211 """
maruel@chromium.orgd1e9fbd2013-02-26 14:42:24 +0000212 assert isinstance(p, unicode), repr(p)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000213 if not isabs(p):
214 raise ValueError(
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +0000215 'get_native_path_case(%r): Require an absolute path' % p, p)
216
maruel@chromium.org037758d2012-12-10 17:59:46 +0000217 # Make sure it is normalized to os.path.sep. Do not do it here to keep the
218 # function fast
219 assert '/' not in p, p
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +0000220 suffix = ''
221 count = p.count(':')
222 if count > 1:
223 # This means it has an alternate-data stream. There could be 3 ':', since
224 # it could be the $DATA datastream of an ADS. Split the whole ADS suffix
225 # off and add it back afterward. There is no way to know the native path
226 # case of an alternate data stream.
227 items = p.split(':')
228 p = ':'.join(items[0:2])
229 suffix = ''.join(':' + i for i in items[2:])
230
maruel@chromium.org42b091d2013-04-02 22:27:02 +0000231 # TODO(maruel): Use os.path.normpath?
232 if p.endswith('.\\'):
233 p = p[:-2]
234
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000235 # Windows used to have an option to turn on case sensitivity on non Win32
236 # subsystem but that's out of scope here and isn't supported anymore.
237 # Go figure why GetShortPathName() is needed.
238 try:
239 out = GetLongPathName(GetShortPathName(p))
240 except OSError, e:
241 if e.args[0] in (2, 3, 5):
242 # The path does not exist. Try to recurse and reconstruct the path.
243 base = os.path.dirname(p)
244 rest = os.path.basename(p)
245 return os.path.join(get_native_path_case(base), rest)
246 raise
247 if out.startswith('\\\\?\\'):
248 out = out[4:]
249 # Always upper case the first letter since GetLongPathName() will return the
250 # drive letter in the case it was given.
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +0000251 return out[0].upper() + out[1:] + suffix
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000252
253
254 def CommandLineToArgvW(command_line):
255 """Splits a commandline into argv using CommandLineToArgvW()."""
256 # http://msdn.microsoft.com/library/windows/desktop/bb776391.aspx
257 size = c_int()
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000258 assert isinstance(command_line, unicode)
259 ptr = windll.shell32.CommandLineToArgvW(command_line, byref(size))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000260 try:
261 return [arg for arg in (c_wchar_p * size.value).from_address(ptr)]
262 finally:
263 windll.kernel32.LocalFree(ptr)
264
265
266elif sys.platform == 'darwin':
267
268
269 # On non-windows, keep the stdlib behavior.
270 isabs = os.path.isabs
271
272
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000273 def _native_case(p):
274 """Gets the native path case. Warning: this function resolves symlinks."""
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000275 try:
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000276 rel_ref, _ = Carbon.File.FSPathMakeRef(p.encode('utf-8'))
277 # The OSX underlying code uses NFD but python strings are in NFC. This
278 # will cause issues with os.listdir() for example. Since the dtrace log
279 # *is* in NFC, normalize it here.
csharp@chromium.org5a2c3732013-03-07 14:44:56 +0000280 out = unicodedata.normalize(
281 'NFC', rel_ref.FSRefMakePath().decode('utf-8'))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000282 if p.endswith(os.path.sep) and not out.endswith(os.path.sep):
283 return out + os.path.sep
284 return out
285 except MacOS.Error, e:
286 if e.args[0] in (-43, -120):
287 # The path does not exist. Try to recurse and reconstruct the path.
288 # -43 means file not found.
289 # -120 means directory not found.
290 base = os.path.dirname(p)
291 rest = os.path.basename(p)
csharp@chromium.org5a2c3732013-03-07 14:44:56 +0000292 return os.path.join(_native_case(base), rest)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000293 raise OSError(
294 e.args[0], 'Failed to get native path for %s' % p, p, e.args[1])
295
296
297 def _split_at_symlink_native(base_path, rest):
298 """Returns the native path for a symlink."""
299 base, symlink, rest = split_at_symlink(base_path, rest)
300 if symlink:
301 if not base_path:
302 base_path = base
303 else:
304 base_path = safe_join(base_path, base)
csharp@chromium.orgf2eacff2013-04-04 14:20:20 +0000305 symlink = find_item_native_case(base_path, symlink)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000306 return base, symlink, rest
307
308
csharp@chromium.orgf2eacff2013-04-04 14:20:20 +0000309 def find_item_native_case(root_path, item):
310 """Gets the native path case of a single item based at root_path.
311
312 There is no API to get the native path case of symlinks on OSX. So it
313 needs to be done the slow way.
314 """
315 if item == '..':
316 return item
317
318 item = item.lower()
319 for element in os.listdir(root_path):
320 if element.lower() == item:
321 return element
322
323
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000324 def get_native_path_case(path):
325 """Returns the native path case for an existing file.
326
327 Technically, it's only HFS+ on OSX that is case preserving and
328 insensitive. It's the default setting on HFS+ but can be changed.
329 """
maruel@chromium.orgd1e9fbd2013-02-26 14:42:24 +0000330 assert isinstance(path, unicode), repr(path)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000331 if not isabs(path):
332 raise ValueError(
maruel@chromium.orgd1e9fbd2013-02-26 14:42:24 +0000333 'get_native_path_case(%r): Require an absolute path' % path, path)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000334 if path.startswith('/dev'):
335 # /dev is not visible from Carbon, causing an exception.
336 return path
337
338 # Starts assuming there is no symlink along the path.
339 resolved = _native_case(path)
csharp@chromium.orgf2eacff2013-04-04 14:20:20 +0000340 if path.lower() in (resolved.lower(), resolved.lower() + './'):
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000341 # This code path is incredibly faster.
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000342 logging.debug('get_native_path_case(%s) = %s' % (path, resolved))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000343 return resolved
344
345 # There was a symlink, process it.
346 base, symlink, rest = _split_at_symlink_native(None, path)
347 assert symlink, (path, base, symlink, rest, resolved)
348 prev = base
349 base = safe_join(_native_case(base), symlink)
350 assert len(base) > len(prev)
351 while rest:
352 prev = base
353 relbase, symlink, rest = _split_at_symlink_native(base, rest)
354 base = safe_join(base, relbase)
355 assert len(base) > len(prev), (prev, base, symlink)
356 if symlink:
357 base = safe_join(base, symlink)
358 assert len(base) > len(prev), (prev, base, symlink)
359 # Make sure no symlink was resolved.
360 assert base.lower() == path.lower(), (base, path)
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000361 logging.debug('get_native_path_case(%s) = %s' % (path, base))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000362 return base
363
364
365else: # OSes other than Windows and OSX.
366
367
368 # On non-windows, keep the stdlib behavior.
369 isabs = os.path.isabs
370
371
csharp@chromium.orgf2eacff2013-04-04 14:20:20 +0000372 def find_item_native_case(root, item):
373 """Gets the native path case of a single item based at root_path."""
374 if item == '..':
375 return item
376
377 root = get_native_path_case(root)
378 return os.path.basename(get_native_path_case(os.path.join(root, item)))
379
380
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000381 def get_native_path_case(path):
382 """Returns the native path case for an existing file.
383
384 On OSes other than OSX and Windows, assume the file system is
385 case-sensitive.
386
387 TODO(maruel): This is not strictly true. Implement if necessary.
388 """
maruel@chromium.orgd1e9fbd2013-02-26 14:42:24 +0000389 assert isinstance(path, unicode), repr(path)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000390 if not isabs(path):
391 raise ValueError(
maruel@chromium.orgd1e9fbd2013-02-26 14:42:24 +0000392 'get_native_path_case(%r): Require an absolute path' % path, path)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000393 # Give up on cygwin, as GetLongPathName() can't be called.
394 # Linux traces tends to not be normalized so use this occasion to normalize
395 # it. This function implementation already normalizes the path on the other
396 # OS so this needs to be done here to be coherent between OSes.
397 out = os.path.normpath(path)
398 if path.endswith(os.path.sep) and not out.endswith(os.path.sep):
399 return out + os.path.sep
400 return out
401
402
403if sys.platform != 'win32': # All non-Windows OSes.
404
405
406 def safe_join(*args):
407 """Joins path elements like os.path.join() but doesn't abort on absolute
408 path.
409
410 os.path.join('foo', '/bar') == '/bar'
411 but safe_join('foo', '/bar') == 'foo/bar'.
412 """
413 out = ''
414 for element in args:
415 if element.startswith(os.path.sep):
416 if out.endswith(os.path.sep):
417 out += element[1:]
418 else:
419 out += element
420 else:
421 if out.endswith(os.path.sep):
422 out += element
423 else:
424 out += os.path.sep + element
425 return out
426
427
428 def split_at_symlink(base_dir, relfile):
429 """Scans each component of relfile and cut the string at the symlink if
430 there is any.
431
432 Returns a tuple (base_path, symlink, rest), with symlink == rest == None if
433 not symlink was found.
434 """
435 if base_dir:
436 assert relfile
437 assert os.path.isabs(base_dir)
438 index = 0
439 else:
440 assert os.path.isabs(relfile)
441 index = 1
442
443 def at_root(rest):
444 if base_dir:
445 return safe_join(base_dir, rest)
446 return rest
447
448 while True:
449 try:
450 index = relfile.index(os.path.sep, index)
451 except ValueError:
452 index = len(relfile)
453 full = at_root(relfile[:index])
454 if os.path.islink(full):
455 # A symlink!
456 base = os.path.dirname(relfile[:index])
457 symlink = os.path.basename(relfile[:index])
458 rest = relfile[index:]
459 logging.debug(
460 'split_at_symlink(%s, %s) -> (%s, %s, %s)' %
461 (base_dir, relfile, base, symlink, rest))
462 return base, symlink, rest
463 if index == len(relfile):
464 break
465 index += 1
466 return relfile, None, None
467
468
maruel@chromium.orgac36fb72013-05-21 14:50:53 +0000469class Unbuffered(object):
470 """Disable buffering on a file object."""
471 def __init__(self, stream):
472 self.stream = stream
473
474 def write(self, data):
475 self.stream.write(data)
476 if '\n' in data:
477 self.stream.flush()
478
479 def __getattr__(self, attr):
480 return getattr(self.stream, attr)
481
482
483def disable_buffering():
484 """Makes this process and child processes stdout unbuffered."""
485 if not os.environ.get('PYTHONUNBUFFERED'):
486 # Since sys.stdout is a C++ object, it's impossible to do
487 # sys.stdout.write = lambda...
488 sys.stdout = Unbuffered(sys.stdout)
489 os.environ['PYTHONUNBUFFERED'] = 'x'
490
491
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000492def fix_python_path(cmd):
493 """Returns the fixed command line to call the right python executable."""
494 out = cmd[:]
495 if out[0] == 'python':
496 out[0] = sys.executable
497 elif out[0].endswith('.py'):
498 out.insert(0, sys.executable)
499 return out
500
501
maruel@chromium.org2909b7c2013-05-29 18:52:59 +0000502def create_subprocess_thunk():
503 """Creates a small temporary script to start the child process.
504
505 This thunk doesn't block, its unique name is used to identify it as the
506 parent.
507 """
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000508 handle, name = tempfile.mkstemp(prefix='trace_inputs_thunk', suffix='.py')
maruel@chromium.org2909b7c2013-05-29 18:52:59 +0000509 try:
510 os.write(
511 handle,
512 (
513 'import subprocess, sys\n'
514 'sys.exit(subprocess.call(sys.argv[2:]))\n'
515 ))
516 finally:
517 os.close(handle)
518 return name
519
520
521def create_exec_thunk():
522 """Creates a small temporary script to start the child executable.
523
524 Reads from the file handle provided as the fisrt argument to block, then
525 execv() the command to be traced.
526 """
527 handle, name = tempfile.mkstemp(prefix='trace_inputs_thunk', suffix='.py')
528 try:
529 os.write(
530 handle,
531 (
532 'import os, sys\n'
533 'fd = int(sys.argv[1])\n'
534 # This will block until the controlling process writes a byte on the
535 # pipe. It will do so once the tracing tool, e.g. strace, is ready to
536 # trace.
537 'os.read(fd, 1)\n'
538 'os.close(fd)\n'
539 'os.execv(sys.argv[2], sys.argv[2:])\n'
540 ))
541 finally:
542 os.close(handle)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000543 return name
544
545
546def strace_process_quoted_arguments(text):
547 """Extracts quoted arguments on a string and return the arguments as a list.
548
549 Implemented as an automaton. Supports incomplete strings in the form
550 '"foo"...'.
551
552 Example:
553 With text = '"foo", "bar"', the function will return ['foo', 'bar']
554
555 TODO(maruel): Implement escaping.
556 """
557 # All the possible states of the DFA.
558 ( NEED_QUOTE, # Begining of a new arguments.
559 INSIDE_STRING, # Inside an argument.
560 ESCAPED, # Found a '\' inside a quote. Treat the next char as-is.
561 NEED_COMMA_OR_DOT, # Right after the closing quote of an argument. Could be
562 # a serie of 3 dots or a comma.
563 NEED_SPACE, # Right after a comma
564 NEED_DOT_2, # Found a dot, need a second one.
565 NEED_DOT_3, # Found second dot, need a third one.
566 NEED_COMMA, # Found third dot, need a comma.
567 ) = range(8)
568
569 state = NEED_QUOTE
570 out = []
571 for index, char in enumerate(text):
572 if char == '"':
573 if state == NEED_QUOTE:
574 state = INSIDE_STRING
575 # A new argument was found.
576 out.append('')
577 elif state == INSIDE_STRING:
578 # The argument is now closed.
579 state = NEED_COMMA_OR_DOT
580 elif state == ESCAPED:
581 out[-1] += char
582 state = INSIDE_STRING
583 else:
584 raise ValueError(
maruel@chromium.org588e09d2013-05-21 14:38:57 +0000585 'Can\'t process char \'%s\' at column %d for: %r' % (
586 char, index, text),
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000587 index,
588 text)
589 elif char == ',':
590 if state in (NEED_COMMA_OR_DOT, NEED_COMMA):
591 state = NEED_SPACE
592 elif state == INSIDE_STRING:
593 out[-1] += char
594 elif state == ESCAPED:
595 out[-1] += char
596 state = INSIDE_STRING
597 else:
598 raise ValueError(
maruel@chromium.org588e09d2013-05-21 14:38:57 +0000599 'Can\'t process char \'%s\' at column %d for: %r' % (
600 char, index, text),
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000601 index,
602 text)
603 elif char == ' ':
604 if state == NEED_SPACE:
605 state = NEED_QUOTE
606 elif state == INSIDE_STRING:
607 out[-1] += char
608 elif state == ESCAPED:
609 out[-1] += char
610 state = INSIDE_STRING
611 else:
612 raise ValueError(
maruel@chromium.org588e09d2013-05-21 14:38:57 +0000613 'Can\'t process char \'%s\' at column %d for: %r' % (
614 char, index, text),
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000615 index,
616 text)
617 elif char == '.':
maruel@chromium.org588e09d2013-05-21 14:38:57 +0000618 if state in (NEED_QUOTE, NEED_COMMA_OR_DOT):
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000619 # The string is incomplete, this mean the strace -s flag should be
620 # increased.
maruel@chromium.org588e09d2013-05-21 14:38:57 +0000621 # For NEED_QUOTE, the input string would look like '"foo", ...'.
622 # For NEED_COMMA_OR_DOT, the input string would look like '"foo"...'
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000623 state = NEED_DOT_2
624 elif state == NEED_DOT_2:
625 state = NEED_DOT_3
626 elif state == NEED_DOT_3:
627 state = NEED_COMMA
628 elif state == INSIDE_STRING:
629 out[-1] += char
630 elif state == ESCAPED:
631 out[-1] += char
632 state = INSIDE_STRING
633 else:
634 raise ValueError(
maruel@chromium.org588e09d2013-05-21 14:38:57 +0000635 'Can\'t process char \'%s\' at column %d for: %r' % (
636 char, index, text),
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000637 index,
638 text)
639 elif char == '\\':
640 if state == ESCAPED:
641 out[-1] += char
642 state = INSIDE_STRING
643 elif state == INSIDE_STRING:
644 state = ESCAPED
645 else:
646 raise ValueError(
maruel@chromium.org588e09d2013-05-21 14:38:57 +0000647 'Can\'t process char \'%s\' at column %d for: %r' % (
648 char, index, text),
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000649 index,
650 text)
651 else:
652 if state == INSIDE_STRING:
653 out[-1] += char
654 else:
655 raise ValueError(
maruel@chromium.org588e09d2013-05-21 14:38:57 +0000656 'Can\'t process char \'%s\' at column %d for: %r' % (
657 char, index, text),
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000658 index,
659 text)
660 if state not in (NEED_COMMA, NEED_COMMA_OR_DOT):
661 raise ValueError(
662 'String is incorrectly terminated: %r' % text,
663 text)
664 return out
665
666
667def read_json(filepath):
668 with open(filepath, 'r') as f:
669 return json.load(f)
670
671
672def write_json(filepath_or_handle, data, dense):
673 """Writes data into filepath or file handle encoded as json.
674
675 If dense is True, the json is packed. Otherwise, it is human readable.
676 """
677 if hasattr(filepath_or_handle, 'write'):
678 if dense:
679 filepath_or_handle.write(
680 json.dumps(data, sort_keys=True, separators=(',',':')))
681 else:
682 filepath_or_handle.write(json.dumps(data, sort_keys=True, indent=2))
683 else:
684 with open(filepath_or_handle, 'wb') as f:
685 if dense:
686 json.dump(data, f, sort_keys=True, separators=(',',':'))
687 else:
688 json.dump(data, f, sort_keys=True, indent=2)
689
690
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000691def assert_is_renderable(pseudo_string):
692 """Asserts the input is a valid object to be processed by render()."""
693 assert (
maruel@chromium.orgbea76be2013-06-03 20:59:25 +0000694 pseudo_string is None or
695 isinstance(pseudo_string, unicode) or
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000696 hasattr(pseudo_string, 'render')), repr(pseudo_string)
697
698
699def render(pseudo_string):
700 """Converts the pseudo-string to an unicode string."""
maruel@chromium.orgbea76be2013-06-03 20:59:25 +0000701 if pseudo_string is None or isinstance(pseudo_string, unicode):
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000702 return pseudo_string
703 return pseudo_string.render()
704
705
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000706class Results(object):
707 """Results of a trace session."""
708
709 class _TouchedObject(object):
710 """Something, a file or a directory, that was accessed."""
711 def __init__(self, root, path, tainted, size, nb_files):
712 logging.debug(
713 '%s(%s, %s, %s, %s, %s)' %
714 (self.__class__.__name__, root, path, tainted, size, nb_files))
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000715 assert_is_renderable(root)
716 assert_is_renderable(path)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000717 self.root = root
718 self.path = path
719 self.tainted = tainted
720 self.nb_files = nb_files
maruel@chromium.org538141b2013-06-03 20:57:17 +0000721 # Can be used as a cache or a default value, depending on context. In
722 # particular, once self.tainted is True, because the path was replaced
723 # with a variable, it is not possible to look up the file size.
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000724 self._size = size
725 # These are cache only.
726 self._real_path = None
727
728 # Check internal consistency.
729 assert path, path
730 assert tainted or bool(root) != bool(isabs(path)), (root, path)
731 assert tainted or (
732 not os.path.exists(self.full_path) or
733 (self.full_path == get_native_path_case(self.full_path))), (
734 tainted, self.full_path, get_native_path_case(self.full_path))
735
736 @property
737 def existent(self):
738 return self.size != -1
739
740 @property
741 def full_path(self):
742 if self.root:
743 return os.path.join(self.root, self.path)
744 return self.path
745
746 @property
747 def real_path(self):
748 """Returns the path with symlinks resolved."""
749 if not self._real_path:
750 self._real_path = os.path.realpath(self.full_path)
751 return self._real_path
752
753 @property
754 def size(self):
maruel@chromium.org538141b2013-06-03 20:57:17 +0000755 """File's size. -1 is not existent.
756
757 Once tainted, it is not possible the retrieve the file size anymore since
758 the path is composed of variables.
759 """
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000760 if self._size is None and not self.tainted:
761 try:
762 self._size = os.stat(self.full_path).st_size
763 except OSError:
764 self._size = -1
765 return self._size
766
767 def flatten(self):
768 """Returns a dict representing this object.
769
770 A 'size' of 0 means the file was only touched and not read.
771 """
772 return {
773 'path': self.path,
774 'size': self.size,
775 }
776
777 def replace_variables(self, variables):
778 """Replaces the root of this File with one of the variables if it matches.
779
780 If a variable replacement occurs, the cloned object becomes tainted.
781 """
782 for variable, root_path in variables.iteritems():
783 if self.path.startswith(root_path):
784 return self._clone(
785 self.root, variable + self.path[len(root_path):], True)
786 # No need to clone, returns ourself.
787 return self
788
789 def strip_root(self, root):
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000790 """Returns a clone of itself with 'root' stripped off.
791
792 Note that the file is kept if it is either accessible from a symlinked
793 path that was used to access the file or through the real path.
794 """
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000795 # Check internal consistency.
796 assert self.tainted or (isabs(root) and root.endswith(os.path.sep)), root
797 if not self.full_path.startswith(root):
798 # Now try to resolve the symlinks to see if it can be reached this way.
799 # Only try *after* trying without resolving symlink.
800 if not self.real_path.startswith(root):
801 return None
802 path = self.real_path
803 else:
804 path = self.full_path
805 return self._clone(root, path[len(root):], self.tainted)
806
807 def _clone(self, new_root, new_path, tainted):
808 raise NotImplementedError(self.__class__.__name__)
809
810 class File(_TouchedObject):
811 """A file that was accessed. May not be present anymore.
812
813 If tainted is true, it means it is not a real path anymore as a variable
814 replacement occured.
815
maruel@chromium.org538141b2013-06-03 20:57:17 +0000816 |mode| can be one of None, TOUCHED, READ or WRITE.
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000817 """
maruel@chromium.org538141b2013-06-03 20:57:17 +0000818 # Was probed for existence, and it is existent, but was never _opened_.
819 TOUCHED = 't'
820 # Opened for read only and guaranteed to not have been written to.
821 READ = 'r'
822 # Opened for write.
823 WRITE = 'w'
824
825 # They are listed in order of priority. E.g. if a file is traced as TOUCHED
826 # then as WRITE, only keep WRITE. None means no idea, which is a problem on
827 # Windows.
828 ACCEPTABLE_MODES = (None, TOUCHED, READ, WRITE)
829
830 def __init__(self, root, path, tainted, size, mode):
831 assert mode in self.ACCEPTABLE_MODES
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000832 super(Results.File, self).__init__(root, path, tainted, size, 1)
maruel@chromium.org538141b2013-06-03 20:57:17 +0000833 self.mode = mode
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000834
835 def _clone(self, new_root, new_path, tainted):
836 """Clones itself keeping meta-data."""
837 # Keep the self.size and self._real_path caches for performance reason. It
838 # is also important when the file becomes tainted (with a variable instead
839 # of the real path) since self.path is not an on-disk path anymore so
840 # out._size cannot be updated.
maruel@chromium.org538141b2013-06-03 20:57:17 +0000841 out = self.__class__(new_root, new_path, tainted, self.size, self.mode)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000842 out._real_path = self._real_path
843 return out
844
maruel@chromium.org538141b2013-06-03 20:57:17 +0000845 def flatten(self):
846 out = super(Results.File, self).flatten()
847 out['mode'] = self.mode
848 return out
849
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000850 class Directory(_TouchedObject):
maruel@chromium.org538141b2013-06-03 20:57:17 +0000851 """A directory of files. Must exist.
852
853 For a Directory instance, self.size is not a cache, it's an actual value
854 that is never modified and represents the total size of the files contained
855 in this directory. It is possible that the directory is empty so that
856 size==0; this happens if there's only an invalid symlink in it.
857 """
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000858 def __init__(self, root, path, tainted, size, nb_files):
859 """path='.' is a valid value and must be handled appropriately."""
860 assert not path.endswith(os.path.sep), path
861 super(Results.Directory, self).__init__(
862 root, path + os.path.sep, tainted, size, nb_files)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000863
864 def flatten(self):
865 out = super(Results.Directory, self).flatten()
866 out['nb_files'] = self.nb_files
867 return out
868
869 def _clone(self, new_root, new_path, tainted):
870 """Clones itself keeping meta-data."""
871 out = self.__class__(
872 new_root,
873 new_path.rstrip(os.path.sep),
874 tainted,
875 self.size,
876 self.nb_files)
877 out._real_path = self._real_path
878 return out
879
880 class Process(object):
881 """A process that was traced.
882
883 Contains references to the files accessed by this process and its children.
884 """
885 def __init__(self, pid, files, executable, command, initial_cwd, children):
886 logging.debug('Process(%s, %d, ...)' % (pid, len(files)))
887 self.pid = pid
888 self.files = sorted(files, key=lambda x: x.path)
889 self.children = children
890 self.executable = executable
891 self.command = command
892 self.initial_cwd = initial_cwd
893
894 # Check internal consistency.
895 assert len(set(f.path for f in self.files)) == len(self.files), sorted(
896 f.path for f in self.files)
897 assert isinstance(self.children, list)
898 assert isinstance(self.files, list)
899
900 @property
901 def all(self):
902 for child in self.children:
903 for i in child.all:
904 yield i
905 yield self
906
907 def flatten(self):
908 return {
909 'children': [c.flatten() for c in self.children],
910 'command': self.command,
911 'executable': self.executable,
912 'files': [f.flatten() for f in self.files],
913 'initial_cwd': self.initial_cwd,
914 'pid': self.pid,
915 }
916
917 def strip_root(self, root):
918 assert isabs(root) and root.endswith(os.path.sep), root
919 # Loads the files after since they are constructed as objects.
920 out = self.__class__(
921 self.pid,
922 filter(None, (f.strip_root(root) for f in self.files)),
923 self.executable,
924 self.command,
925 self.initial_cwd,
926 [c.strip_root(root) for c in self.children])
927 logging.debug(
928 'strip_root(%s) %d -> %d' % (root, len(self.files), len(out.files)))
929 return out
930
931 def __init__(self, process):
932 self.process = process
933 # Cache.
934 self._files = None
935
936 def flatten(self):
937 return {
938 'root': self.process.flatten(),
939 }
940
941 @property
942 def files(self):
943 if self._files is None:
944 self._files = sorted(
945 sum((p.files for p in self.process.all), []),
946 key=lambda x: x.path)
947 return self._files
948
949 @property
950 def existent(self):
951 return [f for f in self.files if f.existent]
952
953 @property
954 def non_existent(self):
955 return [f for f in self.files if not f.existent]
956
957 def strip_root(self, root):
958 """Returns a clone with all the files outside the directory |root| removed
959 and converts all the path to be relative paths.
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000960
961 It keeps files accessible through the |root| directory or that have been
962 accessed through any symlink which points to the same directory.
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000963 """
964 # Resolve any symlink
965 root = os.path.realpath(root)
966 root = get_native_path_case(root).rstrip(os.path.sep) + os.path.sep
967 logging.debug('strip_root(%s)' % root)
968 return Results(self.process.strip_root(root))
969
970
971class ApiBase(object):
972 """OS-agnostic API to trace a process and its children."""
973 class Context(object):
974 """Processes one log line at a time and keeps the list of traced processes.
975
976 The parsing is complicated by the fact that logs are traced out of order for
977 strace but in-order for dtrace and logman. In addition, on Windows it is
978 very frequent that processids are reused so a flat list cannot be used. But
979 at the same time, it is impossible to faithfully construct a graph when the
980 logs are processed out of order. So both a tree and a flat mapping are used,
981 the tree is the real process tree, while the flat mapping stores the last
982 valid process for the corresponding processid. For the strace case, the
983 tree's head is guessed at the last moment.
984 """
985 class Process(object):
986 """Keeps context for one traced child process.
987
988 Logs all the files this process touched. Ignores directories.
989 """
990 def __init__(self, blacklist, pid, initial_cwd):
991 # Check internal consistency.
992 assert isinstance(pid, int), repr(pid)
maruel@chromium.org306e0e72012-11-02 18:22:03 +0000993 assert_is_renderable(initial_cwd)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +0000994 self.pid = pid
995 # children are Process instances.
996 self.children = []
997 self.initial_cwd = initial_cwd
998 self.cwd = None
maruel@chromium.org538141b2013-06-03 20:57:17 +0000999 self.files = {}
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001000 self.executable = None
1001 self.command = None
1002 self._blacklist = blacklist
1003
1004 def to_results_process(self):
1005 """Resolves file case sensitivity and or late-bound strings."""
1006 # When resolving files, it's normal to get dupe because a file could be
1007 # opened multiple times with different case. Resolve the deduplication
1008 # here.
1009 def fix_path(x):
1010 """Returns the native file path case.
1011
1012 Converts late-bound strings.
1013 """
1014 if not x:
1015 # Do not convert None instance to 'None'.
1016 return x
maruel@chromium.org306e0e72012-11-02 18:22:03 +00001017 x = render(x)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001018 if os.path.isabs(x):
1019 # If the path is not absolute, which tends to happen occasionally on
1020 # Windows, it is not possible to get the native path case so ignore
1021 # that trace. It mostly happens for 'executable' value.
1022 x = get_native_path_case(x)
1023 return x
1024
maruel@chromium.org538141b2013-06-03 20:57:17 +00001025 def fix_and_blacklist_path(x, m):
1026 """Receives a tuple (filepath, mode) and processes filepath."""
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001027 x = fix_path(x)
1028 if not x:
1029 return
1030 # The blacklist needs to be reapplied, since path casing could
1031 # influence blacklisting.
1032 if self._blacklist(x):
1033 return
maruel@chromium.org538141b2013-06-03 20:57:17 +00001034 # Filters out directories. Some may have passed through.
1035 if os.path.isdir(x):
1036 return
1037 return x, m
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001038
maruel@chromium.org538141b2013-06-03 20:57:17 +00001039 # Renders all the files as strings, as some could be RelativePath
1040 # instances. It is important to do it first since there could still be
1041 # multiple entries with the same path but different modes.
1042 rendered = (
1043 fix_and_blacklist_path(f, m) for f, m in self.files.iteritems())
1044 files = sorted(
1045 (f for f in rendered if f),
1046 key=lambda x: (x[0], Results.File.ACCEPTABLE_MODES.index(x[1])))
1047 # Then converting into a dict will automatically clean up lesser
1048 # important values.
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001049 files = [
maruel@chromium.org538141b2013-06-03 20:57:17 +00001050 Results.File(None, f, False, None, m)
1051 for f, m in dict(files).iteritems()
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001052 ]
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001053 return Results.Process(
1054 self.pid,
1055 files,
1056 fix_path(self.executable),
1057 self.command,
1058 fix_path(self.initial_cwd),
1059 [c.to_results_process() for c in self.children])
1060
maruel@chromium.org538141b2013-06-03 20:57:17 +00001061 def add_file(self, filepath, mode):
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001062 """Adds a file if it passes the blacklist."""
maruel@chromium.org306e0e72012-11-02 18:22:03 +00001063 if self._blacklist(render(filepath)):
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001064 return
maruel@chromium.org538141b2013-06-03 20:57:17 +00001065 logging.debug('add_file(%d, %s, %s)', self.pid, filepath, mode)
1066 # Note that filepath and not render(filepath) is added. It is because
1067 # filepath could be something else than a string, like a RelativePath
1068 # instance for dtrace logs.
1069 modes = Results.File.ACCEPTABLE_MODES
1070 old_mode = self.files.setdefault(filepath, mode)
1071 if old_mode != mode and modes.index(old_mode) < modes.index(mode):
1072 # Take the highest value.
1073 self.files[filepath] = mode
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001074
1075 def __init__(self, blacklist):
1076 self.blacklist = blacklist
1077 # Initial process.
1078 self.root_process = None
1079 # dict to accelerate process lookup, to not have to lookup the whole graph
1080 # each time.
1081 self._process_lookup = {}
1082
1083 class Tracer(object):
1084 """During it's lifetime, the tracing subsystem is enabled."""
1085 def __init__(self, logname):
1086 self._logname = logname
maruel@chromium.org808e3f22013-05-16 16:42:54 +00001087 self._lock = threading.RLock()
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001088 self._traces = []
1089 self._initialized = True
maruel@chromium.org4c9b4fa2013-05-29 19:22:14 +00001090 self._scripts_to_cleanup = []
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001091
1092 def trace(self, cmd, cwd, tracename, output):
1093 """Runs the OS-specific trace program on an executable.
1094
1095 Arguments:
1096 - cmd: The command (a list) to run.
1097 - cwd: Current directory to start the child process in.
1098 - tracename: Name of the trace in the logname file.
1099 - output: If False, redirects output to PIPEs.
1100
1101 Returns a tuple (resultcode, output) and updates the internal trace
1102 entries.
1103 """
1104 # The implementation adds an item to self._traces.
1105 raise NotImplementedError(self.__class__.__name__)
1106
1107 def close(self, _timeout=None):
1108 """Saves the meta-data in the logname file.
1109
1110 For kernel-based tracing, stops the tracing subsystem.
1111
1112 Must not be used manually when using 'with' construct.
1113 """
1114 with self._lock:
maruel@chromium.org808e3f22013-05-16 16:42:54 +00001115 if not self._initialized:
1116 raise TracingFailure(
1117 'Called %s.close() on an unitialized object' %
1118 self.__class__.__name__,
1119 None, None, None)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001120 try:
maruel@chromium.org4c9b4fa2013-05-29 19:22:14 +00001121 while self._scripts_to_cleanup:
1122 try:
1123 os.remove(self._scripts_to_cleanup.pop())
1124 except OSError as e:
1125 logging.error('Failed to delete a temporary script: %s', e)
maruel@chromium.org808e3f22013-05-16 16:42:54 +00001126 write_json(self._logname, self._gen_logdata(), False)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001127 finally:
1128 self._initialized = False
1129
1130 def post_process_log(self):
1131 """Post-processes the log so it becomes faster to load afterward.
1132
1133 Must not be used manually when using 'with' construct.
1134 """
1135 assert not self._initialized, 'Must stop tracing first.'
1136
maruel@chromium.org808e3f22013-05-16 16:42:54 +00001137 def _gen_logdata(self):
1138 """Returns the data to be saved in the trace file."""
1139 return {
1140 'traces': self._traces,
1141 }
1142
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001143 def __enter__(self):
1144 """Enables 'with' statement."""
1145 return self
1146
1147 def __exit__(self, exc_type, exc_value, traceback):
1148 """Enables 'with' statement."""
1149 self.close()
1150 # If an exception was thrown, do not process logs.
1151 if not exc_type:
1152 self.post_process_log()
1153
1154 def get_tracer(self, logname):
1155 """Returns an ApiBase.Tracer instance.
1156
1157 Initializes the tracing subsystem, which is a requirement for kernel-based
1158 tracers. Only one tracer instance should be live at a time!
1159
1160 logname is the filepath to the json file that will contain the meta-data
1161 about the logs.
1162 """
1163 return self.Tracer(logname)
1164
1165 @staticmethod
1166 def clean_trace(logname):
1167 """Deletes an old log."""
1168 raise NotImplementedError()
1169
1170 @classmethod
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00001171 def parse_log(cls, logname, blacklist, trace_name):
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001172 """Processes trace logs and returns the files opened and the files that do
1173 not exist.
1174
1175 It does not track directories.
1176
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00001177 Arguments:
1178 - logname: must be an absolute path.
1179 - blacklist: must be a lambda.
1180 - trace_name: optional trace to read, defaults to reading all traces.
maruel@chromium.org8b056ba2012-10-16 14:04:49 +00001181
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001182 Most of the time, files that do not exist are temporary test files that
1183 should be put in /tmp instead. See http://crbug.com/116251.
1184
1185 Returns a list of dict with keys:
1186 - results: A Results instance.
1187 - trace: The corresponding tracename parameter provided to
1188 get_tracer().trace().
1189 - output: Output gathered during execution, if get_tracer().trace(...,
1190 output=False) was used.
1191 """
1192 raise NotImplementedError(cls.__class__.__name__)
1193
1194
1195class Strace(ApiBase):
1196 """strace implies linux."""
maruel@chromium.org306e0e72012-11-02 18:22:03 +00001197 @staticmethod
1198 def load_filename(filename):
1199 """Parses a filename in a log."""
maruel@chromium.orgbea76be2013-06-03 20:59:25 +00001200 # TODO(maruel): Be compatible with strace -x.
maruel@chromium.org306e0e72012-11-02 18:22:03 +00001201 assert isinstance(filename, str)
1202 out = ''
1203 i = 0
1204 while i < len(filename):
1205 c = filename[i]
1206 if c == '\\':
1207 out += chr(int(filename[i+1:i+4], 8))
1208 i += 4
1209 else:
1210 out += c
1211 i += 1
1212 # TODO(maruel): That's not necessarily true that the current code page is
1213 # utf-8.
1214 return out.decode('utf-8')
1215
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001216 class Context(ApiBase.Context):
1217 """Processes a strace log line and keeps the list of existent and non
1218 existent files accessed.
1219
1220 Ignores directories.
1221
1222 Uses late-binding to processes the cwd of each process. The problem is that
1223 strace generates one log file per process it traced but doesn't give any
maruel@chromium.org2909b7c2013-05-29 18:52:59 +00001224 information about which process was started when and by who. So process the
1225 logs out of order and use late binding with RelativePath to be able to
1226 deduce the initial directory of each process once all the logs are parsed.
1227
1228 TODO(maruel): Use the script even in the non-sudo case, so log parsing can
1229 be done in two phase: first find the root process, then process the child
1230 processes in order. With that, it should be possible to not use RelativePath
1231 anymore. This would significantly simplify the code!
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001232 """
1233 class Process(ApiBase.Context.Process):
1234 """Represents the state of a process.
1235
1236 Contains all the information retrieved from the pid-specific log.
1237 """
1238 # Function names are using ([a-z_0-9]+)
1239 # This is the most common format. function(args) = result
maruel@chromium.org588e09d2013-05-21 14:38:57 +00001240 RE_HEADER = re.compile(r'^([a-z_0-9]+)\((.*?)\)\s+= (.+)$')
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001241 # An interrupted function call, only grab the minimal header.
1242 RE_UNFINISHED = re.compile(r'^([^\(]+)(.*) \<unfinished \.\.\.\>$')
1243 # A resumed function call.
1244 RE_RESUMED = re.compile(r'^<\.\.\. ([^ ]+) resumed> (.+)$')
1245 # A process received a signal.
1246 RE_SIGNAL = re.compile(r'^--- SIG[A-Z]+ .+ ---')
1247 # A process didn't handle a signal. Ignore any junk appearing before,
1248 # because the process was forcibly killed so it won't open any new file.
1249 RE_KILLED = re.compile(
1250 r'^.*\+\+\+ killed by ([A-Z]+)( \(core dumped\))? \+\+\+$')
1251 # The process has exited.
1252 RE_PROCESS_EXITED = re.compile(r'^\+\+\+ exited with (\d+) \+\+\+')
1253 # A call was canceled. Ignore any prefix.
1254 RE_UNAVAILABLE = re.compile(r'^.*\)\s*= \? <unavailable>$')
1255 # Happens when strace fails to even get the function name.
1256 UNNAMED_FUNCTION = '????'
1257
1258 # Corner-case in python, a class member function decorator must not be
1259 # @staticmethod.
1260 def parse_args(regexp, expect_zero): # pylint: disable=E0213
1261 """Automatically convert the str 'args' into a list of processed
1262 arguments.
1263
1264 Arguments:
1265 - regexp is used to parse args.
1266 - expect_zero: one of True, False or None.
1267 - True: will check for result.startswith('0') first and will ignore
1268 the trace line completely otherwise. This is important because for
1269 many functions, the regexp will not process if the call failed.
1270 - False: will check for not result.startswith(('?', '-1')) for the
1271 same reason than with True.
1272 - None: ignore result.
1273 """
1274 def meta_hook(function):
1275 assert function.__name__.startswith('handle_')
1276 def hook(self, args, result):
1277 if expect_zero is True and not result.startswith('0'):
1278 return
1279 if expect_zero is False and result.startswith(('?', '-1')):
1280 return
1281 match = re.match(regexp, args)
1282 if not match:
1283 raise TracingFailure(
1284 'Failed to parse %s(%s) = %s' %
1285 (function.__name__[len('handle_'):], args, result),
1286 None, None, None)
1287 return function(self, match.groups(), result)
1288 return hook
1289 return meta_hook
1290
1291 class RelativePath(object):
1292 """A late-bound relative path."""
1293 def __init__(self, parent, value):
maruel@chromium.org306e0e72012-11-02 18:22:03 +00001294 assert_is_renderable(parent)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001295 self.parent = parent
maruel@chromium.orgbea76be2013-06-03 20:59:25 +00001296 assert (
1297 value is None or
1298 (isinstance(value, unicode) and not os.path.isabs(value)))
1299 self.value = value
maruel@chromium.org306e0e72012-11-02 18:22:03 +00001300 if self.value:
maruel@chromium.orgbea76be2013-06-03 20:59:25 +00001301 # TODO(maruel): On POSIX, '\\' is a valid character so remove this
1302 # assert.
maruel@chromium.org306e0e72012-11-02 18:22:03 +00001303 assert '\\' not in self.value, value
1304 assert '\\' not in self.value, (repr(value), repr(self.value))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001305
1306 def render(self):
1307 """Returns the current directory this instance is representing.
1308
1309 This function is used to return the late-bound value.
1310 """
maruel@chromium.orgbea76be2013-06-03 20:59:25 +00001311 assert self.parent is not None
1312 parent = render(self.parent)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001313 if self.value:
1314 return os.path.normpath(os.path.join(parent, self.value))
1315 return parent
1316
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001317 def __init__(self, root, pid):
1318 """Keeps enough information to be able to guess the original process
1319 root.
1320
1321 strace doesn't store which process was the initial process. So more
1322 information needs to be kept so the graph can be reconstructed from the
1323 flat map.
1324 """
1325 logging.info('%s(%d)' % (self.__class__.__name__, pid))
1326 super(Strace.Context.Process, self).__init__(root.blacklist, pid, None)
1327 assert isinstance(root, ApiBase.Context)
1328 self._root = weakref.ref(root)
1329 # The dict key is the function name of the pending call, like 'open'
1330 # or 'execve'.
1331 self._pending_calls = {}
1332 self._line_number = 0
1333 # Current directory when the process started.
maruel@chromium.orgbea76be2013-06-03 20:59:25 +00001334 if isinstance(self._root(), unicode):
1335 self.initial_cwd = self._root()
1336 else:
1337 self.initial_cwd = self.RelativePath(self._root(), None)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001338 self.parentid = None
maruel@chromium.org9f72e0f2012-10-18 14:48:49 +00001339 self._done = False
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001340
1341 def get_cwd(self):
1342 """Returns the best known value of cwd."""
1343 return self.cwd or self.initial_cwd
1344
1345 def render(self):
1346 """Returns the string value of the RelativePath() object.
1347
1348 Used by RelativePath. Returns the initial directory and not the
1349 current one since the current directory 'cwd' validity is time-limited.
1350
1351 The validity is only guaranteed once all the logs are processed.
1352 """
1353 return self.initial_cwd.render()
1354
1355 def on_line(self, line):
maruel@chromium.org306e0e72012-11-02 18:22:03 +00001356 assert isinstance(line, str)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001357 self._line_number += 1
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001358 try:
maruel@chromium.org588e09d2013-05-21 14:38:57 +00001359 if self._done:
1360 raise TracingFailure(
1361 'Found a trace for a terminated process or corrupted log',
1362 None, None, None)
1363
1364 if self.RE_SIGNAL.match(line):
1365 # Ignore signals.
1366 return
1367
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001368 match = self.RE_KILLED.match(line)
1369 if match:
1370 # Converts a '+++ killed by Foo +++' trace into an exit_group().
1371 self.handle_exit_group(match.group(1), None)
1372 return
1373
1374 match = self.RE_PROCESS_EXITED.match(line)
1375 if match:
1376 # Converts a '+++ exited with 1 +++' trace into an exit_group()
1377 self.handle_exit_group(match.group(1), None)
1378 return
1379
1380 match = self.RE_UNFINISHED.match(line)
1381 if match:
1382 if match.group(1) in self._pending_calls:
1383 raise TracingFailure(
1384 'Found two unfinished calls for the same function',
1385 None, None, None,
1386 self._pending_calls)
1387 self._pending_calls[match.group(1)] = (
1388 match.group(1) + match.group(2))
1389 return
1390
1391 match = self.RE_UNAVAILABLE.match(line)
1392 if match:
1393 # This usually means a process was killed and a pending call was
1394 # canceled.
1395 # TODO(maruel): Look up the last exit_group() trace just above and
1396 # make sure any self._pending_calls[anything] is properly flushed.
1397 return
1398
1399 match = self.RE_RESUMED.match(line)
1400 if match:
1401 if match.group(1) not in self._pending_calls:
1402 raise TracingFailure(
1403 'Found a resumed call that was not logged as unfinished',
1404 None, None, None,
1405 self._pending_calls)
1406 pending = self._pending_calls.pop(match.group(1))
1407 # Reconstruct the line.
1408 line = pending + match.group(2)
1409
1410 match = self.RE_HEADER.match(line)
1411 if not match:
maruel@chromium.orgbcfc9e12013-03-19 18:47:17 +00001412 # The line is corrupted. It happens occasionally when a process is
1413 # killed forcibly with activity going on. Assume the process died.
1414 # No other line can be processed afterward.
maruel@chromium.org588e09d2013-05-21 14:38:57 +00001415 logging.debug('%d is done: %s', self.pid, line)
maruel@chromium.orgbcfc9e12013-03-19 18:47:17 +00001416 self._done = True
1417 return
1418
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001419 if match.group(1) == self.UNNAMED_FUNCTION:
1420 return
1421
1422 # It's a valid line, handle it.
1423 handler = getattr(self, 'handle_%s' % match.group(1), None)
1424 if not handler:
1425 self._handle_unknown(match.group(1), match.group(2), match.group(3))
1426 return handler(match.group(2), match.group(3))
1427 except TracingFailure, e:
1428 # Hack in the values since the handler could be a static function.
1429 e.pid = self.pid
1430 e.line = line
1431 e.line_number = self._line_number
1432 # Re-raise the modified exception.
1433 raise
1434 except (KeyError, NotImplementedError, ValueError), e:
1435 raise TracingFailure(
1436 'Trace generated a %s exception: %s' % (
1437 e.__class__.__name__, str(e)),
1438 self.pid,
1439 self._line_number,
1440 line,
1441 e)
1442
1443 @parse_args(r'^\"(.+?)\", [FKORWX_|]+$', True)
1444 def handle_access(self, args, _result):
maruel@chromium.org538141b2013-06-03 20:57:17 +00001445 self._handle_file(args[0], Results.File.TOUCHED)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001446
1447 @parse_args(r'^\"(.+?)\"$', True)
1448 def handle_chdir(self, args, _result):
1449 """Updates cwd."""
maruel@chromium.orgbea76be2013-06-03 20:59:25 +00001450 self.cwd = self._mangle(args[0])
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001451 logging.debug('handle_chdir(%d, %s)' % (self.pid, self.cwd))
1452
maruel@chromium.org538141b2013-06-03 20:57:17 +00001453 @parse_args(r'^\"(.+?)\", (\d+), (\d+)$', False)
1454 def handle_chown(self, args, _result):
1455 # TODO(maruel): Look at result?
1456 self._handle_file(args[0], Results.File.WRITE)
maruel@chromium.org55550722013-05-28 14:43:14 +00001457
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001458 def handle_clone(self, _args, result):
maruel@chromium.org588e09d2013-05-21 14:38:57 +00001459 self._handling_forking('clone', result)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001460
1461 def handle_close(self, _args, _result):
1462 pass
1463
maruel@chromium.org538141b2013-06-03 20:57:17 +00001464 @parse_args(r'^\"(.+?)\", (\d+)$', False)
1465 def handle_chmod(self, args, _result):
1466 self._handle_file(args[0], Results.File.WRITE)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001467
maruel@chromium.org538141b2013-06-03 20:57:17 +00001468 @parse_args(r'^\"(.+?)\", (\d+)$', False)
1469 def handle_creat(self, args, _result):
1470 self._handle_file(args[0], Results.File.WRITE)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001471
1472 @parse_args(r'^\"(.+?)\", \[(.+)\], \[\/\* \d+ vars? \*\/\]$', True)
1473 def handle_execve(self, args, _result):
1474 # Even if in practice execve() doesn't returns when it succeeds, strace
1475 # still prints '0' as the result.
1476 filepath = args[0]
maruel@chromium.org538141b2013-06-03 20:57:17 +00001477 self._handle_file(filepath, Results.File.READ)
maruel@chromium.orgbea76be2013-06-03 20:59:25 +00001478 self.executable = self._mangle(filepath)
maruel@chromium.org588e09d2013-05-21 14:38:57 +00001479 try:
1480 self.command = strace_process_quoted_arguments(args[1])
1481 except ValueError as e:
1482 raise TracingFailure(
1483 'Failed to process command line argument:\n%s' % e.args[0],
1484 None, None, None)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001485
1486 def handle_exit_group(self, _args, _result):
1487 """Removes cwd."""
1488 self.cwd = None
1489
csharp@chromium.org11ae0672012-10-30 14:53:17 +00001490 @parse_args(r'^(\d+|AT_FDCWD), \"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', True)
1491 def handle_faccessat(self, args, _results):
1492 if args[0] == 'AT_FDCWD':
maruel@chromium.org538141b2013-06-03 20:57:17 +00001493 self._handle_file(args[1], Results.File.TOUCHED)
csharp@chromium.org11ae0672012-10-30 14:53:17 +00001494 else:
1495 raise Exception('Relative faccess not implemented.')
1496
maruel@chromium.org0781f322013-05-28 19:45:49 +00001497 def handle_fallocate(self, _args, result):
1498 pass
1499
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001500 def handle_fork(self, args, result):
1501 self._handle_unknown('fork', args, result)
1502
maruel@chromium.org9f72e0f2012-10-18 14:48:49 +00001503 def handle_futex(self, _args, _result):
1504 pass
1505
maruel@chromium.orgbea76be2013-06-03 20:59:25 +00001506 @parse_args(r'^\"(.+?)\", (\d+)$', False)
1507 def handle_getcwd(self, args, _result):
1508 if os.path.isabs(args[0]):
1509 logging.debug('handle_chdir(%d, %s)' % (self.pid, self.cwd))
1510 if not isinstance(self.cwd, unicode):
1511 # Take the occasion to reset the path.
1512 self.cwd = self._mangle(args[0])
1513 else:
1514 # It should always match.
1515 assert self.cwd == Strace.load_filename(args[0]), (
1516 self.cwd, args[0])
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001517
1518 @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
1519 def handle_link(self, args, _result):
maruel@chromium.org538141b2013-06-03 20:57:17 +00001520 self._handle_file(args[0], Results.File.READ)
1521 self._handle_file(args[1], Results.File.WRITE)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001522
1523 @parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True)
1524 def handle_lstat(self, args, _result):
maruel@chromium.org538141b2013-06-03 20:57:17 +00001525 self._handle_file(args[0], Results.File.TOUCHED)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001526
1527 def handle_mkdir(self, _args, _result):
maruel@chromium.org538141b2013-06-03 20:57:17 +00001528 # We track content, not directories.
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001529 pass
1530
1531 @parse_args(r'^\"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', False)
1532 def handle_open(self, args, _result):
1533 if 'O_DIRECTORY' in args[1]:
1534 return
maruel@chromium.org538141b2013-06-03 20:57:17 +00001535 self._handle_file(
1536 args[0],
1537 Results.File.READ if 'O_RDONLY' in args[1] else Results.File.WRITE)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001538
1539 @parse_args(r'^(\d+|AT_FDCWD), \"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', False)
1540 def handle_openat(self, args, _result):
1541 if 'O_DIRECTORY' in args[2]:
1542 return
csharp@chromium.org11ae0672012-10-30 14:53:17 +00001543 if args[0] == 'AT_FDCWD':
maruel@chromium.org538141b2013-06-03 20:57:17 +00001544 self._handle_file(
1545 args[1],
1546 Results.File.READ if 'O_RDONLY' in args[2]
1547 else Results.File.WRITE)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001548 else:
1549 # TODO(maruel): Implement relative open if necessary instead of the
1550 # AT_FDCWD flag, let's hope not since this means tracking all active
1551 # directory handles.
maruel@chromium.org538141b2013-06-03 20:57:17 +00001552 raise NotImplementedError('Relative open via openat not implemented.')
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001553
1554 @parse_args(r'^\"(.+?)\", \".+?\"(\.\.\.)?, \d+$', False)
1555 def handle_readlink(self, args, _result):
maruel@chromium.org538141b2013-06-03 20:57:17 +00001556 self._handle_file(args[0], Results.File.READ)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001557
1558 @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
1559 def handle_rename(self, args, _result):
maruel@chromium.org538141b2013-06-03 20:57:17 +00001560 self._handle_file(args[0], Results.File.READ)
1561 self._handle_file(args[1], Results.File.WRITE)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001562
1563 def handle_rmdir(self, _args, _result):
maruel@chromium.org538141b2013-06-03 20:57:17 +00001564 # We track content, not directories.
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001565 pass
1566
1567 def handle_setxattr(self, _args, _result):
maruel@chromium.org538141b2013-06-03 20:57:17 +00001568 # TODO(maruel): self._handle_file(args[0], Results.File.WRITE)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001569 pass
1570
1571 @parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True)
1572 def handle_stat(self, args, _result):
maruel@chromium.org538141b2013-06-03 20:57:17 +00001573 self._handle_file(args[0], Results.File.TOUCHED)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001574
maruel@chromium.org538141b2013-06-03 20:57:17 +00001575 @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
1576 def handle_symlink(self, args, _result):
1577 self._handle_file(args[0], Results.File.TOUCHED)
1578 self._handle_file(args[1], Results.File.WRITE)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001579
csharp@chromium.orgfa672922012-11-02 17:43:31 +00001580 @parse_args(r'^\"(.+?)\", \d+', True)
1581 def handle_truncate(self, args, _result):
maruel@chromium.org538141b2013-06-03 20:57:17 +00001582 self._handle_file(args[0], Results.File.WRITE)
csharp@chromium.orgfa672922012-11-02 17:43:31 +00001583
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001584 def handle_unlink(self, _args, _result):
1585 # In theory, the file had to be created anyway.
1586 pass
1587
csharp@chromium.org11ae0672012-10-30 14:53:17 +00001588 def handle_unlinkat(self, _args, _result):
1589 # In theory, the file had to be created anyway.
1590 pass
1591
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001592 def handle_statfs(self, _args, _result):
1593 pass
1594
csharp@chromium.org11ae0672012-10-30 14:53:17 +00001595 def handle_utimensat(self, _args, _result):
1596 pass
1597
maruel@chromium.org588e09d2013-05-21 14:38:57 +00001598 def handle_vfork(self, _args, result):
1599 self._handling_forking('vfork', result)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001600
1601 @staticmethod
1602 def _handle_unknown(function, args, result):
1603 raise TracingFailure(
1604 'Unexpected/unimplemented trace %s(%s)= %s' %
1605 (function, args, result),
1606 None, None, None)
1607
maruel@chromium.org588e09d2013-05-21 14:38:57 +00001608 def _handling_forking(self, name, result):
1609 """Transfers cwd."""
1610 if result.startswith(('?', '-1')):
1611 # The call failed.
1612 return
1613 # Update the other process right away.
1614 childpid = int(result)
1615 child = self._root().get_or_set_proc(childpid)
1616 if child.parentid is not None or childpid in self.children:
1617 raise TracingFailure(
1618 'Found internal inconsitency in process lifetime detection '
1619 'during a %s() call' % name,
1620 None, None, None)
1621
1622 # Copy the cwd object.
1623 child.initial_cwd = self.get_cwd()
1624 child.parentid = self.pid
1625 # It is necessary because the logs are processed out of order.
1626 self.children.append(child)
1627
maruel@chromium.org538141b2013-06-03 20:57:17 +00001628 def _handle_file(self, filepath, mode):
maruel@chromium.orgbea76be2013-06-03 20:59:25 +00001629 filepath = self._mangle(filepath)
maruel@chromium.org538141b2013-06-03 20:57:17 +00001630 self.add_file(filepath, mode)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001631
maruel@chromium.orgbea76be2013-06-03 20:59:25 +00001632 def _mangle(self, filepath):
1633 """Decodes a filepath found in the log and convert it to a late-bound
1634 path if necessary.
1635
1636 |filepath| is an strace 'encoded' string and the returned value is
1637 either an unicode string if the path was absolute or a late bound path
1638 otherwise.
1639 """
1640 filepath = Strace.load_filename(filepath)
1641 if os.path.isabs(filepath):
1642 return filepath
1643 else:
1644 if isinstance(self.get_cwd(), unicode):
1645 return os.path.normpath(os.path.join(self.get_cwd(), filepath))
1646 return self.RelativePath(self.get_cwd(), filepath)
1647
maruel@chromium.org2909b7c2013-05-29 18:52:59 +00001648 def __init__(self, blacklist, root_pid, initial_cwd):
1649 """|root_pid| may be None when the root process is not known.
1650
1651 In that case, a search is done after reading all the logs to figure out
1652 the root process.
1653 """
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001654 super(Strace.Context, self).__init__(blacklist)
maruel@chromium.org306e0e72012-11-02 18:22:03 +00001655 assert_is_renderable(initial_cwd)
maruel@chromium.org2909b7c2013-05-29 18:52:59 +00001656 self.root_pid = root_pid
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001657 self.initial_cwd = initial_cwd
1658
1659 def render(self):
1660 """Returns the string value of the initial cwd of the root process.
1661
1662 Used by RelativePath.
1663 """
1664 return self.initial_cwd
1665
1666 def on_line(self, pid, line):
1667 """Transfers control into the Process.on_line() function."""
1668 self.get_or_set_proc(pid).on_line(line.strip())
1669
1670 def to_results(self):
maruel@chromium.org2909b7c2013-05-29 18:52:59 +00001671 """If necessary, finds back the root process and verify consistency."""
1672 if not self.root_pid:
1673 # The non-sudo case. The traced process was started by strace itself,
1674 # so the pid of the traced process is not known.
1675 root = [p for p in self._process_lookup.itervalues() if not p.parentid]
1676 if len(root) == 1:
1677 self.root_process = root[0]
1678 # Save it for later.
1679 self.root_pid = self.root_process.pid
1680 else:
1681 # The sudo case. The traced process was started manually so its pid is
1682 # known.
1683 self.root_process = self._process_lookup.get(self.root_pid)
1684 if not self.root_process:
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001685 raise TracingFailure(
1686 'Found internal inconsitency in process lifetime detection '
1687 'while finding the root process',
1688 None,
1689 None,
1690 None,
maruel@chromium.org2909b7c2013-05-29 18:52:59 +00001691 self.root_pid,
1692 sorted(self._process_lookup))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001693 process = self.root_process.to_results_process()
1694 if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
1695 raise TracingFailure(
1696 'Found internal inconsitency in process lifetime detection '
1697 'while looking for len(tree) == len(list)',
1698 None,
1699 None,
1700 None,
1701 sorted(self._process_lookup),
1702 sorted(p.pid for p in process.all))
1703 return Results(process)
1704
1705 def get_or_set_proc(self, pid):
1706 """Returns the Context.Process instance for this pid or creates a new one.
1707 """
1708 if not pid or not isinstance(pid, int):
1709 raise TracingFailure(
1710 'Unpexpected value for pid: %r' % pid,
1711 pid,
1712 None,
1713 None,
1714 pid)
1715 if pid not in self._process_lookup:
1716 self._process_lookup[pid] = self.Process(self, pid)
1717 return self._process_lookup[pid]
1718
1719 @classmethod
1720 def traces(cls):
1721 """Returns the list of all handled traces to pass this as an argument to
1722 strace.
1723 """
1724 prefix = 'handle_'
1725 return [i[len(prefix):] for i in dir(cls.Process) if i.startswith(prefix)]
1726
1727 class Tracer(ApiBase.Tracer):
1728 MAX_LEN = 256
1729
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00001730 def __init__(self, logname, use_sudo):
1731 super(Strace.Tracer, self).__init__(logname)
1732 self.use_sudo = use_sudo
maruel@chromium.org2909b7c2013-05-29 18:52:59 +00001733 if use_sudo:
1734 # TODO(maruel): Use the jump script systematically to make it easy to
1735 # figure out the root process, so RelativePath is not necessary anymore.
maruel@chromium.org4c9b4fa2013-05-29 19:22:14 +00001736 self._child_script = create_exec_thunk()
1737 self._scripts_to_cleanup.append(self._child_script)
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00001738
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001739 def trace(self, cmd, cwd, tracename, output):
maruel@chromium.org2909b7c2013-05-29 18:52:59 +00001740 """Runs strace on an executable.
1741
1742 When use_sudo=True, it is a 3-phases process: start the thunk, start
1743 sudo strace with the pid of the thunk and then have the thunk os.execve()
1744 the process to trace.
1745 """
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001746 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
1747 assert os.path.isabs(cmd[0]), cmd[0]
1748 assert os.path.isabs(cwd), cwd
1749 assert os.path.normpath(cwd) == cwd, cwd
1750 with self._lock:
1751 if not self._initialized:
1752 raise TracingFailure(
1753 'Called Tracer.trace() on an unitialized object',
1754 None, None, None, tracename)
1755 assert tracename not in (i['trace'] for i in self._traces)
1756 stdout = stderr = None
1757 if output:
1758 stdout = subprocess.PIPE
1759 stderr = subprocess.STDOUT
maruel@chromium.org2909b7c2013-05-29 18:52:59 +00001760
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001761 # Ensure all file related APIs are hooked.
1762 traces = ','.join(Strace.Context.traces() + ['file'])
maruel@chromium.org2909b7c2013-05-29 18:52:59 +00001763 flags = [
maruel@chromium.orga5cfdc32013-05-28 14:46:15 +00001764 # Each child process has its own trace file. It is necessary because
1765 # strace may generate corrupted log file if multiple processes are
1766 # heavily doing syscalls simultaneously.
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001767 '-ff',
maruel@chromium.orga5cfdc32013-05-28 14:46:15 +00001768 # Reduce whitespace usage.
1769 '-a1',
1770 # hex encode non-ascii strings.
1771 # TODO(maruel): '-x',
1772 # TODO(maruel): '-ttt',
1773 # Signals are unnecessary noise here. Note the parser can cope with them
1774 # but reduce the unnecessary output.
1775 '-esignal=none',
1776 # Print as much data as wanted.
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001777 '-s', '%d' % self.MAX_LEN,
1778 '-e', 'trace=%s' % traces,
1779 '-o', self._logname + '.' + tracename,
1780 ]
maruel@chromium.org2909b7c2013-05-29 18:52:59 +00001781
1782 if self.use_sudo:
1783 pipe_r, pipe_w = os.pipe()
1784 # Start the child process paused.
maruel@chromium.org4c9b4fa2013-05-29 19:22:14 +00001785 target_cmd = [sys.executable, self._child_script, str(pipe_r)] + cmd
maruel@chromium.org2909b7c2013-05-29 18:52:59 +00001786 logging.debug(' '.join(target_cmd))
1787 child_proc = subprocess.Popen(
1788 target_cmd,
1789 stdin=subprocess.PIPE,
1790 stdout=stdout,
1791 stderr=stderr,
1792 cwd=cwd)
1793
1794 # TODO(maruel): both processes must use the same UID for it to work
1795 # without sudo. Look why -p is failing at the moment without sudo.
1796 trace_cmd = [
1797 'sudo',
1798 'strace',
1799 '-p', str(child_proc.pid),
1800 ] + flags
1801 logging.debug(' '.join(trace_cmd))
1802 strace_proc = subprocess.Popen(
1803 trace_cmd,
1804 cwd=cwd,
1805 stdin=subprocess.PIPE,
1806 stdout=subprocess.PIPE,
1807 stderr=subprocess.PIPE)
1808
1809 line = strace_proc.stderr.readline()
1810 if not re.match(r'^Process \d+ attached \- interrupt to quit$', line):
1811 # TODO(maruel): Raise an exception.
1812 assert False, line
1813
1814 # Now fire the child process.
1815 os.write(pipe_w, 'x')
1816
1817 out = child_proc.communicate()[0]
1818 strace_out = strace_proc.communicate()[0]
1819
1820 # TODO(maruel): if strace_proc.returncode: Add an exception.
1821 saved_out = strace_out if strace_proc.returncode else out
1822 root_pid = child_proc.pid
1823 else:
1824 # Non-sudo case.
1825 trace_cmd = [
1826 'strace',
1827 ] + flags + cmd
1828 logging.debug(' '.join(trace_cmd))
1829 child_proc = subprocess.Popen(
1830 trace_cmd,
1831 cwd=cwd,
1832 stdin=subprocess.PIPE,
1833 stdout=stdout,
1834 stderr=stderr)
1835 out = child_proc.communicate()[0]
1836 # TODO(maruel): Walk the logs and figure out the root process would
1837 # simplify parsing the logs a *lot*.
1838 saved_out = out
1839 # The trace reader will have to figure out.
1840 root_pid = None
1841
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001842 with self._lock:
1843 assert tracename not in (i['trace'] for i in self._traces)
1844 self._traces.append(
1845 {
1846 'cmd': cmd,
1847 'cwd': cwd,
maruel@chromium.org2909b7c2013-05-29 18:52:59 +00001848 'output': saved_out,
1849 'pid': root_pid,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001850 'trace': tracename,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001851 })
maruel@chromium.org2909b7c2013-05-29 18:52:59 +00001852 return child_proc.returncode, out
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001853
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00001854 def __init__(self, use_sudo=None):
1855 super(Strace, self).__init__()
1856 self.use_sudo = use_sudo
1857
1858 def get_tracer(self, logname):
1859 return self.Tracer(logname, self.use_sudo)
1860
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001861 @staticmethod
1862 def clean_trace(logname):
1863 if os.path.isfile(logname):
1864 os.remove(logname)
1865 # Also delete any pid specific file from previous traces.
1866 for i in glob.iglob(logname + '.*'):
1867 if i.rsplit('.', 1)[1].isdigit():
1868 os.remove(i)
1869
1870 @classmethod
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00001871 def parse_log(cls, logname, blacklist, trace_name):
1872 logging.info('parse_log(%s, ..., %s)', logname, trace_name)
maruel@chromium.org8b056ba2012-10-16 14:04:49 +00001873 assert os.path.isabs(logname)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001874 data = read_json(logname)
1875 out = []
1876 for item in data['traces']:
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00001877 if trace_name and item['trace'] != trace_name:
1878 continue
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001879 result = {
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001880 'output': item['output'],
maruel@chromium.org964ddde2012-10-29 18:13:46 +00001881 'trace': item['trace'],
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001882 }
1883 try:
maruel@chromium.org2909b7c2013-05-29 18:52:59 +00001884 context = cls.Context(blacklist, item['pid'], item['cwd'])
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001885 for pidfile in glob.iglob('%s.%s.*' % (logname, item['trace'])):
maruel@chromium.org2909b7c2013-05-29 18:52:59 +00001886 logging.debug('Reading %s', pidfile)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001887 pid = pidfile.rsplit('.', 1)[1]
1888 if pid.isdigit():
1889 pid = int(pid)
maruel@chromium.org2909b7c2013-05-29 18:52:59 +00001890 found_line = False
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001891 for line in open(pidfile, 'rb'):
1892 context.on_line(pid, line)
maruel@chromium.org2909b7c2013-05-29 18:52:59 +00001893 found_line = True
1894 if not found_line:
1895 # Ensures that a completely empty trace still creates the
1896 # corresponding Process instance by logging a dummy line.
1897 context.on_line(pid, '')
1898 else:
1899 logging.warning('Found unexpected file %s', pidfile)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001900 result['results'] = context.to_results()
maruel@chromium.org424f5692012-10-15 15:42:38 +00001901 except TracingFailure:
1902 result['exception'] = sys.exc_info()
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001903 out.append(result)
1904 return out
1905
1906
1907class Dtrace(ApiBase):
1908 """Uses DTrace framework through dtrace. Requires root access.
1909
1910 Implies Mac OSX.
1911
1912 dtruss can't be used because it has compatibility issues with python.
1913
1914 Also, the pid->cwd handling needs to be done manually since OSX has no way to
1915 get the absolute path of the 'cwd' dtrace variable from the probe.
1916
1917 Also, OSX doesn't populate curpsinfo->pr_psargs properly, see
1918 https://discussions.apple.com/thread/1980539. So resort to handling execve()
1919 manually.
1920
1921 errno is not printed in the log since this implementation currently only cares
1922 about files that were successfully opened.
1923 """
1924 class Context(ApiBase.Context):
1925 # Format: index pid function(args)
1926 RE_HEADER = re.compile(r'^\d+ (\d+) ([a-zA-Z_\-]+)\((.*?)\)$')
1927
1928 # Arguments parsing.
1929 RE_DTRACE_BEGIN = re.compile(r'^\"(.+?)\"$')
1930 RE_CHDIR = re.compile(r'^\"(.+?)\"$')
1931 RE_EXECVE = re.compile(r'^\"(.+?)\", \[(\d+), (.+)\]$')
1932 RE_OPEN = re.compile(r'^\"(.+?)\", (0x[0-9a-z]+), (0x[0-9a-z]+)$')
1933 RE_PROC_START = re.compile(r'^(\d+), \"(.+?)\", (\d+)$')
1934 RE_RENAME = re.compile(r'^\"(.+?)\", \"(.+?)\"$')
1935
maruel@chromium.orgac361162013-06-04 15:54:06 +00001936 # O_DIRECTORY is not defined on Windows and dtrace doesn't exist on Windows.
1937 O_DIRECTORY = os.O_DIRECTORY if hasattr(os, 'O_DIRECTORY') else None
maruel@chromium.org538141b2013-06-03 20:57:17 +00001938 O_RDWR = os.O_RDWR
1939 O_WRONLY = os.O_WRONLY
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001940
1941 class Process(ApiBase.Context.Process):
1942 def __init__(self, *args):
1943 super(Dtrace.Context.Process, self).__init__(*args)
1944 self.cwd = self.initial_cwd
1945
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00001946 def __init__(self, blacklist, thunk_pid, initial_cwd):
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001947 logging.info(
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00001948 '%s(%d, %s)' % (self.__class__.__name__, thunk_pid, initial_cwd))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001949 super(Dtrace.Context, self).__init__(blacklist)
maruel@chromium.org306e0e72012-11-02 18:22:03 +00001950 assert isinstance(initial_cwd, unicode), initial_cwd
maruel@chromium.org2909b7c2013-05-29 18:52:59 +00001951 # Process ID of the temporary script created by create_subprocess_thunk().
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00001952 self._thunk_pid = thunk_pid
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001953 self._initial_cwd = initial_cwd
1954 self._line_number = 0
1955
1956 def on_line(self, line):
maruel@chromium.org306e0e72012-11-02 18:22:03 +00001957 assert isinstance(line, unicode), line
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00001958 self._line_number += 1
1959 match = self.RE_HEADER.match(line)
1960 if not match:
1961 raise TracingFailure(
1962 'Found malformed line: %s' % line,
1963 None,
1964 self._line_number,
1965 line)
1966 fn = getattr(
1967 self,
1968 'handle_%s' % match.group(2).replace('-', '_'),
1969 self._handle_ignored)
1970 # It is guaranteed to succeed because of the regexp. Or at least I thought
1971 # it would.
1972 pid = int(match.group(1))
1973 try:
1974 return fn(pid, match.group(3))
1975 except TracingFailure, e:
1976 # Hack in the values since the handler could be a static function.
1977 e.pid = pid
1978 e.line = line
1979 e.line_number = self._line_number
1980 # Re-raise the modified exception.
1981 raise
1982 except (KeyError, NotImplementedError, ValueError), e:
1983 raise TracingFailure(
1984 'Trace generated a %s exception: %s' % (
1985 e.__class__.__name__, str(e)),
1986 pid,
1987 self._line_number,
1988 line,
1989 e)
1990
1991 def to_results(self):
1992 process = self.root_process.to_results_process()
1993 # Internal concistency check.
1994 if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
1995 raise TracingFailure(
1996 'Found internal inconsitency in process lifetime detection '
1997 'while looking for len(tree) == len(list)',
1998 None,
1999 None,
2000 None,
2001 sorted(self._process_lookup),
2002 sorted(p.pid for p in process.all))
2003 return Results(process)
2004
2005 def handle_dtrace_BEGIN(self, _pid, args):
2006 if not self.RE_DTRACE_BEGIN.match(args):
2007 raise TracingFailure(
2008 'Found internal inconsitency in dtrace_BEGIN log line',
2009 None, None, None)
2010
2011 def handle_proc_start(self, pid, args):
2012 """Transfers cwd.
2013
2014 The dtrace script already takes care of only tracing the processes that
2015 are child of the traced processes so there is no need to verify the
2016 process hierarchy.
2017 """
2018 if pid in self._process_lookup:
2019 raise TracingFailure(
2020 'Found internal inconsitency in proc_start: %d started two times' %
2021 pid,
2022 None, None, None)
2023 match = self.RE_PROC_START.match(args)
2024 if not match:
2025 raise TracingFailure(
2026 'Failed to parse arguments: %s' % args,
2027 None, None, None)
2028 ppid = int(match.group(1))
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002029 if ppid == self._thunk_pid and not self.root_process:
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002030 proc = self.root_process = self.Process(
2031 self.blacklist, pid, self._initial_cwd)
2032 elif ppid in self._process_lookup:
2033 proc = self.Process(self.blacklist, pid, self._process_lookup[ppid].cwd)
2034 self._process_lookup[ppid].children.append(proc)
2035 else:
2036 # Another process tree, ignore.
2037 return
2038 self._process_lookup[pid] = proc
2039 logging.debug(
2040 'New child: %s -> %d cwd:%s' %
maruel@chromium.org306e0e72012-11-02 18:22:03 +00002041 (ppid, pid, render(proc.initial_cwd)))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002042
2043 def handle_proc_exit(self, pid, _args):
2044 """Removes cwd."""
2045 if pid in self._process_lookup:
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002046 # self._thunk_pid is not traced itself and other traces run neither.
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002047 self._process_lookup[pid].cwd = None
2048
2049 def handle_execve(self, pid, args):
2050 """Sets the process' executable.
2051
2052 TODO(maruel): Read command line arguments. See
2053 https://discussions.apple.com/thread/1980539 for an example.
2054 https://gist.github.com/1242279
2055
2056 Will have to put the answer at http://stackoverflow.com/questions/7556249.
2057 :)
2058 """
2059 if not pid in self._process_lookup:
2060 # Another process tree, ignore.
2061 return
2062 match = self.RE_EXECVE.match(args)
2063 if not match:
2064 raise TracingFailure(
2065 'Failed to parse arguments: %r' % args,
2066 None, None, None)
2067 proc = self._process_lookup[pid]
2068 proc.executable = match.group(1)
maruel@chromium.org538141b2013-06-03 20:57:17 +00002069 self._handle_file(pid, proc.executable, Results.File.READ)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002070 proc.command = self.process_escaped_arguments(match.group(3))
2071 if int(match.group(2)) != len(proc.command):
2072 raise TracingFailure(
2073 'Failed to parse execve() arguments: %s' % args,
2074 None, None, None)
2075
2076 def handle_chdir(self, pid, args):
2077 """Updates cwd."""
2078 if pid not in self._process_lookup:
2079 # Another process tree, ignore.
2080 return
2081 cwd = self.RE_CHDIR.match(args).group(1)
2082 if not cwd.startswith('/'):
2083 cwd2 = os.path.join(self._process_lookup[pid].cwd, cwd)
2084 logging.debug('handle_chdir(%d, %s) -> %s' % (pid, cwd, cwd2))
2085 else:
2086 logging.debug('handle_chdir(%d, %s)' % (pid, cwd))
2087 cwd2 = cwd
2088 self._process_lookup[pid].cwd = cwd2
2089
2090 def handle_open_nocancel(self, pid, args):
2091 """Redirects to handle_open()."""
2092 return self.handle_open(pid, args)
2093
2094 def handle_open(self, pid, args):
2095 if pid not in self._process_lookup:
2096 # Another process tree, ignore.
2097 return
2098 match = self.RE_OPEN.match(args)
2099 if not match:
2100 raise TracingFailure(
2101 'Failed to parse arguments: %s' % args,
2102 None, None, None)
2103 flag = int(match.group(2), 16)
2104 if self.O_DIRECTORY & flag == self.O_DIRECTORY:
2105 # Ignore directories.
2106 return
maruel@chromium.org538141b2013-06-03 20:57:17 +00002107 self._handle_file(
2108 pid,
2109 match.group(1),
2110 Results.File.READ if not ((self.O_RDWR | self.O_WRONLY) & flag)
2111 else Results.File.WRITE)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002112
2113 def handle_rename(self, pid, args):
2114 if pid not in self._process_lookup:
2115 # Another process tree, ignore.
2116 return
2117 match = self.RE_RENAME.match(args)
2118 if not match:
2119 raise TracingFailure(
2120 'Failed to parse arguments: %s' % args,
2121 None, None, None)
maruel@chromium.org538141b2013-06-03 20:57:17 +00002122 self._handle_file(pid, match.group(1), Results.File.READ)
2123 self._handle_file(pid, match.group(2), Results.File.WRITE)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002124
maruel@chromium.org538141b2013-06-03 20:57:17 +00002125 def _handle_file(self, pid, filepath, mode):
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002126 if not filepath.startswith('/'):
2127 filepath = os.path.join(self._process_lookup[pid].cwd, filepath)
2128 # We can get '..' in the path.
2129 filepath = os.path.normpath(filepath)
2130 # Sadly, still need to filter out directories here;
2131 # saw open_nocancel(".", 0, 0) = 0 lines.
2132 if os.path.isdir(filepath):
2133 return
maruel@chromium.org538141b2013-06-03 20:57:17 +00002134 self._process_lookup[pid].add_file(filepath, mode)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002135
2136 def handle_ftruncate(self, pid, args):
2137 """Just used as a signal to kill dtrace, ignoring."""
2138 pass
2139
2140 @staticmethod
2141 def _handle_ignored(pid, args):
2142 """Is called for all the event traces that are not handled."""
2143 raise NotImplementedError('Please implement me')
2144
2145 @staticmethod
2146 def process_escaped_arguments(text):
2147 """Extracts escaped arguments on a string and return the arguments as a
2148 list.
2149
2150 Implemented as an automaton.
2151
2152 Example:
2153 With text = '\\001python2.7\\001-c\\001print(\\"hi\\")\\0', the
2154 function will return ['python2.7', '-c', 'print("hi")]
2155 """
2156 if not text.endswith('\\0'):
2157 raise ValueError('String is not null terminated: %r' % text, text)
2158 text = text[:-2]
2159
2160 def unescape(x):
2161 """Replaces '\\' with '\' and '\?' (where ? is anything) with ?."""
2162 out = []
2163 escaped = False
2164 for i in x:
2165 if i == '\\' and not escaped:
2166 escaped = True
2167 continue
2168 escaped = False
2169 out.append(i)
2170 return ''.join(out)
2171
2172 return [unescape(i) for i in text.split('\\001')]
2173
2174 class Tracer(ApiBase.Tracer):
2175 # pylint: disable=C0301
2176 #
2177 # To understand the following code, you'll want to take a look at:
2178 # http://developers.sun.com/solaris/articles/dtrace_quickref/dtrace_quickref.html
2179 # https://wikis.oracle.com/display/DTrace/Variables
2180 # http://docs.oracle.com/cd/E19205-01/820-4221/
2181 #
2182 # 0. Dump all the valid probes into a text file. It is important, you
2183 # want to redirect into a file and you don't want to constantly 'sudo'.
2184 # $ sudo dtrace -l > probes.txt
2185 #
2186 # 1. Count the number of probes:
2187 # $ wc -l probes.txt
2188 # 81823 # On OSX 10.7, including 1 header line.
2189 #
2190 # 2. List providers, intentionally skipping all the 'syspolicy10925' and the
2191 # likes and skipping the header with NR>1:
2192 # $ awk 'NR>1 { print $2 }' probes.txt | sort | uniq | grep -v '[[:digit:]]'
2193 # dtrace
2194 # fbt
2195 # io
2196 # ip
2197 # lockstat
2198 # mach_trap
2199 # proc
2200 # profile
2201 # sched
2202 # syscall
2203 # tcp
2204 # vminfo
2205 #
2206 # 3. List of valid probes:
2207 # $ grep syscall probes.txt | less
2208 # or use dtrace directly:
2209 # $ sudo dtrace -l -P syscall | less
2210 #
2211 # trackedpid is an associative array where its value can be 0, 1 or 2.
2212 # 0 is for untracked processes and is the default value for items not
2213 # in the associative array.
2214 # 1 is for tracked processes.
maruel@chromium.org2909b7c2013-05-29 18:52:59 +00002215 # 2 is for the script created by create_subprocess_thunk() only. It is not
2216 # tracked itself but all its decendants are.
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002217 #
2218 # The script will kill itself only once waiting_to_die == 1 and
2219 # current_processes == 0, so that both getlogin() was called and that
2220 # all traced processes exited.
2221 #
2222 # TODO(maruel): Use cacheable predicates. See
2223 # https://wikis.oracle.com/display/DTrace/Performance+Considerations
2224 D_CODE = """
2225 dtrace:::BEGIN {
2226 waiting_to_die = 0;
2227 current_processes = 0;
2228 logindex = 0;
2229 printf("%d %d %s_%s(\\"%s\\")\\n",
2230 logindex, PID, probeprov, probename, SCRIPT);
2231 logindex++;
2232 }
2233
2234 proc:::start /trackedpid[ppid]/ {
2235 trackedpid[pid] = 1;
2236 current_processes += 1;
2237 printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
2238 logindex, pid, probeprov, probename,
2239 ppid,
2240 execname,
2241 current_processes);
2242 logindex++;
2243 }
2244 /* Should use SCRIPT but there is no access to this variable at that
2245 * point. */
2246 proc:::start /ppid == PID && execname == "Python"/ {
2247 trackedpid[pid] = 2;
2248 current_processes += 1;
2249 printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
2250 logindex, pid, probeprov, probename,
2251 ppid,
2252 execname,
2253 current_processes);
2254 logindex++;
2255 }
2256 proc:::exit /trackedpid[pid] &&
2257 current_processes == 1 &&
2258 waiting_to_die == 1/ {
2259 trackedpid[pid] = 0;
2260 current_processes -= 1;
2261 printf("%d %d %s_%s(%d)\\n",
2262 logindex, pid, probeprov, probename,
2263 current_processes);
2264 logindex++;
2265 exit(0);
2266 }
2267 proc:::exit /trackedpid[pid]/ {
2268 trackedpid[pid] = 0;
2269 current_processes -= 1;
2270 printf("%d %d %s_%s(%d)\\n",
2271 logindex, pid, probeprov, probename,
2272 current_processes);
2273 logindex++;
2274 }
2275
2276 /* Use an arcane function to detect when we need to die */
2277 syscall::ftruncate:entry /pid == PID && arg0 == FILE_ID/ {
2278 waiting_to_die = 1;
2279 printf("%d %d %s()\\n", logindex, pid, probefunc);
2280 logindex++;
2281 }
2282 syscall::ftruncate:entry /
2283 pid == PID && arg0 == FILE_ID && current_processes == 0/ {
2284 exit(0);
2285 }
2286
2287 syscall::open*:entry /trackedpid[pid] == 1/ {
2288 self->open_arg0 = arg0;
2289 self->open_arg1 = arg1;
2290 self->open_arg2 = arg2;
2291 }
2292 syscall::open*:return /trackedpid[pid] == 1 && errno == 0/ {
2293 this->open_arg0 = copyinstr(self->open_arg0);
2294 printf("%d %d %s(\\"%s\\", 0x%x, 0x%x)\\n",
2295 logindex, pid, probefunc,
2296 this->open_arg0,
2297 self->open_arg1,
2298 self->open_arg2);
2299 logindex++;
2300 this->open_arg0 = 0;
2301 }
2302 syscall::open*:return /trackedpid[pid] == 1/ {
2303 self->open_arg0 = 0;
2304 self->open_arg1 = 0;
2305 self->open_arg2 = 0;
2306 }
2307
2308 syscall::rename:entry /trackedpid[pid] == 1/ {
2309 self->rename_arg0 = arg0;
2310 self->rename_arg1 = arg1;
2311 }
2312 syscall::rename:return /trackedpid[pid] == 1 && errno == 0/ {
2313 this->rename_arg0 = copyinstr(self->rename_arg0);
2314 this->rename_arg1 = copyinstr(self->rename_arg1);
2315 printf("%d %d %s(\\"%s\\", \\"%s\\")\\n",
2316 logindex, pid, probefunc,
2317 this->rename_arg0,
2318 this->rename_arg1);
2319 logindex++;
2320 this->rename_arg0 = 0;
2321 this->rename_arg1 = 0;
2322 }
2323 syscall::rename:return /trackedpid[pid] == 1/ {
2324 self->rename_arg0 = 0;
2325 self->rename_arg1 = 0;
2326 }
2327
2328 /* Track chdir, it's painful because it is only receiving relative path.
2329 */
2330 syscall::chdir:entry /trackedpid[pid] == 1/ {
2331 self->chdir_arg0 = arg0;
2332 }
2333 syscall::chdir:return /trackedpid[pid] == 1 && errno == 0/ {
2334 this->chdir_arg0 = copyinstr(self->chdir_arg0);
2335 printf("%d %d %s(\\"%s\\")\\n",
2336 logindex, pid, probefunc,
2337 this->chdir_arg0);
2338 logindex++;
2339 this->chdir_arg0 = 0;
2340 }
2341 syscall::chdir:return /trackedpid[pid] == 1/ {
2342 self->chdir_arg0 = 0;
2343 }
2344 """
2345
2346 # execve-specific code, tends to throw a lot of exceptions.
2347 D_CODE_EXECVE = """
2348 /* Finally what we care about! */
2349 syscall::exec*:entry /trackedpid[pid]/ {
2350 self->exec_arg0 = copyinstr(arg0);
2351 /* Incrementally probe for a NULL in the argv parameter of execve() to
2352 * figure out argc. */
2353 /* TODO(maruel): Skip the remaining copyin() when a NULL pointer was
2354 * found. */
2355 self->exec_argc = 0;
2356 /* Probe for argc==1 */
2357 this->exec_argv = (user_addr_t*)copyin(
2358 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
2359 self->exec_argc = this->exec_argv[self->exec_argc] ?
2360 (self->exec_argc + 1) : self->exec_argc;
2361
2362 /* Probe for argc==2 */
2363 this->exec_argv = (user_addr_t*)copyin(
2364 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
2365 self->exec_argc = this->exec_argv[self->exec_argc] ?
2366 (self->exec_argc + 1) : self->exec_argc;
2367
2368 /* Probe for argc==3 */
2369 this->exec_argv = (user_addr_t*)copyin(
2370 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
2371 self->exec_argc = this->exec_argv[self->exec_argc] ?
2372 (self->exec_argc + 1) : self->exec_argc;
2373
2374 /* Probe for argc==4 */
2375 this->exec_argv = (user_addr_t*)copyin(
2376 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
2377 self->exec_argc = this->exec_argv[self->exec_argc] ?
2378 (self->exec_argc + 1) : self->exec_argc;
2379
2380 /* Copy the inputs strings since there is no guarantee they'll be
2381 * present after the call completed. */
2382 self->exec_argv0 = (self->exec_argc > 0) ?
2383 copyinstr(this->exec_argv[0]) : "";
2384 self->exec_argv1 = (self->exec_argc > 1) ?
2385 copyinstr(this->exec_argv[1]) : "";
2386 self->exec_argv2 = (self->exec_argc > 2) ?
2387 copyinstr(this->exec_argv[2]) : "";
2388 self->exec_argv3 = (self->exec_argc > 3) ?
2389 copyinstr(this->exec_argv[3]) : "";
2390 this->exec_argv = 0;
2391 }
2392 syscall::exec*:return /trackedpid[pid] && errno == 0/ {
2393 /* We need to join strings here, as using multiple printf() would
2394 * cause tearing when multiple threads/processes are traced.
2395 * Since it is impossible to escape a string and join it to another one,
2396 * like sprintf("%s%S", previous, more), use hackery.
2397 * Each of the elements are split with a \\1. \\0 cannot be used because
2398 * it is simply ignored. This will conflict with any program putting a
2399 * \\1 in their execve() string but this should be "rare enough" */
2400 this->args = "";
2401 /* Process exec_argv[0] */
2402 this->args = strjoin(
2403 this->args, (self->exec_argc > 0) ? self->exec_argv0 : "");
2404
2405 /* Process exec_argv[1] */
2406 this->args = strjoin(
2407 this->args, (self->exec_argc > 1) ? "\\1" : "");
2408 this->args = strjoin(
2409 this->args, (self->exec_argc > 1) ? self->exec_argv1 : "");
2410
2411 /* Process exec_argv[2] */
2412 this->args = strjoin(
2413 this->args, (self->exec_argc > 2) ? "\\1" : "");
2414 this->args = strjoin(
2415 this->args, (self->exec_argc > 2) ? self->exec_argv2 : "");
2416
2417 /* Process exec_argv[3] */
2418 this->args = strjoin(
2419 this->args, (self->exec_argc > 3) ? "\\1" : "");
2420 this->args = strjoin(
2421 this->args, (self->exec_argc > 3) ? self->exec_argv3 : "");
2422
2423 /* Prints self->exec_argc to permits verifying the internal
2424 * consistency since this code is quite fishy. */
2425 printf("%d %d %s(\\"%s\\", [%d, %S])\\n",
2426 logindex, pid, probefunc,
2427 self->exec_arg0,
2428 self->exec_argc,
2429 this->args);
2430 logindex++;
2431 this->args = 0;
2432 }
2433 syscall::exec*:return /trackedpid[pid]/ {
2434 self->exec_arg0 = 0;
2435 self->exec_argc = 0;
2436 self->exec_argv0 = 0;
2437 self->exec_argv1 = 0;
2438 self->exec_argv2 = 0;
2439 self->exec_argv3 = 0;
2440 }
2441 """
2442
2443 # Code currently not used.
2444 D_EXTRANEOUS = """
2445 /* This is a good learning experience, since it traces a lot of things
2446 * related to the process and child processes.
2447 * Warning: it generates a gigantic log. For example, tracing
2448 * "data/trace_inputs/child1.py --child" generates a 2mb log and takes
2449 * several minutes to execute.
2450 */
2451 /*
2452 mach_trap::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2453 printf("%d %d %s_%s() = %d\\n",
2454 logindex, pid, probeprov, probefunc, errno);
2455 logindex++;
2456 }
2457 proc::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2458 printf("%d %d %s_%s() = %d\\n",
2459 logindex, pid, probeprov, probefunc, errno);
2460 logindex++;
2461 }
2462 sched::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2463 printf("%d %d %s_%s() = %d\\n",
2464 logindex, pid, probeprov, probefunc, errno);
2465 logindex++;
2466 }
2467 syscall::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2468 printf("%d %d %s_%s() = %d\\n",
2469 logindex, pid, probeprov, probefunc, errno);
2470 logindex++;
2471 }
2472 vminfo::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2473 printf("%d %d %s_%s() = %d\\n",
2474 logindex, pid, probeprov, probefunc, errno);
2475 logindex++;
2476 }
2477 */
2478 /* TODO(maruel): *stat* functions and friends
2479 syscall::access:return,
2480 syscall::chdir:return,
2481 syscall::chflags:return,
2482 syscall::chown:return,
2483 syscall::chroot:return,
2484 syscall::getattrlist:return,
2485 syscall::getxattr:return,
2486 syscall::lchown:return,
2487 syscall::lstat64:return,
2488 syscall::lstat:return,
2489 syscall::mkdir:return,
2490 syscall::pathconf:return,
2491 syscall::readlink:return,
2492 syscall::removexattr:return,
2493 syscall::setxattr:return,
2494 syscall::stat64:return,
2495 syscall::stat:return,
2496 syscall::truncate:return,
2497 syscall::unlink:return,
2498 syscall::utimes:return,
2499 */
2500 """
2501
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00002502 def __init__(self, logname, use_sudo):
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002503 """Starts the log collection with dtrace.
2504
2505 Requires root access or chmod 4555 on dtrace. dtrace is asynchronous so
2506 this needs to wait for dtrace to be "warmed up".
2507 """
2508 super(Dtrace.Tracer, self).__init__(logname)
maruel@chromium.org4c9b4fa2013-05-29 19:22:14 +00002509 # This script is used as a signal to figure out the root process.
2510 self._signal_script = create_subprocess_thunk()
2511 self._scripts_to_cleanup.append(self._signal_script)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002512 # This unique dummy temp file is used to signal the dtrace script that it
2513 # should stop as soon as all the child processes are done. A bit hackish
2514 # but works fine enough.
2515 self._dummy_file_id, self._dummy_file_name = tempfile.mkstemp(
2516 prefix='trace_signal_file')
2517
maruel@chromium.org9b2112a2013-04-05 18:01:02 +00002518 dtrace_path = '/usr/sbin/dtrace'
2519 if not os.path.isfile(dtrace_path):
2520 dtrace_path = 'dtrace'
2521 elif use_sudo is None and (os.stat(dtrace_path).st_mode & stat.S_ISUID):
2522 # No need to sudo. For those following at home, don't do that.
2523 use_sudo = False
2524
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002525 # Note: do not use the -p flag. It's useless if the initial process quits
2526 # too fast, resulting in missing traces from the grand-children. The D
2527 # code manages the dtrace lifetime itself.
2528 trace_cmd = [
maruel@chromium.org9b2112a2013-04-05 18:01:02 +00002529 dtrace_path,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002530 # Use a larger buffer if getting 'out of scratch space' errors.
2531 # Ref: https://wikis.oracle.com/display/DTrace/Options+and+Tunables
2532 '-b', '10m',
2533 '-x', 'dynvarsize=10m',
2534 #'-x', 'dtrace_global_maxsize=1m',
2535 '-x', 'evaltime=exec',
2536 '-o', '/dev/stderr',
2537 '-q',
2538 '-n', self._get_dtrace_code(),
2539 ]
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00002540 if use_sudo is not False:
2541 trace_cmd.insert(0, 'sudo')
2542
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002543 with open(self._logname + '.log', 'wb') as logfile:
2544 self._dtrace = subprocess.Popen(
2545 trace_cmd, stdout=logfile, stderr=subprocess.STDOUT)
2546 logging.debug('Started dtrace pid: %d' % self._dtrace.pid)
2547
2548 # Reads until one line is printed, which signifies dtrace is up and ready.
2549 with open(self._logname + '.log', 'rb') as logfile:
2550 while 'dtrace_BEGIN' not in logfile.readline():
2551 if self._dtrace.poll() is not None:
2552 # Do a busy wait. :/
2553 break
2554 logging.debug('dtrace started')
2555
2556 def _get_dtrace_code(self):
2557 """Setups the D code to implement child process tracking.
2558
2559 Injects the cookie in the script so it knows when to stop.
2560
2561 The script will detect any instance of the script created with
maruel@chromium.org2909b7c2013-05-29 18:52:59 +00002562 create_subprocess_thunk() and will start tracing it.
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002563 """
maruel@chromium.orgbe75f282012-11-02 19:34:37 +00002564 out = (
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002565 'inline int PID = %d;\n'
2566 'inline string SCRIPT = "%s";\n'
2567 'inline int FILE_ID = %d;\n'
2568 '\n'
2569 '%s') % (
2570 os.getpid(),
maruel@chromium.org4c9b4fa2013-05-29 19:22:14 +00002571 self._signal_script,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002572 self._dummy_file_id,
maruel@chromium.orgbe75f282012-11-02 19:34:37 +00002573 self.D_CODE)
2574 if os.environ.get('TRACE_INPUTS_DTRACE_ENABLE_EXECVE') == '1':
2575 # Do not enable by default since it tends to spew dtrace: error lines
2576 # because the execve() parameters are not in valid memory at the time of
2577 # logging.
2578 # TODO(maruel): Find a way to make this reliable since it's useful but
2579 # only works in limited/trivial uses cases for now.
2580 out += self.D_CODE_EXECVE
2581 return out
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002582
2583 def trace(self, cmd, cwd, tracename, output):
2584 """Runs dtrace on an executable.
2585
2586 This dtruss is broken when it starts the process itself or when tracing
2587 child processes, this code starts a wrapper process
maruel@chromium.org2909b7c2013-05-29 18:52:59 +00002588 generated with create_subprocess_thunk() which starts the executable to
2589 trace.
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002590 """
2591 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
2592 assert os.path.isabs(cmd[0]), cmd[0]
2593 assert os.path.isabs(cwd), cwd
2594 assert os.path.normpath(cwd) == cwd, cwd
2595 with self._lock:
2596 if not self._initialized:
2597 raise TracingFailure(
2598 'Called Tracer.trace() on an unitialized object',
2599 None, None, None, tracename)
2600 assert tracename not in (i['trace'] for i in self._traces)
2601
2602 # Starts the script wrapper to start the child process. This signals the
2603 # dtrace script that this process is to be traced.
2604 stdout = stderr = None
2605 if output:
2606 stdout = subprocess.PIPE
2607 stderr = subprocess.STDOUT
2608 child_cmd = [
2609 sys.executable,
maruel@chromium.org4c9b4fa2013-05-29 19:22:14 +00002610 self._signal_script,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002611 tracename,
2612 ]
2613 # Call a dummy function so that dtrace knows I'm about to launch a process
2614 # that needs to be traced.
2615 # Yummy.
2616 child = subprocess.Popen(
2617 child_cmd + fix_python_path(cmd),
2618 stdin=subprocess.PIPE,
2619 stdout=stdout,
2620 stderr=stderr,
2621 cwd=cwd)
2622 logging.debug('Started child pid: %d' % child.pid)
2623
2624 out = child.communicate()[0]
2625 # This doesn't mean tracing is done, one of the grand-child process may
2626 # still be alive. It will be tracked with the dtrace script.
2627
2628 with self._lock:
2629 assert tracename not in (i['trace'] for i in self._traces)
2630 self._traces.append(
2631 {
2632 'cmd': cmd,
2633 'cwd': cwd,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002634 'pid': child.pid,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002635 'output': out,
maruel@chromium.org964ddde2012-10-29 18:13:46 +00002636 'trace': tracename,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002637 })
2638 return child.returncode, out
2639
2640 def close(self, timeout=None):
2641 """Terminates dtrace."""
2642 logging.debug('close(%s)' % timeout)
2643 try:
2644 try:
2645 super(Dtrace.Tracer, self).close(timeout)
2646 # Signal dtrace that it should stop now.
maruel@chromium.orgf43e68b2012-10-15 20:23:10 +00002647 # ftruncate doesn't exist on Windows.
2648 os.ftruncate(self._dummy_file_id, 0) # pylint: disable=E1101
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002649 if timeout:
2650 start = time.time()
2651 # Use polling. :/
2652 while (self._dtrace.poll() is None and
2653 (time.time() - start) < timeout):
2654 time.sleep(0.1)
2655 self._dtrace.kill()
2656 self._dtrace.wait()
2657 finally:
2658 # Make sure to kill it in any case.
2659 if self._dtrace.poll() is None:
2660 try:
2661 self._dtrace.kill()
2662 self._dtrace.wait()
2663 except OSError:
2664 pass
2665
2666 if self._dtrace.returncode != 0:
2667 # Warn about any dtrace failure but basically ignore it.
2668 print 'dtrace failure: %s' % self._dtrace.returncode
2669 finally:
2670 os.close(self._dummy_file_id)
2671 os.remove(self._dummy_file_name)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002672
2673 def post_process_log(self):
2674 """Sorts the log back in order when each call occured.
2675
2676 dtrace doesn't save the buffer in strict order since it keeps one buffer
2677 per CPU.
2678 """
2679 super(Dtrace.Tracer, self).post_process_log()
2680 logname = self._logname + '.log'
2681 with open(logname, 'rb') as logfile:
2682 lines = [l for l in logfile if l.strip()]
2683 errors = [l for l in lines if l.startswith('dtrace:')]
2684 if errors:
2685 raise TracingFailure(
2686 'Found errors in the trace: %s' % '\n'.join(errors),
2687 None, None, None, logname)
2688 try:
2689 lines = sorted(lines, key=lambda l: int(l.split(' ', 1)[0]))
2690 except ValueError:
2691 raise TracingFailure(
2692 'Found errors in the trace: %s' % '\n'.join(
2693 l for l in lines if l.split(' ', 1)[0].isdigit()),
2694 None, None, None, logname)
2695 with open(logname, 'wb') as logfile:
2696 logfile.write(''.join(lines))
2697
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00002698 def __init__(self, use_sudo=None):
2699 super(Dtrace, self).__init__()
2700 self.use_sudo = use_sudo
2701
2702 def get_tracer(self, logname):
2703 return self.Tracer(logname, self.use_sudo)
2704
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002705 @staticmethod
2706 def clean_trace(logname):
2707 for ext in ('', '.log'):
2708 if os.path.isfile(logname + ext):
2709 os.remove(logname + ext)
2710
2711 @classmethod
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00002712 def parse_log(cls, logname, blacklist, trace_name):
2713 logging.info('parse_log(%s, ..., %s)', logname, trace_name)
maruel@chromium.org8b056ba2012-10-16 14:04:49 +00002714 assert os.path.isabs(logname)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002715
2716 def blacklist_more(filepath):
2717 # All the HFS metadata is in the form /.vol/...
2718 return blacklist(filepath) or re.match(r'^\/\.vol\/.+$', filepath)
2719
2720 data = read_json(logname)
2721 out = []
2722 for item in data['traces']:
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00002723 if trace_name and item['trace'] != trace_name:
2724 continue
maruel@chromium.org964ddde2012-10-29 18:13:46 +00002725 result = {
2726 'output': item['output'],
2727 'trace': item['trace'],
2728 }
2729 try:
2730 context = cls.Context(blacklist_more, item['pid'], item['cwd'])
maruel@chromium.org306e0e72012-11-02 18:22:03 +00002731 # It's fine to assume the file as UTF-8: OSX enforces the file names to
2732 # be valid UTF-8 and we control the log output.
2733 for line in codecs.open(logname + '.log', 'rb', encoding='utf-8'):
maruel@chromium.org964ddde2012-10-29 18:13:46 +00002734 context.on_line(line)
2735 result['results'] = context.to_results()
2736 except TracingFailure:
2737 result['exception'] = sys.exc_info()
2738 out.append(result)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002739 return out
2740
2741
2742class LogmanTrace(ApiBase):
2743 """Uses the native Windows ETW based tracing functionality to trace a child
2744 process.
2745
2746 Caveat: this implementations doesn't track cwd or initial_cwd. It is because
2747 the Windows Kernel doesn't have a concept of 'current working directory' at
2748 all. A Win32 process has a map of current directories, one per drive letter
2749 and it is managed by the user mode kernel32.dll. In kernel, a file is always
2750 opened relative to another file_object or as an absolute path. All the current
2751 working directory logic is done in user mode.
2752 """
2753 class Context(ApiBase.Context):
2754 """Processes a ETW log line and keeps the list of existent and non
2755 existent files accessed.
2756
2757 Ignores directories.
2758 """
2759 # These indexes are for the stripped version in json.
2760 EVENT_NAME = 0
2761 TYPE = 1
2762 PID = 2
2763 TID = 3
2764 PROCESSOR_ID = 4
2765 TIMESTAMP = 5
2766 USER_DATA = 6
2767
2768 class Process(ApiBase.Context.Process):
2769 def __init__(self, *args):
2770 super(LogmanTrace.Context.Process, self).__init__(*args)
2771 # Handle file objects that succeeded.
2772 self.file_objects = {}
2773
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002774 def __init__(self, blacklist, thunk_pid, trace_name, thunk_cmd):
2775 logging.info(
2776 '%s(%d, %s, %s)', self.__class__.__name__, thunk_pid, trace_name,
2777 thunk_cmd)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002778 super(LogmanTrace.Context, self).__init__(blacklist)
2779 self._drive_map = DosDriveMap()
2780 # Threads mapping to the corresponding process id.
2781 self._threads_active = {}
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002782 # Process ID of the tracer, e.g. the temporary script created by
maruel@chromium.org2909b7c2013-05-29 18:52:59 +00002783 # create_subprocess_thunk(). This is tricky because the process id may
2784 # have been reused.
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002785 self._thunk_pid = thunk_pid
2786 self._thunk_cmd = thunk_cmd
2787 self._trace_name = trace_name
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002788 self._line_number = 0
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002789 self._thunk_process = None
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002790
2791 def on_line(self, line):
2792 """Processes a json Event line."""
2793 self._line_number += 1
2794 try:
2795 # By Opcode
2796 handler = getattr(
2797 self,
2798 'handle_%s_%s' % (line[self.EVENT_NAME], line[self.TYPE]),
2799 None)
2800 if not handler:
2801 raise TracingFailure(
2802 'Unexpected event %s_%s' % (
2803 line[self.EVENT_NAME], line[self.TYPE]),
2804 None, None, None)
2805 handler(line)
2806 except TracingFailure, e:
2807 # Hack in the values since the handler could be a static function.
2808 e.pid = line[self.PID]
2809 e.line = line
2810 e.line_number = self._line_number
2811 # Re-raise the modified exception.
2812 raise
2813 except (KeyError, NotImplementedError, ValueError), e:
2814 raise TracingFailure(
2815 'Trace generated a %s exception: %s' % (
2816 e.__class__.__name__, str(e)),
2817 line[self.PID],
2818 self._line_number,
2819 line,
2820 e)
2821
2822 def to_results(self):
2823 if not self.root_process:
2824 raise TracingFailure(
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002825 'Failed to detect the initial process %d' % self._thunk_pid,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002826 None, None, None)
2827 process = self.root_process.to_results_process()
2828 return Results(process)
2829
2830 def _thread_to_process(self, tid):
2831 """Finds the process from the thread id."""
2832 tid = int(tid, 16)
2833 pid = self._threads_active.get(tid)
2834 if not pid or not self._process_lookup.get(pid):
2835 return
2836 return self._process_lookup[pid]
2837
2838 @classmethod
2839 def handle_EventTrace_Header(cls, line):
2840 """Verifies no event was dropped, e.g. no buffer overrun occured."""
2841 BUFFER_SIZE = cls.USER_DATA
2842 #VERSION = cls.USER_DATA + 1
2843 #PROVIDER_VERSION = cls.USER_DATA + 2
2844 #NUMBER_OF_PROCESSORS = cls.USER_DATA + 3
2845 #END_TIME = cls.USER_DATA + 4
2846 #TIMER_RESOLUTION = cls.USER_DATA + 5
2847 #MAX_FILE_SIZE = cls.USER_DATA + 6
2848 #LOG_FILE_MODE = cls.USER_DATA + 7
2849 #BUFFERS_WRITTEN = cls.USER_DATA + 8
2850 #START_BUFFERS = cls.USER_DATA + 9
2851 #POINTER_SIZE = cls.USER_DATA + 10
2852 EVENTS_LOST = cls.USER_DATA + 11
2853 #CPU_SPEED = cls.USER_DATA + 12
2854 #LOGGER_NAME = cls.USER_DATA + 13
2855 #LOG_FILE_NAME = cls.USER_DATA + 14
2856 #BOOT_TIME = cls.USER_DATA + 15
2857 #PERF_FREQ = cls.USER_DATA + 16
2858 #START_TIME = cls.USER_DATA + 17
2859 #RESERVED_FLAGS = cls.USER_DATA + 18
2860 #BUFFERS_LOST = cls.USER_DATA + 19
2861 #SESSION_NAME_STRING = cls.USER_DATA + 20
2862 #LOG_FILE_NAME_STRING = cls.USER_DATA + 21
2863 if line[EVENTS_LOST] != '0':
2864 raise TracingFailure(
2865 ( '%s events were lost during trace, please increase the buffer '
2866 'size from %s') % (line[EVENTS_LOST], line[BUFFER_SIZE]),
2867 None, None, None)
2868
2869 def handle_FileIo_Cleanup(self, line):
2870 """General wisdom: if a file is closed, it's because it was opened.
2871
2872 Note that FileIo_Close is not used since if a file was opened properly but
2873 not closed before the process exits, only Cleanup will be logged.
2874 """
2875 #IRP = self.USER_DATA
2876 TTID = self.USER_DATA + 1 # Thread ID, that's what we want.
2877 FILE_OBJECT = self.USER_DATA + 2
2878 #FILE_KEY = self.USER_DATA + 3
2879 proc = self._thread_to_process(line[TTID])
2880 if not proc:
2881 # Not a process we care about.
2882 return
2883 file_object = line[FILE_OBJECT]
2884 if file_object in proc.file_objects:
maruel@chromium.org538141b2013-06-03 20:57:17 +00002885 filepath, access_type = proc.file_objects.pop(file_object)
2886 proc.add_file(filepath, access_type)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002887
2888 def handle_FileIo_Create(self, line):
2889 """Handles a file open.
2890
2891 All FileIo events are described at
2892 http://msdn.microsoft.com/library/windows/desktop/aa363884.aspx
2893 for some value of 'description'.
2894
2895 " (..) process and thread id values of the IO events (..) are not valid "
2896 http://msdn.microsoft.com/magazine/ee358703.aspx
2897
2898 The FileIo.Create event doesn't return if the CreateFile() call
2899 succeeded, so keep track of the file_object and check that it is
2900 eventually closed with FileIo_Cleanup.
2901 """
2902 #IRP = self.USER_DATA
2903 TTID = self.USER_DATA + 1 # Thread ID, that's what we want.
2904 FILE_OBJECT = self.USER_DATA + 2
2905 #CREATE_OPTIONS = self.USER_DATA + 3
2906 #FILE_ATTRIBUTES = self.USER_DATA + 4
maruel@chromium.org538141b2013-06-03 20:57:17 +00002907 #SHARE_ACCESS = self.USER_DATA + 5
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002908 OPEN_PATH = self.USER_DATA + 6
2909
2910 proc = self._thread_to_process(line[TTID])
2911 if not proc:
2912 # Not a process we care about.
2913 return
2914
maruel@chromium.org306e0e72012-11-02 18:22:03 +00002915 raw_path = line[OPEN_PATH]
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002916 # Ignore directories and bare drive right away.
2917 if raw_path.endswith(os.path.sep):
2918 return
2919 filepath = self._drive_map.to_win32(raw_path)
2920 # Ignore bare drive right away. Some may still fall through with format
2921 # like '\\?\X:'
2922 if len(filepath) == 2:
2923 return
2924 file_object = line[FILE_OBJECT]
2925 if os.path.isdir(filepath):
2926 # There is no O_DIRECTORY equivalent on Windows. The closed is
2927 # FILE_FLAG_BACKUP_SEMANTICS but it's not exactly right either. So
2928 # simply discard directories are they are found.
2929 return
maruel@chromium.org538141b2013-06-03 20:57:17 +00002930 # Override any stale file object.
2931 # TODO(maruel): Figure out a way to detect if the file was opened for
2932 # reading or writting. Sadly CREATE_OPTIONS doesn't seem to be of any help
2933 # here. For now mark as None to make it clear we have no idea what it is
2934 # about.
2935 proc.file_objects[file_object] = (filepath, None)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002936
2937 def handle_FileIo_Rename(self, line):
2938 # TODO(maruel): Handle?
2939 pass
2940
2941 def handle_Process_End(self, line):
2942 pid = line[self.PID]
2943 if self._process_lookup.get(pid):
2944 logging.info('Terminated: %d' % pid)
2945 self._process_lookup[pid] = None
2946 else:
2947 logging.debug('Terminated: %d' % pid)
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002948 if self._thunk_process and self._thunk_process.pid == pid:
2949 self._thunk_process = None
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002950
2951 def handle_Process_Start(self, line):
2952 """Handles a new child process started by PID."""
2953 #UNIQUE_PROCESS_KEY = self.USER_DATA
2954 PROCESS_ID = self.USER_DATA + 1
2955 #PARENT_PID = self.USER_DATA + 2
2956 #SESSION_ID = self.USER_DATA + 3
2957 #EXIT_STATUS = self.USER_DATA + 4
2958 #DIRECTORY_TABLE_BASE = self.USER_DATA + 5
2959 #USER_SID = self.USER_DATA + 6
2960 IMAGE_FILE_NAME = self.USER_DATA + 7
2961 COMMAND_LINE = self.USER_DATA + 8
2962
2963 ppid = line[self.PID]
2964 pid = int(line[PROCESS_ID], 16)
maruel@chromium.org306e0e72012-11-02 18:22:03 +00002965 command_line = CommandLineToArgvW(line[COMMAND_LINE])
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002966 logging.debug(
2967 'New process %d->%d (%s) %s' %
maruel@chromium.org306e0e72012-11-02 18:22:03 +00002968 (ppid, pid, line[IMAGE_FILE_NAME], command_line))
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002969
2970 if pid == self._thunk_pid:
2971 # Need to ignore processes we don't know about because the log is
2972 # system-wide. self._thunk_pid shall start only one process.
2973 # This is tricky though because Windows *loves* to reuse process id and
2974 # it happens often that the process ID of the thunk script created by
maruel@chromium.org2909b7c2013-05-29 18:52:59 +00002975 # create_subprocess_thunk() is reused. So just detecting the pid here is
2976 # not sufficient, we must confirm the command line.
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00002977 if command_line[:len(self._thunk_cmd)] != self._thunk_cmd:
2978 logging.info(
2979 'Ignoring duplicate pid %d for %s: %s while searching for %s',
2980 pid, self._trace_name, command_line, self._thunk_cmd)
2981 return
2982
2983 # TODO(maruel): The check is quite weak. Add the thunk path.
2984 if self._thunk_process:
2985 raise TracingFailure(
2986 ( 'Parent process is _thunk_pid(%d) but thunk_process(%d) is '
2987 'already set') % (self._thunk_pid, self._thunk_process.pid),
2988 None, None, None)
2989 proc = self.Process(self.blacklist, pid, None)
2990 self._thunk_process = proc
2991 return
2992 elif ppid == self._thunk_pid and self._thunk_process:
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00002993 proc = self.Process(self.blacklist, pid, None)
2994 self.root_process = proc
2995 ppid = None
2996 elif self._process_lookup.get(ppid):
2997 proc = self.Process(self.blacklist, pid, None)
2998 self._process_lookup[ppid].children.append(proc)
2999 else:
3000 # Ignore
3001 return
3002 self._process_lookup[pid] = proc
3003
maruel@chromium.org306e0e72012-11-02 18:22:03 +00003004 proc.command = command_line
3005 proc.executable = line[IMAGE_FILE_NAME]
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003006 # proc.command[0] may be the absolute path of 'executable' but it may be
3007 # anything else too. If it happens that command[0] ends with executable,
3008 # use it, otherwise defaults to the base name.
3009 cmd0 = proc.command[0].lower()
3010 if not cmd0.endswith('.exe'):
3011 # TODO(maruel): That's not strictly true either.
3012 cmd0 += '.exe'
3013 if cmd0.endswith(proc.executable) and os.path.isfile(cmd0):
3014 # Fix the path.
3015 cmd0 = cmd0.replace('/', os.path.sep)
3016 cmd0 = os.path.normpath(cmd0)
3017 proc.executable = get_native_path_case(cmd0)
3018 logging.info(
3019 'New child: %s -> %d %s' % (ppid, pid, proc.executable))
3020
3021 def handle_Thread_End(self, line):
3022 """Has the same parameters as Thread_Start."""
3023 tid = int(line[self.TID], 16)
3024 self._threads_active.pop(tid, None)
3025
3026 def handle_Thread_Start(self, line):
3027 """Handles a new thread created.
3028
3029 Do not use self.PID here since a process' initial thread is created by
3030 the parent process.
3031 """
3032 PROCESS_ID = self.USER_DATA
3033 TTHREAD_ID = self.USER_DATA + 1
3034 #STACK_BASE = self.USER_DATA + 2
3035 #STACK_LIMIT = self.USER_DATA + 3
3036 #USER_STACK_BASE = self.USER_DATA + 4
3037 #USER_STACK_LIMIT = self.USER_DATA + 5
3038 #AFFINITY = self.USER_DATA + 6
3039 #WIN32_START_ADDR = self.USER_DATA + 7
3040 #TEB_BASE = self.USER_DATA + 8
3041 #SUB_PROCESS_TAG = self.USER_DATA + 9
3042 #BASE_PRIORITY = self.USER_DATA + 10
3043 #PAGE_PRIORITY = self.USER_DATA + 11
3044 #IO_PRIORITY = self.USER_DATA + 12
3045 #THREAD_FLAGS = self.USER_DATA + 13
3046 # Do not use self.PID here since a process' initial thread is created by
3047 # the parent process.
3048 pid = int(line[PROCESS_ID], 16)
3049 tid = int(line[TTHREAD_ID], 16)
3050 logging.debug('New thread pid:%d, tid:%d' % (pid, tid))
3051 self._threads_active[tid] = pid
3052
3053 @classmethod
3054 def supported_events(cls):
3055 """Returns all the procesed events."""
3056 out = []
3057 for member in dir(cls):
3058 match = re.match(r'^handle_([A-Za-z]+)_([A-Za-z]+)$', member)
3059 if match:
3060 out.append(match.groups())
3061 return out
3062
3063 class Tracer(ApiBase.Tracer):
3064 # The basic headers.
3065 EXPECTED_HEADER = [
3066 u'Event Name',
3067 u'Type',
3068 u'Event ID',
3069 u'Version',
3070 u'Channel',
3071 u'Level', # 5
3072 u'Opcode',
3073 u'Task',
3074 u'Keyword',
3075 u'PID',
3076 u'TID', # 10
3077 u'Processor Number',
3078 u'Instance ID',
3079 u'Parent Instance ID',
3080 u'Activity ID',
3081 u'Related Activity ID', # 15
3082 u'Clock-Time',
3083 u'Kernel(ms)', # Both have a resolution of ~15ms which makes them
3084 u'User(ms)', # pretty much useless.
3085 u'User Data', # Extra arguments that are event-specific.
3086 ]
3087 # Only the useful headers common to all entries are listed there. Any column
3088 # at 19 or higher is dependent on the specific event.
3089 EVENT_NAME = 0
3090 TYPE = 1
3091 PID = 9
3092 TID = 10
3093 PROCESSOR_ID = 11
3094 TIMESTAMP = 16
3095 NULL_GUID = '{00000000-0000-0000-0000-000000000000}'
3096 USER_DATA = 19
3097
maruel@chromium.org306e0e72012-11-02 18:22:03 +00003098 class CsvReader(object):
3099 """CSV reader that reads files generated by tracerpt.exe.
3100
3101 csv.reader() fails to read them properly, it mangles file names quoted
3102 with "" with a comma in it.
3103 """
3104 # 0. Had a ',' or one of the following ' ' after a comma, next should
3105 # be ' ', '"' or string or ',' for an empty field.
3106 ( HAD_DELIMITER,
3107 # 1. Processing an unquoted field up to ','.
3108 IN_STR,
3109 # 2. Processing a new field starting with '"'.
3110 STARTING_STR_QUOTED,
3111 # 3. Second quote in a row at the start of a field. It could be either
3112 # '""foo""' or '""'. Who the hell thought it was a great idea to use
3113 # the same character for delimiting and escaping?
3114 STARTING_SECOND_QUOTE,
3115 # 4. A quote inside a quoted string where the previous character was
3116 # not a quote, so the string is not empty. Can be either: end of a
3117 # quoted string (a delimiter) or a quote escape. The next char must be
3118 # either '"' or ','.
3119 HAD_QUOTE_IN_QUOTED,
3120 # 5. Second quote inside a quoted string.
3121 HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED,
3122 # 6. Processing a field that started with '"'.
3123 IN_STR_QUOTED) = range(7)
3124
3125 def __init__(self, f):
3126 self.f = f
3127
3128 def __iter__(self):
3129 return self
3130
3131 def next(self):
3132 """Splits the line in fields."""
3133 line = self.f.readline()
3134 if not line:
3135 raise StopIteration()
3136 line = line.strip()
3137 fields = []
3138 state = self.HAD_DELIMITER
3139 for i, c in enumerate(line):
3140 if state == self.HAD_DELIMITER:
3141 if c == ',':
3142 # Empty field.
3143 fields.append('')
3144 elif c == ' ':
3145 # Ignore initial whitespaces
3146 pass
3147 elif c == '"':
3148 state = self.STARTING_STR_QUOTED
3149 fields.append('')
3150 else:
3151 # Start of a new field.
3152 state = self.IN_STR
3153 fields.append(c)
3154
3155 elif state == self.IN_STR:
3156 # Do not accept quote inside unquoted field.
3157 assert c != '"', (i, c, line, fields)
3158 if c == ',':
3159 fields[-1] = fields[-1].strip()
3160 state = self.HAD_DELIMITER
3161 else:
3162 fields[-1] = fields[-1] + c
3163
3164 elif state == self.STARTING_STR_QUOTED:
3165 if c == '"':
3166 # Do not store the character yet.
3167 state = self.STARTING_SECOND_QUOTE
3168 else:
3169 state = self.IN_STR_QUOTED
3170 fields[-1] = fields[-1] + c
3171
3172 elif state == self.STARTING_SECOND_QUOTE:
3173 if c == ',':
3174 # It was an empty field. '""' == ''.
3175 state = self.HAD_DELIMITER
3176 else:
3177 fields[-1] = fields[-1] + '"' + c
3178 state = self.IN_STR_QUOTED
3179
3180 elif state == self.HAD_QUOTE_IN_QUOTED:
3181 if c == ',':
3182 # End of the string.
3183 state = self.HAD_DELIMITER
3184 elif c == '"':
3185 state = self.HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED
3186 else:
3187 # The previous double-quote was just an unescaped quote.
3188 fields[-1] = fields[-1] + '"' + c
3189 state = self.IN_STR_QUOTED
3190
3191 elif state == self.HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED:
3192 if c == ',':
3193 # End of the string.
3194 state = self.HAD_DELIMITER
3195 fields[-1] = fields[-1] + '"'
3196 else:
maruel@chromium.org261d1352013-02-25 19:23:20 +00003197 # That's just how the logger rolls. Revert back to appending the
3198 # char and "guess" it was a quote in a double-quoted string.
3199 state = self.IN_STR_QUOTED
3200 fields[-1] = fields[-1] + '"' + c
maruel@chromium.org306e0e72012-11-02 18:22:03 +00003201
3202 elif state == self.IN_STR_QUOTED:
3203 if c == '"':
3204 # Could be a delimiter or an escape.
3205 state = self.HAD_QUOTE_IN_QUOTED
3206 else:
3207 fields[-1] = fields[-1] + c
3208
3209 if state == self.HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED:
3210 fields[-1] = fields[-1] + '"'
3211 else:
3212 assert state in (
3213 # Terminated with a normal field.
3214 self.IN_STR,
3215 # Terminated with an empty field.
3216 self.STARTING_SECOND_QUOTE,
3217 # Terminated with a normal quoted field.
3218 self.HAD_QUOTE_IN_QUOTED), (
3219 line, state, fields)
3220 return fields
3221
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003222 def __init__(self, logname):
3223 """Starts the log collection.
3224
3225 Requires administrative access. logman.exe is synchronous so no need for a
3226 "warmup" call. 'Windows Kernel Trace' is *localized* so use its GUID
3227 instead. The GUID constant name is SystemTraceControlGuid. Lovely.
3228
3229 One can get the list of potentially interesting providers with:
3230 "logman query providers | findstr /i file"
3231 """
3232 super(LogmanTrace.Tracer, self).__init__(logname)
maruel@chromium.org4c9b4fa2013-05-29 19:22:14 +00003233 self._signal_script = create_subprocess_thunk()
3234 self._scripts_to_cleanup.append(self._signal_script)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003235 cmd_start = [
3236 'logman.exe',
3237 'start',
3238 'NT Kernel Logger',
3239 '-p', '{9e814aad-3204-11d2-9a82-006008a86939}',
3240 # splitio,fileiocompletion,syscall,file,cswitch,img
3241 '(process,fileio,thread)',
3242 '-o', self._logname + '.etl',
3243 '-ets', # Send directly to kernel
3244 # Values extracted out of thin air.
3245 # Event Trace Session buffer size in kb.
3246 '-bs', '10240',
3247 # Number of Event Trace Session buffers.
3248 '-nb', '16', '256',
3249 ]
3250 logging.debug('Running: %s' % cmd_start)
3251 try:
3252 subprocess.check_call(
3253 cmd_start,
3254 stdin=subprocess.PIPE,
3255 stdout=subprocess.PIPE,
3256 stderr=subprocess.STDOUT)
3257 except subprocess.CalledProcessError, e:
3258 if e.returncode == -2147024891:
3259 print >> sys.stderr, 'Please restart with an elevated admin prompt'
3260 elif e.returncode == -2144337737:
3261 print >> sys.stderr, (
3262 'A kernel trace was already running, stop it and try again')
3263 raise
3264
3265 def trace(self, cmd, cwd, tracename, output):
3266 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
3267 assert os.path.isabs(cmd[0]), cmd[0]
3268 assert os.path.isabs(cwd), cwd
3269 assert os.path.normpath(cwd) == cwd, cwd
3270 with self._lock:
3271 if not self._initialized:
3272 raise TracingFailure(
3273 'Called Tracer.trace() on an unitialized object',
3274 None, None, None, tracename)
3275 assert tracename not in (i['trace'] for i in self._traces)
3276
3277 # Use "logman -?" for help.
3278
3279 stdout = stderr = None
3280 if output:
3281 stdout = subprocess.PIPE
3282 stderr = subprocess.STDOUT
3283
3284 # Run the child process.
3285 logging.debug('Running: %s' % cmd)
maruel@chromium.org2909b7c2013-05-29 18:52:59 +00003286 # Use the temporary script generated with create_subprocess_thunk() so we
3287 # have a clear pid owner. Since trace_inputs.py can be used as a library
3288 # and could trace multiple processes simultaneously, it makes it more
3289 # complex if the executable to be traced is executed directly here. It
3290 # also solves issues related to logman.exe that needs to be executed to
3291 # control the kernel trace.
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003292 child_cmd = [
3293 sys.executable,
maruel@chromium.org4c9b4fa2013-05-29 19:22:14 +00003294 self._signal_script,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003295 tracename,
3296 ]
3297 child = subprocess.Popen(
3298 child_cmd + fix_python_path(cmd),
3299 cwd=cwd,
3300 stdin=subprocess.PIPE,
3301 stdout=stdout,
3302 stderr=stderr)
3303 logging.debug('Started child pid: %d' % child.pid)
3304 out = child.communicate()[0]
3305 # This doesn't mean all the grand-children are done. Sadly, we don't have
3306 # a good way to determine that.
3307
3308 with self._lock:
3309 assert tracename not in (i['trace'] for i in self._traces)
3310 self._traces.append({
maruel@chromium.orgf43e68b2012-10-15 20:23:10 +00003311 'cmd': cmd,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003312 'cwd': cwd,
maruel@chromium.org964ddde2012-10-29 18:13:46 +00003313 'output': out,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003314 'pid': child.pid,
maruel@chromium.org19e9fcd2012-10-29 18:24:02 +00003315 # Used to figure out the real process when process ids are reused.
3316 'thunk_cmd': child_cmd,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003317 'trace': tracename,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003318 })
3319
3320 return child.returncode, out
3321
3322 def close(self, _timeout=None):
3323 """Stops the kernel log collection and converts the traces to text
3324 representation.
3325 """
3326 with self._lock:
maruel@chromium.org808e3f22013-05-16 16:42:54 +00003327 try:
3328 super(LogmanTrace.Tracer, self).close()
3329 finally:
3330 cmd_stop = [
3331 'logman.exe',
3332 'stop',
3333 'NT Kernel Logger',
3334 '-ets', # Sends the command directly to the kernel.
3335 ]
3336 logging.debug('Running: %s' % cmd_stop)
3337 subprocess.check_call(
3338 cmd_stop,
3339 stdin=subprocess.PIPE,
3340 stdout=subprocess.PIPE,
3341 stderr=subprocess.STDOUT)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003342
3343 def post_process_log(self):
3344 """Converts the .etl file into .csv then into .json."""
3345 super(LogmanTrace.Tracer, self).post_process_log()
3346 logformat = 'csv'
3347 self._convert_log(logformat)
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +00003348 self._trim_log(logformat)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003349
maruel@chromium.org808e3f22013-05-16 16:42:54 +00003350 def _gen_logdata(self):
3351 return {
3352 'format': 'csv',
3353 'traces': self._traces,
3354 }
3355
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +00003356 def _trim_log(self, logformat):
3357 """Reduces the amount of data in original log by generating a 'reduced'
3358 log.
3359 """
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003360 if logformat == 'csv_utf16':
maruel@chromium.org306e0e72012-11-02 18:22:03 +00003361 file_handle = codecs.open(
3362 self._logname + '.' + logformat, 'r', encoding='utf-16')
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003363
3364 elif logformat == 'csv':
maruel@chromium.org306e0e72012-11-02 18:22:03 +00003365 assert sys.getfilesystemencoding() == 'mbcs'
3366 file_handle = codecs.open(
3367 self._logname + '.' + logformat, 'r',
3368 encoding=get_current_encoding())
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003369
3370 supported_events = LogmanTrace.Context.supported_events()
3371
3372 def trim(generator):
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +00003373 """Loads items from the generator and returns the interesting data.
3374
3375 It filters out any uninteresting line and reduce the amount of data in
3376 the trace.
3377 """
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003378 for index, line in enumerate(generator):
3379 if not index:
3380 if line != self.EXPECTED_HEADER:
3381 raise TracingFailure(
maruel@chromium.org306e0e72012-11-02 18:22:03 +00003382 'Found malformed header: %s' % line,
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003383 None, None, None)
3384 continue
3385 # As you can see, the CSV is full of useful non-redundant information:
3386 if (line[2] != '0' or # Event ID
3387 line[3] not in ('2', '3') or # Version
3388 line[4] != '0' or # Channel
3389 line[5] != '0' or # Level
3390 line[7] != '0' or # Task
3391 line[8] != '0x0000000000000000' or # Keyword
3392 line[12] != '' or # Instance ID
3393 line[13] != '' or # Parent Instance ID
3394 line[14] != self.NULL_GUID or # Activity ID
3395 line[15] != ''): # Related Activity ID
3396 raise TracingFailure(
3397 'Found unexpected values in line: %s' % ' '.join(line),
3398 None, None, None)
3399
3400 if (line[self.EVENT_NAME], line[self.TYPE]) not in supported_events:
3401 continue
3402
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003403 yield [
3404 line[self.EVENT_NAME],
3405 line[self.TYPE],
3406 line[self.PID],
3407 line[self.TID],
3408 line[self.PROCESSOR_ID],
3409 line[self.TIMESTAMP],
3410 ] + line[self.USER_DATA:]
3411
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +00003412 # must not convert the trim() call into a list, since it will use too much
3413 # memory for large trace. use a csv file as a workaround since the json
3414 # parser requires a complete in-memory file.
3415 with open('%s.preprocessed' % self._logname, 'wb') as f:
3416 # $ and * can't be used in file name on windows, reducing the likelihood
3417 # of having to escape a string.
3418 out = csv.writer(
3419 f, delimiter='$', quotechar='*', quoting=csv.QUOTE_MINIMAL)
maruel@chromium.org306e0e72012-11-02 18:22:03 +00003420 for line in trim(self.CsvReader(file_handle)):
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +00003421 out.writerow([s.encode('utf-8') for s in line])
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003422
3423 def _convert_log(self, logformat):
3424 """Converts the ETL trace to text representation.
3425
3426 Normally, 'csv' is sufficient. If complex scripts are used (like eastern
3427 languages), use 'csv_utf16'. If localization gets in the way, use 'xml'.
3428
3429 Arguments:
3430 - logformat: Text format to be generated, csv, csv_utf16 or xml.
3431
3432 Use "tracerpt -?" for help.
3433 """
3434 LOCALE_INVARIANT = 0x7F
3435 windll.kernel32.SetThreadLocale(LOCALE_INVARIANT)
3436 cmd_convert = [
3437 'tracerpt.exe',
3438 '-l', self._logname + '.etl',
3439 '-o', self._logname + '.' + logformat,
3440 '-gmt', # Use UTC
3441 '-y', # No prompt
3442 # Use -of XML to get the header of each items after column 19, e.g. all
3443 # the actual headers of 'User Data'.
3444 ]
3445
3446 if logformat == 'csv':
3447 # tracerpt localizes the 'Type' column, for major brainfuck
3448 # entertainment. I can't imagine any sane reason to do that.
3449 cmd_convert.extend(['-of', 'CSV'])
3450 elif logformat == 'csv_utf16':
3451 # This causes it to use UTF-16, which doubles the log size but ensures
3452 # the log is readable for non-ASCII characters.
3453 cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode'])
3454 elif logformat == 'xml':
3455 cmd_convert.extend(['-of', 'XML'])
3456 else:
3457 raise ValueError('Unexpected log format \'%s\'' % logformat)
3458 logging.debug('Running: %s' % cmd_convert)
3459 # This can takes tens of minutes for large logs.
3460 # Redirects all output to stderr.
3461 subprocess.check_call(
3462 cmd_convert,
3463 stdin=subprocess.PIPE,
3464 stdout=sys.stderr,
3465 stderr=sys.stderr)
3466
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00003467 def __init__(self, use_sudo=False): # pylint: disable=W0613
3468 super(LogmanTrace, self).__init__()
3469 # Ignore use_sudo. It's irrelevant on Windows but kept to simplify the API.
3470
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003471 @staticmethod
3472 def clean_trace(logname):
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +00003473 for ext in ('', '.csv', '.etl', '.json', '.xml', '.preprocessed'):
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003474 if os.path.isfile(logname + ext):
3475 os.remove(logname + ext)
3476
3477 @classmethod
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00003478 def parse_log(cls, logname, blacklist, trace_name):
3479 logging.info('parse_log(%s, ..., %s)', logname, trace_name)
maruel@chromium.org8b056ba2012-10-16 14:04:49 +00003480 assert os.path.isabs(logname)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003481
3482 def blacklist_more(filepath):
3483 # All the NTFS metadata is in the form x:\$EXTEND or stuff like that.
3484 return blacklist(filepath) or re.match(r'[A-Z]\:\\\$EXTEND', filepath)
3485
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +00003486 # Create a list of (Context, result_dict) tuples. This is necessary because
3487 # the csv file may be larger than the amount of available memory.
3488 contexes = [
3489 (
3490 cls.Context(
3491 blacklist_more, item['pid'], item['trace'], item['thunk_cmd']),
3492 {
3493 'output': item['output'],
3494 'trace': item['trace'],
3495 },
3496 )
3497 for item in read_json(logname)['traces']
3498 if not trace_name or item['trace'] == trace_name
3499 ]
3500
3501 # The log may be too large to fit in memory and it is not efficient to read
3502 # it multiple times, so multiplex the contexes instead, which is slightly
3503 # more awkward.
3504 with open('%s.preprocessed' % logname, 'rb') as f:
3505 lines = csv.reader(
3506 f, delimiter='$', quotechar='*', quoting=csv.QUOTE_MINIMAL)
3507 for encoded in lines:
3508 line = [s.decode('utf-8') for s in encoded]
3509 # Convert the PID in-place from hex.
3510 line[cls.Context.PID] = int(line[cls.Context.PID], 16)
3511 for context in contexes:
3512 if 'exception' in context[1]:
3513 continue
3514 try:
3515 context[0].on_line(line)
3516 except TracingFailure:
3517 context[1]['exception'] = sys.exc_info()
3518
3519 for context in contexes:
3520 if 'exception' in context[1]:
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00003521 continue
maruel@chromium.orgeaedeb42012-10-31 14:38:22 +00003522 context[1]['results'] = context[0].to_results()
3523
3524 return [context[1] for context in contexes]
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003525
3526
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00003527def get_api(**kwargs):
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003528 """Returns the correct implementation for the current OS."""
3529 if sys.platform == 'cygwin':
3530 raise NotImplementedError(
3531 'Not implemented for cygwin, start the script from Win32 python')
3532 flavors = {
3533 'win32': LogmanTrace,
3534 'darwin': Dtrace,
3535 'sunos5': Dtrace,
3536 'freebsd7': Dtrace,
3537 'freebsd8': Dtrace,
3538 }
3539 # Defaults to strace.
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00003540 return flavors.get(sys.platform, Strace)(**kwargs)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003541
3542
3543def extract_directories(root_dir, files, blacklist):
3544 """Detects if all the files in a directory are in |files| and if so, replace
3545 the individual files by a Results.Directory instance.
3546
3547 Takes a list of Results.File instances and returns a shorter list of
3548 Results.File and Results.Directory instances.
3549
3550 Arguments:
3551 - root_dir: Optional base directory that shouldn't be search further.
3552 - files: list of Results.File instances.
3553 - blacklist: regexp of files to ignore, for example r'.+\.pyc'.
3554 """
3555 logging.info(
3556 'extract_directories(%s, %d files, ...)' % (root_dir, len(files)))
3557 assert not (root_dir or '').endswith(os.path.sep), root_dir
maruel@chromium.org306e0e72012-11-02 18:22:03 +00003558 # It is important for root_dir to not be a symlinked path, make sure to call
3559 # os.path.realpath() as needed.
3560 assert not root_dir or (
3561 os.path.realpath(get_native_path_case(root_dir)) == root_dir)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003562 assert not any(isinstance(f, Results.Directory) for f in files)
3563 # Remove non existent files.
3564 files = [f for f in files if f.existent]
3565 if not files:
3566 return files
3567 # All files must share the same root, which can be None.
3568 assert len(set(f.root for f in files)) == 1, set(f.root for f in files)
3569
3570 # Creates a {directory: {filename: File}} mapping, up to root.
3571 buckets = {}
3572 if root_dir:
3573 buckets[root_dir] = {}
3574 for fileobj in files:
3575 path = fileobj.full_path
3576 directory = os.path.dirname(path)
3577 assert directory
3578 # Do not use os.path.basename() so trailing os.path.sep is kept.
3579 basename = path[len(directory)+1:]
3580 files_in_directory = buckets.setdefault(directory, {})
3581 files_in_directory[basename] = fileobj
3582 # Add all the directories recursively up to root.
3583 while True:
3584 old_d = directory
3585 directory = os.path.dirname(directory)
3586 if directory + os.path.sep == root_dir or directory == old_d:
3587 break
3588 buckets.setdefault(directory, {})
3589
3590 root_prefix = len(root_dir) + 1 if root_dir else 0
3591 for directory in sorted(buckets, reverse=True):
maruel@chromium.orgb1678e62013-03-19 15:55:17 +00003592 if not os.path.isdir(directory):
3593 logging.debug(
3594 '%s was a directory but doesn\'t exist anymore; ignoring', directory)
3595 continue
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003596 actual = set(f for f in os.listdir(directory) if not blacklist(f))
3597 expected = set(buckets[directory])
3598 if not (actual - expected):
3599 parent = os.path.dirname(directory)
3600 buckets[parent][os.path.basename(directory)] = Results.Directory(
3601 root_dir,
3602 directory[root_prefix:],
3603 False,
3604 sum(f.size for f in buckets[directory].itervalues()),
3605 sum(f.nb_files for f in buckets[directory].itervalues()))
3606 # Remove the whole bucket.
3607 del buckets[directory]
3608
3609 # Reverse the mapping with what remains. The original instances are returned,
3610 # so the cached meta data is kept.
3611 files = sum((x.values() for x in buckets.itervalues()), [])
3612 return sorted(files, key=lambda x: x.path)
3613
3614
3615def trace(logfile, cmd, cwd, api, output):
3616 """Traces an executable. Returns (returncode, output) from api.
3617
3618 Arguments:
3619 - logfile: file to write to.
3620 - cmd: command to run.
3621 - cwd: current directory to start the process in.
3622 - api: a tracing api instance.
3623 - output: if True, returns output, otherwise prints it at the console.
3624 """
3625 cmd = fix_python_path(cmd)
3626 api.clean_trace(logfile)
3627 with api.get_tracer(logfile) as tracer:
3628 return tracer.trace(cmd, cwd, 'default', output)
3629
3630
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003631def CMDclean(args):
3632 """Cleans up traces."""
3633 parser = OptionParserTraceInputs(command='clean')
3634 options, args = parser.parse_args(args)
3635 api = get_api()
3636 api.clean_trace(options.log)
3637 return 0
3638
3639
3640def CMDtrace(args):
3641 """Traces an executable."""
3642 parser = OptionParserTraceInputs(command='trace')
3643 parser.allow_interspersed_args = False
3644 parser.add_option(
3645 '-q', '--quiet', action='store_true',
3646 help='Redirects traced executable output to /dev/null')
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00003647 parser.add_option(
3648 '-s', '--sudo', action='store_true',
3649 help='Use sudo when shelling out the tracer tool (ignored on Windows)')
3650 parser.add_option(
3651 '-n', '--no-sudo', action='store_false',
3652 help='Don\'t use sudo')
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003653 options, args = parser.parse_args(args)
3654
3655 if not args:
3656 parser.error('Please provide a command to run')
3657
3658 if not os.path.isabs(args[0]) and os.access(args[0], os.X_OK):
3659 args[0] = os.path.abspath(args[0])
3660
maruel@chromium.orgf14633b2013-01-25 15:12:52 +00003661 # options.sudo default value is None, which is to do whatever tracer defaults
3662 # do.
3663 api = get_api(use_sudo=options.sudo)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003664 return trace(options.log, args, os.getcwd(), api, options.quiet)[0]
3665
3666
3667def CMDread(args):
3668 """Reads the logs and prints the result."""
3669 parser = OptionParserTraceInputs(command='read')
3670 parser.add_option(
3671 '-V', '--variable',
3672 nargs=2,
3673 action='append',
3674 dest='variables',
3675 metavar='VAR_NAME directory',
3676 default=[],
3677 help=('Variables to replace relative directories against. Example: '
3678 '"-v \'$HOME\' \'/home/%s\'" will replace all occurence of your '
3679 'home dir with $HOME') % getpass.getuser())
3680 parser.add_option(
3681 '--root-dir',
3682 help='Root directory to base everything off it. Anything outside of this '
3683 'this directory will not be reported')
3684 parser.add_option(
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00003685 '--trace-name',
3686 help='Only reads one of the trace. Defaults to reading all traces')
3687 parser.add_option(
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003688 '-j', '--json', action='store_true',
3689 help='Outputs raw result data as json')
3690 parser.add_option(
3691 '-b', '--blacklist', action='append', default=[],
3692 help='List of regexp to use as blacklist filter')
3693 options, args = parser.parse_args(args)
3694
3695 if options.root_dir:
maruel@chromium.org7dc17b32013-04-02 22:27:05 +00003696 options.root_dir = get_native_path_case(
3697 unicode(os.path.abspath(options.root_dir)))
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003698
3699 variables = dict(options.variables)
3700 api = get_api()
3701 def blacklist(f):
3702 return any(re.match(b, f) for b in options.blacklist)
maruel@chromium.orgec74ff82012-10-29 18:14:47 +00003703 data = api.parse_log(options.log, blacklist, options.trace_name)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003704 # Process each trace.
3705 output_as_json = []
maruel@chromium.org9e56b592013-05-21 16:56:05 +00003706 try:
3707 for item in data:
3708 if 'exception' in item:
3709 # Do not abort the other traces.
3710 print >> sys.stderr, (
3711 'Trace %s: Got an exception: %s' % (
3712 item['trace'], item['exception'][1]))
3713 continue
3714 results = item['results']
3715 if options.root_dir:
3716 results = results.strip_root(options.root_dir)
3717
3718 if options.json:
3719 output_as_json.append(results.flatten())
3720 else:
3721 simplified = extract_directories(
3722 options.root_dir, results.files, blacklist)
3723 simplified = [f.replace_variables(variables) for f in simplified]
3724 if len(data) > 1:
3725 print('Trace: %s' % item['trace'])
3726 print('Total: %d' % len(results.files))
3727 print('Non existent: %d' % len(results.non_existent))
3728 for f in results.non_existent:
3729 print(' %s' % f.path)
3730 print(
3731 'Interesting: %d reduced to %d' % (
3732 len(results.existent), len(simplified)))
3733 for f in simplified:
3734 print(' %s' % f.path)
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003735
3736 if options.json:
maruel@chromium.org9e56b592013-05-21 16:56:05 +00003737 write_json(sys.stdout, output_as_json, False)
3738 except KeyboardInterrupt:
3739 return 1
3740 except IOError as e:
3741 if e.errno == errno.EPIPE:
3742 # Do not print a stack trace when the output is piped to less and the user
3743 # quits before the whole output was written.
3744 return 1
3745 raise
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003746 return 0
3747
3748
3749class OptionParserWithLogging(optparse.OptionParser):
3750 """Adds --verbose option."""
3751 def __init__(self, verbose=0, **kwargs):
3752 optparse.OptionParser.__init__(self, **kwargs)
3753 self.add_option(
3754 '-v', '--verbose',
3755 action='count',
3756 default=verbose,
3757 help='Use multiple times to increase verbosity')
3758
3759 def parse_args(self, *args, **kwargs):
3760 options, args = optparse.OptionParser.parse_args(self, *args, **kwargs)
3761 levels = [logging.ERROR, logging.INFO, logging.DEBUG]
3762 logging.basicConfig(
3763 level=levels[min(len(levels)-1, options.verbose)],
3764 format='%(levelname)5s %(module)15s(%(lineno)3d): %(message)s')
3765 return options, args
3766
3767
3768class OptionParserWithNiceDescription(OptionParserWithLogging):
3769 """Generates the description with the command's docstring."""
3770 def __init__(self, **kwargs):
3771 """Sets 'description' and 'usage' if not already specified."""
3772 command = kwargs.pop('command', 'help')
3773 kwargs.setdefault(
3774 'description',
3775 re.sub('[\r\n ]{2,}', ' ', get_command_handler(command).__doc__))
3776 kwargs.setdefault('usage', '%%prog %s [options]' % command)
3777 OptionParserWithLogging.__init__(self, **kwargs)
3778
3779
3780class OptionParserTraceInputs(OptionParserWithNiceDescription):
3781 """Adds automatic --log handling."""
3782 def __init__(self, **kwargs):
3783 OptionParserWithNiceDescription.__init__(self, **kwargs)
3784 self.add_option(
3785 '-l', '--log', help='Log file to generate or read, required')
3786
3787 def parse_args(self, *args, **kwargs):
3788 """Makes sure the paths make sense.
3789
3790 On Windows, / and \ are often mixed together in a path.
3791 """
3792 options, args = OptionParserWithNiceDescription.parse_args(
3793 self, *args, **kwargs)
3794 if not options.log:
3795 self.error('Must supply a log file with -l')
3796 options.log = os.path.abspath(options.log)
3797 return options, args
3798
3799
3800def extract_documentation():
3801 """Returns a dict {command: description} for each of documented command."""
3802 commands = (
3803 fn[3:]
3804 for fn in dir(sys.modules['__main__'])
3805 if fn.startswith('CMD') and get_command_handler(fn[3:]).__doc__)
3806 return dict((fn, get_command_handler(fn).__doc__) for fn in commands)
3807
3808
3809def CMDhelp(args):
3810 """Prints list of commands or help for a specific command."""
3811 doc = extract_documentation()
3812 # Calculates the optimal offset.
3813 offset = max(len(cmd) for cmd in doc)
3814 format_str = ' %-' + str(offset + 2) + 's %s'
3815 # Generate a one-liner documentation of each commands.
3816 commands_description = '\n'.join(
3817 format_str % (cmd, doc[cmd].split('\n')[0]) for cmd in sorted(doc))
3818
3819 parser = OptionParserWithNiceDescription(
3820 usage='%prog <command> [options]',
3821 description='Commands are:\n%s\n' % commands_description)
3822 parser.format_description = lambda _: parser.description
3823
3824 # Strip out any -h or --help argument.
3825 _, args = parser.parse_args([i for i in args if not i in ('-h', '--help')])
3826 if len(args) == 1:
3827 if not get_command_handler(args[0]):
3828 parser.error('Unknown command %s' % args[0])
3829 # The command was "%prog help command", replaces ourself with
3830 # "%prog command --help" so help is correctly printed out.
3831 return main(args + ['--help'])
3832 elif args:
3833 parser.error('Unknown argument "%s"' % ' '.join(args))
3834 parser.print_help()
3835 return 0
3836
3837
3838def get_command_handler(name):
3839 """Returns the command handler or CMDhelp if it doesn't exist."""
3840 return getattr(sys.modules['__main__'], 'CMD%s' % name, None)
3841
3842
3843def main_impl(argv):
3844 command = get_command_handler(argv[0] if argv else 'help')
3845 if not command:
3846 return CMDhelp(argv)
3847 return command(argv[1:])
3848
3849def main(argv):
maruel@chromium.orgac36fb72013-05-21 14:50:53 +00003850 disable_buffering()
maruel@chromium.org8fb47fe2012-10-03 20:13:15 +00003851 try:
3852 main_impl(argv)
3853 except TracingFailure, e:
3854 sys.stderr.write('\nError: ')
3855 sys.stderr.write(str(e))
3856 sys.stderr.write('\n')
3857 return 1
3858
3859
3860if __name__ == '__main__':
3861 sys.exit(main(sys.argv[1:]))