Blame - trace_inputs.py - chromium.googlesource.com/infra/luci/client-py

blob: 4e8df22757b15ef1672c50b6711df71ff068531c [file] [log] [blame]

maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1	#!/usr/bin/env python
				2	# coding=utf-8
				3	# Copyright (c) 2012 The Chromium Authors. All rights reserved.
				4	# Use of this source code is governed by a BSD-style license that can be
				5	# found in the LICENSE file.
				6
				7	"""Traces an executable and its child processes and extract the files accessed
				8	by them.
				9
				10	The implementation uses OS-specific API. The native Kernel logger and the ETL
				11	interface is used on Windows. Dtrace is used on OSX. Strace is used otherwise.
				12	The OS-specific implementation is hidden in an 'API' interface.
				13
				14	The results are embedded in a Results instance. The tracing is done in two
				15	phases, the first is to do the actual trace and generate an
				16	implementation-specific log file. Then the log file is parsed to extract the
				17	information, including the individual child processes and the files accessed
				18	from the log.
				19	"""
				20
				21	import codecs
				22	import csv
				23	import getpass
				24	import glob
				25	import json
				26	import logging
				27	import optparse
				28	import os
				29	import re
				30	import subprocess
				31	import sys
				32	import tempfile
				33	import threading
				34	import time
				35	import weakref
				36
				37	## OS-specific imports
				38
				39	if sys.platform == 'win32':
				40	from ctypes.wintypes import byref, create_unicode_buffer, c_int, c_wchar_p
				41	from ctypes.wintypes import windll, FormatError # pylint: disable=E0611
				42	from ctypes.wintypes import GetLastError # pylint: disable=E0611
				43	elif sys.platform == 'darwin':
				44	import Carbon.File # pylint: disable=F0401
				45	import MacOS # pylint: disable=F0401
				46
				47
				48	BASE_DIR = os.path.dirname(os.path.abspath(__file__))
				49	ROOT_DIR = os.path.dirname(os.path.dirname(BASE_DIR))
				50
				51
				52	class TracingFailure(Exception):
				53	"""An exception occured during tracing."""
				54	def __init__(self, description, pid, line_number, line, *args):
				55	super(TracingFailure, self).__init__(
				56	description, pid, line_number, line, *args)
				57	self.description = description
				58	self.pid = pid
				59	self.line_number = line_number
				60	self.line = line
				61	self.extra = args
				62
				63	def __str__(self):
				64	out = self.description
				65	if self.pid:
				66	out += '\npid: %d' % self.pid
				67	if self.line_number:
				68	out += '\nline: %d' % self.line_number
				69	if self.line:
				70	out += '\n%s' % self.line
				71	if self.extra:
				72	out += '\n' + ', '.join(map(str, filter(None, self.extra)))
				73	return out
				74
				75
				76	## OS-specific functions
				77
				78	if sys.platform == 'win32':
				79	def QueryDosDevice(drive_letter):
				80	"""Returns the Windows 'native' path for a DOS drive letter."""
				81	assert re.match(r'^[a-zA-Z]:$', drive_letter), drive_letter
				82	# Guesswork. QueryDosDeviceW never returns the required number of bytes.
				83	chars = 1024
				84	drive_letter = unicode(drive_letter)
				85	p = create_unicode_buffer(chars)
				86	if 0 == windll.kernel32.QueryDosDeviceW(drive_letter, p, chars):
				87	err = GetLastError()
				88	if err:
				89	# pylint: disable=E0602
				90	raise WindowsError(
				91	err,
				92	'QueryDosDevice(%s): %s (%d)' % (
				93	str(drive_letter), FormatError(err), err))
				94	return p.value
				95
				96
				97	def GetShortPathName(long_path):
				98	"""Returns the Windows short path equivalent for a 'long' path."""
				99	long_path = unicode(long_path)
				100	# Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is
				101	# not enforced.
				102	if os.path.isabs(long_path) and not long_path.startswith('\\\\?\\'):
				103	long_path = '\\\\?\\' + long_path
				104	chars = windll.kernel32.GetShortPathNameW(long_path, None, 0)
				105	if chars:
				106	p = create_unicode_buffer(chars)
				107	if windll.kernel32.GetShortPathNameW(long_path, p, chars):
				108	return p.value
				109
				110	err = GetLastError()
				111	if err:
				112	# pylint: disable=E0602
				113	raise WindowsError(
				114	err,
				115	'GetShortPathName(%s): %s (%d)' % (
				116	str(long_path), FormatError(err), err))
				117
				118
				119	def GetLongPathName(short_path):
				120	"""Returns the Windows long path equivalent for a 'short' path."""
				121	short_path = unicode(short_path)
				122	# Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is
				123	# not enforced.
				124	if os.path.isabs(short_path) and not short_path.startswith('\\\\?\\'):
				125	short_path = '\\\\?\\' + short_path
				126	chars = windll.kernel32.GetLongPathNameW(short_path, None, 0)
				127	if chars:
				128	p = create_unicode_buffer(chars)
				129	if windll.kernel32.GetLongPathNameW(short_path, p, chars):
				130	return p.value
				131
				132	err = GetLastError()
				133	if err:
				134	# pylint: disable=E0602
				135	raise WindowsError(
				136	err,
				137	'GetLongPathName(%s): %s (%d)' % (
				138	str(short_path), FormatError(err), err))
				139
				140
				141	def get_current_encoding():
				142	"""Returns the 'ANSI' code page associated to the process."""
				143	return 'cp%d' % int(windll.kernel32.GetACP())
				144
				145
				146	class DosDriveMap(object):
				147	"""Maps \Device\HarddiskVolumeN to N: on Windows."""
				148	# Keep one global cache.
				149	_MAPPING = {}
				150
				151	def __init__(self):
				152	"""Lazy loads the cache."""
				153	if not self._MAPPING:
				154	# This is related to UNC resolver on windows. Ignore that.
				155	self._MAPPING['\\Device\\Mup'] = None
				156	self._MAPPING['\\SystemRoot'] = os.environ['SystemRoot']
				157
				158	for letter in (chr(l) for l in xrange(ord('C'), ord('Z')+1)):
				159	try:
				160	letter = '%s:' % letter
				161	mapped = QueryDosDevice(letter)
				162	if mapped in self._MAPPING:
				163	logging.warn(
				164	('Two drives: \'%s\' and \'%s\', are mapped to the same disk'
				165	'. Drive letters are a user-mode concept and the kernel '
				166	'traces only have NT path, so all accesses will be '
				167	'associated with the first drive letter, independent of the '
				168	'actual letter used by the code') % (
				169	self._MAPPING[mapped], letter))
				170	else:
				171	self._MAPPING[mapped] = letter
				172	except WindowsError: # pylint: disable=E0602
				173	pass
				174
				175	def to_win32(self, path):
				176	"""Converts a native NT path to Win32/DOS compatible path."""
				177	match = re.match(r'(^\\Device\\[a-zA-Z0-9]+)(\\.*)?$', path)
				178	if not match:
				179	raise ValueError(
				180	'Can\'t convert %s into a Win32 compatible path' % path,
				181	path)
				182	if not match.group(1) in self._MAPPING:
				183	# Unmapped partitions may be accessed by windows for the
				184	# fun of it while the test is running. Discard these.
				185	return None
				186	drive = self._MAPPING[match.group(1)]
				187	if not drive or not match.group(2):
				188	return drive
				189	return drive + match.group(2)
				190
				191
				192	def isabs(path):
				193	"""Accepts X: as an absolute path, unlike python's os.path.isabs()."""
				194	return os.path.isabs(path) or len(path) == 2 and path[1] == ':'
				195
				196
				197	def get_native_path_case(p):
				198	"""Returns the native path case for an existing file.
				199
				200	On Windows, removes any leading '\\?\'.
				201	"""
				202	if not isabs(p):
				203	raise ValueError(
				204	'Can\'t get native path case for a non-absolute path: %s' % p,
				205	p)
				206	# Windows used to have an option to turn on case sensitivity on non Win32
				207	# subsystem but that's out of scope here and isn't supported anymore.
				208	# Go figure why GetShortPathName() is needed.
				209	try:
				210	out = GetLongPathName(GetShortPathName(p))
				211	except OSError, e:
				212	if e.args[0] in (2, 3, 5):
				213	# The path does not exist. Try to recurse and reconstruct the path.
				214	base = os.path.dirname(p)
				215	rest = os.path.basename(p)
				216	return os.path.join(get_native_path_case(base), rest)
				217	raise
				218	if out.startswith('\\\\?\\'):
				219	out = out[4:]
				220	# Always upper case the first letter since GetLongPathName() will return the
				221	# drive letter in the case it was given.
				222	return out[0].upper() + out[1:]
				223
				224
				225	def CommandLineToArgvW(command_line):
				226	"""Splits a commandline into argv using CommandLineToArgvW()."""
				227	# http://msdn.microsoft.com/library/windows/desktop/bb776391.aspx
				228	size = c_int()
				229	ptr = windll.shell32.CommandLineToArgvW(unicode(command_line), byref(size))
				230	try:
				231	return [arg for arg in (c_wchar_p * size.value).from_address(ptr)]
				232	finally:
				233	windll.kernel32.LocalFree(ptr)
				234
				235
				236	elif sys.platform == 'darwin':
				237
				238
				239	# On non-windows, keep the stdlib behavior.
				240	isabs = os.path.isabs
				241
				242
				243	def _find_item_native_case(root_path, item):
				244	"""Gets the native path case of a single item based at root_path.
				245
				246	There is no API to get the native path case of symlinks on OSX. So it
				247	needs to be done the slow way.
				248	"""
				249	item = item.lower()
				250	for element in os.listdir(root_path):
				251	if element.lower() == item:
				252	return element
				253
				254
				255	def _native_case(p):
				256	"""Gets the native path case. Warning: this function resolves symlinks."""
				257	logging.debug('native_case(%s)' % p)
				258	try:
				259	rel_ref, _ = Carbon.File.FSPathMakeRef(p)
				260	out = rel_ref.FSRefMakePath()
				261	if p.endswith(os.path.sep) and not out.endswith(os.path.sep):
				262	return out + os.path.sep
				263	return out
				264	except MacOS.Error, e:
				265	if e.args[0] in (-43, -120):
				266	# The path does not exist. Try to recurse and reconstruct the path.
				267	# -43 means file not found.
				268	# -120 means directory not found.
				269	base = os.path.dirname(p)
				270	rest = os.path.basename(p)
				271	return os.path.join(_native_case(base), rest)
				272	raise OSError(
				273	e.args[0], 'Failed to get native path for %s' % p, p, e.args[1])
				274
				275
				276	def _split_at_symlink_native(base_path, rest):
				277	"""Returns the native path for a symlink."""
				278	base, symlink, rest = split_at_symlink(base_path, rest)
				279	if symlink:
				280	if not base_path:
				281	base_path = base
				282	else:
				283	base_path = safe_join(base_path, base)
				284	symlink = _find_item_native_case(base_path, symlink)
				285	return base, symlink, rest
				286
				287
				288	def get_native_path_case(path):
				289	"""Returns the native path case for an existing file.
				290
				291	Technically, it's only HFS+ on OSX that is case preserving and
				292	insensitive. It's the default setting on HFS+ but can be changed.
				293	"""
				294	if not isabs(path):
				295	raise ValueError(
				296	'Can\'t get native path case for a non-absolute path: %s' % path,
				297	path)
				298	if path.startswith('/dev'):
				299	# /dev is not visible from Carbon, causing an exception.
				300	return path
				301
				302	# Starts assuming there is no symlink along the path.
				303	resolved = _native_case(path)
				304	if resolved.lower() == path.lower():
				305	# This code path is incredibly faster.
				306	return resolved
				307
				308	# There was a symlink, process it.
				309	base, symlink, rest = _split_at_symlink_native(None, path)
				310	assert symlink, (path, base, symlink, rest, resolved)
				311	prev = base
				312	base = safe_join(_native_case(base), symlink)
				313	assert len(base) > len(prev)
				314	while rest:
				315	prev = base
				316	relbase, symlink, rest = _split_at_symlink_native(base, rest)
				317	base = safe_join(base, relbase)
				318	assert len(base) > len(prev), (prev, base, symlink)
				319	if symlink:
				320	base = safe_join(base, symlink)
				321	assert len(base) > len(prev), (prev, base, symlink)
				322	# Make sure no symlink was resolved.
				323	assert base.lower() == path.lower(), (base, path)
				324	return base
				325
				326
				327	else: # OSes other than Windows and OSX.
				328
				329
				330	# On non-windows, keep the stdlib behavior.
				331	isabs = os.path.isabs
				332
				333
				334	def get_native_path_case(path):
				335	"""Returns the native path case for an existing file.
				336
				337	On OSes other than OSX and Windows, assume the file system is
				338	case-sensitive.
				339
				340	TODO(maruel): This is not strictly true. Implement if necessary.
				341	"""
				342	if not isabs(path):
				343	raise ValueError(
				344	'Can\'t get native path case for a non-absolute path: %s' % path,
				345	path)
				346	# Give up on cygwin, as GetLongPathName() can't be called.
				347	# Linux traces tends to not be normalized so use this occasion to normalize
				348	# it. This function implementation already normalizes the path on the other
				349	# OS so this needs to be done here to be coherent between OSes.
				350	out = os.path.normpath(path)
				351	if path.endswith(os.path.sep) and not out.endswith(os.path.sep):
				352	return out + os.path.sep
				353	return out
				354
				355
				356	if sys.platform != 'win32': # All non-Windows OSes.
				357
				358
				359	def safe_join(*args):
				360	"""Joins path elements like os.path.join() but doesn't abort on absolute
				361	path.
				362
				363	os.path.join('foo', '/bar') == '/bar'
				364	but safe_join('foo', '/bar') == 'foo/bar'.
				365	"""
				366	out = ''
				367	for element in args:
				368	if element.startswith(os.path.sep):
				369	if out.endswith(os.path.sep):
				370	out += element[1:]
				371	else:
				372	out += element
				373	else:
				374	if out.endswith(os.path.sep):
				375	out += element
				376	else:
				377	out += os.path.sep + element
				378	return out
				379
				380
				381	def split_at_symlink(base_dir, relfile):
				382	"""Scans each component of relfile and cut the string at the symlink if
				383	there is any.
				384
				385	Returns a tuple (base_path, symlink, rest), with symlink == rest == None if
				386	not symlink was found.
				387	"""
				388	if base_dir:
				389	assert relfile
				390	assert os.path.isabs(base_dir)
				391	index = 0
				392	else:
				393	assert os.path.isabs(relfile)
				394	index = 1
				395
				396	def at_root(rest):
				397	if base_dir:
				398	return safe_join(base_dir, rest)
				399	return rest
				400
				401	while True:
				402	try:
				403	index = relfile.index(os.path.sep, index)
				404	except ValueError:
				405	index = len(relfile)
				406	full = at_root(relfile[:index])
				407	if os.path.islink(full):
				408	# A symlink!
				409	base = os.path.dirname(relfile[:index])
				410	symlink = os.path.basename(relfile[:index])
				411	rest = relfile[index:]
				412	logging.debug(
				413	'split_at_symlink(%s, %s) -> (%s, %s, %s)' %
				414	(base_dir, relfile, base, symlink, rest))
				415	return base, symlink, rest
				416	if index == len(relfile):
				417	break
				418	index += 1
				419	return relfile, None, None
				420
				421
				422	def fix_python_path(cmd):
				423	"""Returns the fixed command line to call the right python executable."""
				424	out = cmd[:]
				425	if out[0] == 'python':
				426	out[0] = sys.executable
				427	elif out[0].endswith('.py'):
				428	out.insert(0, sys.executable)
				429	return out
				430
				431
				432	def create_thunk():
				433	handle, name = tempfile.mkstemp(prefix='trace_inputs_thunk', suffix='.py')
				434	os.write(
				435	handle,
				436	(
				437	'import subprocess\n'
				438	'import sys\n'
				439	'sys.exit(subprocess.call(sys.argv[2:]))\n'
				440	))
				441	os.close(handle)
				442	return name
				443
				444
				445	def strace_process_quoted_arguments(text):
				446	"""Extracts quoted arguments on a string and return the arguments as a list.
				447
				448	Implemented as an automaton. Supports incomplete strings in the form
				449	'"foo"...'.
				450
				451	Example:
				452	With text = '"foo", "bar"', the function will return ['foo', 'bar']
				453
				454	TODO(maruel): Implement escaping.
				455	"""
				456	# All the possible states of the DFA.
				457	( NEED_QUOTE, # Begining of a new arguments.
				458	INSIDE_STRING, # Inside an argument.
				459	ESCAPED, # Found a '\' inside a quote. Treat the next char as-is.
				460	NEED_COMMA_OR_DOT, # Right after the closing quote of an argument. Could be
				461	# a serie of 3 dots or a comma.
				462	NEED_SPACE, # Right after a comma
				463	NEED_DOT_2, # Found a dot, need a second one.
				464	NEED_DOT_3, # Found second dot, need a third one.
				465	NEED_COMMA, # Found third dot, need a comma.
				466	) = range(8)
				467
				468	state = NEED_QUOTE
				469	out = []
				470	for index, char in enumerate(text):
				471	if char == '"':
				472	if state == NEED_QUOTE:
				473	state = INSIDE_STRING
				474	# A new argument was found.
				475	out.append('')
				476	elif state == INSIDE_STRING:
				477	# The argument is now closed.
				478	state = NEED_COMMA_OR_DOT
				479	elif state == ESCAPED:
				480	out[-1] += char
				481	state = INSIDE_STRING
				482	else:
				483	raise ValueError(
				484	'Can\'t process char at column %d for: %r' % (index, text),
				485	index,
				486	text)
				487	elif char == ',':
				488	if state in (NEED_COMMA_OR_DOT, NEED_COMMA):
				489	state = NEED_SPACE
				490	elif state == INSIDE_STRING:
				491	out[-1] += char
				492	elif state == ESCAPED:
				493	out[-1] += char
				494	state = INSIDE_STRING
				495	else:
				496	raise ValueError(
				497	'Can\'t process char at column %d for: %r' % (index, text),
				498	index,
				499	text)
				500	elif char == ' ':
				501	if state == NEED_SPACE:
				502	state = NEED_QUOTE
				503	elif state == INSIDE_STRING:
				504	out[-1] += char
				505	elif state == ESCAPED:
				506	out[-1] += char
				507	state = INSIDE_STRING
				508	else:
				509	raise ValueError(
				510	'Can\'t process char at column %d for: %r' % (index, text),
				511	index,
				512	text)
				513	elif char == '.':
				514	if state == NEED_COMMA_OR_DOT:
				515	# The string is incomplete, this mean the strace -s flag should be
				516	# increased.
				517	state = NEED_DOT_2
				518	elif state == NEED_DOT_2:
				519	state = NEED_DOT_3
				520	elif state == NEED_DOT_3:
				521	state = NEED_COMMA
				522	elif state == INSIDE_STRING:
				523	out[-1] += char
				524	elif state == ESCAPED:
				525	out[-1] += char
				526	state = INSIDE_STRING
				527	else:
				528	raise ValueError(
				529	'Can\'t process char at column %d for: %r' % (index, text),
				530	index,
				531	text)
				532	elif char == '\\':
				533	if state == ESCAPED:
				534	out[-1] += char
				535	state = INSIDE_STRING
				536	elif state == INSIDE_STRING:
				537	state = ESCAPED
				538	else:
				539	raise ValueError(
				540	'Can\'t process char at column %d for: %r' % (index, text),
				541	index,
				542	text)
				543	else:
				544	if state == INSIDE_STRING:
				545	out[-1] += char
				546	else:
				547	raise ValueError(
				548	'Can\'t process char at column %d for: %r' % (index, text),
				549	index,
				550	text)
				551	if state not in (NEED_COMMA, NEED_COMMA_OR_DOT):
				552	raise ValueError(
				553	'String is incorrectly terminated: %r' % text,
				554	text)
				555	return out
				556
				557
				558	def read_json(filepath):
				559	with open(filepath, 'r') as f:
				560	return json.load(f)
				561
				562
				563	def write_json(filepath_or_handle, data, dense):
				564	"""Writes data into filepath or file handle encoded as json.
				565
				566	If dense is True, the json is packed. Otherwise, it is human readable.
				567	"""
				568	if hasattr(filepath_or_handle, 'write'):
				569	if dense:
				570	filepath_or_handle.write(
				571	json.dumps(data, sort_keys=True, separators=(',',':')))
				572	else:
				573	filepath_or_handle.write(json.dumps(data, sort_keys=True, indent=2))
				574	else:
				575	with open(filepath_or_handle, 'wb') as f:
				576	if dense:
				577	json.dump(data, f, sort_keys=True, separators=(',',':'))
				578	else:
				579	json.dump(data, f, sort_keys=True, indent=2)
				580
				581
				582	class Results(object):
				583	"""Results of a trace session."""
				584
				585	class _TouchedObject(object):
				586	"""Something, a file or a directory, that was accessed."""
				587	def __init__(self, root, path, tainted, size, nb_files):
				588	logging.debug(
				589	'%s(%s, %s, %s, %s, %s)' %
				590	(self.__class__.__name__, root, path, tainted, size, nb_files))
				591	self.root = root
				592	self.path = path
				593	self.tainted = tainted
				594	self.nb_files = nb_files
				595	# Can be used as a cache or a default value, depending on context.
				596	self._size = size
				597	# These are cache only.
				598	self._real_path = None
				599
				600	# Check internal consistency.
				601	assert path, path
				602	assert tainted or bool(root) != bool(isabs(path)), (root, path)
				603	assert tainted or (
				604	not os.path.exists(self.full_path) or
				605	(self.full_path == get_native_path_case(self.full_path))), (
				606	tainted, self.full_path, get_native_path_case(self.full_path))
				607
				608	@property
				609	def existent(self):
				610	return self.size != -1
				611
				612	@property
				613	def full_path(self):
				614	if self.root:
				615	return os.path.join(self.root, self.path)
				616	return self.path
				617
				618	@property
				619	def real_path(self):
				620	"""Returns the path with symlinks resolved."""
				621	if not self._real_path:
				622	self._real_path = os.path.realpath(self.full_path)
				623	return self._real_path
				624
				625	@property
				626	def size(self):
				627	"""File's size. -1 is not existent."""
				628	if self._size is None and not self.tainted:
				629	try:
				630	self._size = os.stat(self.full_path).st_size
				631	except OSError:
				632	self._size = -1
				633	return self._size
				634
				635	def flatten(self):
				636	"""Returns a dict representing this object.
				637
				638	A 'size' of 0 means the file was only touched and not read.
				639	"""
				640	return {
				641	'path': self.path,
				642	'size': self.size,
				643	}
				644
				645	def replace_variables(self, variables):
				646	"""Replaces the root of this File with one of the variables if it matches.
				647
				648	If a variable replacement occurs, the cloned object becomes tainted.
				649	"""
				650	for variable, root_path in variables.iteritems():
				651	if self.path.startswith(root_path):
				652	return self._clone(
				653	self.root, variable + self.path[len(root_path):], True)
				654	# No need to clone, returns ourself.
				655	return self
				656
				657	def strip_root(self, root):
				658	"""Returns a clone of itself with 'root' stripped off."""
				659	# Check internal consistency.
				660	assert self.tainted or (isabs(root) and root.endswith(os.path.sep)), root
				661	if not self.full_path.startswith(root):
				662	# Now try to resolve the symlinks to see if it can be reached this way.
				663	# Only try after trying without resolving symlink.
				664	if not self.real_path.startswith(root):
				665	return None
				666	path = self.real_path
				667	else:
				668	path = self.full_path
				669	return self._clone(root, path[len(root):], self.tainted)
				670
				671	def _clone(self, new_root, new_path, tainted):
				672	raise NotImplementedError(self.__class__.__name__)
				673
				674	class File(_TouchedObject):
				675	"""A file that was accessed. May not be present anymore.
				676
				677	If tainted is true, it means it is not a real path anymore as a variable
				678	replacement occured.
				679
				680	If touched_only is True, this means the file was probed for existence, and
				681	it is existent, but was never _opened_. If touched_only is True, the file
				682	must have existed.
				683	"""
				684	def __init__(self, root, path, tainted, size):
				685	super(Results.File, self).__init__(root, path, tainted, size, 1)
				686
				687	def _clone(self, new_root, new_path, tainted):
				688	"""Clones itself keeping meta-data."""
				689	# Keep the self.size and self._real_path caches for performance reason. It
				690	# is also important when the file becomes tainted (with a variable instead
				691	# of the real path) since self.path is not an on-disk path anymore so
				692	# out._size cannot be updated.
				693	out = self.__class__(new_root, new_path, tainted, self.size)
				694	out._real_path = self._real_path
				695	return out
				696
				697	class Directory(_TouchedObject):
				698	"""A directory of files. Must exist."""
				699	def __init__(self, root, path, tainted, size, nb_files):
				700	"""path='.' is a valid value and must be handled appropriately."""
				701	assert not path.endswith(os.path.sep), path
				702	super(Results.Directory, self).__init__(
				703	root, path + os.path.sep, tainted, size, nb_files)
				704	# For a Directory instance, self.size is not a cache, it's an actual value
				705	# that is never modified and represents the total size of the files
				706	# contained in this directory. It is possible that the directory is empty
				707	# so that size == 0; this happens if there's only an invalid symlink in
				708	# it.
				709
				710	def flatten(self):
				711	out = super(Results.Directory, self).flatten()
				712	out['nb_files'] = self.nb_files
				713	return out
				714
				715	def _clone(self, new_root, new_path, tainted):
				716	"""Clones itself keeping meta-data."""
				717	out = self.__class__(
				718	new_root,
				719	new_path.rstrip(os.path.sep),
				720	tainted,
				721	self.size,
				722	self.nb_files)
				723	out._real_path = self._real_path
				724	return out
				725
				726	class Process(object):
				727	"""A process that was traced.
				728
				729	Contains references to the files accessed by this process and its children.
				730	"""
				731	def __init__(self, pid, files, executable, command, initial_cwd, children):
				732	logging.debug('Process(%s, %d, ...)' % (pid, len(files)))
				733	self.pid = pid
				734	self.files = sorted(files, key=lambda x: x.path)
				735	self.children = children
				736	self.executable = executable
				737	self.command = command
				738	self.initial_cwd = initial_cwd
				739
				740	# Check internal consistency.
				741	assert len(set(f.path for f in self.files)) == len(self.files), sorted(
				742	f.path for f in self.files)
				743	assert isinstance(self.children, list)
				744	assert isinstance(self.files, list)
				745
				746	@property
				747	def all(self):
				748	for child in self.children:
				749	for i in child.all:
				750	yield i
				751	yield self
				752
				753	def flatten(self):
				754	return {
				755	'children': [c.flatten() for c in self.children],
				756	'command': self.command,
				757	'executable': self.executable,
				758	'files': [f.flatten() for f in self.files],
				759	'initial_cwd': self.initial_cwd,
				760	'pid': self.pid,
				761	}
				762
				763	def strip_root(self, root):
				764	assert isabs(root) and root.endswith(os.path.sep), root
				765	# Loads the files after since they are constructed as objects.
				766	out = self.__class__(
				767	self.pid,
				768	filter(None, (f.strip_root(root) for f in self.files)),
				769	self.executable,
				770	self.command,
				771	self.initial_cwd,
				772	[c.strip_root(root) for c in self.children])
				773	logging.debug(
				774	'strip_root(%s) %d -> %d' % (root, len(self.files), len(out.files)))
				775	return out
				776
				777	def __init__(self, process):
				778	self.process = process
				779	# Cache.
				780	self._files = None
				781
				782	def flatten(self):
				783	return {
				784	'root': self.process.flatten(),
				785	}
				786
				787	@property
				788	def files(self):
				789	if self._files is None:
				790	self._files = sorted(
				791	sum((p.files for p in self.process.all), []),
				792	key=lambda x: x.path)
				793	return self._files
				794
				795	@property
				796	def existent(self):
				797	return [f for f in self.files if f.existent]
				798
				799	@property
				800	def non_existent(self):
				801	return [f for f in self.files if not f.existent]
				802
				803	def strip_root(self, root):
				804	"""Returns a clone with all the files outside the directory \|root\| removed
				805	and converts all the path to be relative paths.
				806	"""
				807	# Resolve any symlink
				808	root = os.path.realpath(root)
				809	root = get_native_path_case(root).rstrip(os.path.sep) + os.path.sep
				810	logging.debug('strip_root(%s)' % root)
				811	return Results(self.process.strip_root(root))
				812
				813
				814	class ApiBase(object):
				815	"""OS-agnostic API to trace a process and its children."""
				816	class Context(object):
				817	"""Processes one log line at a time and keeps the list of traced processes.
				818
				819	The parsing is complicated by the fact that logs are traced out of order for
				820	strace but in-order for dtrace and logman. In addition, on Windows it is
				821	very frequent that processids are reused so a flat list cannot be used. But
				822	at the same time, it is impossible to faithfully construct a graph when the
				823	logs are processed out of order. So both a tree and a flat mapping are used,
				824	the tree is the real process tree, while the flat mapping stores the last
				825	valid process for the corresponding processid. For the strace case, the
				826	tree's head is guessed at the last moment.
				827	"""
				828	class Process(object):
				829	"""Keeps context for one traced child process.
				830
				831	Logs all the files this process touched. Ignores directories.
				832	"""
				833	def __init__(self, blacklist, pid, initial_cwd):
				834	# Check internal consistency.
				835	assert isinstance(pid, int), repr(pid)
				836	self.pid = pid
				837	# children are Process instances.
				838	self.children = []
				839	self.initial_cwd = initial_cwd
				840	self.cwd = None
				841	self.files = set()
				842	self.only_touched = set()
				843	self.executable = None
				844	self.command = None
				845	self._blacklist = blacklist
				846
				847	def to_results_process(self):
				848	"""Resolves file case sensitivity and or late-bound strings."""
				849	# When resolving files, it's normal to get dupe because a file could be
				850	# opened multiple times with different case. Resolve the deduplication
				851	# here.
				852	def fix_path(x):
				853	"""Returns the native file path case.
				854
				855	Converts late-bound strings.
				856	"""
				857	if not x:
				858	# Do not convert None instance to 'None'.
				859	return x
				860	# TODO(maruel): Do not upconvert to unicode here, on linux we don't
				861	# know the file path encoding so they must be treated as bytes.
				862	x = unicode(x)
				863	if os.path.isabs(x):
				864	# If the path is not absolute, which tends to happen occasionally on
				865	# Windows, it is not possible to get the native path case so ignore
				866	# that trace. It mostly happens for 'executable' value.
				867	x = get_native_path_case(x)
				868	return x
				869
				870	def fix_and_blacklist_path(x):
				871	x = fix_path(x)
				872	if not x:
				873	return
				874	# The blacklist needs to be reapplied, since path casing could
				875	# influence blacklisting.
				876	if self._blacklist(x):
				877	return
				878	return x
				879
				880	# Filters out directories. Some may have passed through.
				881	files = set(f for f in map(fix_and_blacklist_path, self.files) if f)
				882	only_touched = set(
				883	f for f in map(fix_and_blacklist_path, self.only_touched) if f)
				884	only_touched -= files
				885
				886	files = [
				887	Results.File(None, f, False, None) for f in files
				888	if not os.path.isdir(f)
				889	]
				890	# Using 0 as size means the file's content is ignored since the file was
				891	# never opened for I/O.
				892	files.extend(
				893	Results.File(None, f, False, 0) for f in only_touched
				894	if not os.path.isdir(f)
				895	)
				896	return Results.Process(
				897	self.pid,
				898	files,
				899	fix_path(self.executable),
				900	self.command,
				901	fix_path(self.initial_cwd),
				902	[c.to_results_process() for c in self.children])
				903
				904	def add_file(self, filepath, touch_only):
				905	"""Adds a file if it passes the blacklist."""
				906	if self._blacklist(unicode(filepath)):
				907	return
				908	logging.debug('add_file(%d, %s, %s)' % (self.pid, filepath, touch_only))
				909	# Note that filepath and not unicode(filepath) is added. It is because
				910	# filepath could be something else than a string, like a RelativePath
				911	# instance for dtrace logs.
				912	if touch_only:
				913	self.only_touched.add(filepath)
				914	else:
				915	self.files.add(filepath)
				916
				917	def __init__(self, blacklist):
				918	self.blacklist = blacklist
				919	# Initial process.
				920	self.root_process = None
				921	# dict to accelerate process lookup, to not have to lookup the whole graph
				922	# each time.
				923	self._process_lookup = {}
				924
				925	class Tracer(object):
				926	"""During it's lifetime, the tracing subsystem is enabled."""
				927	def __init__(self, logname):
				928	self._logname = logname
				929	self._lock = threading.Lock()
				930	self._traces = []
				931	self._initialized = True
				932
				933	def trace(self, cmd, cwd, tracename, output):
				934	"""Runs the OS-specific trace program on an executable.
				935
				936	Arguments:
				937	- cmd: The command (a list) to run.
				938	- cwd: Current directory to start the child process in.
				939	- tracename: Name of the trace in the logname file.
				940	- output: If False, redirects output to PIPEs.
				941
				942	Returns a tuple (resultcode, output) and updates the internal trace
				943	entries.
				944	"""
				945	# The implementation adds an item to self._traces.
				946	raise NotImplementedError(self.__class__.__name__)
				947
				948	def close(self, _timeout=None):
				949	"""Saves the meta-data in the logname file.
				950
				951	For kernel-based tracing, stops the tracing subsystem.
				952
				953	Must not be used manually when using 'with' construct.
				954	"""
				955	with self._lock:
				956	assert self._initialized
				957	try:
				958	data = {
				959	'traces': self._traces,
				960	}
				961	write_json(self._logname, data, False)
				962	finally:
				963	self._initialized = False
				964
				965	def post_process_log(self):
				966	"""Post-processes the log so it becomes faster to load afterward.
				967
				968	Must not be used manually when using 'with' construct.
				969	"""
				970	assert not self._initialized, 'Must stop tracing first.'
				971
				972	def __enter__(self):
				973	"""Enables 'with' statement."""
				974	return self
				975
				976	def __exit__(self, exc_type, exc_value, traceback):
				977	"""Enables 'with' statement."""
				978	self.close()
				979	# If an exception was thrown, do not process logs.
				980	if not exc_type:
				981	self.post_process_log()
				982
				983	def get_tracer(self, logname):
				984	"""Returns an ApiBase.Tracer instance.
				985
				986	Initializes the tracing subsystem, which is a requirement for kernel-based
				987	tracers. Only one tracer instance should be live at a time!
				988
				989	logname is the filepath to the json file that will contain the meta-data
				990	about the logs.
				991	"""
				992	return self.Tracer(logname)
				993
				994	@staticmethod
				995	def clean_trace(logname):
				996	"""Deletes an old log."""
				997	raise NotImplementedError()
				998
				999	@classmethod
				1000	def parse_log(cls, logname, blacklist):
				1001	"""Processes trace logs and returns the files opened and the files that do
				1002	not exist.
				1003
				1004	It does not track directories.
				1005
				1006	Most of the time, files that do not exist are temporary test files that
				1007	should be put in /tmp instead. See http://crbug.com/116251.
				1008
				1009	Returns a list of dict with keys:
				1010	- results: A Results instance.
				1011	- trace: The corresponding tracename parameter provided to
				1012	get_tracer().trace().
				1013	- output: Output gathered during execution, if get_tracer().trace(...,
				1014	output=False) was used.
				1015	"""
				1016	raise NotImplementedError(cls.__class__.__name__)
				1017
				1018
				1019	class Strace(ApiBase):
				1020	"""strace implies linux."""
				1021	class Context(ApiBase.Context):
				1022	"""Processes a strace log line and keeps the list of existent and non
				1023	existent files accessed.
				1024
				1025	Ignores directories.
				1026
				1027	Uses late-binding to processes the cwd of each process. The problem is that
				1028	strace generates one log file per process it traced but doesn't give any
				1029	information about which process was started when and by who. So we don't
				1030	even know which process is the initial one. So process the logs out of
				1031	order and use late binding with RelativePath to be able to deduce the
				1032	initial directory of each process once all the logs are parsed.
				1033	"""
				1034	class Process(ApiBase.Context.Process):
				1035	"""Represents the state of a process.
				1036
				1037	Contains all the information retrieved from the pid-specific log.
				1038	"""
				1039	# Function names are using ([a-z_0-9]+)
				1040	# This is the most common format. function(args) = result
				1041	RE_HEADER = re.compile(r'^([a-z_0-9]+)$(.+?)$\s+= (.+)$')
				1042	# An interrupted function call, only grab the minimal header.
				1043	RE_UNFINISHED = re.compile(r'^([^\(]+)(.*) \<unfinished \.\.\.\>$')
				1044	# A resumed function call.
				1045	RE_RESUMED = re.compile(r'^<\.\.\. ([^ ]+) resumed> (.+)$')
				1046	# A process received a signal.
				1047	RE_SIGNAL = re.compile(r'^--- SIG[A-Z]+ .+ ---')
				1048	# A process didn't handle a signal. Ignore any junk appearing before,
				1049	# because the process was forcibly killed so it won't open any new file.
				1050	RE_KILLED = re.compile(
				1051	r'^.*\+\+\+ killed by ([A-Z]+)( $core dumped$)? \+\+\+$')
				1052	# The process has exited.
				1053	RE_PROCESS_EXITED = re.compile(r'^\+\+\+ exited with (\d+) \+\+\+')
				1054	# A call was canceled. Ignore any prefix.
				1055	RE_UNAVAILABLE = re.compile(r'^.\)\s= \? <unavailable>$')
				1056	# Happens when strace fails to even get the function name.
				1057	UNNAMED_FUNCTION = '????'
				1058
				1059	# Corner-case in python, a class member function decorator must not be
				1060	# @staticmethod.
				1061	def parse_args(regexp, expect_zero): # pylint: disable=E0213
				1062	"""Automatically convert the str 'args' into a list of processed
				1063	arguments.
				1064
				1065	Arguments:
				1066	- regexp is used to parse args.
				1067	- expect_zero: one of True, False or None.
				1068	- True: will check for result.startswith('0') first and will ignore
				1069	the trace line completely otherwise. This is important because for
				1070	many functions, the regexp will not process if the call failed.
				1071	- False: will check for not result.startswith(('?', '-1')) for the
				1072	same reason than with True.
				1073	- None: ignore result.
				1074	"""
				1075	def meta_hook(function):
				1076	assert function.__name__.startswith('handle_')
				1077	def hook(self, args, result):
				1078	if expect_zero is True and not result.startswith('0'):
				1079	return
				1080	if expect_zero is False and result.startswith(('?', '-1')):
				1081	return
				1082	match = re.match(regexp, args)
				1083	if not match:
				1084	raise TracingFailure(
				1085	'Failed to parse %s(%s) = %s' %
				1086	(function.__name__[len('handle_'):], args, result),
				1087	None, None, None)
				1088	return function(self, match.groups(), result)
				1089	return hook
				1090	return meta_hook
				1091
				1092	class RelativePath(object):
				1093	"""A late-bound relative path."""
				1094	def __init__(self, parent, value):
				1095	self.parent = parent
				1096	self.value = value
				1097
				1098	def render(self):
				1099	"""Returns the current directory this instance is representing.
				1100
				1101	This function is used to return the late-bound value.
				1102	"""
				1103	if self.value and self.value.startswith(u'/'):
				1104	# An absolute path.
				1105	return self.value
				1106	parent = self.parent.render() if self.parent else u'<None>'
				1107	if self.value:
				1108	return os.path.normpath(os.path.join(parent, self.value))
				1109	return parent
				1110
				1111	def __unicode__(self):
				1112	"""Acts as a string whenever needed."""
				1113	return unicode(self.render())
				1114
				1115	def __str__(self):
				1116	"""Acts as a string whenever needed."""
				1117	return str(self.render())
				1118
				1119	def __init__(self, root, pid):
				1120	"""Keeps enough information to be able to guess the original process
				1121	root.
				1122
				1123	strace doesn't store which process was the initial process. So more
				1124	information needs to be kept so the graph can be reconstructed from the
				1125	flat map.
				1126	"""
				1127	logging.info('%s(%d)' % (self.__class__.__name__, pid))
				1128	super(Strace.Context.Process, self).__init__(root.blacklist, pid, None)
				1129	assert isinstance(root, ApiBase.Context)
				1130	self._root = weakref.ref(root)
				1131	# The dict key is the function name of the pending call, like 'open'
				1132	# or 'execve'.
				1133	self._pending_calls = {}
				1134	self._line_number = 0
				1135	# Current directory when the process started.
				1136	self.initial_cwd = self.RelativePath(self._root(), None)
				1137	self.parentid = None
				1138
				1139	def get_cwd(self):
				1140	"""Returns the best known value of cwd."""
				1141	return self.cwd or self.initial_cwd
				1142
				1143	def render(self):
				1144	"""Returns the string value of the RelativePath() object.
				1145
				1146	Used by RelativePath. Returns the initial directory and not the
				1147	current one since the current directory 'cwd' validity is time-limited.
				1148
				1149	The validity is only guaranteed once all the logs are processed.
				1150	"""
				1151	return self.initial_cwd.render()
				1152
				1153	def on_line(self, line):
				1154	self._line_number += 1
				1155	if self.RE_SIGNAL.match(line):
				1156	# Ignore signals.
				1157	return
				1158
				1159	try:
				1160	match = self.RE_KILLED.match(line)
				1161	if match:
				1162	# Converts a '+++ killed by Foo +++' trace into an exit_group().
				1163	self.handle_exit_group(match.group(1), None)
				1164	return
				1165
				1166	match = self.RE_PROCESS_EXITED.match(line)
				1167	if match:
				1168	# Converts a '+++ exited with 1 +++' trace into an exit_group()
				1169	self.handle_exit_group(match.group(1), None)
				1170	return
				1171
				1172	match = self.RE_UNFINISHED.match(line)
				1173	if match:
				1174	if match.group(1) in self._pending_calls:
				1175	raise TracingFailure(
				1176	'Found two unfinished calls for the same function',
				1177	None, None, None,
				1178	self._pending_calls)
				1179	self._pending_calls[match.group(1)] = (
				1180	match.group(1) + match.group(2))
				1181	return
				1182
				1183	match = self.RE_UNAVAILABLE.match(line)
				1184	if match:
				1185	# This usually means a process was killed and a pending call was
				1186	# canceled.
				1187	# TODO(maruel): Look up the last exit_group() trace just above and
				1188	# make sure any self._pending_calls[anything] is properly flushed.
				1189	return
				1190
				1191	match = self.RE_RESUMED.match(line)
				1192	if match:
				1193	if match.group(1) not in self._pending_calls:
				1194	raise TracingFailure(
				1195	'Found a resumed call that was not logged as unfinished',
				1196	None, None, None,
				1197	self._pending_calls)
				1198	pending = self._pending_calls.pop(match.group(1))
				1199	# Reconstruct the line.
				1200	line = pending + match.group(2)
				1201
				1202	match = self.RE_HEADER.match(line)
				1203	if not match:
				1204	raise TracingFailure(
				1205	'Found an invalid line: %s' % line,
				1206	None, None, None)
				1207	if match.group(1) == self.UNNAMED_FUNCTION:
				1208	return
				1209
				1210	# It's a valid line, handle it.
				1211	handler = getattr(self, 'handle_%s' % match.group(1), None)
				1212	if not handler:
				1213	self._handle_unknown(match.group(1), match.group(2), match.group(3))
				1214	return handler(match.group(2), match.group(3))
				1215	except TracingFailure, e:
				1216	# Hack in the values since the handler could be a static function.
				1217	e.pid = self.pid
				1218	e.line = line
				1219	e.line_number = self._line_number
				1220	# Re-raise the modified exception.
				1221	raise
				1222	except (KeyError, NotImplementedError, ValueError), e:
				1223	raise TracingFailure(
				1224	'Trace generated a %s exception: %s' % (
				1225	e.__class__.__name__, str(e)),
				1226	self.pid,
				1227	self._line_number,
				1228	line,
				1229	e)
				1230
				1231	@parse_args(r'^\"(.+?)\", [FKORWX_\|]+$', True)
				1232	def handle_access(self, args, _result):
				1233	self._handle_file(args[0], True)
				1234
				1235	@parse_args(r'^\"(.+?)\"$', True)
				1236	def handle_chdir(self, args, _result):
				1237	"""Updates cwd."""
				1238	self.cwd = self.RelativePath(self, args[0])
				1239	logging.debug('handle_chdir(%d, %s)' % (self.pid, self.cwd))
				1240
				1241	def handle_clone(self, _args, result):
				1242	"""Transfers cwd."""
				1243	if result.startswith(('?', '-1')):
				1244	# The call failed.
				1245	return
				1246	# Update the other process right away.
				1247	childpid = int(result)
				1248	child = self._root().get_or_set_proc(childpid)
				1249	if child.parentid is not None or childpid in self.children:
				1250	raise TracingFailure(
				1251	'Found internal inconsitency in process lifetime detection '
				1252	'during a clone() call',
				1253	None, None, None)
				1254
				1255	# Copy the cwd object.
				1256	child.initial_cwd = self.get_cwd()
				1257	child.parentid = self.pid
				1258	# It is necessary because the logs are processed out of order.
				1259	self.children.append(child)
				1260
				1261	def handle_close(self, _args, _result):
				1262	pass
				1263
				1264	def handle_chmod(self, _args, _result):
				1265	pass
				1266
				1267	def handle_creat(self, _args, _result):
				1268	# Ignore files created, since they didn't need to exist.
				1269	pass
				1270
				1271	@parse_args(r'^\"(.+?)\", \[(.+)\], \[\/\* \d+ vars? \*\/\]$', True)
				1272	def handle_execve(self, args, _result):
				1273	# Even if in practice execve() doesn't returns when it succeeds, strace
				1274	# still prints '0' as the result.
				1275	filepath = args[0]
				1276	self._handle_file(filepath, False)
				1277	self.executable = self.RelativePath(self.get_cwd(), filepath)
				1278	self.command = strace_process_quoted_arguments(args[1])
				1279
				1280	def handle_exit_group(self, _args, _result):
				1281	"""Removes cwd."""
				1282	self.cwd = None
				1283
				1284	def handle_fork(self, args, result):
				1285	self._handle_unknown('fork', args, result)
				1286
				1287	def handle_getcwd(self, _args, _result):
				1288	pass
				1289
				1290	@parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
				1291	def handle_link(self, args, _result):
				1292	self._handle_file(args[0], False)
				1293	self._handle_file(args[1], False)
				1294
				1295	@parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True)
				1296	def handle_lstat(self, args, _result):
				1297	self._handle_file(args[0], True)
				1298
				1299	def handle_mkdir(self, _args, _result):
				1300	pass
				1301
				1302	@parse_args(r'^\"(.*?)\", ([A-Z\_\\|]+)(\|, \d+)$', False)
				1303	def handle_open(self, args, _result):
				1304	if 'O_DIRECTORY' in args[1]:
				1305	return
				1306	self._handle_file(args[0], False)
				1307
				1308	@parse_args(r'^(\d+\|AT_FDCWD), \"(.*?)\", ([A-Z\_\\|]+)(\|, \d+)$', False)
				1309	def handle_openat(self, args, _result):
				1310	if 'O_DIRECTORY' in args[2]:
				1311	return
				1312	if args[1] == 'AT_FDCWD':
				1313	self._handle_file(args[1], False)
				1314	else:
				1315	# TODO(maruel): Implement relative open if necessary instead of the
				1316	# AT_FDCWD flag, let's hope not since this means tracking all active
				1317	# directory handles.
				1318	raise Exception('Relative open via openat not implemented.')
				1319
				1320	@parse_args(r'^\"(.+?)\", \".+?\"(\.\.\.)?, \d+$', False)
				1321	def handle_readlink(self, args, _result):
				1322	self._handle_file(args[0], False)
				1323
				1324	@parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
				1325	def handle_rename(self, args, _result):
				1326	self._handle_file(args[0], False)
				1327	self._handle_file(args[1], False)
				1328
				1329	def handle_rmdir(self, _args, _result):
				1330	pass
				1331
				1332	def handle_setxattr(self, _args, _result):
				1333	pass
				1334
				1335	@parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True)
				1336	def handle_stat(self, args, _result):
				1337	self._handle_file(args[0], True)
				1338
				1339	def handle_symlink(self, _args, _result):
				1340	pass
				1341
				1342	def handle_unlink(self, _args, _result):
				1343	# In theory, the file had to be created anyway.
				1344	pass
				1345
				1346	def handle_statfs(self, _args, _result):
				1347	pass
				1348
				1349	def handle_vfork(self, args, result):
				1350	self._handle_unknown('vfork', args, result)
				1351
				1352	@staticmethod
				1353	def _handle_unknown(function, args, result):
				1354	raise TracingFailure(
				1355	'Unexpected/unimplemented trace %s(%s)= %s' %
				1356	(function, args, result),
				1357	None, None, None)
				1358
				1359	def _handle_file(self, filepath, touch_only):
				1360	filepath = self.RelativePath(self.get_cwd(), filepath)
				1361	#assert not touch_only, unicode(filepath)
				1362	self.add_file(filepath, touch_only)
				1363
				1364	def __init__(self, blacklist, initial_cwd):
				1365	super(Strace.Context, self).__init__(blacklist)
				1366	self.initial_cwd = initial_cwd
				1367
				1368	def render(self):
				1369	"""Returns the string value of the initial cwd of the root process.
				1370
				1371	Used by RelativePath.
				1372	"""
				1373	return self.initial_cwd
				1374
				1375	def on_line(self, pid, line):
				1376	"""Transfers control into the Process.on_line() function."""
				1377	self.get_or_set_proc(pid).on_line(line.strip())
				1378
				1379	def to_results(self):
				1380	"""Finds back the root process and verify consistency."""
				1381	# TODO(maruel): Absolutely unecessary, fix me.
				1382	root = [p for p in self._process_lookup.itervalues() if not p.parentid]
				1383	if len(root) != 1:
				1384	raise TracingFailure(
				1385	'Found internal inconsitency in process lifetime detection '
				1386	'while finding the root process',
				1387	None,
				1388	None,
				1389	None,
				1390	sorted(p.pid for p in root))
				1391	self.root_process = root[0]
				1392	process = self.root_process.to_results_process()
				1393	if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
				1394	raise TracingFailure(
				1395	'Found internal inconsitency in process lifetime detection '
				1396	'while looking for len(tree) == len(list)',
				1397	None,
				1398	None,
				1399	None,
				1400	sorted(self._process_lookup),
				1401	sorted(p.pid for p in process.all))
				1402	return Results(process)
				1403
				1404	def get_or_set_proc(self, pid):
				1405	"""Returns the Context.Process instance for this pid or creates a new one.
				1406	"""
				1407	if not pid or not isinstance(pid, int):
				1408	raise TracingFailure(
				1409	'Unpexpected value for pid: %r' % pid,
				1410	pid,
				1411	None,
				1412	None,
				1413	pid)
				1414	if pid not in self._process_lookup:
				1415	self._process_lookup[pid] = self.Process(self, pid)
				1416	return self._process_lookup[pid]
				1417
				1418	@classmethod
				1419	def traces(cls):
				1420	"""Returns the list of all handled traces to pass this as an argument to
				1421	strace.
				1422	"""
				1423	prefix = 'handle_'
				1424	return [i[len(prefix):] for i in dir(cls.Process) if i.startswith(prefix)]
				1425
				1426	class Tracer(ApiBase.Tracer):
				1427	MAX_LEN = 256
				1428
				1429	def trace(self, cmd, cwd, tracename, output):
				1430	"""Runs strace on an executable."""
				1431	logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
				1432	assert os.path.isabs(cmd[0]), cmd[0]
				1433	assert os.path.isabs(cwd), cwd
				1434	assert os.path.normpath(cwd) == cwd, cwd
				1435	with self._lock:
				1436	if not self._initialized:
				1437	raise TracingFailure(
				1438	'Called Tracer.trace() on an unitialized object',
				1439	None, None, None, tracename)
				1440	assert tracename not in (i['trace'] for i in self._traces)
				1441	stdout = stderr = None
				1442	if output:
				1443	stdout = subprocess.PIPE
				1444	stderr = subprocess.STDOUT
				1445	# Ensure all file related APIs are hooked.
				1446	traces = ','.join(Strace.Context.traces() + ['file'])
				1447	trace_cmd = [
				1448	'strace',
				1449	'-ff',
				1450	'-s', '%d' % self.MAX_LEN,
				1451	'-e', 'trace=%s' % traces,
				1452	'-o', self._logname + '.' + tracename,
				1453	]
				1454	child = subprocess.Popen(
				1455	trace_cmd + cmd,
				1456	cwd=cwd,
				1457	stdin=subprocess.PIPE,
				1458	stdout=stdout,
				1459	stderr=stderr)
				1460	out = child.communicate()[0]
				1461	# TODO(maruel): Walk the logs and figure out the root process would
				1462	# simplify parsing the logs a lot.
				1463	with self._lock:
				1464	assert tracename not in (i['trace'] for i in self._traces)
				1465	self._traces.append(
				1466	{
				1467	'cmd': cmd,
				1468	'cwd': cwd,
				1469	# The pid of strace process, not very useful.
				1470	'pid': child.pid,
				1471	'trace': tracename,
				1472	'output': out,
				1473	})
				1474	return child.returncode, out
				1475
				1476	@staticmethod
				1477	def clean_trace(logname):
				1478	if os.path.isfile(logname):
				1479	os.remove(logname)
				1480	# Also delete any pid specific file from previous traces.
				1481	for i in glob.iglob(logname + '.*'):
				1482	if i.rsplit('.', 1)[1].isdigit():
				1483	os.remove(i)
				1484
				1485	@classmethod
				1486	def parse_log(cls, logname, blacklist):
				1487	logging.info('parse_log(%s, %s)' % (logname, blacklist))
				1488	data = read_json(logname)
				1489	out = []
				1490	for item in data['traces']:
				1491	result = {
				1492	'trace': item['trace'],
				1493	'output': item['output'],
				1494	}
				1495	try:
				1496	context = cls.Context(blacklist, item['cwd'])
				1497	for pidfile in glob.iglob('%s.%s.*' % (logname, item['trace'])):
				1498	pid = pidfile.rsplit('.', 1)[1]
				1499	if pid.isdigit():
				1500	pid = int(pid)
				1501	# TODO(maruel): Load as utf-8
				1502	for line in open(pidfile, 'rb'):
				1503	context.on_line(pid, line)
				1504	result['results'] = context.to_results()
				1505	except TracingFailure, e:
				1506	result['exception'] = e
				1507	out.append(result)
				1508	return out
				1509
				1510
				1511	class Dtrace(ApiBase):
				1512	"""Uses DTrace framework through dtrace. Requires root access.
				1513
				1514	Implies Mac OSX.
				1515
				1516	dtruss can't be used because it has compatibility issues with python.
				1517
				1518	Also, the pid->cwd handling needs to be done manually since OSX has no way to
				1519	get the absolute path of the 'cwd' dtrace variable from the probe.
				1520
				1521	Also, OSX doesn't populate curpsinfo->pr_psargs properly, see
				1522	https://discussions.apple.com/thread/1980539. So resort to handling execve()
				1523	manually.
				1524
				1525	errno is not printed in the log since this implementation currently only cares
				1526	about files that were successfully opened.
				1527	"""
				1528	class Context(ApiBase.Context):
				1529	# Format: index pid function(args)
				1530	RE_HEADER = re.compile(r'^\d+ (\d+) ([a-zA-Z_\-]+)$(.*?)$$')
				1531
				1532	# Arguments parsing.
				1533	RE_DTRACE_BEGIN = re.compile(r'^\"(.+?)\"$')
				1534	RE_CHDIR = re.compile(r'^\"(.+?)\"$')
				1535	RE_EXECVE = re.compile(r'^\"(.+?)\", \[(\d+), (.+)\]$')
				1536	RE_OPEN = re.compile(r'^\"(.+?)\", (0x[0-9a-z]+), (0x[0-9a-z]+)$')
				1537	RE_PROC_START = re.compile(r'^(\d+), \"(.+?)\", (\d+)$')
				1538	RE_RENAME = re.compile(r'^\"(.+?)\", \"(.+?)\"$')
				1539
				1540	O_DIRECTORY = 0x100000
				1541
				1542	class Process(ApiBase.Context.Process):
				1543	def __init__(self, *args):
				1544	super(Dtrace.Context.Process, self).__init__(*args)
				1545	self.cwd = self.initial_cwd
				1546
				1547	def __init__(self, blacklist, tracer_pid, initial_cwd):
				1548	logging.info(
				1549	'%s(%d, %s)' % (self.__class__.__name__, tracer_pid, initial_cwd))
				1550	super(Dtrace.Context, self).__init__(blacklist)
				1551	# Process ID of the temporary script created by create_thunk().
				1552	self._tracer_pid = tracer_pid
				1553	self._initial_cwd = initial_cwd
				1554	self._line_number = 0
				1555
				1556	def on_line(self, line):
				1557	self._line_number += 1
				1558	match = self.RE_HEADER.match(line)
				1559	if not match:
				1560	raise TracingFailure(
				1561	'Found malformed line: %s' % line,
				1562	None,
				1563	self._line_number,
				1564	line)
				1565	fn = getattr(
				1566	self,
				1567	'handle_%s' % match.group(2).replace('-', '_'),
				1568	self._handle_ignored)
				1569	# It is guaranteed to succeed because of the regexp. Or at least I thought
				1570	# it would.
				1571	pid = int(match.group(1))
				1572	try:
				1573	return fn(pid, match.group(3))
				1574	except TracingFailure, e:
				1575	# Hack in the values since the handler could be a static function.
				1576	e.pid = pid
				1577	e.line = line
				1578	e.line_number = self._line_number
				1579	# Re-raise the modified exception.
				1580	raise
				1581	except (KeyError, NotImplementedError, ValueError), e:
				1582	raise TracingFailure(
				1583	'Trace generated a %s exception: %s' % (
				1584	e.__class__.__name__, str(e)),
				1585	pid,
				1586	self._line_number,
				1587	line,
				1588	e)
				1589
				1590	def to_results(self):
				1591	process = self.root_process.to_results_process()
				1592	# Internal concistency check.
				1593	if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
				1594	raise TracingFailure(
				1595	'Found internal inconsitency in process lifetime detection '
				1596	'while looking for len(tree) == len(list)',
				1597	None,
				1598	None,
				1599	None,
				1600	sorted(self._process_lookup),
				1601	sorted(p.pid for p in process.all))
				1602	return Results(process)
				1603
				1604	def handle_dtrace_BEGIN(self, _pid, args):
				1605	if not self.RE_DTRACE_BEGIN.match(args):
				1606	raise TracingFailure(
				1607	'Found internal inconsitency in dtrace_BEGIN log line',
				1608	None, None, None)
				1609
				1610	def handle_proc_start(self, pid, args):
				1611	"""Transfers cwd.
				1612
				1613	The dtrace script already takes care of only tracing the processes that
				1614	are child of the traced processes so there is no need to verify the
				1615	process hierarchy.
				1616	"""
				1617	if pid in self._process_lookup:
				1618	raise TracingFailure(
				1619	'Found internal inconsitency in proc_start: %d started two times' %
				1620	pid,
				1621	None, None, None)
				1622	match = self.RE_PROC_START.match(args)
				1623	if not match:
				1624	raise TracingFailure(
				1625	'Failed to parse arguments: %s' % args,
				1626	None, None, None)
				1627	ppid = int(match.group(1))
				1628	if ppid == self._tracer_pid and not self.root_process:
				1629	proc = self.root_process = self.Process(
				1630	self.blacklist, pid, self._initial_cwd)
				1631	elif ppid in self._process_lookup:
				1632	proc = self.Process(self.blacklist, pid, self._process_lookup[ppid].cwd)
				1633	self._process_lookup[ppid].children.append(proc)
				1634	else:
				1635	# Another process tree, ignore.
				1636	return
				1637	self._process_lookup[pid] = proc
				1638	logging.debug(
				1639	'New child: %s -> %d cwd:%s' %
				1640	(ppid, pid, unicode(proc.initial_cwd)))
				1641
				1642	def handle_proc_exit(self, pid, _args):
				1643	"""Removes cwd."""
				1644	if pid in self._process_lookup:
				1645	# self._tracer_pid is not traced itself and other traces run neither.
				1646	self._process_lookup[pid].cwd = None
				1647
				1648	def handle_execve(self, pid, args):
				1649	"""Sets the process' executable.
				1650
				1651	TODO(maruel): Read command line arguments. See
				1652	https://discussions.apple.com/thread/1980539 for an example.
				1653	https://gist.github.com/1242279
				1654
				1655	Will have to put the answer at http://stackoverflow.com/questions/7556249.
				1656	:)
				1657	"""
				1658	if not pid in self._process_lookup:
				1659	# Another process tree, ignore.
				1660	return
				1661	match = self.RE_EXECVE.match(args)
				1662	if not match:
				1663	raise TracingFailure(
				1664	'Failed to parse arguments: %r' % args,
				1665	None, None, None)
				1666	proc = self._process_lookup[pid]
				1667	proc.executable = match.group(1)
				1668	proc.command = self.process_escaped_arguments(match.group(3))
				1669	if int(match.group(2)) != len(proc.command):
				1670	raise TracingFailure(
				1671	'Failed to parse execve() arguments: %s' % args,
				1672	None, None, None)
				1673
				1674	def handle_chdir(self, pid, args):
				1675	"""Updates cwd."""
				1676	if pid not in self._process_lookup:
				1677	# Another process tree, ignore.
				1678	return
				1679	cwd = self.RE_CHDIR.match(args).group(1)
				1680	if not cwd.startswith('/'):
				1681	cwd2 = os.path.join(self._process_lookup[pid].cwd, cwd)
				1682	logging.debug('handle_chdir(%d, %s) -> %s' % (pid, cwd, cwd2))
				1683	else:
				1684	logging.debug('handle_chdir(%d, %s)' % (pid, cwd))
				1685	cwd2 = cwd
				1686	self._process_lookup[pid].cwd = cwd2
				1687
				1688	def handle_open_nocancel(self, pid, args):
				1689	"""Redirects to handle_open()."""
				1690	return self.handle_open(pid, args)
				1691
				1692	def handle_open(self, pid, args):
				1693	if pid not in self._process_lookup:
				1694	# Another process tree, ignore.
				1695	return
				1696	match = self.RE_OPEN.match(args)
				1697	if not match:
				1698	raise TracingFailure(
				1699	'Failed to parse arguments: %s' % args,
				1700	None, None, None)
				1701	flag = int(match.group(2), 16)
				1702	if self.O_DIRECTORY & flag == self.O_DIRECTORY:
				1703	# Ignore directories.
				1704	return
				1705	self._handle_file(pid, match.group(1))
				1706
				1707	def handle_rename(self, pid, args):
				1708	if pid not in self._process_lookup:
				1709	# Another process tree, ignore.
				1710	return
				1711	match = self.RE_RENAME.match(args)
				1712	if not match:
				1713	raise TracingFailure(
				1714	'Failed to parse arguments: %s' % args,
				1715	None, None, None)
				1716	self._handle_file(pid, match.group(1))
				1717	self._handle_file(pid, match.group(2))
				1718
				1719	def _handle_file(self, pid, filepath):
				1720	if not filepath.startswith('/'):
				1721	filepath = os.path.join(self._process_lookup[pid].cwd, filepath)
				1722	# We can get '..' in the path.
				1723	filepath = os.path.normpath(filepath)
				1724	# Sadly, still need to filter out directories here;
				1725	# saw open_nocancel(".", 0, 0) = 0 lines.
				1726	if os.path.isdir(filepath):
				1727	return
				1728	self._process_lookup[pid].add_file(filepath, False)
				1729
				1730	def handle_ftruncate(self, pid, args):
				1731	"""Just used as a signal to kill dtrace, ignoring."""
				1732	pass
				1733
				1734	@staticmethod
				1735	def _handle_ignored(pid, args):
				1736	"""Is called for all the event traces that are not handled."""
				1737	raise NotImplementedError('Please implement me')
				1738
				1739	@staticmethod
				1740	def process_escaped_arguments(text):
				1741	"""Extracts escaped arguments on a string and return the arguments as a
				1742	list.
				1743
				1744	Implemented as an automaton.
				1745
				1746	Example:
				1747	With text = '\\001python2.7\\001-c\\001print(\\"hi\\")\\0', the
				1748	function will return ['python2.7', '-c', 'print("hi")]
				1749	"""
				1750	if not text.endswith('\\0'):
				1751	raise ValueError('String is not null terminated: %r' % text, text)
				1752	text = text[:-2]
				1753
				1754	def unescape(x):
				1755	"""Replaces '\\' with '\' and '\?' (where ? is anything) with ?."""
				1756	out = []
				1757	escaped = False
				1758	for i in x:
				1759	if i == '\\' and not escaped:
				1760	escaped = True
				1761	continue
				1762	escaped = False
				1763	out.append(i)
				1764	return ''.join(out)
				1765
				1766	return [unescape(i) for i in text.split('\\001')]
				1767
				1768	class Tracer(ApiBase.Tracer):
				1769	# pylint: disable=C0301
				1770	#
				1771	# To understand the following code, you'll want to take a look at:
				1772	# http://developers.sun.com/solaris/articles/dtrace_quickref/dtrace_quickref.html
				1773	# https://wikis.oracle.com/display/DTrace/Variables
				1774	# http://docs.oracle.com/cd/E19205-01/820-4221/
				1775	#
				1776	# 0. Dump all the valid probes into a text file. It is important, you
				1777	# want to redirect into a file and you don't want to constantly 'sudo'.
				1778	# $ sudo dtrace -l > probes.txt
				1779	#
				1780	# 1. Count the number of probes:
				1781	# $ wc -l probes.txt
				1782	# 81823 # On OSX 10.7, including 1 header line.
				1783	#
				1784	# 2. List providers, intentionally skipping all the 'syspolicy10925' and the
				1785	# likes and skipping the header with NR>1:
				1786	# $ awk 'NR>1 { print $2 }' probes.txt \| sort \| uniq \| grep -v '[[:digit:]]'
				1787	# dtrace
				1788	# fbt
				1789	# io
				1790	# ip
				1791	# lockstat
				1792	# mach_trap
				1793	# proc
				1794	# profile
				1795	# sched
				1796	# syscall
				1797	# tcp
				1798	# vminfo
				1799	#
				1800	# 3. List of valid probes:
				1801	# $ grep syscall probes.txt \| less
				1802	# or use dtrace directly:
				1803	# $ sudo dtrace -l -P syscall \| less
				1804	#
				1805	# trackedpid is an associative array where its value can be 0, 1 or 2.
				1806	# 0 is for untracked processes and is the default value for items not
				1807	# in the associative array.
				1808	# 1 is for tracked processes.
				1809	# 2 is for the script created by create_thunk() only. It is not tracked
				1810	# itself but all its decendants are.
				1811	#
				1812	# The script will kill itself only once waiting_to_die == 1 and
				1813	# current_processes == 0, so that both getlogin() was called and that
				1814	# all traced processes exited.
				1815	#
				1816	# TODO(maruel): Use cacheable predicates. See
				1817	# https://wikis.oracle.com/display/DTrace/Performance+Considerations
				1818	D_CODE = """
				1819	dtrace:::BEGIN {
				1820	waiting_to_die = 0;
				1821	current_processes = 0;
				1822	logindex = 0;
				1823	printf("%d %d %s_%s(\\"%s\\")\\n",
				1824	logindex, PID, probeprov, probename, SCRIPT);
				1825	logindex++;
				1826	}
				1827
				1828	proc:::start /trackedpid[ppid]/ {
				1829	trackedpid[pid] = 1;
				1830	current_processes += 1;
				1831	printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
				1832	logindex, pid, probeprov, probename,
				1833	ppid,
				1834	execname,
				1835	current_processes);
				1836	logindex++;
				1837	}
				1838	/* Should use SCRIPT but there is no access to this variable at that
				1839	* point. */
				1840	proc:::start /ppid == PID && execname == "Python"/ {
				1841	trackedpid[pid] = 2;
				1842	current_processes += 1;
				1843	printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
				1844	logindex, pid, probeprov, probename,
				1845	ppid,
				1846	execname,
				1847	current_processes);
				1848	logindex++;
				1849	}
				1850	proc:::exit /trackedpid[pid] &&
				1851	current_processes == 1 &&
				1852	waiting_to_die == 1/ {
				1853	trackedpid[pid] = 0;
				1854	current_processes -= 1;
				1855	printf("%d %d %s_%s(%d)\\n",
				1856	logindex, pid, probeprov, probename,
				1857	current_processes);
				1858	logindex++;
				1859	exit(0);
				1860	}
				1861	proc:::exit /trackedpid[pid]/ {
				1862	trackedpid[pid] = 0;
				1863	current_processes -= 1;
				1864	printf("%d %d %s_%s(%d)\\n",
				1865	logindex, pid, probeprov, probename,
				1866	current_processes);
				1867	logindex++;
				1868	}
				1869
				1870	/* Use an arcane function to detect when we need to die */
				1871	syscall::ftruncate:entry /pid == PID && arg0 == FILE_ID/ {
				1872	waiting_to_die = 1;
				1873	printf("%d %d %s()\\n", logindex, pid, probefunc);
				1874	logindex++;
				1875	}
				1876	syscall::ftruncate:entry /
				1877	pid == PID && arg0 == FILE_ID && current_processes == 0/ {
				1878	exit(0);
				1879	}
				1880
				1881	syscall::open*:entry /trackedpid[pid] == 1/ {
				1882	self->open_arg0 = arg0;
				1883	self->open_arg1 = arg1;
				1884	self->open_arg2 = arg2;
				1885	}
				1886	syscall::open*:return /trackedpid[pid] == 1 && errno == 0/ {
				1887	this->open_arg0 = copyinstr(self->open_arg0);
				1888	printf("%d %d %s(\\"%s\\", 0x%x, 0x%x)\\n",
				1889	logindex, pid, probefunc,
				1890	this->open_arg0,
				1891	self->open_arg1,
				1892	self->open_arg2);
				1893	logindex++;
				1894	this->open_arg0 = 0;
				1895	}
				1896	syscall::open*:return /trackedpid[pid] == 1/ {
				1897	self->open_arg0 = 0;
				1898	self->open_arg1 = 0;
				1899	self->open_arg2 = 0;
				1900	}
				1901
				1902	syscall::rename:entry /trackedpid[pid] == 1/ {
				1903	self->rename_arg0 = arg0;
				1904	self->rename_arg1 = arg1;
				1905	}
				1906	syscall::rename:return /trackedpid[pid] == 1 && errno == 0/ {
				1907	this->rename_arg0 = copyinstr(self->rename_arg0);
				1908	this->rename_arg1 = copyinstr(self->rename_arg1);
				1909	printf("%d %d %s(\\"%s\\", \\"%s\\")\\n",
				1910	logindex, pid, probefunc,
				1911	this->rename_arg0,
				1912	this->rename_arg1);
				1913	logindex++;
				1914	this->rename_arg0 = 0;
				1915	this->rename_arg1 = 0;
				1916	}
				1917	syscall::rename:return /trackedpid[pid] == 1/ {
				1918	self->rename_arg0 = 0;
				1919	self->rename_arg1 = 0;
				1920	}
				1921
				1922	/* Track chdir, it's painful because it is only receiving relative path.
				1923	*/
				1924	syscall::chdir:entry /trackedpid[pid] == 1/ {
				1925	self->chdir_arg0 = arg0;
				1926	}
				1927	syscall::chdir:return /trackedpid[pid] == 1 && errno == 0/ {
				1928	this->chdir_arg0 = copyinstr(self->chdir_arg0);
				1929	printf("%d %d %s(\\"%s\\")\\n",
				1930	logindex, pid, probefunc,
				1931	this->chdir_arg0);
				1932	logindex++;
				1933	this->chdir_arg0 = 0;
				1934	}
				1935	syscall::chdir:return /trackedpid[pid] == 1/ {
				1936	self->chdir_arg0 = 0;
				1937	}
				1938	"""
				1939
				1940	# execve-specific code, tends to throw a lot of exceptions.
				1941	D_CODE_EXECVE = """
				1942	/* Finally what we care about! */
				1943	syscall::exec*:entry /trackedpid[pid]/ {
				1944	self->exec_arg0 = copyinstr(arg0);
				1945	/* Incrementally probe for a NULL in the argv parameter of execve() to
				1946	* figure out argc. */
				1947	/* TODO(maruel): Skip the remaining copyin() when a NULL pointer was
				1948	* found. */
				1949	self->exec_argc = 0;
				1950	/* Probe for argc==1 */
				1951	this->exec_argv = (user_addr_t*)copyin(
				1952	arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
				1953	self->exec_argc = this->exec_argv[self->exec_argc] ?
				1954	(self->exec_argc + 1) : self->exec_argc;
				1955
				1956	/* Probe for argc==2 */
				1957	this->exec_argv = (user_addr_t*)copyin(
				1958	arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
				1959	self->exec_argc = this->exec_argv[self->exec_argc] ?
				1960	(self->exec_argc + 1) : self->exec_argc;
				1961
				1962	/* Probe for argc==3 */
				1963	this->exec_argv = (user_addr_t*)copyin(
				1964	arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
				1965	self->exec_argc = this->exec_argv[self->exec_argc] ?
				1966	(self->exec_argc + 1) : self->exec_argc;
				1967
				1968	/* Probe for argc==4 */
				1969	this->exec_argv = (user_addr_t*)copyin(
				1970	arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
				1971	self->exec_argc = this->exec_argv[self->exec_argc] ?
				1972	(self->exec_argc + 1) : self->exec_argc;
				1973
				1974	/* Copy the inputs strings since there is no guarantee they'll be
				1975	* present after the call completed. */
				1976	self->exec_argv0 = (self->exec_argc > 0) ?
				1977	copyinstr(this->exec_argv[0]) : "";
				1978	self->exec_argv1 = (self->exec_argc > 1) ?
				1979	copyinstr(this->exec_argv[1]) : "";
				1980	self->exec_argv2 = (self->exec_argc > 2) ?
				1981	copyinstr(this->exec_argv[2]) : "";
				1982	self->exec_argv3 = (self->exec_argc > 3) ?
				1983	copyinstr(this->exec_argv[3]) : "";
				1984	this->exec_argv = 0;
				1985	}
				1986	syscall::exec*:return /trackedpid[pid] && errno == 0/ {
				1987	/* We need to join strings here, as using multiple printf() would
				1988	* cause tearing when multiple threads/processes are traced.
				1989	* Since it is impossible to escape a string and join it to another one,
				1990	* like sprintf("%s%S", previous, more), use hackery.
				1991	* Each of the elements are split with a \\1. \\0 cannot be used because
				1992	* it is simply ignored. This will conflict with any program putting a
				1993	* \\1 in their execve() string but this should be "rare enough" */
				1994	this->args = "";
				1995	/* Process exec_argv[0] */
				1996	this->args = strjoin(
				1997	this->args, (self->exec_argc > 0) ? self->exec_argv0 : "");
				1998
				1999	/* Process exec_argv[1] */
				2000	this->args = strjoin(
				2001	this->args, (self->exec_argc > 1) ? "\\1" : "");
				2002	this->args = strjoin(
				2003	this->args, (self->exec_argc > 1) ? self->exec_argv1 : "");
				2004
				2005	/* Process exec_argv[2] */
				2006	this->args = strjoin(
				2007	this->args, (self->exec_argc > 2) ? "\\1" : "");
				2008	this->args = strjoin(
				2009	this->args, (self->exec_argc > 2) ? self->exec_argv2 : "");
				2010
				2011	/* Process exec_argv[3] */
				2012	this->args = strjoin(
				2013	this->args, (self->exec_argc > 3) ? "\\1" : "");
				2014	this->args = strjoin(
				2015	this->args, (self->exec_argc > 3) ? self->exec_argv3 : "");
				2016
				2017	/* Prints self->exec_argc to permits verifying the internal
				2018	* consistency since this code is quite fishy. */
				2019	printf("%d %d %s(\\"%s\\", [%d, %S])\\n",
				2020	logindex, pid, probefunc,
				2021	self->exec_arg0,
				2022	self->exec_argc,
				2023	this->args);
				2024	logindex++;
				2025	this->args = 0;
				2026	}
				2027	syscall::exec*:return /trackedpid[pid]/ {
				2028	self->exec_arg0 = 0;
				2029	self->exec_argc = 0;
				2030	self->exec_argv0 = 0;
				2031	self->exec_argv1 = 0;
				2032	self->exec_argv2 = 0;
				2033	self->exec_argv3 = 0;
				2034	}
				2035	"""
				2036
				2037	# Code currently not used.
				2038	D_EXTRANEOUS = """
				2039	/* This is a good learning experience, since it traces a lot of things
				2040	* related to the process and child processes.
				2041	* Warning: it generates a gigantic log. For example, tracing
				2042	* "data/trace_inputs/child1.py --child" generates a 2mb log and takes
				2043	* several minutes to execute.
				2044	*/
				2045	/*
				2046	mach_trap::: /trackedpid[pid] == 1 \|\| trackedpid[ppid]/ {
				2047	printf("%d %d %s_%s() = %d\\n",
				2048	logindex, pid, probeprov, probefunc, errno);
				2049	logindex++;
				2050	}
				2051	proc::: /trackedpid[pid] == 1 \|\| trackedpid[ppid]/ {
				2052	printf("%d %d %s_%s() = %d\\n",
				2053	logindex, pid, probeprov, probefunc, errno);
				2054	logindex++;
				2055	}
				2056	sched::: /trackedpid[pid] == 1 \|\| trackedpid[ppid]/ {
				2057	printf("%d %d %s_%s() = %d\\n",
				2058	logindex, pid, probeprov, probefunc, errno);
				2059	logindex++;
				2060	}
				2061	syscall::: /trackedpid[pid] == 1 \|\| trackedpid[ppid]/ {
				2062	printf("%d %d %s_%s() = %d\\n",
				2063	logindex, pid, probeprov, probefunc, errno);
				2064	logindex++;
				2065	}
				2066	vminfo::: /trackedpid[pid] == 1 \|\| trackedpid[ppid]/ {
				2067	printf("%d %d %s_%s() = %d\\n",
				2068	logindex, pid, probeprov, probefunc, errno);
				2069	logindex++;
				2070	}
				2071	*/
				2072	/* TODO(maruel): stat functions and friends
				2073	syscall::access:return,
				2074	syscall::chdir:return,
				2075	syscall::chflags:return,
				2076	syscall::chown:return,
				2077	syscall::chroot:return,
				2078	syscall::getattrlist:return,
				2079	syscall::getxattr:return,
				2080	syscall::lchown:return,
				2081	syscall::lstat64:return,
				2082	syscall::lstat:return,
				2083	syscall::mkdir:return,
				2084	syscall::pathconf:return,
				2085	syscall::readlink:return,
				2086	syscall::removexattr:return,
				2087	syscall::setxattr:return,
				2088	syscall::stat64:return,
				2089	syscall::stat:return,
				2090	syscall::truncate:return,
				2091	syscall::unlink:return,
				2092	syscall::utimes:return,
				2093	*/
				2094	"""
				2095
				2096	def __init__(self, logname):
				2097	"""Starts the log collection with dtrace.
				2098
				2099	Requires root access or chmod 4555 on dtrace. dtrace is asynchronous so
				2100	this needs to wait for dtrace to be "warmed up".
				2101	"""
				2102	super(Dtrace.Tracer, self).__init__(logname)
				2103	self._script = create_thunk()
				2104	# This unique dummy temp file is used to signal the dtrace script that it
				2105	# should stop as soon as all the child processes are done. A bit hackish
				2106	# but works fine enough.
				2107	self._dummy_file_id, self._dummy_file_name = tempfile.mkstemp(
				2108	prefix='trace_signal_file')
				2109
				2110	# Note: do not use the -p flag. It's useless if the initial process quits
				2111	# too fast, resulting in missing traces from the grand-children. The D
				2112	# code manages the dtrace lifetime itself.
				2113	trace_cmd = [
				2114	'sudo',
				2115	'dtrace',
				2116	# Use a larger buffer if getting 'out of scratch space' errors.
				2117	# Ref: https://wikis.oracle.com/display/DTrace/Options+and+Tunables
				2118	'-b', '10m',
				2119	'-x', 'dynvarsize=10m',
				2120	#'-x', 'dtrace_global_maxsize=1m',
				2121	'-x', 'evaltime=exec',
				2122	'-o', '/dev/stderr',
				2123	'-q',
				2124	'-n', self._get_dtrace_code(),
				2125	]
				2126	with open(self._logname + '.log', 'wb') as logfile:
				2127	self._dtrace = subprocess.Popen(
				2128	trace_cmd, stdout=logfile, stderr=subprocess.STDOUT)
				2129	logging.debug('Started dtrace pid: %d' % self._dtrace.pid)
				2130
				2131	# Reads until one line is printed, which signifies dtrace is up and ready.
				2132	with open(self._logname + '.log', 'rb') as logfile:
				2133	while 'dtrace_BEGIN' not in logfile.readline():
				2134	if self._dtrace.poll() is not None:
				2135	# Do a busy wait. :/
				2136	break
				2137	logging.debug('dtrace started')
				2138
				2139	def _get_dtrace_code(self):
				2140	"""Setups the D code to implement child process tracking.
				2141
				2142	Injects the cookie in the script so it knows when to stop.
				2143
				2144	The script will detect any instance of the script created with
				2145	create_thunk() and will start tracing it.
				2146	"""
				2147	return (
				2148	'inline int PID = %d;\n'
				2149	'inline string SCRIPT = "%s";\n'
				2150	'inline int FILE_ID = %d;\n'
				2151	'\n'
				2152	'%s') % (
				2153	os.getpid(),
				2154	self._script,
				2155	self._dummy_file_id,
				2156	self.D_CODE) + self.D_CODE_EXECVE
				2157
				2158	def trace(self, cmd, cwd, tracename, output):
				2159	"""Runs dtrace on an executable.
				2160
				2161	This dtruss is broken when it starts the process itself or when tracing
				2162	child processes, this code starts a wrapper process
				2163	generated with create_thunk() which starts the executable to trace.
				2164	"""
				2165	logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
				2166	assert os.path.isabs(cmd[0]), cmd[0]
				2167	assert os.path.isabs(cwd), cwd
				2168	assert os.path.normpath(cwd) == cwd, cwd
				2169	with self._lock:
				2170	if not self._initialized:
				2171	raise TracingFailure(
				2172	'Called Tracer.trace() on an unitialized object',
				2173	None, None, None, tracename)
				2174	assert tracename not in (i['trace'] for i in self._traces)
				2175
				2176	# Starts the script wrapper to start the child process. This signals the
				2177	# dtrace script that this process is to be traced.
				2178	stdout = stderr = None
				2179	if output:
				2180	stdout = subprocess.PIPE
				2181	stderr = subprocess.STDOUT
				2182	child_cmd = [
				2183	sys.executable,
				2184	self._script,
				2185	tracename,
				2186	]
				2187	# Call a dummy function so that dtrace knows I'm about to launch a process
				2188	# that needs to be traced.
				2189	# Yummy.
				2190	child = subprocess.Popen(
				2191	child_cmd + fix_python_path(cmd),
				2192	stdin=subprocess.PIPE,
				2193	stdout=stdout,
				2194	stderr=stderr,
				2195	cwd=cwd)
				2196	logging.debug('Started child pid: %d' % child.pid)
				2197
				2198	out = child.communicate()[0]
				2199	# This doesn't mean tracing is done, one of the grand-child process may
				2200	# still be alive. It will be tracked with the dtrace script.
				2201
				2202	with self._lock:
				2203	assert tracename not in (i['trace'] for i in self._traces)
				2204	self._traces.append(
				2205	{
				2206	'cmd': cmd,
				2207	'cwd': cwd,
				2208	# The pid of strace process, not very useful.
				2209	'pid': child.pid,
				2210	'trace': tracename,
				2211	'output': out,
				2212	})
				2213	return child.returncode, out
				2214
				2215	def close(self, timeout=None):
				2216	"""Terminates dtrace."""
				2217	logging.debug('close(%s)' % timeout)
				2218	try:
				2219	try:
				2220	super(Dtrace.Tracer, self).close(timeout)
				2221	# Signal dtrace that it should stop now.
				2222	os.ftruncate(self._dummy_file_id, 0)
				2223	if timeout:
				2224	start = time.time()
				2225	# Use polling. :/
				2226	while (self._dtrace.poll() is None and
				2227	(time.time() - start) < timeout):
				2228	time.sleep(0.1)
				2229	self._dtrace.kill()
				2230	self._dtrace.wait()
				2231	finally:
				2232	# Make sure to kill it in any case.
				2233	if self._dtrace.poll() is None:
				2234	try:
				2235	self._dtrace.kill()
				2236	self._dtrace.wait()
				2237	except OSError:
				2238	pass
				2239
				2240	if self._dtrace.returncode != 0:
				2241	# Warn about any dtrace failure but basically ignore it.
				2242	print 'dtrace failure: %s' % self._dtrace.returncode
				2243	finally:
				2244	os.close(self._dummy_file_id)
				2245	os.remove(self._dummy_file_name)
				2246	os.remove(self._script)
				2247
				2248	def post_process_log(self):
				2249	"""Sorts the log back in order when each call occured.
				2250
				2251	dtrace doesn't save the buffer in strict order since it keeps one buffer
				2252	per CPU.
				2253	"""
				2254	super(Dtrace.Tracer, self).post_process_log()
				2255	logname = self._logname + '.log'
				2256	with open(logname, 'rb') as logfile:
				2257	lines = [l for l in logfile if l.strip()]
				2258	errors = [l for l in lines if l.startswith('dtrace:')]
				2259	if errors:
				2260	raise TracingFailure(
				2261	'Found errors in the trace: %s' % '\n'.join(errors),
				2262	None, None, None, logname)
				2263	try:
				2264	lines = sorted(lines, key=lambda l: int(l.split(' ', 1)[0]))
				2265	except ValueError:
				2266	raise TracingFailure(
				2267	'Found errors in the trace: %s' % '\n'.join(
				2268	l for l in lines if l.split(' ', 1)[0].isdigit()),
				2269	None, None, None, logname)
				2270	with open(logname, 'wb') as logfile:
				2271	logfile.write(''.join(lines))
				2272
				2273	@staticmethod
				2274	def clean_trace(logname):
				2275	for ext in ('', '.log'):
				2276	if os.path.isfile(logname + ext):
				2277	os.remove(logname + ext)
				2278
				2279	@classmethod
				2280	def parse_log(cls, logname, blacklist):
				2281	logging.info('parse_log(%s, ...)' % logname)
				2282
				2283	def blacklist_more(filepath):
				2284	# All the HFS metadata is in the form /.vol/...
				2285	return blacklist(filepath) or re.match(r'^\/\.vol\/.+$', filepath)
				2286
				2287	data = read_json(logname)
				2288	out = []
				2289	for item in data['traces']:
				2290	context = cls.Context(blacklist_more, item['pid'], item['cwd'])
				2291	for line in open(logname + '.log', 'rb'):
				2292	context.on_line(line)
				2293	out.append(
				2294	{
				2295	'results': context.to_results(),
				2296	'trace': item['trace'],
				2297	'output': item['output'],
				2298	})
				2299	return out
				2300
				2301
				2302	class LogmanTrace(ApiBase):
				2303	"""Uses the native Windows ETW based tracing functionality to trace a child
				2304	process.
				2305
				2306	Caveat: this implementations doesn't track cwd or initial_cwd. It is because
				2307	the Windows Kernel doesn't have a concept of 'current working directory' at
				2308	all. A Win32 process has a map of current directories, one per drive letter
				2309	and it is managed by the user mode kernel32.dll. In kernel, a file is always
				2310	opened relative to another file_object or as an absolute path. All the current
				2311	working directory logic is done in user mode.
				2312	"""
				2313	class Context(ApiBase.Context):
				2314	"""Processes a ETW log line and keeps the list of existent and non
				2315	existent files accessed.
				2316
				2317	Ignores directories.
				2318	"""
				2319	# These indexes are for the stripped version in json.
				2320	EVENT_NAME = 0
				2321	TYPE = 1
				2322	PID = 2
				2323	TID = 3
				2324	PROCESSOR_ID = 4
				2325	TIMESTAMP = 5
				2326	USER_DATA = 6
				2327
				2328	class Process(ApiBase.Context.Process):
				2329	def __init__(self, *args):
				2330	super(LogmanTrace.Context.Process, self).__init__(*args)
				2331	# Handle file objects that succeeded.
				2332	self.file_objects = {}
				2333
				2334	def __init__(self, blacklist, tracer_pid):
				2335	logging.info('%s(%d)' % (self.__class__.__name__, tracer_pid))
				2336	super(LogmanTrace.Context, self).__init__(blacklist)
				2337	self._drive_map = DosDriveMap()
				2338	# Threads mapping to the corresponding process id.
				2339	self._threads_active = {}
				2340	# Process ID of the tracer, e.g. tracer_inputs.py
				2341	self._tracer_pid = tracer_pid
				2342	self._line_number = 0
				2343
				2344	def on_line(self, line):
				2345	"""Processes a json Event line."""
				2346	self._line_number += 1
				2347	try:
				2348	# By Opcode
				2349	handler = getattr(
				2350	self,
				2351	'handle_%s_%s' % (line[self.EVENT_NAME], line[self.TYPE]),
				2352	None)
				2353	if not handler:
				2354	raise TracingFailure(
				2355	'Unexpected event %s_%s' % (
				2356	line[self.EVENT_NAME], line[self.TYPE]),
				2357	None, None, None)
				2358	handler(line)
				2359	except TracingFailure, e:
				2360	# Hack in the values since the handler could be a static function.
				2361	e.pid = line[self.PID]
				2362	e.line = line
				2363	e.line_number = self._line_number
				2364	# Re-raise the modified exception.
				2365	raise
				2366	except (KeyError, NotImplementedError, ValueError), e:
				2367	raise TracingFailure(
				2368	'Trace generated a %s exception: %s' % (
				2369	e.__class__.__name__, str(e)),
				2370	line[self.PID],
				2371	self._line_number,
				2372	line,
				2373	e)
				2374
				2375	def to_results(self):
				2376	if not self.root_process:
				2377	raise TracingFailure(
				2378	'Failed to detect the initial process',
				2379	None, None, None)
				2380	process = self.root_process.to_results_process()
				2381	return Results(process)
				2382
				2383	def _thread_to_process(self, tid):
				2384	"""Finds the process from the thread id."""
				2385	tid = int(tid, 16)
				2386	pid = self._threads_active.get(tid)
				2387	if not pid or not self._process_lookup.get(pid):
				2388	return
				2389	return self._process_lookup[pid]
				2390
				2391	@classmethod
				2392	def handle_EventTrace_Header(cls, line):
				2393	"""Verifies no event was dropped, e.g. no buffer overrun occured."""
				2394	BUFFER_SIZE = cls.USER_DATA
				2395	#VERSION = cls.USER_DATA + 1
				2396	#PROVIDER_VERSION = cls.USER_DATA + 2
				2397	#NUMBER_OF_PROCESSORS = cls.USER_DATA + 3
				2398	#END_TIME = cls.USER_DATA + 4
				2399	#TIMER_RESOLUTION = cls.USER_DATA + 5
				2400	#MAX_FILE_SIZE = cls.USER_DATA + 6
				2401	#LOG_FILE_MODE = cls.USER_DATA + 7
				2402	#BUFFERS_WRITTEN = cls.USER_DATA + 8
				2403	#START_BUFFERS = cls.USER_DATA + 9
				2404	#POINTER_SIZE = cls.USER_DATA + 10
				2405	EVENTS_LOST = cls.USER_DATA + 11
				2406	#CPU_SPEED = cls.USER_DATA + 12
				2407	#LOGGER_NAME = cls.USER_DATA + 13
				2408	#LOG_FILE_NAME = cls.USER_DATA + 14
				2409	#BOOT_TIME = cls.USER_DATA + 15
				2410	#PERF_FREQ = cls.USER_DATA + 16
				2411	#START_TIME = cls.USER_DATA + 17
				2412	#RESERVED_FLAGS = cls.USER_DATA + 18
				2413	#BUFFERS_LOST = cls.USER_DATA + 19
				2414	#SESSION_NAME_STRING = cls.USER_DATA + 20
				2415	#LOG_FILE_NAME_STRING = cls.USER_DATA + 21
				2416	if line[EVENTS_LOST] != '0':
				2417	raise TracingFailure(
				2418	( '%s events were lost during trace, please increase the buffer '
				2419	'size from %s') % (line[EVENTS_LOST], line[BUFFER_SIZE]),
				2420	None, None, None)
				2421
				2422	def handle_FileIo_Cleanup(self, line):
				2423	"""General wisdom: if a file is closed, it's because it was opened.
				2424
				2425	Note that FileIo_Close is not used since if a file was opened properly but
				2426	not closed before the process exits, only Cleanup will be logged.
				2427	"""
				2428	#IRP = self.USER_DATA
				2429	TTID = self.USER_DATA + 1 # Thread ID, that's what we want.
				2430	FILE_OBJECT = self.USER_DATA + 2
				2431	#FILE_KEY = self.USER_DATA + 3
				2432	proc = self._thread_to_process(line[TTID])
				2433	if not proc:
				2434	# Not a process we care about.
				2435	return
				2436	file_object = line[FILE_OBJECT]
				2437	if file_object in proc.file_objects:
				2438	proc.add_file(proc.file_objects.pop(file_object), False)
				2439
				2440	def handle_FileIo_Create(self, line):
				2441	"""Handles a file open.
				2442
				2443	All FileIo events are described at
				2444	http://msdn.microsoft.com/library/windows/desktop/aa363884.aspx
				2445	for some value of 'description'.
				2446
				2447	" (..) process and thread id values of the IO events (..) are not valid "
				2448	http://msdn.microsoft.com/magazine/ee358703.aspx
				2449
				2450	The FileIo.Create event doesn't return if the CreateFile() call
				2451	succeeded, so keep track of the file_object and check that it is
				2452	eventually closed with FileIo_Cleanup.
				2453	"""
				2454	#IRP = self.USER_DATA
				2455	TTID = self.USER_DATA + 1 # Thread ID, that's what we want.
				2456	FILE_OBJECT = self.USER_DATA + 2
				2457	#CREATE_OPTIONS = self.USER_DATA + 3
				2458	#FILE_ATTRIBUTES = self.USER_DATA + 4
				2459	#self.USER_DATA + SHARE_ACCESS = 5
				2460	OPEN_PATH = self.USER_DATA + 6
				2461
				2462	proc = self._thread_to_process(line[TTID])
				2463	if not proc:
				2464	# Not a process we care about.
				2465	return
				2466
				2467	match = re.match(r'^\"(.+)\"$', line[OPEN_PATH])
				2468	raw_path = match.group(1)
				2469	# Ignore directories and bare drive right away.
				2470	if raw_path.endswith(os.path.sep):
				2471	return
				2472	filepath = self._drive_map.to_win32(raw_path)
				2473	# Ignore bare drive right away. Some may still fall through with format
				2474	# like '\\?\X:'
				2475	if len(filepath) == 2:
				2476	return
				2477	file_object = line[FILE_OBJECT]
				2478	if os.path.isdir(filepath):
				2479	# There is no O_DIRECTORY equivalent on Windows. The closed is
				2480	# FILE_FLAG_BACKUP_SEMANTICS but it's not exactly right either. So
				2481	# simply discard directories are they are found.
				2482	return
				2483	# Override any stale file object
				2484	proc.file_objects[file_object] = filepath
				2485
				2486	def handle_FileIo_Rename(self, line):
				2487	# TODO(maruel): Handle?
				2488	pass
				2489
				2490	def handle_Process_End(self, line):
				2491	pid = line[self.PID]
				2492	if self._process_lookup.get(pid):
				2493	logging.info('Terminated: %d' % pid)
				2494	self._process_lookup[pid] = None
				2495	else:
				2496	logging.debug('Terminated: %d' % pid)
				2497
				2498	def handle_Process_Start(self, line):
				2499	"""Handles a new child process started by PID."""
				2500	#UNIQUE_PROCESS_KEY = self.USER_DATA
				2501	PROCESS_ID = self.USER_DATA + 1
				2502	#PARENT_PID = self.USER_DATA + 2
				2503	#SESSION_ID = self.USER_DATA + 3
				2504	#EXIT_STATUS = self.USER_DATA + 4
				2505	#DIRECTORY_TABLE_BASE = self.USER_DATA + 5
				2506	#USER_SID = self.USER_DATA + 6
				2507	IMAGE_FILE_NAME = self.USER_DATA + 7
				2508	COMMAND_LINE = self.USER_DATA + 8
				2509
				2510	ppid = line[self.PID]
				2511	pid = int(line[PROCESS_ID], 16)
				2512	logging.debug(
				2513	'New process %d->%d (%s) %s' %
				2514	(ppid, pid, line[IMAGE_FILE_NAME], line[COMMAND_LINE]))
				2515
				2516	if ppid == self._tracer_pid:
				2517	# Need to ignore processes we don't know about because the log is
				2518	# system-wide. self._tracer_pid shall start only one process.
				2519	if self.root_process:
				2520	raise TracingFailure(
				2521	( 'Parent process is _tracer_pid(%d) but root_process(%d) is '
				2522	'already set') % (self._tracer_pid, self.root_process.pid),
				2523	None, None, None)
				2524	proc = self.Process(self.blacklist, pid, None)
				2525	self.root_process = proc
				2526	ppid = None
				2527	elif self._process_lookup.get(ppid):
				2528	proc = self.Process(self.blacklist, pid, None)
				2529	self._process_lookup[ppid].children.append(proc)
				2530	else:
				2531	# Ignore
				2532	return
				2533	self._process_lookup[pid] = proc
				2534
				2535	if (not line[IMAGE_FILE_NAME].startswith('"') or
				2536	not line[IMAGE_FILE_NAME].endswith('"')):
				2537	raise TracingFailure(
				2538	'Command line is not properly quoted: %s' % line[IMAGE_FILE_NAME],
				2539	None, None, None)
				2540
				2541	# TODO(maruel): Process escapes.
				2542	if (not line[COMMAND_LINE].startswith('"') or
				2543	not line[COMMAND_LINE].endswith('"')):
				2544	raise TracingFailure(
				2545	'Command line is not properly quoted: %s' % line[COMMAND_LINE],
				2546	None, None, None)
				2547	proc.command = CommandLineToArgvW(line[COMMAND_LINE][1:-1])
				2548	proc.executable = line[IMAGE_FILE_NAME][1:-1]
				2549	# proc.command[0] may be the absolute path of 'executable' but it may be
				2550	# anything else too. If it happens that command[0] ends with executable,
				2551	# use it, otherwise defaults to the base name.
				2552	cmd0 = proc.command[0].lower()
				2553	if not cmd0.endswith('.exe'):
				2554	# TODO(maruel): That's not strictly true either.
				2555	cmd0 += '.exe'
				2556	if cmd0.endswith(proc.executable) and os.path.isfile(cmd0):
				2557	# Fix the path.
				2558	cmd0 = cmd0.replace('/', os.path.sep)
				2559	cmd0 = os.path.normpath(cmd0)
				2560	proc.executable = get_native_path_case(cmd0)
				2561	logging.info(
				2562	'New child: %s -> %d %s' % (ppid, pid, proc.executable))
				2563
				2564	def handle_Thread_End(self, line):
				2565	"""Has the same parameters as Thread_Start."""
				2566	tid = int(line[self.TID], 16)
				2567	self._threads_active.pop(tid, None)
				2568
				2569	def handle_Thread_Start(self, line):
				2570	"""Handles a new thread created.
				2571
				2572	Do not use self.PID here since a process' initial thread is created by
				2573	the parent process.
				2574	"""
				2575	PROCESS_ID = self.USER_DATA
				2576	TTHREAD_ID = self.USER_DATA + 1
				2577	#STACK_BASE = self.USER_DATA + 2
				2578	#STACK_LIMIT = self.USER_DATA + 3
				2579	#USER_STACK_BASE = self.USER_DATA + 4
				2580	#USER_STACK_LIMIT = self.USER_DATA + 5
				2581	#AFFINITY = self.USER_DATA + 6
				2582	#WIN32_START_ADDR = self.USER_DATA + 7
				2583	#TEB_BASE = self.USER_DATA + 8
				2584	#SUB_PROCESS_TAG = self.USER_DATA + 9
				2585	#BASE_PRIORITY = self.USER_DATA + 10
				2586	#PAGE_PRIORITY = self.USER_DATA + 11
				2587	#IO_PRIORITY = self.USER_DATA + 12
				2588	#THREAD_FLAGS = self.USER_DATA + 13
				2589	# Do not use self.PID here since a process' initial thread is created by
				2590	# the parent process.
				2591	pid = int(line[PROCESS_ID], 16)
				2592	tid = int(line[TTHREAD_ID], 16)
				2593	logging.debug('New thread pid:%d, tid:%d' % (pid, tid))
				2594	self._threads_active[tid] = pid
				2595
				2596	@classmethod
				2597	def supported_events(cls):
				2598	"""Returns all the procesed events."""
				2599	out = []
				2600	for member in dir(cls):
				2601	match = re.match(r'^handle_([A-Za-z]+)_([A-Za-z]+)$', member)
				2602	if match:
				2603	out.append(match.groups())
				2604	return out
				2605
				2606	class Tracer(ApiBase.Tracer):
				2607	# The basic headers.
				2608	EXPECTED_HEADER = [
				2609	u'Event Name',
				2610	u'Type',
				2611	u'Event ID',
				2612	u'Version',
				2613	u'Channel',
				2614	u'Level', # 5
				2615	u'Opcode',
				2616	u'Task',
				2617	u'Keyword',
				2618	u'PID',
				2619	u'TID', # 10
				2620	u'Processor Number',
				2621	u'Instance ID',
				2622	u'Parent Instance ID',
				2623	u'Activity ID',
				2624	u'Related Activity ID', # 15
				2625	u'Clock-Time',
				2626	u'Kernel(ms)', # Both have a resolution of ~15ms which makes them
				2627	u'User(ms)', # pretty much useless.
				2628	u'User Data', # Extra arguments that are event-specific.
				2629	]
				2630	# Only the useful headers common to all entries are listed there. Any column
				2631	# at 19 or higher is dependent on the specific event.
				2632	EVENT_NAME = 0
				2633	TYPE = 1
				2634	PID = 9
				2635	TID = 10
				2636	PROCESSOR_ID = 11
				2637	TIMESTAMP = 16
				2638	NULL_GUID = '{00000000-0000-0000-0000-000000000000}'
				2639	USER_DATA = 19
				2640
				2641	def __init__(self, logname):
				2642	"""Starts the log collection.
				2643
				2644	Requires administrative access. logman.exe is synchronous so no need for a
				2645	"warmup" call. 'Windows Kernel Trace' is localized so use its GUID
				2646	instead. The GUID constant name is SystemTraceControlGuid. Lovely.
				2647
				2648	One can get the list of potentially interesting providers with:
				2649	"logman query providers \| findstr /i file"
				2650	"""
				2651	super(LogmanTrace.Tracer, self).__init__(logname)
				2652	self._script = create_thunk()
				2653	cmd_start = [
				2654	'logman.exe',
				2655	'start',
				2656	'NT Kernel Logger',
				2657	'-p', '{9e814aad-3204-11d2-9a82-006008a86939}',
				2658	# splitio,fileiocompletion,syscall,file,cswitch,img
				2659	'(process,fileio,thread)',
				2660	'-o', self._logname + '.etl',
				2661	'-ets', # Send directly to kernel
				2662	# Values extracted out of thin air.
				2663	# Event Trace Session buffer size in kb.
				2664	'-bs', '10240',
				2665	# Number of Event Trace Session buffers.
				2666	'-nb', '16', '256',
				2667	]
				2668	logging.debug('Running: %s' % cmd_start)
				2669	try:
				2670	subprocess.check_call(
				2671	cmd_start,
				2672	stdin=subprocess.PIPE,
				2673	stdout=subprocess.PIPE,
				2674	stderr=subprocess.STDOUT)
				2675	except subprocess.CalledProcessError, e:
				2676	if e.returncode == -2147024891:
				2677	print >> sys.stderr, 'Please restart with an elevated admin prompt'
				2678	elif e.returncode == -2144337737:
				2679	print >> sys.stderr, (
				2680	'A kernel trace was already running, stop it and try again')
				2681	raise
				2682
				2683	def trace(self, cmd, cwd, tracename, output):
				2684	logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
				2685	assert os.path.isabs(cmd[0]), cmd[0]
				2686	assert os.path.isabs(cwd), cwd
				2687	assert os.path.normpath(cwd) == cwd, cwd
				2688	with self._lock:
				2689	if not self._initialized:
				2690	raise TracingFailure(
				2691	'Called Tracer.trace() on an unitialized object',
				2692	None, None, None, tracename)
				2693	assert tracename not in (i['trace'] for i in self._traces)
				2694
				2695	# Use "logman -?" for help.
				2696
				2697	stdout = stderr = None
				2698	if output:
				2699	stdout = subprocess.PIPE
				2700	stderr = subprocess.STDOUT
				2701
				2702	# Run the child process.
				2703	logging.debug('Running: %s' % cmd)
				2704	# Use the temporary script generated with create_thunk() so we have a
				2705	# clear pid owner. Since trace_inputs.py can be used as a library and
				2706	# could trace multiple processes simultaneously, it makes it more complex
				2707	# if the executable to be traced is executed directly here. It also solves
				2708	# issues related to logman.exe that needs to be executed to control the
				2709	# kernel trace.
				2710	child_cmd = [
				2711	sys.executable,
				2712	self._script,
				2713	tracename,
				2714	]
				2715	child = subprocess.Popen(
				2716	child_cmd + fix_python_path(cmd),
				2717	cwd=cwd,
				2718	stdin=subprocess.PIPE,
				2719	stdout=stdout,
				2720	stderr=stderr)
				2721	logging.debug('Started child pid: %d' % child.pid)
				2722	out = child.communicate()[0]
				2723	# This doesn't mean all the grand-children are done. Sadly, we don't have
				2724	# a good way to determine that.
				2725
				2726	with self._lock:
				2727	assert tracename not in (i['trace'] for i in self._traces)
				2728	self._traces.append({
				2729	'command': cmd,
				2730	'cwd': cwd,
				2731	'pid': child.pid,
				2732	'trace': tracename,
				2733	'output': out,
				2734	})
				2735
				2736	return child.returncode, out
				2737
				2738	def close(self, _timeout=None):
				2739	"""Stops the kernel log collection and converts the traces to text
				2740	representation.
				2741	"""
				2742	with self._lock:
				2743	if not self._initialized:
				2744	raise TracingFailure(
				2745	'Called Tracer.close() on an unitialized object',
				2746	None, None, None)
				2747	os.remove(self._script)
				2748	# Save metadata, add 'format' key..
				2749	data = {
				2750	'format': 'csv',
				2751	'traces': self._traces,
				2752	}
				2753	write_json(self._logname, data, False)
				2754
				2755	cmd_stop = [
				2756	'logman.exe',
				2757	'stop',
				2758	'NT Kernel Logger',
				2759	'-ets', # Sends the command directly to the kernel.
				2760	]
				2761	logging.debug('Running: %s' % cmd_stop)
				2762	subprocess.check_call(
				2763	cmd_stop,
				2764	stdin=subprocess.PIPE,
				2765	stdout=subprocess.PIPE,
				2766	stderr=subprocess.STDOUT)
				2767	self._initialized = False
				2768
				2769	def post_process_log(self):
				2770	"""Converts the .etl file into .csv then into .json."""
				2771	super(LogmanTrace.Tracer, self).post_process_log()
				2772	logformat = 'csv'
				2773	self._convert_log(logformat)
				2774
				2775	if logformat == 'csv_utf16':
				2776	def load_file():
				2777	def utf_8_encoder(unicode_csv_data):
				2778	"""Encodes the unicode object as utf-8 encoded str instance"""
				2779	for line in unicode_csv_data:
				2780	yield line.encode('utf-8')
				2781
				2782	def unicode_csv_reader(unicode_csv_data, **kwargs):
				2783	"""Encodes temporarily as UTF-8 since csv module doesn't do unicode.
				2784	"""
				2785	csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), **kwargs)
				2786	for row in csv_reader:
				2787	# Decode str utf-8 instances back to unicode instances, cell by
				2788	# cell:
				2789	yield [cell.decode('utf-8') for cell in row]
				2790
				2791	# The CSV file is UTF-16 so use codecs.open() to load the file into
				2792	# the python internal unicode format (utf-8). Then explicitly
				2793	# re-encode as utf8 as str instances so csv can parse it fine. Then
				2794	# decode the utf-8 str back into python unicode instances. This
				2795	# sounds about right.
				2796	for line in unicode_csv_reader(
				2797	codecs.open(self._logname + '.' + logformat, 'r', 'utf-16')):
				2798	# line is a list of unicode objects
				2799	# So much white space!
				2800	yield [i.strip() for i in line]
				2801
				2802	elif logformat == 'csv':
				2803	def load_file():
				2804	def ansi_csv_reader(ansi_csv_data, **kwargs):
				2805	"""Loads an 'ANSI' code page and returns unicode() objects."""
				2806	assert sys.getfilesystemencoding() == 'mbcs'
				2807	encoding = get_current_encoding()
				2808	for row in csv.reader(ansi_csv_data, **kwargs):
				2809	# Decode str 'ansi' instances to unicode instances, cell by cell:
				2810	yield [cell.decode(encoding) for cell in row]
				2811
				2812	# The fastest and smallest format but only supports 'ANSI' file paths.
				2813	# E.g. the filenames are encoding in the 'current' encoding.
				2814	for line in ansi_csv_reader(open(self._logname + '.' + logformat)):
				2815	# line is a list of unicode objects.
				2816	yield [i.strip() for i in line]
				2817
				2818	supported_events = LogmanTrace.Context.supported_events()
				2819
				2820	def trim(generator):
				2821	for index, line in enumerate(generator):
				2822	if not index:
				2823	if line != self.EXPECTED_HEADER:
				2824	raise TracingFailure(
				2825	'Found malformed header: %s' % ' '.join(line),
				2826	None, None, None)
				2827	continue
				2828	# As you can see, the CSV is full of useful non-redundant information:
				2829	if (line[2] != '0' or # Event ID
				2830	line[3] not in ('2', '3') or # Version
				2831	line[4] != '0' or # Channel
				2832	line[5] != '0' or # Level
				2833	line[7] != '0' or # Task
				2834	line[8] != '0x0000000000000000' or # Keyword
				2835	line[12] != '' or # Instance ID
				2836	line[13] != '' or # Parent Instance ID
				2837	line[14] != self.NULL_GUID or # Activity ID
				2838	line[15] != ''): # Related Activity ID
				2839	raise TracingFailure(
				2840	'Found unexpected values in line: %s' % ' '.join(line),
				2841	None, None, None)
				2842
				2843	if (line[self.EVENT_NAME], line[self.TYPE]) not in supported_events:
				2844	continue
				2845
				2846	# Convert the PID in-place from hex.
				2847	line[self.PID] = int(line[self.PID], 16)
				2848
				2849	yield [
				2850	line[self.EVENT_NAME],
				2851	line[self.TYPE],
				2852	line[self.PID],
				2853	line[self.TID],
				2854	line[self.PROCESSOR_ID],
				2855	line[self.TIMESTAMP],
				2856	] + line[self.USER_DATA:]
				2857
				2858	write_json('%s.json' % self._logname, list(trim(load_file())), True)
				2859
				2860	def _convert_log(self, logformat):
				2861	"""Converts the ETL trace to text representation.
				2862
				2863	Normally, 'csv' is sufficient. If complex scripts are used (like eastern
				2864	languages), use 'csv_utf16'. If localization gets in the way, use 'xml'.
				2865
				2866	Arguments:
				2867	- logformat: Text format to be generated, csv, csv_utf16 or xml.
				2868
				2869	Use "tracerpt -?" for help.
				2870	"""
				2871	LOCALE_INVARIANT = 0x7F
				2872	windll.kernel32.SetThreadLocale(LOCALE_INVARIANT)
				2873	cmd_convert = [
				2874	'tracerpt.exe',
				2875	'-l', self._logname + '.etl',
				2876	'-o', self._logname + '.' + logformat,
				2877	'-gmt', # Use UTC
				2878	'-y', # No prompt
				2879	# Use -of XML to get the header of each items after column 19, e.g. all
				2880	# the actual headers of 'User Data'.
				2881	]
				2882
				2883	if logformat == 'csv':
				2884	# tracerpt localizes the 'Type' column, for major brainfuck
				2885	# entertainment. I can't imagine any sane reason to do that.
				2886	cmd_convert.extend(['-of', 'CSV'])
				2887	elif logformat == 'csv_utf16':
				2888	# This causes it to use UTF-16, which doubles the log size but ensures
				2889	# the log is readable for non-ASCII characters.
				2890	cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode'])
				2891	elif logformat == 'xml':
				2892	cmd_convert.extend(['-of', 'XML'])
				2893	else:
				2894	raise ValueError('Unexpected log format \'%s\'' % logformat)
				2895	logging.debug('Running: %s' % cmd_convert)
				2896	# This can takes tens of minutes for large logs.
				2897	# Redirects all output to stderr.
				2898	subprocess.check_call(
				2899	cmd_convert,
				2900	stdin=subprocess.PIPE,
				2901	stdout=sys.stderr,
				2902	stderr=sys.stderr)
				2903
				2904	@staticmethod
				2905	def clean_trace(logname):
				2906	for ext in ('', '.csv', '.etl', '.json', '.xml'):
				2907	if os.path.isfile(logname + ext):
				2908	os.remove(logname + ext)
				2909
				2910	@classmethod
				2911	def parse_log(cls, logname, blacklist):
				2912	logging.info('parse_log(%s, %s)' % (logname, blacklist))
				2913
				2914	def blacklist_more(filepath):
				2915	# All the NTFS metadata is in the form x:\$EXTEND or stuff like that.
				2916	return blacklist(filepath) or re.match(r'[A-Z]\:\\\$EXTEND', filepath)
				2917
				2918	data = read_json(logname)
				2919	lines = read_json(logname + '.json')
				2920	out = []
				2921	for item in data['traces']:
				2922	context = cls.Context(blacklist_more, item['pid'])
				2923	for line in lines:
				2924	context.on_line(line)
				2925	out.append(
				2926	{
				2927	'results': context.to_results(),
				2928	'trace': item['trace'],
				2929	'output': item['output'],
				2930	})
				2931	return out
				2932
				2933
				2934	def get_api():
				2935	"""Returns the correct implementation for the current OS."""
				2936	if sys.platform == 'cygwin':
				2937	raise NotImplementedError(
				2938	'Not implemented for cygwin, start the script from Win32 python')
				2939	flavors = {
				2940	'win32': LogmanTrace,
				2941	'darwin': Dtrace,
				2942	'sunos5': Dtrace,
				2943	'freebsd7': Dtrace,
				2944	'freebsd8': Dtrace,
				2945	}
				2946	# Defaults to strace.
				2947	return flavors.get(sys.platform, Strace)()
				2948
				2949
				2950	def extract_directories(root_dir, files, blacklist):
				2951	"""Detects if all the files in a directory are in \|files\| and if so, replace
				2952	the individual files by a Results.Directory instance.
				2953
				2954	Takes a list of Results.File instances and returns a shorter list of
				2955	Results.File and Results.Directory instances.
				2956
				2957	Arguments:
				2958	- root_dir: Optional base directory that shouldn't be search further.
				2959	- files: list of Results.File instances.
				2960	- blacklist: regexp of files to ignore, for example r'.+\.pyc'.
				2961	"""
				2962	logging.info(
				2963	'extract_directories(%s, %d files, ...)' % (root_dir, len(files)))
				2964	assert not (root_dir or '').endswith(os.path.sep), root_dir
				2965	assert not root_dir or (get_native_path_case(root_dir) == root_dir)
				2966	assert not any(isinstance(f, Results.Directory) for f in files)
				2967	# Remove non existent files.
				2968	files = [f for f in files if f.existent]
				2969	if not files:
				2970	return files
				2971	# All files must share the same root, which can be None.
				2972	assert len(set(f.root for f in files)) == 1, set(f.root for f in files)
				2973
				2974	# Creates a {directory: {filename: File}} mapping, up to root.
				2975	buckets = {}
				2976	if root_dir:
				2977	buckets[root_dir] = {}
				2978	for fileobj in files:
				2979	path = fileobj.full_path
				2980	directory = os.path.dirname(path)
				2981	assert directory
				2982	# Do not use os.path.basename() so trailing os.path.sep is kept.
				2983	basename = path[len(directory)+1:]
				2984	files_in_directory = buckets.setdefault(directory, {})
				2985	files_in_directory[basename] = fileobj
				2986	# Add all the directories recursively up to root.
				2987	while True:
				2988	old_d = directory
				2989	directory = os.path.dirname(directory)
				2990	if directory + os.path.sep == root_dir or directory == old_d:
				2991	break
				2992	buckets.setdefault(directory, {})
				2993
				2994	root_prefix = len(root_dir) + 1 if root_dir else 0
				2995	for directory in sorted(buckets, reverse=True):
				2996	actual = set(f for f in os.listdir(directory) if not blacklist(f))
				2997	expected = set(buckets[directory])
				2998	if not (actual - expected):
				2999	parent = os.path.dirname(directory)
				3000	buckets[parent][os.path.basename(directory)] = Results.Directory(
				3001	root_dir,
				3002	directory[root_prefix:],
				3003	False,
				3004	sum(f.size for f in buckets[directory].itervalues()),
				3005	sum(f.nb_files for f in buckets[directory].itervalues()))
				3006	# Remove the whole bucket.
				3007	del buckets[directory]
				3008
				3009	# Reverse the mapping with what remains. The original instances are returned,
				3010	# so the cached meta data is kept.
				3011	files = sum((x.values() for x in buckets.itervalues()), [])
				3012	return sorted(files, key=lambda x: x.path)
				3013
				3014
				3015	def trace(logfile, cmd, cwd, api, output):
				3016	"""Traces an executable. Returns (returncode, output) from api.
				3017
				3018	Arguments:
				3019	- logfile: file to write to.
				3020	- cmd: command to run.
				3021	- cwd: current directory to start the process in.
				3022	- api: a tracing api instance.
				3023	- output: if True, returns output, otherwise prints it at the console.
				3024	"""
				3025	cmd = fix_python_path(cmd)
				3026	api.clean_trace(logfile)
				3027	with api.get_tracer(logfile) as tracer:
				3028	return tracer.trace(cmd, cwd, 'default', output)
				3029
				3030
				3031	def load_trace(logfile, root_dir, api, blacklist):
				3032	"""Loads a trace file and returns the Results instance.
				3033
				3034	Arguments:
				3035	- logfile: File to load.
				3036	- root_dir: Root directory to use to determine if a file is relevant to the
				3037	trace or not.
				3038	- api: A tracing api instance.
				3039	- blacklist: Optional blacklist function to filter out unimportant files.
				3040	"""
				3041	data = api.parse_log(logfile, (blacklist or (lambda _: False)))
				3042	assert len(data) == 1, 'More than one trace was detected!'
				3043	if 'exception' in data[0]:
				3044	# It got an exception, raise it.
				3045	raise data[0]['exception']
				3046	results = data[0]['results']
				3047	if root_dir:
				3048	results = results.strip_root(root_dir)
				3049	return results
				3050
				3051
				3052	def CMDclean(args):
				3053	"""Cleans up traces."""
				3054	parser = OptionParserTraceInputs(command='clean')
				3055	options, args = parser.parse_args(args)
				3056	api = get_api()
				3057	api.clean_trace(options.log)
				3058	return 0
				3059
				3060
				3061	def CMDtrace(args):
				3062	"""Traces an executable."""
				3063	parser = OptionParserTraceInputs(command='trace')
				3064	parser.allow_interspersed_args = False
				3065	parser.add_option(
				3066	'-q', '--quiet', action='store_true',
				3067	help='Redirects traced executable output to /dev/null')
				3068	options, args = parser.parse_args(args)
				3069
				3070	if not args:
				3071	parser.error('Please provide a command to run')
				3072
				3073	if not os.path.isabs(args[0]) and os.access(args[0], os.X_OK):
				3074	args[0] = os.path.abspath(args[0])
				3075
				3076	api = get_api()
				3077	return trace(options.log, args, os.getcwd(), api, options.quiet)[0]
				3078
				3079
				3080	def CMDread(args):
				3081	"""Reads the logs and prints the result."""
				3082	parser = OptionParserTraceInputs(command='read')
				3083	parser.add_option(
				3084	'-V', '--variable',
				3085	nargs=2,
				3086	action='append',
				3087	dest='variables',
				3088	metavar='VAR_NAME directory',
				3089	default=[],
				3090	help=('Variables to replace relative directories against. Example: '
				3091	'"-v \'$HOME\' \'/home/%s\'" will replace all occurence of your '
				3092	'home dir with $HOME') % getpass.getuser())
				3093	parser.add_option(
				3094	'--root-dir',
				3095	help='Root directory to base everything off it. Anything outside of this '
				3096	'this directory will not be reported')
				3097	parser.add_option(
				3098	'-j', '--json', action='store_true',
				3099	help='Outputs raw result data as json')
				3100	parser.add_option(
				3101	'-b', '--blacklist', action='append', default=[],
				3102	help='List of regexp to use as blacklist filter')
				3103	options, args = parser.parse_args(args)
				3104
				3105	if options.root_dir:
				3106	options.root_dir = os.path.abspath(options.root_dir)
				3107
				3108	variables = dict(options.variables)
				3109	api = get_api()
				3110	def blacklist(f):
				3111	return any(re.match(b, f) for b in options.blacklist)
				3112	data = api.parse_log(options.log, blacklist)
				3113	# Process each trace.
				3114	output_as_json = []
				3115	for item in data:
				3116	if 'exception' in item:
				3117	print >> sys.stderr, (
				3118	'Trace %s: Got an exception: %s' % (item['trace'], item['exception']))
				3119	continue
				3120	results = item['results']
				3121	if options.root_dir:
				3122	results = results.strip_root(options.root_dir)
				3123
				3124	if options.json:
				3125	output_as_json.append(results.flatten())
				3126	else:
				3127	simplified = extract_directories(
				3128	options.root_dir, results.files, blacklist)
				3129	simplified = [f.replace_variables(variables) for f in simplified]
				3130	if len(data) > 1:
				3131	print('Trace: %s' % item['trace'])
				3132	print('Total: %d' % len(results.files))
				3133	print('Non existent: %d' % len(results.non_existent))
				3134	for f in results.non_existent:
				3135	print(' %s' % f.path)
				3136	print(
				3137	'Interesting: %d reduced to %d' % (
				3138	len(results.existent), len(simplified)))
				3139	for f in simplified:
				3140	print(' %s' % f.path)
				3141
				3142	if options.json:
				3143	write_json(sys.stdout, output_as_json, False)
				3144	return 0
				3145
				3146
				3147	class OptionParserWithLogging(optparse.OptionParser):
				3148	"""Adds --verbose option."""
				3149	def __init__(self, verbose=0, **kwargs):
				3150	optparse.OptionParser.__init__(self, **kwargs)
				3151	self.add_option(
				3152	'-v', '--verbose',
				3153	action='count',
				3154	default=verbose,
				3155	help='Use multiple times to increase verbosity')
				3156
				3157	def parse_args(self, args, *kwargs):
				3158	options, args = optparse.OptionParser.parse_args(self, args, *kwargs)
				3159	levels = [logging.ERROR, logging.INFO, logging.DEBUG]
				3160	logging.basicConfig(
				3161	level=levels[min(len(levels)-1, options.verbose)],
				3162	format='%(levelname)5s %(module)15s(%(lineno)3d): %(message)s')
				3163	return options, args
				3164
				3165
				3166	class OptionParserWithNiceDescription(OptionParserWithLogging):
				3167	"""Generates the description with the command's docstring."""
				3168	def __init__(self, **kwargs):
				3169	"""Sets 'description' and 'usage' if not already specified."""
				3170	command = kwargs.pop('command', 'help')
				3171	kwargs.setdefault(
				3172	'description',
				3173	re.sub('[\r\n ]{2,}', ' ', get_command_handler(command).__doc__))
				3174	kwargs.setdefault('usage', '%%prog %s [options]' % command)
				3175	OptionParserWithLogging.__init__(self, **kwargs)
				3176
				3177
				3178	class OptionParserTraceInputs(OptionParserWithNiceDescription):
				3179	"""Adds automatic --log handling."""
				3180	def __init__(self, **kwargs):
				3181	OptionParserWithNiceDescription.__init__(self, **kwargs)
				3182	self.add_option(
				3183	'-l', '--log', help='Log file to generate or read, required')
				3184
				3185	def parse_args(self, args, *kwargs):
				3186	"""Makes sure the paths make sense.
				3187
				3188	On Windows, / and \ are often mixed together in a path.
				3189	"""
				3190	options, args = OptionParserWithNiceDescription.parse_args(
				3191	self, args, *kwargs)
				3192	if not options.log:
				3193	self.error('Must supply a log file with -l')
				3194	options.log = os.path.abspath(options.log)
				3195	return options, args
				3196
				3197
				3198	def extract_documentation():
				3199	"""Returns a dict {command: description} for each of documented command."""
				3200	commands = (
				3201	fn[3:]
				3202	for fn in dir(sys.modules['__main__'])
				3203	if fn.startswith('CMD') and get_command_handler(fn[3:]).__doc__)
				3204	return dict((fn, get_command_handler(fn).__doc__) for fn in commands)
				3205
				3206
				3207	def CMDhelp(args):
				3208	"""Prints list of commands or help for a specific command."""
				3209	doc = extract_documentation()
				3210	# Calculates the optimal offset.
				3211	offset = max(len(cmd) for cmd in doc)
				3212	format_str = ' %-' + str(offset + 2) + 's %s'
				3213	# Generate a one-liner documentation of each commands.
				3214	commands_description = '\n'.join(
				3215	format_str % (cmd, doc[cmd].split('\n')[0]) for cmd in sorted(doc))
				3216
				3217	parser = OptionParserWithNiceDescription(
				3218	usage='%prog <command> [options]',
				3219	description='Commands are:\n%s\n' % commands_description)
				3220	parser.format_description = lambda _: parser.description
				3221
				3222	# Strip out any -h or --help argument.
				3223	_, args = parser.parse_args([i for i in args if not i in ('-h', '--help')])
				3224	if len(args) == 1:
				3225	if not get_command_handler(args[0]):
				3226	parser.error('Unknown command %s' % args[0])
				3227	# The command was "%prog help command", replaces ourself with
				3228	# "%prog command --help" so help is correctly printed out.
				3229	return main(args + ['--help'])
				3230	elif args:
				3231	parser.error('Unknown argument "%s"' % ' '.join(args))
				3232	parser.print_help()
				3233	return 0
				3234
				3235
				3236	def get_command_handler(name):
				3237	"""Returns the command handler or CMDhelp if it doesn't exist."""
				3238	return getattr(sys.modules['__main__'], 'CMD%s' % name, None)
				3239
				3240
				3241	def main_impl(argv):
				3242	command = get_command_handler(argv[0] if argv else 'help')
				3243	if not command:
				3244	return CMDhelp(argv)
				3245	return command(argv[1:])
				3246
				3247	def main(argv):
				3248	try:
				3249	main_impl(argv)
				3250	except TracingFailure, e:
				3251	sys.stderr.write('\nError: ')
				3252	sys.stderr.write(str(e))
				3253	sys.stderr.write('\n')
				3254	return 1
				3255
				3256
				3257	if __name__ == '__main__':
				3258	sys.exit(main(sys.argv[1:]))