Blame - trace_inputs.py - chromium.googlesource.com/infra/luci/client-py

blob: c812cf9afac17f04a476d71f3ae540d935fdf110 [file] [log] [blame]

maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1	#!/usr/bin/env python
				2	# coding=utf-8
				3	# Copyright (c) 2012 The Chromium Authors. All rights reserved.
				4	# Use of this source code is governed by a BSD-style license that can be
				5	# found in the LICENSE file.
				6
				7	"""Traces an executable and its child processes and extract the files accessed
				8	by them.
				9
				10	The implementation uses OS-specific API. The native Kernel logger and the ETL
				11	interface is used on Windows. Dtrace is used on OSX. Strace is used otherwise.
				12	The OS-specific implementation is hidden in an 'API' interface.
				13
				14	The results are embedded in a Results instance. The tracing is done in two
				15	phases, the first is to do the actual trace and generate an
				16	implementation-specific log file. Then the log file is parsed to extract the
				17	information, including the individual child processes and the files accessed
				18	from the log.
				19	"""
				20
				21	import codecs
				22	import csv
				23	import getpass
				24	import glob
				25	import json
				26	import logging
				27	import optparse
				28	import os
				29	import re
				30	import subprocess
				31	import sys
				32	import tempfile
				33	import threading
				34	import time
				35	import weakref
				36
				37	## OS-specific imports
				38
				39	if sys.platform == 'win32':
				40	from ctypes.wintypes import byref, create_unicode_buffer, c_int, c_wchar_p
				41	from ctypes.wintypes import windll, FormatError # pylint: disable=E0611
				42	from ctypes.wintypes import GetLastError # pylint: disable=E0611
				43	elif sys.platform == 'darwin':
				44	import Carbon.File # pylint: disable=F0401
				45	import MacOS # pylint: disable=F0401
				46
				47
				48	BASE_DIR = os.path.dirname(os.path.abspath(__file__))
				49	ROOT_DIR = os.path.dirname(os.path.dirname(BASE_DIR))
				50
				51
				52	class TracingFailure(Exception):
				53	"""An exception occured during tracing."""
				54	def __init__(self, description, pid, line_number, line, *args):
				55	super(TracingFailure, self).__init__(
				56	description, pid, line_number, line, *args)
				57	self.description = description
				58	self.pid = pid
				59	self.line_number = line_number
				60	self.line = line
				61	self.extra = args
				62
				63	def __str__(self):
				64	out = self.description
				65	if self.pid:
				66	out += '\npid: %d' % self.pid
				67	if self.line_number:
				68	out += '\nline: %d' % self.line_number
				69	if self.line:
				70	out += '\n%s' % self.line
				71	if self.extra:
				72	out += '\n' + ', '.join(map(str, filter(None, self.extra)))
				73	return out
				74
				75
				76	## OS-specific functions
				77
				78	if sys.platform == 'win32':
				79	def QueryDosDevice(drive_letter):
				80	"""Returns the Windows 'native' path for a DOS drive letter."""
				81	assert re.match(r'^[a-zA-Z]:$', drive_letter), drive_letter
				82	# Guesswork. QueryDosDeviceW never returns the required number of bytes.
				83	chars = 1024
				84	drive_letter = unicode(drive_letter)
				85	p = create_unicode_buffer(chars)
				86	if 0 == windll.kernel32.QueryDosDeviceW(drive_letter, p, chars):
				87	err = GetLastError()
				88	if err:
				89	# pylint: disable=E0602
				90	raise WindowsError(
				91	err,
				92	'QueryDosDevice(%s): %s (%d)' % (
				93	str(drive_letter), FormatError(err), err))
				94	return p.value
				95
				96
				97	def GetShortPathName(long_path):
				98	"""Returns the Windows short path equivalent for a 'long' path."""
				99	long_path = unicode(long_path)
				100	# Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is
				101	# not enforced.
				102	if os.path.isabs(long_path) and not long_path.startswith('\\\\?\\'):
				103	long_path = '\\\\?\\' + long_path
				104	chars = windll.kernel32.GetShortPathNameW(long_path, None, 0)
				105	if chars:
				106	p = create_unicode_buffer(chars)
				107	if windll.kernel32.GetShortPathNameW(long_path, p, chars):
				108	return p.value
				109
				110	err = GetLastError()
				111	if err:
				112	# pylint: disable=E0602
				113	raise WindowsError(
				114	err,
				115	'GetShortPathName(%s): %s (%d)' % (
				116	str(long_path), FormatError(err), err))
				117
				118
				119	def GetLongPathName(short_path):
				120	"""Returns the Windows long path equivalent for a 'short' path."""
				121	short_path = unicode(short_path)
				122	# Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is
				123	# not enforced.
				124	if os.path.isabs(short_path) and not short_path.startswith('\\\\?\\'):
				125	short_path = '\\\\?\\' + short_path
				126	chars = windll.kernel32.GetLongPathNameW(short_path, None, 0)
				127	if chars:
				128	p = create_unicode_buffer(chars)
				129	if windll.kernel32.GetLongPathNameW(short_path, p, chars):
				130	return p.value
				131
				132	err = GetLastError()
				133	if err:
				134	# pylint: disable=E0602
				135	raise WindowsError(
				136	err,
				137	'GetLongPathName(%s): %s (%d)' % (
				138	str(short_path), FormatError(err), err))
				139
				140
				141	def get_current_encoding():
				142	"""Returns the 'ANSI' code page associated to the process."""
				143	return 'cp%d' % int(windll.kernel32.GetACP())
				144
				145
				146	class DosDriveMap(object):
				147	"""Maps \Device\HarddiskVolumeN to N: on Windows."""
				148	# Keep one global cache.
				149	_MAPPING = {}
				150
				151	def __init__(self):
				152	"""Lazy loads the cache."""
				153	if not self._MAPPING:
				154	# This is related to UNC resolver on windows. Ignore that.
				155	self._MAPPING['\\Device\\Mup'] = None
				156	self._MAPPING['\\SystemRoot'] = os.environ['SystemRoot']
				157
				158	for letter in (chr(l) for l in xrange(ord('C'), ord('Z')+1)):
				159	try:
				160	letter = '%s:' % letter
				161	mapped = QueryDosDevice(letter)
				162	if mapped in self._MAPPING:
				163	logging.warn(
				164	('Two drives: \'%s\' and \'%s\', are mapped to the same disk'
				165	'. Drive letters are a user-mode concept and the kernel '
				166	'traces only have NT path, so all accesses will be '
				167	'associated with the first drive letter, independent of the '
				168	'actual letter used by the code') % (
				169	self._MAPPING[mapped], letter))
				170	else:
				171	self._MAPPING[mapped] = letter
				172	except WindowsError: # pylint: disable=E0602
				173	pass
				174
				175	def to_win32(self, path):
				176	"""Converts a native NT path to Win32/DOS compatible path."""
				177	match = re.match(r'(^\\Device\\[a-zA-Z0-9]+)(\\.*)?$', path)
				178	if not match:
				179	raise ValueError(
				180	'Can\'t convert %s into a Win32 compatible path' % path,
				181	path)
				182	if not match.group(1) in self._MAPPING:
				183	# Unmapped partitions may be accessed by windows for the
				184	# fun of it while the test is running. Discard these.
				185	return None
				186	drive = self._MAPPING[match.group(1)]
				187	if not drive or not match.group(2):
				188	return drive
				189	return drive + match.group(2)
				190
				191
				192	def isabs(path):
				193	"""Accepts X: as an absolute path, unlike python's os.path.isabs()."""
				194	return os.path.isabs(path) or len(path) == 2 and path[1] == ':'
				195
				196
				197	def get_native_path_case(p):
				198	"""Returns the native path case for an existing file.
				199
				200	On Windows, removes any leading '\\?\'.
				201	"""
				202	if not isabs(p):
				203	raise ValueError(
				204	'Can\'t get native path case for a non-absolute path: %s' % p,
				205	p)
				206	# Windows used to have an option to turn on case sensitivity on non Win32
				207	# subsystem but that's out of scope here and isn't supported anymore.
				208	# Go figure why GetShortPathName() is needed.
				209	try:
				210	out = GetLongPathName(GetShortPathName(p))
				211	except OSError, e:
				212	if e.args[0] in (2, 3, 5):
				213	# The path does not exist. Try to recurse and reconstruct the path.
				214	base = os.path.dirname(p)
				215	rest = os.path.basename(p)
				216	return os.path.join(get_native_path_case(base), rest)
				217	raise
				218	if out.startswith('\\\\?\\'):
				219	out = out[4:]
				220	# Always upper case the first letter since GetLongPathName() will return the
				221	# drive letter in the case it was given.
				222	return out[0].upper() + out[1:]
				223
				224
				225	def CommandLineToArgvW(command_line):
				226	"""Splits a commandline into argv using CommandLineToArgvW()."""
				227	# http://msdn.microsoft.com/library/windows/desktop/bb776391.aspx
				228	size = c_int()
				229	ptr = windll.shell32.CommandLineToArgvW(unicode(command_line), byref(size))
				230	try:
				231	return [arg for arg in (c_wchar_p * size.value).from_address(ptr)]
				232	finally:
				233	windll.kernel32.LocalFree(ptr)
				234
				235
				236	elif sys.platform == 'darwin':
				237
				238
				239	# On non-windows, keep the stdlib behavior.
				240	isabs = os.path.isabs
				241
				242
				243	def _find_item_native_case(root_path, item):
				244	"""Gets the native path case of a single item based at root_path.
				245
				246	There is no API to get the native path case of symlinks on OSX. So it
				247	needs to be done the slow way.
				248	"""
				249	item = item.lower()
				250	for element in os.listdir(root_path):
				251	if element.lower() == item:
				252	return element
				253
				254
				255	def _native_case(p):
				256	"""Gets the native path case. Warning: this function resolves symlinks."""
				257	logging.debug('native_case(%s)' % p)
				258	try:
				259	rel_ref, _ = Carbon.File.FSPathMakeRef(p)
				260	out = rel_ref.FSRefMakePath()
				261	if p.endswith(os.path.sep) and not out.endswith(os.path.sep):
				262	return out + os.path.sep
				263	return out
				264	except MacOS.Error, e:
				265	if e.args[0] in (-43, -120):
				266	# The path does not exist. Try to recurse and reconstruct the path.
				267	# -43 means file not found.
				268	# -120 means directory not found.
				269	base = os.path.dirname(p)
				270	rest = os.path.basename(p)
				271	return os.path.join(_native_case(base), rest)
				272	raise OSError(
				273	e.args[0], 'Failed to get native path for %s' % p, p, e.args[1])
				274
				275
				276	def _split_at_symlink_native(base_path, rest):
				277	"""Returns the native path for a symlink."""
				278	base, symlink, rest = split_at_symlink(base_path, rest)
				279	if symlink:
				280	if not base_path:
				281	base_path = base
				282	else:
				283	base_path = safe_join(base_path, base)
				284	symlink = _find_item_native_case(base_path, symlink)
				285	return base, symlink, rest
				286
				287
				288	def get_native_path_case(path):
				289	"""Returns the native path case for an existing file.
				290
				291	Technically, it's only HFS+ on OSX that is case preserving and
				292	insensitive. It's the default setting on HFS+ but can be changed.
				293	"""
				294	if not isabs(path):
				295	raise ValueError(
				296	'Can\'t get native path case for a non-absolute path: %s' % path,
				297	path)
				298	if path.startswith('/dev'):
				299	# /dev is not visible from Carbon, causing an exception.
				300	return path
				301
				302	# Starts assuming there is no symlink along the path.
				303	resolved = _native_case(path)
				304	if resolved.lower() == path.lower():
				305	# This code path is incredibly faster.
				306	return resolved
				307
				308	# There was a symlink, process it.
				309	base, symlink, rest = _split_at_symlink_native(None, path)
				310	assert symlink, (path, base, symlink, rest, resolved)
				311	prev = base
				312	base = safe_join(_native_case(base), symlink)
				313	assert len(base) > len(prev)
				314	while rest:
				315	prev = base
				316	relbase, symlink, rest = _split_at_symlink_native(base, rest)
				317	base = safe_join(base, relbase)
				318	assert len(base) > len(prev), (prev, base, symlink)
				319	if symlink:
				320	base = safe_join(base, symlink)
				321	assert len(base) > len(prev), (prev, base, symlink)
				322	# Make sure no symlink was resolved.
				323	assert base.lower() == path.lower(), (base, path)
				324	return base
				325
				326
				327	else: # OSes other than Windows and OSX.
				328
				329
				330	# On non-windows, keep the stdlib behavior.
				331	isabs = os.path.isabs
				332
				333
				334	def get_native_path_case(path):
				335	"""Returns the native path case for an existing file.
				336
				337	On OSes other than OSX and Windows, assume the file system is
				338	case-sensitive.
				339
				340	TODO(maruel): This is not strictly true. Implement if necessary.
				341	"""
				342	if not isabs(path):
				343	raise ValueError(
				344	'Can\'t get native path case for a non-absolute path: %s' % path,
				345	path)
				346	# Give up on cygwin, as GetLongPathName() can't be called.
				347	# Linux traces tends to not be normalized so use this occasion to normalize
				348	# it. This function implementation already normalizes the path on the other
				349	# OS so this needs to be done here to be coherent between OSes.
				350	out = os.path.normpath(path)
				351	if path.endswith(os.path.sep) and not out.endswith(os.path.sep):
				352	return out + os.path.sep
				353	return out
				354
				355
				356	if sys.platform != 'win32': # All non-Windows OSes.
				357
				358
				359	def safe_join(*args):
				360	"""Joins path elements like os.path.join() but doesn't abort on absolute
				361	path.
				362
				363	os.path.join('foo', '/bar') == '/bar'
				364	but safe_join('foo', '/bar') == 'foo/bar'.
				365	"""
				366	out = ''
				367	for element in args:
				368	if element.startswith(os.path.sep):
				369	if out.endswith(os.path.sep):
				370	out += element[1:]
				371	else:
				372	out += element
				373	else:
				374	if out.endswith(os.path.sep):
				375	out += element
				376	else:
				377	out += os.path.sep + element
				378	return out
				379
				380
				381	def split_at_symlink(base_dir, relfile):
				382	"""Scans each component of relfile and cut the string at the symlink if
				383	there is any.
				384
				385	Returns a tuple (base_path, symlink, rest), with symlink == rest == None if
				386	not symlink was found.
				387	"""
				388	if base_dir:
				389	assert relfile
				390	assert os.path.isabs(base_dir)
				391	index = 0
				392	else:
				393	assert os.path.isabs(relfile)
				394	index = 1
				395
				396	def at_root(rest):
				397	if base_dir:
				398	return safe_join(base_dir, rest)
				399	return rest
				400
				401	while True:
				402	try:
				403	index = relfile.index(os.path.sep, index)
				404	except ValueError:
				405	index = len(relfile)
				406	full = at_root(relfile[:index])
				407	if os.path.islink(full):
				408	# A symlink!
				409	base = os.path.dirname(relfile[:index])
				410	symlink = os.path.basename(relfile[:index])
				411	rest = relfile[index:]
				412	logging.debug(
				413	'split_at_symlink(%s, %s) -> (%s, %s, %s)' %
				414	(base_dir, relfile, base, symlink, rest))
				415	return base, symlink, rest
				416	if index == len(relfile):
				417	break
				418	index += 1
				419	return relfile, None, None
				420
				421
				422	def fix_python_path(cmd):
				423	"""Returns the fixed command line to call the right python executable."""
				424	out = cmd[:]
				425	if out[0] == 'python':
				426	out[0] = sys.executable
				427	elif out[0].endswith('.py'):
				428	out.insert(0, sys.executable)
				429	return out
				430
				431
				432	def create_thunk():
				433	handle, name = tempfile.mkstemp(prefix='trace_inputs_thunk', suffix='.py')
				434	os.write(
				435	handle,
				436	(
				437	'import subprocess\n'
				438	'import sys\n'
				439	'sys.exit(subprocess.call(sys.argv[2:]))\n'
				440	))
				441	os.close(handle)
				442	return name
				443
				444
				445	def strace_process_quoted_arguments(text):
				446	"""Extracts quoted arguments on a string and return the arguments as a list.
				447
				448	Implemented as an automaton. Supports incomplete strings in the form
				449	'"foo"...'.
				450
				451	Example:
				452	With text = '"foo", "bar"', the function will return ['foo', 'bar']
				453
				454	TODO(maruel): Implement escaping.
				455	"""
				456	# All the possible states of the DFA.
				457	( NEED_QUOTE, # Begining of a new arguments.
				458	INSIDE_STRING, # Inside an argument.
				459	ESCAPED, # Found a '\' inside a quote. Treat the next char as-is.
				460	NEED_COMMA_OR_DOT, # Right after the closing quote of an argument. Could be
				461	# a serie of 3 dots or a comma.
				462	NEED_SPACE, # Right after a comma
				463	NEED_DOT_2, # Found a dot, need a second one.
				464	NEED_DOT_3, # Found second dot, need a third one.
				465	NEED_COMMA, # Found third dot, need a comma.
				466	) = range(8)
				467
				468	state = NEED_QUOTE
				469	out = []
				470	for index, char in enumerate(text):
				471	if char == '"':
				472	if state == NEED_QUOTE:
				473	state = INSIDE_STRING
				474	# A new argument was found.
				475	out.append('')
				476	elif state == INSIDE_STRING:
				477	# The argument is now closed.
				478	state = NEED_COMMA_OR_DOT
				479	elif state == ESCAPED:
				480	out[-1] += char
				481	state = INSIDE_STRING
				482	else:
				483	raise ValueError(
				484	'Can\'t process char at column %d for: %r' % (index, text),
				485	index,
				486	text)
				487	elif char == ',':
				488	if state in (NEED_COMMA_OR_DOT, NEED_COMMA):
				489	state = NEED_SPACE
				490	elif state == INSIDE_STRING:
				491	out[-1] += char
				492	elif state == ESCAPED:
				493	out[-1] += char
				494	state = INSIDE_STRING
				495	else:
				496	raise ValueError(
				497	'Can\'t process char at column %d for: %r' % (index, text),
				498	index,
				499	text)
				500	elif char == ' ':
				501	if state == NEED_SPACE:
				502	state = NEED_QUOTE
				503	elif state == INSIDE_STRING:
				504	out[-1] += char
				505	elif state == ESCAPED:
				506	out[-1] += char
				507	state = INSIDE_STRING
				508	else:
				509	raise ValueError(
				510	'Can\'t process char at column %d for: %r' % (index, text),
				511	index,
				512	text)
				513	elif char == '.':
				514	if state == NEED_COMMA_OR_DOT:
				515	# The string is incomplete, this mean the strace -s flag should be
				516	# increased.
				517	state = NEED_DOT_2
				518	elif state == NEED_DOT_2:
				519	state = NEED_DOT_3
				520	elif state == NEED_DOT_3:
				521	state = NEED_COMMA
				522	elif state == INSIDE_STRING:
				523	out[-1] += char
				524	elif state == ESCAPED:
				525	out[-1] += char
				526	state = INSIDE_STRING
				527	else:
				528	raise ValueError(
				529	'Can\'t process char at column %d for: %r' % (index, text),
				530	index,
				531	text)
				532	elif char == '\\':
				533	if state == ESCAPED:
				534	out[-1] += char
				535	state = INSIDE_STRING
				536	elif state == INSIDE_STRING:
				537	state = ESCAPED
				538	else:
				539	raise ValueError(
				540	'Can\'t process char at column %d for: %r' % (index, text),
				541	index,
				542	text)
				543	else:
				544	if state == INSIDE_STRING:
				545	out[-1] += char
				546	else:
				547	raise ValueError(
				548	'Can\'t process char at column %d for: %r' % (index, text),
				549	index,
				550	text)
				551	if state not in (NEED_COMMA, NEED_COMMA_OR_DOT):
				552	raise ValueError(
				553	'String is incorrectly terminated: %r' % text,
				554	text)
				555	return out
				556
				557
				558	def read_json(filepath):
				559	with open(filepath, 'r') as f:
				560	return json.load(f)
				561
				562
				563	def write_json(filepath_or_handle, data, dense):
				564	"""Writes data into filepath or file handle encoded as json.
				565
				566	If dense is True, the json is packed. Otherwise, it is human readable.
				567	"""
				568	if hasattr(filepath_or_handle, 'write'):
				569	if dense:
				570	filepath_or_handle.write(
				571	json.dumps(data, sort_keys=True, separators=(',',':')))
				572	else:
				573	filepath_or_handle.write(json.dumps(data, sort_keys=True, indent=2))
				574	else:
				575	with open(filepath_or_handle, 'wb') as f:
				576	if dense:
				577	json.dump(data, f, sort_keys=True, separators=(',',':'))
				578	else:
				579	json.dump(data, f, sort_keys=True, indent=2)
				580
				581
				582	class Results(object):
				583	"""Results of a trace session."""
				584
				585	class _TouchedObject(object):
				586	"""Something, a file or a directory, that was accessed."""
				587	def __init__(self, root, path, tainted, size, nb_files):
				588	logging.debug(
				589	'%s(%s, %s, %s, %s, %s)' %
				590	(self.__class__.__name__, root, path, tainted, size, nb_files))
				591	self.root = root
				592	self.path = path
				593	self.tainted = tainted
				594	self.nb_files = nb_files
				595	# Can be used as a cache or a default value, depending on context.
				596	self._size = size
				597	# These are cache only.
				598	self._real_path = None
				599
				600	# Check internal consistency.
				601	assert path, path
				602	assert tainted or bool(root) != bool(isabs(path)), (root, path)
				603	assert tainted or (
				604	not os.path.exists(self.full_path) or
				605	(self.full_path == get_native_path_case(self.full_path))), (
				606	tainted, self.full_path, get_native_path_case(self.full_path))
				607
				608	@property
				609	def existent(self):
				610	return self.size != -1
				611
				612	@property
				613	def full_path(self):
				614	if self.root:
				615	return os.path.join(self.root, self.path)
				616	return self.path
				617
				618	@property
				619	def real_path(self):
				620	"""Returns the path with symlinks resolved."""
				621	if not self._real_path:
				622	self._real_path = os.path.realpath(self.full_path)
				623	return self._real_path
				624
				625	@property
				626	def size(self):
				627	"""File's size. -1 is not existent."""
				628	if self._size is None and not self.tainted:
				629	try:
				630	self._size = os.stat(self.full_path).st_size
				631	except OSError:
				632	self._size = -1
				633	return self._size
				634
				635	def flatten(self):
				636	"""Returns a dict representing this object.
				637
				638	A 'size' of 0 means the file was only touched and not read.
				639	"""
				640	return {
				641	'path': self.path,
				642	'size': self.size,
				643	}
				644
				645	def replace_variables(self, variables):
				646	"""Replaces the root of this File with one of the variables if it matches.
				647
				648	If a variable replacement occurs, the cloned object becomes tainted.
				649	"""
				650	for variable, root_path in variables.iteritems():
				651	if self.path.startswith(root_path):
				652	return self._clone(
				653	self.root, variable + self.path[len(root_path):], True)
				654	# No need to clone, returns ourself.
				655	return self
				656
				657	def strip_root(self, root):
				658	"""Returns a clone of itself with 'root' stripped off."""
				659	# Check internal consistency.
				660	assert self.tainted or (isabs(root) and root.endswith(os.path.sep)), root
				661	if not self.full_path.startswith(root):
				662	# Now try to resolve the symlinks to see if it can be reached this way.
				663	# Only try after trying without resolving symlink.
				664	if not self.real_path.startswith(root):
				665	return None
				666	path = self.real_path
				667	else:
				668	path = self.full_path
				669	return self._clone(root, path[len(root):], self.tainted)
				670
				671	def _clone(self, new_root, new_path, tainted):
				672	raise NotImplementedError(self.__class__.__name__)
				673
				674	class File(_TouchedObject):
				675	"""A file that was accessed. May not be present anymore.
				676
				677	If tainted is true, it means it is not a real path anymore as a variable
				678	replacement occured.
				679
				680	If touched_only is True, this means the file was probed for existence, and
				681	it is existent, but was never _opened_. If touched_only is True, the file
				682	must have existed.
				683	"""
				684	def __init__(self, root, path, tainted, size):
				685	super(Results.File, self).__init__(root, path, tainted, size, 1)
				686
				687	def _clone(self, new_root, new_path, tainted):
				688	"""Clones itself keeping meta-data."""
				689	# Keep the self.size and self._real_path caches for performance reason. It
				690	# is also important when the file becomes tainted (with a variable instead
				691	# of the real path) since self.path is not an on-disk path anymore so
				692	# out._size cannot be updated.
				693	out = self.__class__(new_root, new_path, tainted, self.size)
				694	out._real_path = self._real_path
				695	return out
				696
				697	class Directory(_TouchedObject):
				698	"""A directory of files. Must exist."""
				699	def __init__(self, root, path, tainted, size, nb_files):
				700	"""path='.' is a valid value and must be handled appropriately."""
				701	assert not path.endswith(os.path.sep), path
				702	super(Results.Directory, self).__init__(
				703	root, path + os.path.sep, tainted, size, nb_files)
				704	# For a Directory instance, self.size is not a cache, it's an actual value
				705	# that is never modified and represents the total size of the files
				706	# contained in this directory. It is possible that the directory is empty
				707	# so that size == 0; this happens if there's only an invalid symlink in
				708	# it.
				709
				710	def flatten(self):
				711	out = super(Results.Directory, self).flatten()
				712	out['nb_files'] = self.nb_files
				713	return out
				714
				715	def _clone(self, new_root, new_path, tainted):
				716	"""Clones itself keeping meta-data."""
				717	out = self.__class__(
				718	new_root,
				719	new_path.rstrip(os.path.sep),
				720	tainted,
				721	self.size,
				722	self.nb_files)
				723	out._real_path = self._real_path
				724	return out
				725
				726	class Process(object):
				727	"""A process that was traced.
				728
				729	Contains references to the files accessed by this process and its children.
				730	"""
				731	def __init__(self, pid, files, executable, command, initial_cwd, children):
				732	logging.debug('Process(%s, %d, ...)' % (pid, len(files)))
				733	self.pid = pid
				734	self.files = sorted(files, key=lambda x: x.path)
				735	self.children = children
				736	self.executable = executable
				737	self.command = command
				738	self.initial_cwd = initial_cwd
				739
				740	# Check internal consistency.
				741	assert len(set(f.path for f in self.files)) == len(self.files), sorted(
				742	f.path for f in self.files)
				743	assert isinstance(self.children, list)
				744	assert isinstance(self.files, list)
				745
				746	@property
				747	def all(self):
				748	for child in self.children:
				749	for i in child.all:
				750	yield i
				751	yield self
				752
				753	def flatten(self):
				754	return {
				755	'children': [c.flatten() for c in self.children],
				756	'command': self.command,
				757	'executable': self.executable,
				758	'files': [f.flatten() for f in self.files],
				759	'initial_cwd': self.initial_cwd,
				760	'pid': self.pid,
				761	}
				762
				763	def strip_root(self, root):
				764	assert isabs(root) and root.endswith(os.path.sep), root
				765	# Loads the files after since they are constructed as objects.
				766	out = self.__class__(
				767	self.pid,
				768	filter(None, (f.strip_root(root) for f in self.files)),
				769	self.executable,
				770	self.command,
				771	self.initial_cwd,
				772	[c.strip_root(root) for c in self.children])
				773	logging.debug(
				774	'strip_root(%s) %d -> %d' % (root, len(self.files), len(out.files)))
				775	return out
				776
				777	def __init__(self, process):
				778	self.process = process
				779	# Cache.
				780	self._files = None
				781
				782	def flatten(self):
				783	return {
				784	'root': self.process.flatten(),
				785	}
				786
				787	@property
				788	def files(self):
				789	if self._files is None:
				790	self._files = sorted(
				791	sum((p.files for p in self.process.all), []),
				792	key=lambda x: x.path)
				793	return self._files
				794
				795	@property
				796	def existent(self):
				797	return [f for f in self.files if f.existent]
				798
				799	@property
				800	def non_existent(self):
				801	return [f for f in self.files if not f.existent]
				802
				803	def strip_root(self, root):
				804	"""Returns a clone with all the files outside the directory \|root\| removed
				805	and converts all the path to be relative paths.
				806	"""
				807	# Resolve any symlink
				808	root = os.path.realpath(root)
				809	root = get_native_path_case(root).rstrip(os.path.sep) + os.path.sep
				810	logging.debug('strip_root(%s)' % root)
				811	return Results(self.process.strip_root(root))
				812
				813
				814	class ApiBase(object):
				815	"""OS-agnostic API to trace a process and its children."""
				816	class Context(object):
				817	"""Processes one log line at a time and keeps the list of traced processes.
				818
				819	The parsing is complicated by the fact that logs are traced out of order for
				820	strace but in-order for dtrace and logman. In addition, on Windows it is
				821	very frequent that processids are reused so a flat list cannot be used. But
				822	at the same time, it is impossible to faithfully construct a graph when the
				823	logs are processed out of order. So both a tree and a flat mapping are used,
				824	the tree is the real process tree, while the flat mapping stores the last
				825	valid process for the corresponding processid. For the strace case, the
				826	tree's head is guessed at the last moment.
				827	"""
				828	class Process(object):
				829	"""Keeps context for one traced child process.
				830
				831	Logs all the files this process touched. Ignores directories.
				832	"""
				833	def __init__(self, blacklist, pid, initial_cwd):
				834	# Check internal consistency.
				835	assert isinstance(pid, int), repr(pid)
				836	self.pid = pid
				837	# children are Process instances.
				838	self.children = []
				839	self.initial_cwd = initial_cwd
				840	self.cwd = None
				841	self.files = set()
				842	self.only_touched = set()
				843	self.executable = None
				844	self.command = None
				845	self._blacklist = blacklist
				846
				847	def to_results_process(self):
				848	"""Resolves file case sensitivity and or late-bound strings."""
				849	# When resolving files, it's normal to get dupe because a file could be
				850	# opened multiple times with different case. Resolve the deduplication
				851	# here.
				852	def fix_path(x):
				853	"""Returns the native file path case.
				854
				855	Converts late-bound strings.
				856	"""
				857	if not x:
				858	# Do not convert None instance to 'None'.
				859	return x
				860	# TODO(maruel): Do not upconvert to unicode here, on linux we don't
				861	# know the file path encoding so they must be treated as bytes.
				862	x = unicode(x)
				863	if os.path.isabs(x):
				864	# If the path is not absolute, which tends to happen occasionally on
				865	# Windows, it is not possible to get the native path case so ignore
				866	# that trace. It mostly happens for 'executable' value.
				867	x = get_native_path_case(x)
				868	return x
				869
				870	def fix_and_blacklist_path(x):
				871	x = fix_path(x)
				872	if not x:
				873	return
				874	# The blacklist needs to be reapplied, since path casing could
				875	# influence blacklisting.
				876	if self._blacklist(x):
				877	return
				878	return x
				879
				880	# Filters out directories. Some may have passed through.
				881	files = set(f for f in map(fix_and_blacklist_path, self.files) if f)
				882	only_touched = set(
				883	f for f in map(fix_and_blacklist_path, self.only_touched) if f)
				884	only_touched -= files
				885
				886	files = [
				887	Results.File(None, f, False, None) for f in files
				888	if not os.path.isdir(f)
				889	]
				890	# Using 0 as size means the file's content is ignored since the file was
				891	# never opened for I/O.
				892	files.extend(
				893	Results.File(None, f, False, 0) for f in only_touched
				894	if not os.path.isdir(f)
				895	)
				896	return Results.Process(
				897	self.pid,
				898	files,
				899	fix_path(self.executable),
				900	self.command,
				901	fix_path(self.initial_cwd),
				902	[c.to_results_process() for c in self.children])
				903
				904	def add_file(self, filepath, touch_only):
				905	"""Adds a file if it passes the blacklist."""
				906	if self._blacklist(unicode(filepath)):
				907	return
				908	logging.debug('add_file(%d, %s, %s)' % (self.pid, filepath, touch_only))
				909	# Note that filepath and not unicode(filepath) is added. It is because
				910	# filepath could be something else than a string, like a RelativePath
				911	# instance for dtrace logs.
				912	if touch_only:
				913	self.only_touched.add(filepath)
				914	else:
				915	self.files.add(filepath)
				916
				917	def __init__(self, blacklist):
				918	self.blacklist = blacklist
				919	# Initial process.
				920	self.root_process = None
				921	# dict to accelerate process lookup, to not have to lookup the whole graph
				922	# each time.
				923	self._process_lookup = {}
				924
				925	class Tracer(object):
				926	"""During it's lifetime, the tracing subsystem is enabled."""
				927	def __init__(self, logname):
				928	self._logname = logname
				929	self._lock = threading.Lock()
				930	self._traces = []
				931	self._initialized = True
				932
				933	def trace(self, cmd, cwd, tracename, output):
				934	"""Runs the OS-specific trace program on an executable.
				935
				936	Arguments:
				937	- cmd: The command (a list) to run.
				938	- cwd: Current directory to start the child process in.
				939	- tracename: Name of the trace in the logname file.
				940	- output: If False, redirects output to PIPEs.
				941
				942	Returns a tuple (resultcode, output) and updates the internal trace
				943	entries.
				944	"""
				945	# The implementation adds an item to self._traces.
				946	raise NotImplementedError(self.__class__.__name__)
				947
				948	def close(self, _timeout=None):
				949	"""Saves the meta-data in the logname file.
				950
				951	For kernel-based tracing, stops the tracing subsystem.
				952
				953	Must not be used manually when using 'with' construct.
				954	"""
				955	with self._lock:
				956	assert self._initialized
				957	try:
				958	data = {
				959	'traces': self._traces,
				960	}
				961	write_json(self._logname, data, False)
				962	finally:
				963	self._initialized = False
				964
				965	def post_process_log(self):
				966	"""Post-processes the log so it becomes faster to load afterward.
				967
				968	Must not be used manually when using 'with' construct.
				969	"""
				970	assert not self._initialized, 'Must stop tracing first.'
				971
				972	def __enter__(self):
				973	"""Enables 'with' statement."""
				974	return self
				975
				976	def __exit__(self, exc_type, exc_value, traceback):
				977	"""Enables 'with' statement."""
				978	self.close()
				979	# If an exception was thrown, do not process logs.
				980	if not exc_type:
				981	self.post_process_log()
				982
				983	def get_tracer(self, logname):
				984	"""Returns an ApiBase.Tracer instance.
				985
				986	Initializes the tracing subsystem, which is a requirement for kernel-based
				987	tracers. Only one tracer instance should be live at a time!
				988
				989	logname is the filepath to the json file that will contain the meta-data
				990	about the logs.
				991	"""
				992	return self.Tracer(logname)
				993
				994	@staticmethod
				995	def clean_trace(logname):
				996	"""Deletes an old log."""
				997	raise NotImplementedError()
				998
				999	@classmethod
				1000	def parse_log(cls, logname, blacklist):
				1001	"""Processes trace logs and returns the files opened and the files that do
				1002	not exist.
				1003
				1004	It does not track directories.
				1005
				1006	Most of the time, files that do not exist are temporary test files that
				1007	should be put in /tmp instead. See http://crbug.com/116251.
				1008
				1009	Returns a list of dict with keys:
				1010	- results: A Results instance.
				1011	- trace: The corresponding tracename parameter provided to
				1012	get_tracer().trace().
				1013	- output: Output gathered during execution, if get_tracer().trace(...,
				1014	output=False) was used.
				1015	"""
				1016	raise NotImplementedError(cls.__class__.__name__)
				1017
				1018
				1019	class Strace(ApiBase):
				1020	"""strace implies linux."""
				1021	class Context(ApiBase.Context):
				1022	"""Processes a strace log line and keeps the list of existent and non
				1023	existent files accessed.
				1024
				1025	Ignores directories.
				1026
				1027	Uses late-binding to processes the cwd of each process. The problem is that
				1028	strace generates one log file per process it traced but doesn't give any
				1029	information about which process was started when and by who. So we don't
				1030	even know which process is the initial one. So process the logs out of
				1031	order and use late binding with RelativePath to be able to deduce the
				1032	initial directory of each process once all the logs are parsed.
				1033	"""
				1034	class Process(ApiBase.Context.Process):
				1035	"""Represents the state of a process.
				1036
				1037	Contains all the information retrieved from the pid-specific log.
				1038	"""
				1039	# Function names are using ([a-z_0-9]+)
				1040	# This is the most common format. function(args) = result
				1041	RE_HEADER = re.compile(r'^([a-z_0-9]+)$(.+?)$\s+= (.+)$')
				1042	# An interrupted function call, only grab the minimal header.
				1043	RE_UNFINISHED = re.compile(r'^([^\(]+)(.*) \<unfinished \.\.\.\>$')
				1044	# A resumed function call.
				1045	RE_RESUMED = re.compile(r'^<\.\.\. ([^ ]+) resumed> (.+)$')
				1046	# A process received a signal.
				1047	RE_SIGNAL = re.compile(r'^--- SIG[A-Z]+ .+ ---')
				1048	# A process didn't handle a signal. Ignore any junk appearing before,
				1049	# because the process was forcibly killed so it won't open any new file.
				1050	RE_KILLED = re.compile(
				1051	r'^.*\+\+\+ killed by ([A-Z]+)( $core dumped$)? \+\+\+$')
				1052	# The process has exited.
				1053	RE_PROCESS_EXITED = re.compile(r'^\+\+\+ exited with (\d+) \+\+\+')
				1054	# A call was canceled. Ignore any prefix.
				1055	RE_UNAVAILABLE = re.compile(r'^.\)\s= \? <unavailable>$')
				1056	# Happens when strace fails to even get the function name.
				1057	UNNAMED_FUNCTION = '????'
				1058
				1059	# Corner-case in python, a class member function decorator must not be
				1060	# @staticmethod.
				1061	def parse_args(regexp, expect_zero): # pylint: disable=E0213
				1062	"""Automatically convert the str 'args' into a list of processed
				1063	arguments.
				1064
				1065	Arguments:
				1066	- regexp is used to parse args.
				1067	- expect_zero: one of True, False or None.
				1068	- True: will check for result.startswith('0') first and will ignore
				1069	the trace line completely otherwise. This is important because for
				1070	many functions, the regexp will not process if the call failed.
				1071	- False: will check for not result.startswith(('?', '-1')) for the
				1072	same reason than with True.
				1073	- None: ignore result.
				1074	"""
				1075	def meta_hook(function):
				1076	assert function.__name__.startswith('handle_')
				1077	def hook(self, args, result):
				1078	if expect_zero is True and not result.startswith('0'):
				1079	return
				1080	if expect_zero is False and result.startswith(('?', '-1')):
				1081	return
				1082	match = re.match(regexp, args)
				1083	if not match:
				1084	raise TracingFailure(
				1085	'Failed to parse %s(%s) = %s' %
				1086	(function.__name__[len('handle_'):], args, result),
				1087	None, None, None)
				1088	return function(self, match.groups(), result)
				1089	return hook
				1090	return meta_hook
				1091
				1092	class RelativePath(object):
				1093	"""A late-bound relative path."""
				1094	def __init__(self, parent, value):
				1095	self.parent = parent
				1096	self.value = value
				1097
				1098	def render(self):
				1099	"""Returns the current directory this instance is representing.
				1100
				1101	This function is used to return the late-bound value.
				1102	"""
				1103	if self.value and self.value.startswith(u'/'):
				1104	# An absolute path.
				1105	return self.value
				1106	parent = self.parent.render() if self.parent else u'<None>'
				1107	if self.value:
				1108	return os.path.normpath(os.path.join(parent, self.value))
				1109	return parent
				1110
				1111	def __unicode__(self):
				1112	"""Acts as a string whenever needed."""
				1113	return unicode(self.render())
				1114
				1115	def __str__(self):
				1116	"""Acts as a string whenever needed."""
				1117	return str(self.render())
				1118
				1119	def __init__(self, root, pid):
				1120	"""Keeps enough information to be able to guess the original process
				1121	root.
				1122
				1123	strace doesn't store which process was the initial process. So more
				1124	information needs to be kept so the graph can be reconstructed from the
				1125	flat map.
				1126	"""
				1127	logging.info('%s(%d)' % (self.__class__.__name__, pid))
				1128	super(Strace.Context.Process, self).__init__(root.blacklist, pid, None)
				1129	assert isinstance(root, ApiBase.Context)
				1130	self._root = weakref.ref(root)
				1131	# The dict key is the function name of the pending call, like 'open'
				1132	# or 'execve'.
				1133	self._pending_calls = {}
				1134	self._line_number = 0
				1135	# Current directory when the process started.
				1136	self.initial_cwd = self.RelativePath(self._root(), None)
				1137	self.parentid = None
				1138
				1139	def get_cwd(self):
				1140	"""Returns the best known value of cwd."""
				1141	return self.cwd or self.initial_cwd
				1142
				1143	def render(self):
				1144	"""Returns the string value of the RelativePath() object.
				1145
				1146	Used by RelativePath. Returns the initial directory and not the
				1147	current one since the current directory 'cwd' validity is time-limited.
				1148
				1149	The validity is only guaranteed once all the logs are processed.
				1150	"""
				1151	return self.initial_cwd.render()
				1152
				1153	def on_line(self, line):
				1154	self._line_number += 1
				1155	if self.RE_SIGNAL.match(line):
				1156	# Ignore signals.
				1157	return
				1158
				1159	try:
				1160	match = self.RE_KILLED.match(line)
				1161	if match:
				1162	# Converts a '+++ killed by Foo +++' trace into an exit_group().
				1163	self.handle_exit_group(match.group(1), None)
				1164	return
				1165
				1166	match = self.RE_PROCESS_EXITED.match(line)
				1167	if match:
				1168	# Converts a '+++ exited with 1 +++' trace into an exit_group()
				1169	self.handle_exit_group(match.group(1), None)
				1170	return
				1171
				1172	match = self.RE_UNFINISHED.match(line)
				1173	if match:
				1174	if match.group(1) in self._pending_calls:
				1175	raise TracingFailure(
				1176	'Found two unfinished calls for the same function',
				1177	None, None, None,
				1178	self._pending_calls)
				1179	self._pending_calls[match.group(1)] = (
				1180	match.group(1) + match.group(2))
				1181	return
				1182
				1183	match = self.RE_UNAVAILABLE.match(line)
				1184	if match:
				1185	# This usually means a process was killed and a pending call was
				1186	# canceled.
				1187	# TODO(maruel): Look up the last exit_group() trace just above and
				1188	# make sure any self._pending_calls[anything] is properly flushed.
				1189	return
				1190
				1191	match = self.RE_RESUMED.match(line)
				1192	if match:
				1193	if match.group(1) not in self._pending_calls:
				1194	raise TracingFailure(
				1195	'Found a resumed call that was not logged as unfinished',
				1196	None, None, None,
				1197	self._pending_calls)
				1198	pending = self._pending_calls.pop(match.group(1))
				1199	# Reconstruct the line.
				1200	line = pending + match.group(2)
				1201
				1202	match = self.RE_HEADER.match(line)
				1203	if not match:
				1204	raise TracingFailure(
				1205	'Found an invalid line: %s' % line,
				1206	None, None, None)
				1207	if match.group(1) == self.UNNAMED_FUNCTION:
				1208	return
				1209
				1210	# It's a valid line, handle it.
				1211	handler = getattr(self, 'handle_%s' % match.group(1), None)
				1212	if not handler:
				1213	self._handle_unknown(match.group(1), match.group(2), match.group(3))
				1214	return handler(match.group(2), match.group(3))
				1215	except TracingFailure, e:
				1216	# Hack in the values since the handler could be a static function.
				1217	e.pid = self.pid
				1218	e.line = line
				1219	e.line_number = self._line_number
				1220	# Re-raise the modified exception.
				1221	raise
				1222	except (KeyError, NotImplementedError, ValueError), e:
				1223	raise TracingFailure(
				1224	'Trace generated a %s exception: %s' % (
				1225	e.__class__.__name__, str(e)),
				1226	self.pid,
				1227	self._line_number,
				1228	line,
				1229	e)
				1230
				1231	@parse_args(r'^\"(.+?)\", [FKORWX_\|]+$', True)
				1232	def handle_access(self, args, _result):
				1233	self._handle_file(args[0], True)
				1234
				1235	@parse_args(r'^\"(.+?)\"$', True)
				1236	def handle_chdir(self, args, _result):
				1237	"""Updates cwd."""
				1238	self.cwd = self.RelativePath(self, args[0])
				1239	logging.debug('handle_chdir(%d, %s)' % (self.pid, self.cwd))
				1240
				1241	def handle_clone(self, _args, result):
				1242	"""Transfers cwd."""
				1243	if result.startswith(('?', '-1')):
				1244	# The call failed.
				1245	return
				1246	# Update the other process right away.
				1247	childpid = int(result)
				1248	child = self._root().get_or_set_proc(childpid)
				1249	if child.parentid is not None or childpid in self.children:
				1250	raise TracingFailure(
				1251	'Found internal inconsitency in process lifetime detection '
				1252	'during a clone() call',
				1253	None, None, None)
				1254
				1255	# Copy the cwd object.
				1256	child.initial_cwd = self.get_cwd()
				1257	child.parentid = self.pid
				1258	# It is necessary because the logs are processed out of order.
				1259	self.children.append(child)
				1260
				1261	def handle_close(self, _args, _result):
				1262	pass
				1263
				1264	def handle_chmod(self, _args, _result):
				1265	pass
				1266
				1267	def handle_creat(self, _args, _result):
				1268	# Ignore files created, since they didn't need to exist.
				1269	pass
				1270
				1271	@parse_args(r'^\"(.+?)\", \[(.+)\], \[\/\* \d+ vars? \*\/\]$', True)
				1272	def handle_execve(self, args, _result):
				1273	# Even if in practice execve() doesn't returns when it succeeds, strace
				1274	# still prints '0' as the result.
				1275	filepath = args[0]
				1276	self._handle_file(filepath, False)
				1277	self.executable = self.RelativePath(self.get_cwd(), filepath)
				1278	self.command = strace_process_quoted_arguments(args[1])
				1279
				1280	def handle_exit_group(self, _args, _result):
				1281	"""Removes cwd."""
				1282	self.cwd = None
				1283
				1284	def handle_fork(self, args, result):
				1285	self._handle_unknown('fork', args, result)
				1286
				1287	def handle_getcwd(self, _args, _result):
				1288	pass
				1289
				1290	@parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
				1291	def handle_link(self, args, _result):
				1292	self._handle_file(args[0], False)
				1293	self._handle_file(args[1], False)
				1294
				1295	@parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True)
				1296	def handle_lstat(self, args, _result):
				1297	self._handle_file(args[0], True)
				1298
				1299	def handle_mkdir(self, _args, _result):
				1300	pass
				1301
				1302	@parse_args(r'^\"(.*?)\", ([A-Z\_\\|]+)(\|, \d+)$', False)
				1303	def handle_open(self, args, _result):
				1304	if 'O_DIRECTORY' in args[1]:
				1305	return
				1306	self._handle_file(args[0], False)
				1307
				1308	@parse_args(r'^(\d+\|AT_FDCWD), \"(.*?)\", ([A-Z\_\\|]+)(\|, \d+)$', False)
				1309	def handle_openat(self, args, _result):
				1310	if 'O_DIRECTORY' in args[2]:
				1311	return
				1312	if args[1] == 'AT_FDCWD':
				1313	self._handle_file(args[1], False)
				1314	else:
				1315	# TODO(maruel): Implement relative open if necessary instead of the
				1316	# AT_FDCWD flag, let's hope not since this means tracking all active
				1317	# directory handles.
				1318	raise Exception('Relative open via openat not implemented.')
				1319
				1320	@parse_args(r'^\"(.+?)\", \".+?\"(\.\.\.)?, \d+$', False)
				1321	def handle_readlink(self, args, _result):
				1322	self._handle_file(args[0], False)
				1323
				1324	@parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
				1325	def handle_rename(self, args, _result):
				1326	self._handle_file(args[0], False)
				1327	self._handle_file(args[1], False)
				1328
				1329	def handle_rmdir(self, _args, _result):
				1330	pass
				1331
				1332	def handle_setxattr(self, _args, _result):
				1333	pass
				1334
				1335	@parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True)
				1336	def handle_stat(self, args, _result):
				1337	self._handle_file(args[0], True)
				1338
				1339	def handle_symlink(self, _args, _result):
				1340	pass
				1341
				1342	def handle_unlink(self, _args, _result):
				1343	# In theory, the file had to be created anyway.
				1344	pass
				1345
				1346	def handle_statfs(self, _args, _result):
				1347	pass
				1348
				1349	def handle_vfork(self, args, result):
				1350	self._handle_unknown('vfork', args, result)
				1351
				1352	@staticmethod
				1353	def _handle_unknown(function, args, result):
				1354	raise TracingFailure(
				1355	'Unexpected/unimplemented trace %s(%s)= %s' %
				1356	(function, args, result),
				1357	None, None, None)
				1358
				1359	def _handle_file(self, filepath, touch_only):
				1360	filepath = self.RelativePath(self.get_cwd(), filepath)
				1361	#assert not touch_only, unicode(filepath)
				1362	self.add_file(filepath, touch_only)
				1363
				1364	def __init__(self, blacklist, initial_cwd):
				1365	super(Strace.Context, self).__init__(blacklist)
				1366	self.initial_cwd = initial_cwd
				1367
				1368	def render(self):
				1369	"""Returns the string value of the initial cwd of the root process.
				1370
				1371	Used by RelativePath.
				1372	"""
				1373	return self.initial_cwd
				1374
				1375	def on_line(self, pid, line):
				1376	"""Transfers control into the Process.on_line() function."""
				1377	self.get_or_set_proc(pid).on_line(line.strip())
				1378
				1379	def to_results(self):
				1380	"""Finds back the root process and verify consistency."""
				1381	# TODO(maruel): Absolutely unecessary, fix me.
				1382	root = [p for p in self._process_lookup.itervalues() if not p.parentid]
				1383	if len(root) != 1:
				1384	raise TracingFailure(
				1385	'Found internal inconsitency in process lifetime detection '
				1386	'while finding the root process',
				1387	None,
				1388	None,
				1389	None,
				1390	sorted(p.pid for p in root))
				1391	self.root_process = root[0]
				1392	process = self.root_process.to_results_process()
				1393	if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
				1394	raise TracingFailure(
				1395	'Found internal inconsitency in process lifetime detection '
				1396	'while looking for len(tree) == len(list)',
				1397	None,
				1398	None,
				1399	None,
				1400	sorted(self._process_lookup),
				1401	sorted(p.pid for p in process.all))
				1402	return Results(process)
				1403
				1404	def get_or_set_proc(self, pid):
				1405	"""Returns the Context.Process instance for this pid or creates a new one.
				1406	"""
				1407	if not pid or not isinstance(pid, int):
				1408	raise TracingFailure(
				1409	'Unpexpected value for pid: %r' % pid,
				1410	pid,
				1411	None,
				1412	None,
				1413	pid)
				1414	if pid not in self._process_lookup:
				1415	self._process_lookup[pid] = self.Process(self, pid)
				1416	return self._process_lookup[pid]
				1417
				1418	@classmethod
				1419	def traces(cls):
				1420	"""Returns the list of all handled traces to pass this as an argument to
				1421	strace.
				1422	"""
				1423	prefix = 'handle_'
				1424	return [i[len(prefix):] for i in dir(cls.Process) if i.startswith(prefix)]
				1425
				1426	class Tracer(ApiBase.Tracer):
				1427	MAX_LEN = 256
				1428
				1429	def trace(self, cmd, cwd, tracename, output):
				1430	"""Runs strace on an executable."""
				1431	logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
				1432	assert os.path.isabs(cmd[0]), cmd[0]
				1433	assert os.path.isabs(cwd), cwd
				1434	assert os.path.normpath(cwd) == cwd, cwd
				1435	with self._lock:
				1436	if not self._initialized:
				1437	raise TracingFailure(
				1438	'Called Tracer.trace() on an unitialized object',
				1439	None, None, None, tracename)
				1440	assert tracename not in (i['trace'] for i in self._traces)
				1441	stdout = stderr = None
				1442	if output:
				1443	stdout = subprocess.PIPE
				1444	stderr = subprocess.STDOUT
				1445	# Ensure all file related APIs are hooked.
				1446	traces = ','.join(Strace.Context.traces() + ['file'])
				1447	trace_cmd = [
				1448	'strace',
				1449	'-ff',
				1450	'-s', '%d' % self.MAX_LEN,
				1451	'-e', 'trace=%s' % traces,
				1452	'-o', self._logname + '.' + tracename,
				1453	]
				1454	child = subprocess.Popen(
				1455	trace_cmd + cmd,
				1456	cwd=cwd,
				1457	stdin=subprocess.PIPE,
				1458	stdout=stdout,
				1459	stderr=stderr)
				1460	out = child.communicate()[0]
				1461	# TODO(maruel): Walk the logs and figure out the root process would
				1462	# simplify parsing the logs a lot.
				1463	with self._lock:
				1464	assert tracename not in (i['trace'] for i in self._traces)
				1465	self._traces.append(
				1466	{
				1467	'cmd': cmd,
				1468	'cwd': cwd,
				1469	# The pid of strace process, not very useful.
				1470	'pid': child.pid,
				1471	'trace': tracename,
				1472	'output': out,
				1473	})
				1474	return child.returncode, out
				1475
				1476	@staticmethod
				1477	def clean_trace(logname):
				1478	if os.path.isfile(logname):
				1479	os.remove(logname)
				1480	# Also delete any pid specific file from previous traces.
				1481	for i in glob.iglob(logname + '.*'):
				1482	if i.rsplit('.', 1)[1].isdigit():
				1483	os.remove(i)
				1484
				1485	@classmethod
				1486	def parse_log(cls, logname, blacklist):
				1487	logging.info('parse_log(%s, %s)' % (logname, blacklist))
				1488	data = read_json(logname)
				1489	out = []
				1490	for item in data['traces']:
				1491	result = {
				1492	'trace': item['trace'],
				1493	'output': item['output'],
				1494	}
				1495	try:
				1496	context = cls.Context(blacklist, item['cwd'])
				1497	for pidfile in glob.iglob('%s.%s.*' % (logname, item['trace'])):
				1498	pid = pidfile.rsplit('.', 1)[1]
				1499	if pid.isdigit():
				1500	pid = int(pid)
				1501	# TODO(maruel): Load as utf-8
				1502	for line in open(pidfile, 'rb'):
				1503	context.on_line(pid, line)
				1504	result['results'] = context.to_results()
maruel@chromium.org	424f569	2012-10-15 15:42:38 +0000	[diff] [blame]	1505	except TracingFailure:
				1506	result['exception'] = sys.exc_info()
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1507	out.append(result)
				1508	return out
				1509
				1510
				1511	class Dtrace(ApiBase):
				1512	"""Uses DTrace framework through dtrace. Requires root access.
				1513
				1514	Implies Mac OSX.
				1515
				1516	dtruss can't be used because it has compatibility issues with python.
				1517
				1518	Also, the pid->cwd handling needs to be done manually since OSX has no way to
				1519	get the absolute path of the 'cwd' dtrace variable from the probe.
				1520
				1521	Also, OSX doesn't populate curpsinfo->pr_psargs properly, see
				1522	https://discussions.apple.com/thread/1980539. So resort to handling execve()
				1523	manually.
				1524
				1525	errno is not printed in the log since this implementation currently only cares
				1526	about files that were successfully opened.
				1527	"""
				1528	class Context(ApiBase.Context):
				1529	# Format: index pid function(args)
				1530	RE_HEADER = re.compile(r'^\d+ (\d+) ([a-zA-Z_\-]+)$(.*?)$$')
				1531
				1532	# Arguments parsing.
				1533	RE_DTRACE_BEGIN = re.compile(r'^\"(.+?)\"$')
				1534	RE_CHDIR = re.compile(r'^\"(.+?)\"$')
				1535	RE_EXECVE = re.compile(r'^\"(.+?)\", \[(\d+), (.+)\]$')
				1536	RE_OPEN = re.compile(r'^\"(.+?)\", (0x[0-9a-z]+), (0x[0-9a-z]+)$')
				1537	RE_PROC_START = re.compile(r'^(\d+), \"(.+?)\", (\d+)$')
				1538	RE_RENAME = re.compile(r'^\"(.+?)\", \"(.+?)\"$')
				1539
				1540	O_DIRECTORY = 0x100000
				1541
				1542	class Process(ApiBase.Context.Process):
				1543	def __init__(self, *args):
				1544	super(Dtrace.Context.Process, self).__init__(*args)
				1545	self.cwd = self.initial_cwd
				1546
				1547	def __init__(self, blacklist, tracer_pid, initial_cwd):
				1548	logging.info(
				1549	'%s(%d, %s)' % (self.__class__.__name__, tracer_pid, initial_cwd))
				1550	super(Dtrace.Context, self).__init__(blacklist)
				1551	# Process ID of the temporary script created by create_thunk().
				1552	self._tracer_pid = tracer_pid
				1553	self._initial_cwd = initial_cwd
				1554	self._line_number = 0
				1555
				1556	def on_line(self, line):
				1557	self._line_number += 1
				1558	match = self.RE_HEADER.match(line)
				1559	if not match:
				1560	raise TracingFailure(
				1561	'Found malformed line: %s' % line,
				1562	None,
				1563	self._line_number,
				1564	line)
				1565	fn = getattr(
				1566	self,
				1567	'handle_%s' % match.group(2).replace('-', '_'),
				1568	self._handle_ignored)
				1569	# It is guaranteed to succeed because of the regexp. Or at least I thought
				1570	# it would.
				1571	pid = int(match.group(1))
				1572	try:
				1573	return fn(pid, match.group(3))
				1574	except TracingFailure, e:
				1575	# Hack in the values since the handler could be a static function.
				1576	e.pid = pid
				1577	e.line = line
				1578	e.line_number = self._line_number
				1579	# Re-raise the modified exception.
				1580	raise
				1581	except (KeyError, NotImplementedError, ValueError), e:
				1582	raise TracingFailure(
				1583	'Trace generated a %s exception: %s' % (
				1584	e.__class__.__name__, str(e)),
				1585	pid,
				1586	self._line_number,
				1587	line,
				1588	e)
				1589
				1590	def to_results(self):
				1591	process = self.root_process.to_results_process()
				1592	# Internal concistency check.
				1593	if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
				1594	raise TracingFailure(
				1595	'Found internal inconsitency in process lifetime detection '
				1596	'while looking for len(tree) == len(list)',
				1597	None,
				1598	None,
				1599	None,
				1600	sorted(self._process_lookup),
				1601	sorted(p.pid for p in process.all))
				1602	return Results(process)
				1603
				1604	def handle_dtrace_BEGIN(self, _pid, args):
				1605	if not self.RE_DTRACE_BEGIN.match(args):
				1606	raise TracingFailure(
				1607	'Found internal inconsitency in dtrace_BEGIN log line',
				1608	None, None, None)
				1609
				1610	def handle_proc_start(self, pid, args):
				1611	"""Transfers cwd.
				1612
				1613	The dtrace script already takes care of only tracing the processes that
				1614	are child of the traced processes so there is no need to verify the
				1615	process hierarchy.
				1616	"""
				1617	if pid in self._process_lookup:
				1618	raise TracingFailure(
				1619	'Found internal inconsitency in proc_start: %d started two times' %
				1620	pid,
				1621	None, None, None)
				1622	match = self.RE_PROC_START.match(args)
				1623	if not match:
				1624	raise TracingFailure(
				1625	'Failed to parse arguments: %s' % args,
				1626	None, None, None)
				1627	ppid = int(match.group(1))
				1628	if ppid == self._tracer_pid and not self.root_process:
				1629	proc = self.root_process = self.Process(
				1630	self.blacklist, pid, self._initial_cwd)
				1631	elif ppid in self._process_lookup:
				1632	proc = self.Process(self.blacklist, pid, self._process_lookup[ppid].cwd)
				1633	self._process_lookup[ppid].children.append(proc)
				1634	else:
				1635	# Another process tree, ignore.
				1636	return
				1637	self._process_lookup[pid] = proc
				1638	logging.debug(
				1639	'New child: %s -> %d cwd:%s' %
				1640	(ppid, pid, unicode(proc.initial_cwd)))
				1641
				1642	def handle_proc_exit(self, pid, _args):
				1643	"""Removes cwd."""
				1644	if pid in self._process_lookup:
				1645	# self._tracer_pid is not traced itself and other traces run neither.
				1646	self._process_lookup[pid].cwd = None
				1647
				1648	def handle_execve(self, pid, args):
				1649	"""Sets the process' executable.
				1650
				1651	TODO(maruel): Read command line arguments. See
				1652	https://discussions.apple.com/thread/1980539 for an example.
				1653	https://gist.github.com/1242279
				1654
				1655	Will have to put the answer at http://stackoverflow.com/questions/7556249.
				1656	:)
				1657	"""
				1658	if not pid in self._process_lookup:
				1659	# Another process tree, ignore.
				1660	return
				1661	match = self.RE_EXECVE.match(args)
				1662	if not match:
				1663	raise TracingFailure(
				1664	'Failed to parse arguments: %r' % args,
				1665	None, None, None)
				1666	proc = self._process_lookup[pid]
				1667	proc.executable = match.group(1)
				1668	proc.command = self.process_escaped_arguments(match.group(3))
				1669	if int(match.group(2)) != len(proc.command):
				1670	raise TracingFailure(
				1671	'Failed to parse execve() arguments: %s' % args,
				1672	None, None, None)
				1673
				1674	def handle_chdir(self, pid, args):
				1675	"""Updates cwd."""
				1676	if pid not in self._process_lookup:
				1677	# Another process tree, ignore.
				1678	return
				1679	cwd = self.RE_CHDIR.match(args).group(1)
				1680	if not cwd.startswith('/'):
				1681	cwd2 = os.path.join(self._process_lookup[pid].cwd, cwd)
				1682	logging.debug('handle_chdir(%d, %s) -> %s' % (pid, cwd, cwd2))
				1683	else:
				1684	logging.debug('handle_chdir(%d, %s)' % (pid, cwd))
				1685	cwd2 = cwd
				1686	self._process_lookup[pid].cwd = cwd2
				1687
				1688	def handle_open_nocancel(self, pid, args):
				1689	"""Redirects to handle_open()."""
				1690	return self.handle_open(pid, args)
				1691
				1692	def handle_open(self, pid, args):
				1693	if pid not in self._process_lookup:
				1694	# Another process tree, ignore.
				1695	return
				1696	match = self.RE_OPEN.match(args)
				1697	if not match:
				1698	raise TracingFailure(
				1699	'Failed to parse arguments: %s' % args,
				1700	None, None, None)
				1701	flag = int(match.group(2), 16)
				1702	if self.O_DIRECTORY & flag == self.O_DIRECTORY:
				1703	# Ignore directories.
				1704	return
				1705	self._handle_file(pid, match.group(1))
				1706
				1707	def handle_rename(self, pid, args):
				1708	if pid not in self._process_lookup:
				1709	# Another process tree, ignore.
				1710	return
				1711	match = self.RE_RENAME.match(args)
				1712	if not match:
				1713	raise TracingFailure(
				1714	'Failed to parse arguments: %s' % args,
				1715	None, None, None)
				1716	self._handle_file(pid, match.group(1))
				1717	self._handle_file(pid, match.group(2))
				1718
				1719	def _handle_file(self, pid, filepath):
				1720	if not filepath.startswith('/'):
				1721	filepath = os.path.join(self._process_lookup[pid].cwd, filepath)
				1722	# We can get '..' in the path.
				1723	filepath = os.path.normpath(filepath)
				1724	# Sadly, still need to filter out directories here;
				1725	# saw open_nocancel(".", 0, 0) = 0 lines.
				1726	if os.path.isdir(filepath):
				1727	return
				1728	self._process_lookup[pid].add_file(filepath, False)
				1729
				1730	def handle_ftruncate(self, pid, args):
				1731	"""Just used as a signal to kill dtrace, ignoring."""
				1732	pass
				1733
				1734	@staticmethod
				1735	def _handle_ignored(pid, args):
				1736	"""Is called for all the event traces that are not handled."""
				1737	raise NotImplementedError('Please implement me')
				1738
				1739	@staticmethod
				1740	def process_escaped_arguments(text):
				1741	"""Extracts escaped arguments on a string and return the arguments as a
				1742	list.
				1743
				1744	Implemented as an automaton.
				1745
				1746	Example:
				1747	With text = '\\001python2.7\\001-c\\001print(\\"hi\\")\\0', the
				1748	function will return ['python2.7', '-c', 'print("hi")]
				1749	"""
				1750	if not text.endswith('\\0'):
				1751	raise ValueError('String is not null terminated: %r' % text, text)
				1752	text = text[:-2]
				1753
				1754	def unescape(x):
				1755	"""Replaces '\\' with '\' and '\?' (where ? is anything) with ?."""
				1756	out = []
				1757	escaped = False
				1758	for i in x:
				1759	if i == '\\' and not escaped:
				1760	escaped = True
				1761	continue
				1762	escaped = False
				1763	out.append(i)
				1764	return ''.join(out)
				1765
				1766	return [unescape(i) for i in text.split('\\001')]
				1767
				1768	class Tracer(ApiBase.Tracer):
				1769	# pylint: disable=C0301
				1770	#
				1771	# To understand the following code, you'll want to take a look at:
				1772	# http://developers.sun.com/solaris/articles/dtrace_quickref/dtrace_quickref.html
				1773	# https://wikis.oracle.com/display/DTrace/Variables
				1774	# http://docs.oracle.com/cd/E19205-01/820-4221/
				1775	#
				1776	# 0. Dump all the valid probes into a text file. It is important, you
				1777	# want to redirect into a file and you don't want to constantly 'sudo'.
				1778	# $ sudo dtrace -l > probes.txt
				1779	#
				1780	# 1. Count the number of probes:
				1781	# $ wc -l probes.txt
				1782	# 81823 # On OSX 10.7, including 1 header line.
				1783	#
				1784	# 2. List providers, intentionally skipping all the 'syspolicy10925' and the
				1785	# likes and skipping the header with NR>1:
				1786	# $ awk 'NR>1 { print $2 }' probes.txt \| sort \| uniq \| grep -v '[[:digit:]]'
				1787	# dtrace
				1788	# fbt
				1789	# io
				1790	# ip
				1791	# lockstat
				1792	# mach_trap
				1793	# proc
				1794	# profile
				1795	# sched
				1796	# syscall
				1797	# tcp
				1798	# vminfo
				1799	#
				1800	# 3. List of valid probes:
				1801	# $ grep syscall probes.txt \| less
				1802	# or use dtrace directly:
				1803	# $ sudo dtrace -l -P syscall \| less
				1804	#
				1805	# trackedpid is an associative array where its value can be 0, 1 or 2.
				1806	# 0 is for untracked processes and is the default value for items not
				1807	# in the associative array.
				1808	# 1 is for tracked processes.
				1809	# 2 is for the script created by create_thunk() only. It is not tracked
				1810	# itself but all its decendants are.
				1811	#
				1812	# The script will kill itself only once waiting_to_die == 1 and
				1813	# current_processes == 0, so that both getlogin() was called and that
				1814	# all traced processes exited.
				1815	#
				1816	# TODO(maruel): Use cacheable predicates. See
				1817	# https://wikis.oracle.com/display/DTrace/Performance+Considerations
				1818	D_CODE = """
				1819	dtrace:::BEGIN {
				1820	waiting_to_die = 0;
				1821	current_processes = 0;
				1822	logindex = 0;
				1823	printf("%d %d %s_%s(\\"%s\\")\\n",
				1824	logindex, PID, probeprov, probename, SCRIPT);
				1825	logindex++;
				1826	}
				1827
				1828	proc:::start /trackedpid[ppid]/ {
				1829	trackedpid[pid] = 1;
				1830	current_processes += 1;
				1831	printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
				1832	logindex, pid, probeprov, probename,
				1833	ppid,
				1834	execname,
				1835	current_processes);
				1836	logindex++;
				1837	}
				1838	/* Should use SCRIPT but there is no access to this variable at that
				1839	* point. */
				1840	proc:::start /ppid == PID && execname == "Python"/ {
				1841	trackedpid[pid] = 2;
				1842	current_processes += 1;
				1843	printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
				1844	logindex, pid, probeprov, probename,
				1845	ppid,
				1846	execname,
				1847	current_processes);
				1848	logindex++;
				1849	}
				1850	proc:::exit /trackedpid[pid] &&
				1851	current_processes == 1 &&
				1852	waiting_to_die == 1/ {
				1853	trackedpid[pid] = 0;
				1854	current_processes -= 1;
				1855	printf("%d %d %s_%s(%d)\\n",
				1856	logindex, pid, probeprov, probename,
				1857	current_processes);
				1858	logindex++;
				1859	exit(0);
				1860	}
				1861	proc:::exit /trackedpid[pid]/ {
				1862	trackedpid[pid] = 0;
				1863	current_processes -= 1;
				1864	printf("%d %d %s_%s(%d)\\n",
				1865	logindex, pid, probeprov, probename,
				1866	current_processes);
				1867	logindex++;
				1868	}
				1869
				1870	/* Use an arcane function to detect when we need to die */
				1871	syscall::ftruncate:entry /pid == PID && arg0 == FILE_ID/ {
				1872	waiting_to_die = 1;
				1873	printf("%d %d %s()\\n", logindex, pid, probefunc);
				1874	logindex++;
				1875	}
				1876	syscall::ftruncate:entry /
				1877	pid == PID && arg0 == FILE_ID && current_processes == 0/ {
				1878	exit(0);
				1879	}
				1880
				1881	syscall::open*:entry /trackedpid[pid] == 1/ {
				1882	self->open_arg0 = arg0;
				1883	self->open_arg1 = arg1;
				1884	self->open_arg2 = arg2;
				1885	}
				1886	syscall::open*:return /trackedpid[pid] == 1 && errno == 0/ {
				1887	this->open_arg0 = copyinstr(self->open_arg0);
				1888	printf("%d %d %s(\\"%s\\", 0x%x, 0x%x)\\n",
				1889	logindex, pid, probefunc,
				1890	this->open_arg0,
				1891	self->open_arg1,
				1892	self->open_arg2);
				1893	logindex++;
				1894	this->open_arg0 = 0;
				1895	}
				1896	syscall::open*:return /trackedpid[pid] == 1/ {
				1897	self->open_arg0 = 0;
				1898	self->open_arg1 = 0;
				1899	self->open_arg2 = 0;
				1900	}
				1901
				1902	syscall::rename:entry /trackedpid[pid] == 1/ {
				1903	self->rename_arg0 = arg0;
				1904	self->rename_arg1 = arg1;
				1905	}
				1906	syscall::rename:return /trackedpid[pid] == 1 && errno == 0/ {
				1907	this->rename_arg0 = copyinstr(self->rename_arg0);
				1908	this->rename_arg1 = copyinstr(self->rename_arg1);
				1909	printf("%d %d %s(\\"%s\\", \\"%s\\")\\n",
				1910	logindex, pid, probefunc,
				1911	this->rename_arg0,
				1912	this->rename_arg1);
				1913	logindex++;
				1914	this->rename_arg0 = 0;
				1915	this->rename_arg1 = 0;
				1916	}
				1917	syscall::rename:return /trackedpid[pid] == 1/ {
				1918	self->rename_arg0 = 0;
				1919	self->rename_arg1 = 0;
				1920	}
				1921
				1922	/* Track chdir, it's painful because it is only receiving relative path.
				1923	*/
				1924	syscall::chdir:entry /trackedpid[pid] == 1/ {
				1925	self->chdir_arg0 = arg0;
				1926	}
				1927	syscall::chdir:return /trackedpid[pid] == 1 && errno == 0/ {
				1928	this->chdir_arg0 = copyinstr(self->chdir_arg0);
				1929	printf("%d %d %s(\\"%s\\")\\n",
				1930	logindex, pid, probefunc,
				1931	this->chdir_arg0);
				1932	logindex++;
				1933	this->chdir_arg0 = 0;
				1934	}
				1935	syscall::chdir:return /trackedpid[pid] == 1/ {
				1936	self->chdir_arg0 = 0;
				1937	}
				1938	"""
				1939
				1940	# execve-specific code, tends to throw a lot of exceptions.
				1941	D_CODE_EXECVE = """
				1942	/* Finally what we care about! */
				1943	syscall::exec*:entry /trackedpid[pid]/ {
				1944	self->exec_arg0 = copyinstr(arg0);
				1945	/* Incrementally probe for a NULL in the argv parameter of execve() to
				1946	* figure out argc. */
				1947	/* TODO(maruel): Skip the remaining copyin() when a NULL pointer was
				1948	* found. */
				1949	self->exec_argc = 0;
				1950	/* Probe for argc==1 */
				1951	this->exec_argv = (user_addr_t*)copyin(
				1952	arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
				1953	self->exec_argc = this->exec_argv[self->exec_argc] ?
				1954	(self->exec_argc + 1) : self->exec_argc;
				1955
				1956	/* Probe for argc==2 */
				1957	this->exec_argv = (user_addr_t*)copyin(
				1958	arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
				1959	self->exec_argc = this->exec_argv[self->exec_argc] ?
				1960	(self->exec_argc + 1) : self->exec_argc;
				1961
				1962	/* Probe for argc==3 */
				1963	this->exec_argv = (user_addr_t*)copyin(
				1964	arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
				1965	self->exec_argc = this->exec_argv[self->exec_argc] ?
				1966	(self->exec_argc + 1) : self->exec_argc;
				1967
				1968	/* Probe for argc==4 */
				1969	this->exec_argv = (user_addr_t*)copyin(
				1970	arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
				1971	self->exec_argc = this->exec_argv[self->exec_argc] ?
				1972	(self->exec_argc + 1) : self->exec_argc;
				1973
				1974	/* Copy the inputs strings since there is no guarantee they'll be
				1975	* present after the call completed. */
				1976	self->exec_argv0 = (self->exec_argc > 0) ?
				1977	copyinstr(this->exec_argv[0]) : "";
				1978	self->exec_argv1 = (self->exec_argc > 1) ?
				1979	copyinstr(this->exec_argv[1]) : "";
				1980	self->exec_argv2 = (self->exec_argc > 2) ?
				1981	copyinstr(this->exec_argv[2]) : "";
				1982	self->exec_argv3 = (self->exec_argc > 3) ?
				1983	copyinstr(this->exec_argv[3]) : "";
				1984	this->exec_argv = 0;
				1985	}
				1986	syscall::exec*:return /trackedpid[pid] && errno == 0/ {
				1987	/* We need to join strings here, as using multiple printf() would
				1988	* cause tearing when multiple threads/processes are traced.
				1989	* Since it is impossible to escape a string and join it to another one,
				1990	* like sprintf("%s%S", previous, more), use hackery.
				1991	* Each of the elements are split with a \\1. \\0 cannot be used because
				1992	* it is simply ignored. This will conflict with any program putting a
				1993	* \\1 in their execve() string but this should be "rare enough" */
				1994	this->args = "";
				1995	/* Process exec_argv[0] */
				1996	this->args = strjoin(
				1997	this->args, (self->exec_argc > 0) ? self->exec_argv0 : "");
				1998
				1999	/* Process exec_argv[1] */
				2000	this->args = strjoin(
				2001	this->args, (self->exec_argc > 1) ? "\\1" : "");
				2002	this->args = strjoin(
				2003	this->args, (self->exec_argc > 1) ? self->exec_argv1 : "");
				2004
				2005	/* Process exec_argv[2] */
				2006	this->args = strjoin(
				2007	this->args, (self->exec_argc > 2) ? "\\1" : "");
				2008	this->args = strjoin(
				2009	this->args, (self->exec_argc > 2) ? self->exec_argv2 : "");
				2010
				2011	/* Process exec_argv[3] */
				2012	this->args = strjoin(
				2013	this->args, (self->exec_argc > 3) ? "\\1" : "");
				2014	this->args = strjoin(
				2015	this->args, (self->exec_argc > 3) ? self->exec_argv3 : "");
				2016
				2017	/* Prints self->exec_argc to permits verifying the internal
				2018	* consistency since this code is quite fishy. */
				2019	printf("%d %d %s(\\"%s\\", [%d, %S])\\n",
				2020	logindex, pid, probefunc,
				2021	self->exec_arg0,
				2022	self->exec_argc,
				2023	this->args);
				2024	logindex++;
				2025	this->args = 0;
				2026	}
				2027	syscall::exec*:return /trackedpid[pid]/ {
				2028	self->exec_arg0 = 0;
				2029	self->exec_argc = 0;
				2030	self->exec_argv0 = 0;
				2031	self->exec_argv1 = 0;
				2032	self->exec_argv2 = 0;
				2033	self->exec_argv3 = 0;
				2034	}
				2035	"""
				2036
				2037	# Code currently not used.
				2038	D_EXTRANEOUS = """
				2039	/* This is a good learning experience, since it traces a lot of things
				2040	* related to the process and child processes.
				2041	* Warning: it generates a gigantic log. For example, tracing
				2042	* "data/trace_inputs/child1.py --child" generates a 2mb log and takes
				2043	* several minutes to execute.
				2044	*/
				2045	/*
				2046	mach_trap::: /trackedpid[pid] == 1 \|\| trackedpid[ppid]/ {
				2047	printf("%d %d %s_%s() = %d\\n",
				2048	logindex, pid, probeprov, probefunc, errno);
				2049	logindex++;
				2050	}
				2051	proc::: /trackedpid[pid] == 1 \|\| trackedpid[ppid]/ {
				2052	printf("%d %d %s_%s() = %d\\n",
				2053	logindex, pid, probeprov, probefunc, errno);
				2054	logindex++;
				2055	}
				2056	sched::: /trackedpid[pid] == 1 \|\| trackedpid[ppid]/ {
				2057	printf("%d %d %s_%s() = %d\\n",
				2058	logindex, pid, probeprov, probefunc, errno);
				2059	logindex++;
				2060	}
				2061	syscall::: /trackedpid[pid] == 1 \|\| trackedpid[ppid]/ {
				2062	printf("%d %d %s_%s() = %d\\n",
				2063	logindex, pid, probeprov, probefunc, errno);
				2064	logindex++;
				2065	}
				2066	vminfo::: /trackedpid[pid] == 1 \|\| trackedpid[ppid]/ {
				2067	printf("%d %d %s_%s() = %d\\n",
				2068	logindex, pid, probeprov, probefunc, errno);
				2069	logindex++;
				2070	}
				2071	*/
				2072	/* TODO(maruel): stat functions and friends
				2073	syscall::access:return,
				2074	syscall::chdir:return,
				2075	syscall::chflags:return,
				2076	syscall::chown:return,
				2077	syscall::chroot:return,
				2078	syscall::getattrlist:return,
				2079	syscall::getxattr:return,
				2080	syscall::lchown:return,
				2081	syscall::lstat64:return,
				2082	syscall::lstat:return,
				2083	syscall::mkdir:return,
				2084	syscall::pathconf:return,
				2085	syscall::readlink:return,
				2086	syscall::removexattr:return,
				2087	syscall::setxattr:return,
				2088	syscall::stat64:return,
				2089	syscall::stat:return,
				2090	syscall::truncate:return,
				2091	syscall::unlink:return,
				2092	syscall::utimes:return,
				2093	*/
				2094	"""
				2095
				2096	def __init__(self, logname):
				2097	"""Starts the log collection with dtrace.
				2098
				2099	Requires root access or chmod 4555 on dtrace. dtrace is asynchronous so
				2100	this needs to wait for dtrace to be "warmed up".
				2101	"""
				2102	super(Dtrace.Tracer, self).__init__(logname)
				2103	self._script = create_thunk()
				2104	# This unique dummy temp file is used to signal the dtrace script that it
				2105	# should stop as soon as all the child processes are done. A bit hackish
				2106	# but works fine enough.
				2107	self._dummy_file_id, self._dummy_file_name = tempfile.mkstemp(
				2108	prefix='trace_signal_file')
				2109
				2110	# Note: do not use the -p flag. It's useless if the initial process quits
				2111	# too fast, resulting in missing traces from the grand-children. The D
				2112	# code manages the dtrace lifetime itself.
				2113	trace_cmd = [
				2114	'sudo',
				2115	'dtrace',
				2116	# Use a larger buffer if getting 'out of scratch space' errors.
				2117	# Ref: https://wikis.oracle.com/display/DTrace/Options+and+Tunables
				2118	'-b', '10m',
				2119	'-x', 'dynvarsize=10m',
				2120	#'-x', 'dtrace_global_maxsize=1m',
				2121	'-x', 'evaltime=exec',
				2122	'-o', '/dev/stderr',
				2123	'-q',
				2124	'-n', self._get_dtrace_code(),
				2125	]
				2126	with open(self._logname + '.log', 'wb') as logfile:
				2127	self._dtrace = subprocess.Popen(
				2128	trace_cmd, stdout=logfile, stderr=subprocess.STDOUT)
				2129	logging.debug('Started dtrace pid: %d' % self._dtrace.pid)
				2130
				2131	# Reads until one line is printed, which signifies dtrace is up and ready.
				2132	with open(self._logname + '.log', 'rb') as logfile:
				2133	while 'dtrace_BEGIN' not in logfile.readline():
				2134	if self._dtrace.poll() is not None:
				2135	# Do a busy wait. :/
				2136	break
				2137	logging.debug('dtrace started')
				2138
				2139	def _get_dtrace_code(self):
				2140	"""Setups the D code to implement child process tracking.
				2141
				2142	Injects the cookie in the script so it knows when to stop.
				2143
				2144	The script will detect any instance of the script created with
				2145	create_thunk() and will start tracing it.
				2146	"""
				2147	return (
				2148	'inline int PID = %d;\n'
				2149	'inline string SCRIPT = "%s";\n'
				2150	'inline int FILE_ID = %d;\n'
				2151	'\n'
				2152	'%s') % (
				2153	os.getpid(),
				2154	self._script,
				2155	self._dummy_file_id,
				2156	self.D_CODE) + self.D_CODE_EXECVE
				2157
				2158	def trace(self, cmd, cwd, tracename, output):
				2159	"""Runs dtrace on an executable.
				2160
				2161	This dtruss is broken when it starts the process itself or when tracing
				2162	child processes, this code starts a wrapper process
				2163	generated with create_thunk() which starts the executable to trace.
				2164	"""
				2165	logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
				2166	assert os.path.isabs(cmd[0]), cmd[0]
				2167	assert os.path.isabs(cwd), cwd
				2168	assert os.path.normpath(cwd) == cwd, cwd
				2169	with self._lock:
				2170	if not self._initialized:
				2171	raise TracingFailure(
				2172	'Called Tracer.trace() on an unitialized object',
				2173	None, None, None, tracename)
				2174	assert tracename not in (i['trace'] for i in self._traces)
				2175
				2176	# Starts the script wrapper to start the child process. This signals the
				2177	# dtrace script that this process is to be traced.
				2178	stdout = stderr = None
				2179	if output:
				2180	stdout = subprocess.PIPE
				2181	stderr = subprocess.STDOUT
				2182	child_cmd = [
				2183	sys.executable,
				2184	self._script,
				2185	tracename,
				2186	]
				2187	# Call a dummy function so that dtrace knows I'm about to launch a process
				2188	# that needs to be traced.
				2189	# Yummy.
				2190	child = subprocess.Popen(
				2191	child_cmd + fix_python_path(cmd),
				2192	stdin=subprocess.PIPE,
				2193	stdout=stdout,
				2194	stderr=stderr,
				2195	cwd=cwd)
				2196	logging.debug('Started child pid: %d' % child.pid)
				2197
				2198	out = child.communicate()[0]
				2199	# This doesn't mean tracing is done, one of the grand-child process may
				2200	# still be alive. It will be tracked with the dtrace script.
				2201
				2202	with self._lock:
				2203	assert tracename not in (i['trace'] for i in self._traces)
				2204	self._traces.append(
				2205	{
				2206	'cmd': cmd,
				2207	'cwd': cwd,
				2208	# The pid of strace process, not very useful.
				2209	'pid': child.pid,
				2210	'trace': tracename,
				2211	'output': out,
				2212	})
				2213	return child.returncode, out
				2214
				2215	def close(self, timeout=None):
				2216	"""Terminates dtrace."""
				2217	logging.debug('close(%s)' % timeout)
				2218	try:
				2219	try:
				2220	super(Dtrace.Tracer, self).close(timeout)
				2221	# Signal dtrace that it should stop now.
maruel@chromium.org	f43e68b	2012-10-15 20:23:10 +0000	[diff] [blame^]	2222	# ftruncate doesn't exist on Windows.
				2223	os.ftruncate(self._dummy_file_id, 0) # pylint: disable=E1101
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	2224	if timeout:
				2225	start = time.time()
				2226	# Use polling. :/
				2227	while (self._dtrace.poll() is None and
				2228	(time.time() - start) < timeout):
				2229	time.sleep(0.1)
				2230	self._dtrace.kill()
				2231	self._dtrace.wait()
				2232	finally:
				2233	# Make sure to kill it in any case.
				2234	if self._dtrace.poll() is None:
				2235	try:
				2236	self._dtrace.kill()
				2237	self._dtrace.wait()
				2238	except OSError:
				2239	pass
				2240
				2241	if self._dtrace.returncode != 0:
				2242	# Warn about any dtrace failure but basically ignore it.
				2243	print 'dtrace failure: %s' % self._dtrace.returncode
				2244	finally:
				2245	os.close(self._dummy_file_id)
				2246	os.remove(self._dummy_file_name)
				2247	os.remove(self._script)
				2248
				2249	def post_process_log(self):
				2250	"""Sorts the log back in order when each call occured.
				2251
				2252	dtrace doesn't save the buffer in strict order since it keeps one buffer
				2253	per CPU.
				2254	"""
				2255	super(Dtrace.Tracer, self).post_process_log()
				2256	logname = self._logname + '.log'
				2257	with open(logname, 'rb') as logfile:
				2258	lines = [l for l in logfile if l.strip()]
				2259	errors = [l for l in lines if l.startswith('dtrace:')]
				2260	if errors:
				2261	raise TracingFailure(
				2262	'Found errors in the trace: %s' % '\n'.join(errors),
				2263	None, None, None, logname)
				2264	try:
				2265	lines = sorted(lines, key=lambda l: int(l.split(' ', 1)[0]))
				2266	except ValueError:
				2267	raise TracingFailure(
				2268	'Found errors in the trace: %s' % '\n'.join(
				2269	l for l in lines if l.split(' ', 1)[0].isdigit()),
				2270	None, None, None, logname)
				2271	with open(logname, 'wb') as logfile:
				2272	logfile.write(''.join(lines))
				2273
				2274	@staticmethod
				2275	def clean_trace(logname):
				2276	for ext in ('', '.log'):
				2277	if os.path.isfile(logname + ext):
				2278	os.remove(logname + ext)
				2279
				2280	@classmethod
				2281	def parse_log(cls, logname, blacklist):
				2282	logging.info('parse_log(%s, ...)' % logname)
				2283
				2284	def blacklist_more(filepath):
				2285	# All the HFS metadata is in the form /.vol/...
				2286	return blacklist(filepath) or re.match(r'^\/\.vol\/.+$', filepath)
				2287
				2288	data = read_json(logname)
				2289	out = []
				2290	for item in data['traces']:
				2291	context = cls.Context(blacklist_more, item['pid'], item['cwd'])
				2292	for line in open(logname + '.log', 'rb'):
				2293	context.on_line(line)
				2294	out.append(
				2295	{
				2296	'results': context.to_results(),
				2297	'trace': item['trace'],
				2298	'output': item['output'],
				2299	})
				2300	return out
				2301
				2302
				2303	class LogmanTrace(ApiBase):
				2304	"""Uses the native Windows ETW based tracing functionality to trace a child
				2305	process.
				2306
				2307	Caveat: this implementations doesn't track cwd or initial_cwd. It is because
				2308	the Windows Kernel doesn't have a concept of 'current working directory' at
				2309	all. A Win32 process has a map of current directories, one per drive letter
				2310	and it is managed by the user mode kernel32.dll. In kernel, a file is always
				2311	opened relative to another file_object or as an absolute path. All the current
				2312	working directory logic is done in user mode.
				2313	"""
				2314	class Context(ApiBase.Context):
				2315	"""Processes a ETW log line and keeps the list of existent and non
				2316	existent files accessed.
				2317
				2318	Ignores directories.
				2319	"""
				2320	# These indexes are for the stripped version in json.
				2321	EVENT_NAME = 0
				2322	TYPE = 1
				2323	PID = 2
				2324	TID = 3
				2325	PROCESSOR_ID = 4
				2326	TIMESTAMP = 5
				2327	USER_DATA = 6
				2328
				2329	class Process(ApiBase.Context.Process):
				2330	def __init__(self, *args):
				2331	super(LogmanTrace.Context.Process, self).__init__(*args)
				2332	# Handle file objects that succeeded.
				2333	self.file_objects = {}
				2334
				2335	def __init__(self, blacklist, tracer_pid):
				2336	logging.info('%s(%d)' % (self.__class__.__name__, tracer_pid))
				2337	super(LogmanTrace.Context, self).__init__(blacklist)
				2338	self._drive_map = DosDriveMap()
				2339	# Threads mapping to the corresponding process id.
				2340	self._threads_active = {}
				2341	# Process ID of the tracer, e.g. tracer_inputs.py
				2342	self._tracer_pid = tracer_pid
				2343	self._line_number = 0
				2344
				2345	def on_line(self, line):
				2346	"""Processes a json Event line."""
				2347	self._line_number += 1
				2348	try:
				2349	# By Opcode
				2350	handler = getattr(
				2351	self,
				2352	'handle_%s_%s' % (line[self.EVENT_NAME], line[self.TYPE]),
				2353	None)
				2354	if not handler:
				2355	raise TracingFailure(
				2356	'Unexpected event %s_%s' % (
				2357	line[self.EVENT_NAME], line[self.TYPE]),
				2358	None, None, None)
				2359	handler(line)
				2360	except TracingFailure, e:
				2361	# Hack in the values since the handler could be a static function.
				2362	e.pid = line[self.PID]
				2363	e.line = line
				2364	e.line_number = self._line_number
				2365	# Re-raise the modified exception.
				2366	raise
				2367	except (KeyError, NotImplementedError, ValueError), e:
				2368	raise TracingFailure(
				2369	'Trace generated a %s exception: %s' % (
				2370	e.__class__.__name__, str(e)),
				2371	line[self.PID],
				2372	self._line_number,
				2373	line,
				2374	e)
				2375
				2376	def to_results(self):
				2377	if not self.root_process:
				2378	raise TracingFailure(
				2379	'Failed to detect the initial process',
				2380	None, None, None)
				2381	process = self.root_process.to_results_process()
				2382	return Results(process)
				2383
				2384	def _thread_to_process(self, tid):
				2385	"""Finds the process from the thread id."""
				2386	tid = int(tid, 16)
				2387	pid = self._threads_active.get(tid)
				2388	if not pid or not self._process_lookup.get(pid):
				2389	return
				2390	return self._process_lookup[pid]
				2391
				2392	@classmethod
				2393	def handle_EventTrace_Header(cls, line):
				2394	"""Verifies no event was dropped, e.g. no buffer overrun occured."""
				2395	BUFFER_SIZE = cls.USER_DATA
				2396	#VERSION = cls.USER_DATA + 1
				2397	#PROVIDER_VERSION = cls.USER_DATA + 2
				2398	#NUMBER_OF_PROCESSORS = cls.USER_DATA + 3
				2399	#END_TIME = cls.USER_DATA + 4
				2400	#TIMER_RESOLUTION = cls.USER_DATA + 5
				2401	#MAX_FILE_SIZE = cls.USER_DATA + 6
				2402	#LOG_FILE_MODE = cls.USER_DATA + 7
				2403	#BUFFERS_WRITTEN = cls.USER_DATA + 8
				2404	#START_BUFFERS = cls.USER_DATA + 9
				2405	#POINTER_SIZE = cls.USER_DATA + 10
				2406	EVENTS_LOST = cls.USER_DATA + 11
				2407	#CPU_SPEED = cls.USER_DATA + 12
				2408	#LOGGER_NAME = cls.USER_DATA + 13
				2409	#LOG_FILE_NAME = cls.USER_DATA + 14
				2410	#BOOT_TIME = cls.USER_DATA + 15
				2411	#PERF_FREQ = cls.USER_DATA + 16
				2412	#START_TIME = cls.USER_DATA + 17
				2413	#RESERVED_FLAGS = cls.USER_DATA + 18
				2414	#BUFFERS_LOST = cls.USER_DATA + 19
				2415	#SESSION_NAME_STRING = cls.USER_DATA + 20
				2416	#LOG_FILE_NAME_STRING = cls.USER_DATA + 21
				2417	if line[EVENTS_LOST] != '0':
				2418	raise TracingFailure(
				2419	( '%s events were lost during trace, please increase the buffer '
				2420	'size from %s') % (line[EVENTS_LOST], line[BUFFER_SIZE]),
				2421	None, None, None)
				2422
				2423	def handle_FileIo_Cleanup(self, line):
				2424	"""General wisdom: if a file is closed, it's because it was opened.
				2425
				2426	Note that FileIo_Close is not used since if a file was opened properly but
				2427	not closed before the process exits, only Cleanup will be logged.
				2428	"""
				2429	#IRP = self.USER_DATA
				2430	TTID = self.USER_DATA + 1 # Thread ID, that's what we want.
				2431	FILE_OBJECT = self.USER_DATA + 2
				2432	#FILE_KEY = self.USER_DATA + 3
				2433	proc = self._thread_to_process(line[TTID])
				2434	if not proc:
				2435	# Not a process we care about.
				2436	return
				2437	file_object = line[FILE_OBJECT]
				2438	if file_object in proc.file_objects:
				2439	proc.add_file(proc.file_objects.pop(file_object), False)
				2440
				2441	def handle_FileIo_Create(self, line):
				2442	"""Handles a file open.
				2443
				2444	All FileIo events are described at
				2445	http://msdn.microsoft.com/library/windows/desktop/aa363884.aspx
				2446	for some value of 'description'.
				2447
				2448	" (..) process and thread id values of the IO events (..) are not valid "
				2449	http://msdn.microsoft.com/magazine/ee358703.aspx
				2450
				2451	The FileIo.Create event doesn't return if the CreateFile() call
				2452	succeeded, so keep track of the file_object and check that it is
				2453	eventually closed with FileIo_Cleanup.
				2454	"""
				2455	#IRP = self.USER_DATA
				2456	TTID = self.USER_DATA + 1 # Thread ID, that's what we want.
				2457	FILE_OBJECT = self.USER_DATA + 2
				2458	#CREATE_OPTIONS = self.USER_DATA + 3
				2459	#FILE_ATTRIBUTES = self.USER_DATA + 4
				2460	#self.USER_DATA + SHARE_ACCESS = 5
				2461	OPEN_PATH = self.USER_DATA + 6
				2462
				2463	proc = self._thread_to_process(line[TTID])
				2464	if not proc:
				2465	# Not a process we care about.
				2466	return
				2467
				2468	match = re.match(r'^\"(.+)\"$', line[OPEN_PATH])
				2469	raw_path = match.group(1)
				2470	# Ignore directories and bare drive right away.
				2471	if raw_path.endswith(os.path.sep):
				2472	return
				2473	filepath = self._drive_map.to_win32(raw_path)
				2474	# Ignore bare drive right away. Some may still fall through with format
				2475	# like '\\?\X:'
				2476	if len(filepath) == 2:
				2477	return
				2478	file_object = line[FILE_OBJECT]
				2479	if os.path.isdir(filepath):
				2480	# There is no O_DIRECTORY equivalent on Windows. The closed is
				2481	# FILE_FLAG_BACKUP_SEMANTICS but it's not exactly right either. So
				2482	# simply discard directories are they are found.
				2483	return
				2484	# Override any stale file object
				2485	proc.file_objects[file_object] = filepath
				2486
				2487	def handle_FileIo_Rename(self, line):
				2488	# TODO(maruel): Handle?
				2489	pass
				2490
				2491	def handle_Process_End(self, line):
				2492	pid = line[self.PID]
				2493	if self._process_lookup.get(pid):
				2494	logging.info('Terminated: %d' % pid)
				2495	self._process_lookup[pid] = None
				2496	else:
				2497	logging.debug('Terminated: %d' % pid)
				2498
				2499	def handle_Process_Start(self, line):
				2500	"""Handles a new child process started by PID."""
				2501	#UNIQUE_PROCESS_KEY = self.USER_DATA
				2502	PROCESS_ID = self.USER_DATA + 1
				2503	#PARENT_PID = self.USER_DATA + 2
				2504	#SESSION_ID = self.USER_DATA + 3
				2505	#EXIT_STATUS = self.USER_DATA + 4
				2506	#DIRECTORY_TABLE_BASE = self.USER_DATA + 5
				2507	#USER_SID = self.USER_DATA + 6
				2508	IMAGE_FILE_NAME = self.USER_DATA + 7
				2509	COMMAND_LINE = self.USER_DATA + 8
				2510
				2511	ppid = line[self.PID]
				2512	pid = int(line[PROCESS_ID], 16)
				2513	logging.debug(
				2514	'New process %d->%d (%s) %s' %
				2515	(ppid, pid, line[IMAGE_FILE_NAME], line[COMMAND_LINE]))
				2516
				2517	if ppid == self._tracer_pid:
				2518	# Need to ignore processes we don't know about because the log is
				2519	# system-wide. self._tracer_pid shall start only one process.
				2520	if self.root_process:
				2521	raise TracingFailure(
				2522	( 'Parent process is _tracer_pid(%d) but root_process(%d) is '
				2523	'already set') % (self._tracer_pid, self.root_process.pid),
				2524	None, None, None)
				2525	proc = self.Process(self.blacklist, pid, None)
				2526	self.root_process = proc
				2527	ppid = None
				2528	elif self._process_lookup.get(ppid):
				2529	proc = self.Process(self.blacklist, pid, None)
				2530	self._process_lookup[ppid].children.append(proc)
				2531	else:
				2532	# Ignore
				2533	return
				2534	self._process_lookup[pid] = proc
				2535
				2536	if (not line[IMAGE_FILE_NAME].startswith('"') or
				2537	not line[IMAGE_FILE_NAME].endswith('"')):
				2538	raise TracingFailure(
				2539	'Command line is not properly quoted: %s' % line[IMAGE_FILE_NAME],
				2540	None, None, None)
				2541
				2542	# TODO(maruel): Process escapes.
				2543	if (not line[COMMAND_LINE].startswith('"') or
				2544	not line[COMMAND_LINE].endswith('"')):
				2545	raise TracingFailure(
				2546	'Command line is not properly quoted: %s' % line[COMMAND_LINE],
				2547	None, None, None)
				2548	proc.command = CommandLineToArgvW(line[COMMAND_LINE][1:-1])
				2549	proc.executable = line[IMAGE_FILE_NAME][1:-1]
				2550	# proc.command[0] may be the absolute path of 'executable' but it may be
				2551	# anything else too. If it happens that command[0] ends with executable,
				2552	# use it, otherwise defaults to the base name.
				2553	cmd0 = proc.command[0].lower()
				2554	if not cmd0.endswith('.exe'):
				2555	# TODO(maruel): That's not strictly true either.
				2556	cmd0 += '.exe'
				2557	if cmd0.endswith(proc.executable) and os.path.isfile(cmd0):
				2558	# Fix the path.
				2559	cmd0 = cmd0.replace('/', os.path.sep)
				2560	cmd0 = os.path.normpath(cmd0)
				2561	proc.executable = get_native_path_case(cmd0)
				2562	logging.info(
				2563	'New child: %s -> %d %s' % (ppid, pid, proc.executable))
				2564
				2565	def handle_Thread_End(self, line):
				2566	"""Has the same parameters as Thread_Start."""
				2567	tid = int(line[self.TID], 16)
				2568	self._threads_active.pop(tid, None)
				2569
				2570	def handle_Thread_Start(self, line):
				2571	"""Handles a new thread created.
				2572
				2573	Do not use self.PID here since a process' initial thread is created by
				2574	the parent process.
				2575	"""
				2576	PROCESS_ID = self.USER_DATA
				2577	TTHREAD_ID = self.USER_DATA + 1
				2578	#STACK_BASE = self.USER_DATA + 2
				2579	#STACK_LIMIT = self.USER_DATA + 3
				2580	#USER_STACK_BASE = self.USER_DATA + 4
				2581	#USER_STACK_LIMIT = self.USER_DATA + 5
				2582	#AFFINITY = self.USER_DATA + 6
				2583	#WIN32_START_ADDR = self.USER_DATA + 7
				2584	#TEB_BASE = self.USER_DATA + 8
				2585	#SUB_PROCESS_TAG = self.USER_DATA + 9
				2586	#BASE_PRIORITY = self.USER_DATA + 10
				2587	#PAGE_PRIORITY = self.USER_DATA + 11
				2588	#IO_PRIORITY = self.USER_DATA + 12
				2589	#THREAD_FLAGS = self.USER_DATA + 13
				2590	# Do not use self.PID here since a process' initial thread is created by
				2591	# the parent process.
				2592	pid = int(line[PROCESS_ID], 16)
				2593	tid = int(line[TTHREAD_ID], 16)
				2594	logging.debug('New thread pid:%d, tid:%d' % (pid, tid))
				2595	self._threads_active[tid] = pid
				2596
				2597	@classmethod
				2598	def supported_events(cls):
				2599	"""Returns all the procesed events."""
				2600	out = []
				2601	for member in dir(cls):
				2602	match = re.match(r'^handle_([A-Za-z]+)_([A-Za-z]+)$', member)
				2603	if match:
				2604	out.append(match.groups())
				2605	return out
				2606
				2607	class Tracer(ApiBase.Tracer):
				2608	# The basic headers.
				2609	EXPECTED_HEADER = [
				2610	u'Event Name',
				2611	u'Type',
				2612	u'Event ID',
				2613	u'Version',
				2614	u'Channel',
				2615	u'Level', # 5
				2616	u'Opcode',
				2617	u'Task',
				2618	u'Keyword',
				2619	u'PID',
				2620	u'TID', # 10
				2621	u'Processor Number',
				2622	u'Instance ID',
				2623	u'Parent Instance ID',
				2624	u'Activity ID',
				2625	u'Related Activity ID', # 15
				2626	u'Clock-Time',
				2627	u'Kernel(ms)', # Both have a resolution of ~15ms which makes them
				2628	u'User(ms)', # pretty much useless.
				2629	u'User Data', # Extra arguments that are event-specific.
				2630	]
				2631	# Only the useful headers common to all entries are listed there. Any column
				2632	# at 19 or higher is dependent on the specific event.
				2633	EVENT_NAME = 0
				2634	TYPE = 1
				2635	PID = 9
				2636	TID = 10
				2637	PROCESSOR_ID = 11
				2638	TIMESTAMP = 16
				2639	NULL_GUID = '{00000000-0000-0000-0000-000000000000}'
				2640	USER_DATA = 19
				2641
				2642	def __init__(self, logname):
				2643	"""Starts the log collection.
				2644
				2645	Requires administrative access. logman.exe is synchronous so no need for a
				2646	"warmup" call. 'Windows Kernel Trace' is localized so use its GUID
				2647	instead. The GUID constant name is SystemTraceControlGuid. Lovely.
				2648
				2649	One can get the list of potentially interesting providers with:
				2650	"logman query providers \| findstr /i file"
				2651	"""
				2652	super(LogmanTrace.Tracer, self).__init__(logname)
				2653	self._script = create_thunk()
				2654	cmd_start = [
				2655	'logman.exe',
				2656	'start',
				2657	'NT Kernel Logger',
				2658	'-p', '{9e814aad-3204-11d2-9a82-006008a86939}',
				2659	# splitio,fileiocompletion,syscall,file,cswitch,img
				2660	'(process,fileio,thread)',
				2661	'-o', self._logname + '.etl',
				2662	'-ets', # Send directly to kernel
				2663	# Values extracted out of thin air.
				2664	# Event Trace Session buffer size in kb.
				2665	'-bs', '10240',
				2666	# Number of Event Trace Session buffers.
				2667	'-nb', '16', '256',
				2668	]
				2669	logging.debug('Running: %s' % cmd_start)
				2670	try:
				2671	subprocess.check_call(
				2672	cmd_start,
				2673	stdin=subprocess.PIPE,
				2674	stdout=subprocess.PIPE,
				2675	stderr=subprocess.STDOUT)
				2676	except subprocess.CalledProcessError, e:
				2677	if e.returncode == -2147024891:
				2678	print >> sys.stderr, 'Please restart with an elevated admin prompt'
				2679	elif e.returncode == -2144337737:
				2680	print >> sys.stderr, (
				2681	'A kernel trace was already running, stop it and try again')
				2682	raise
				2683
				2684	def trace(self, cmd, cwd, tracename, output):
				2685	logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
				2686	assert os.path.isabs(cmd[0]), cmd[0]
				2687	assert os.path.isabs(cwd), cwd
				2688	assert os.path.normpath(cwd) == cwd, cwd
				2689	with self._lock:
				2690	if not self._initialized:
				2691	raise TracingFailure(
				2692	'Called Tracer.trace() on an unitialized object',
				2693	None, None, None, tracename)
				2694	assert tracename not in (i['trace'] for i in self._traces)
				2695
				2696	# Use "logman -?" for help.
				2697
				2698	stdout = stderr = None
				2699	if output:
				2700	stdout = subprocess.PIPE
				2701	stderr = subprocess.STDOUT
				2702
				2703	# Run the child process.
				2704	logging.debug('Running: %s' % cmd)
				2705	# Use the temporary script generated with create_thunk() so we have a
				2706	# clear pid owner. Since trace_inputs.py can be used as a library and
				2707	# could trace multiple processes simultaneously, it makes it more complex
				2708	# if the executable to be traced is executed directly here. It also solves
				2709	# issues related to logman.exe that needs to be executed to control the
				2710	# kernel trace.
				2711	child_cmd = [
				2712	sys.executable,
				2713	self._script,
				2714	tracename,
				2715	]
				2716	child = subprocess.Popen(
				2717	child_cmd + fix_python_path(cmd),
				2718	cwd=cwd,
				2719	stdin=subprocess.PIPE,
				2720	stdout=stdout,
				2721	stderr=stderr)
				2722	logging.debug('Started child pid: %d' % child.pid)
				2723	out = child.communicate()[0]
				2724	# This doesn't mean all the grand-children are done. Sadly, we don't have
				2725	# a good way to determine that.
				2726
				2727	with self._lock:
				2728	assert tracename not in (i['trace'] for i in self._traces)
				2729	self._traces.append({
maruel@chromium.org	f43e68b	2012-10-15 20:23:10 +0000	[diff] [blame^]	2730	'cmd': cmd,
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	2731	'cwd': cwd,
				2732	'pid': child.pid,
				2733	'trace': tracename,
				2734	'output': out,
				2735	})
				2736
				2737	return child.returncode, out
				2738
				2739	def close(self, _timeout=None):
				2740	"""Stops the kernel log collection and converts the traces to text
				2741	representation.
				2742	"""
				2743	with self._lock:
				2744	if not self._initialized:
				2745	raise TracingFailure(
				2746	'Called Tracer.close() on an unitialized object',
				2747	None, None, None)
				2748	os.remove(self._script)
				2749	# Save metadata, add 'format' key..
				2750	data = {
				2751	'format': 'csv',
				2752	'traces': self._traces,
				2753	}
				2754	write_json(self._logname, data, False)
				2755
				2756	cmd_stop = [
				2757	'logman.exe',
				2758	'stop',
				2759	'NT Kernel Logger',
				2760	'-ets', # Sends the command directly to the kernel.
				2761	]
				2762	logging.debug('Running: %s' % cmd_stop)
				2763	subprocess.check_call(
				2764	cmd_stop,
				2765	stdin=subprocess.PIPE,
				2766	stdout=subprocess.PIPE,
				2767	stderr=subprocess.STDOUT)
				2768	self._initialized = False
				2769
				2770	def post_process_log(self):
				2771	"""Converts the .etl file into .csv then into .json."""
				2772	super(LogmanTrace.Tracer, self).post_process_log()
				2773	logformat = 'csv'
				2774	self._convert_log(logformat)
				2775
				2776	if logformat == 'csv_utf16':
				2777	def load_file():
				2778	def utf_8_encoder(unicode_csv_data):
				2779	"""Encodes the unicode object as utf-8 encoded str instance"""
				2780	for line in unicode_csv_data:
				2781	yield line.encode('utf-8')
				2782
				2783	def unicode_csv_reader(unicode_csv_data, **kwargs):
				2784	"""Encodes temporarily as UTF-8 since csv module doesn't do unicode.
				2785	"""
				2786	csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), **kwargs)
				2787	for row in csv_reader:
				2788	# Decode str utf-8 instances back to unicode instances, cell by
				2789	# cell:
				2790	yield [cell.decode('utf-8') for cell in row]
				2791
				2792	# The CSV file is UTF-16 so use codecs.open() to load the file into
				2793	# the python internal unicode format (utf-8). Then explicitly
				2794	# re-encode as utf8 as str instances so csv can parse it fine. Then
				2795	# decode the utf-8 str back into python unicode instances. This
				2796	# sounds about right.
				2797	for line in unicode_csv_reader(
				2798	codecs.open(self._logname + '.' + logformat, 'r', 'utf-16')):
				2799	# line is a list of unicode objects
				2800	# So much white space!
				2801	yield [i.strip() for i in line]
				2802
				2803	elif logformat == 'csv':
				2804	def load_file():
				2805	def ansi_csv_reader(ansi_csv_data, **kwargs):
				2806	"""Loads an 'ANSI' code page and returns unicode() objects."""
				2807	assert sys.getfilesystemencoding() == 'mbcs'
				2808	encoding = get_current_encoding()
				2809	for row in csv.reader(ansi_csv_data, **kwargs):
				2810	# Decode str 'ansi' instances to unicode instances, cell by cell:
				2811	yield [cell.decode(encoding) for cell in row]
				2812
				2813	# The fastest and smallest format but only supports 'ANSI' file paths.
				2814	# E.g. the filenames are encoding in the 'current' encoding.
				2815	for line in ansi_csv_reader(open(self._logname + '.' + logformat)):
				2816	# line is a list of unicode objects.
				2817	yield [i.strip() for i in line]
				2818
				2819	supported_events = LogmanTrace.Context.supported_events()
				2820
				2821	def trim(generator):
				2822	for index, line in enumerate(generator):
				2823	if not index:
				2824	if line != self.EXPECTED_HEADER:
				2825	raise TracingFailure(
				2826	'Found malformed header: %s' % ' '.join(line),
				2827	None, None, None)
				2828	continue
				2829	# As you can see, the CSV is full of useful non-redundant information:
				2830	if (line[2] != '0' or # Event ID
				2831	line[3] not in ('2', '3') or # Version
				2832	line[4] != '0' or # Channel
				2833	line[5] != '0' or # Level
				2834	line[7] != '0' or # Task
				2835	line[8] != '0x0000000000000000' or # Keyword
				2836	line[12] != '' or # Instance ID
				2837	line[13] != '' or # Parent Instance ID
				2838	line[14] != self.NULL_GUID or # Activity ID
				2839	line[15] != ''): # Related Activity ID
				2840	raise TracingFailure(
				2841	'Found unexpected values in line: %s' % ' '.join(line),
				2842	None, None, None)
				2843
				2844	if (line[self.EVENT_NAME], line[self.TYPE]) not in supported_events:
				2845	continue
				2846
				2847	# Convert the PID in-place from hex.
				2848	line[self.PID] = int(line[self.PID], 16)
				2849
				2850	yield [
				2851	line[self.EVENT_NAME],
				2852	line[self.TYPE],
				2853	line[self.PID],
				2854	line[self.TID],
				2855	line[self.PROCESSOR_ID],
				2856	line[self.TIMESTAMP],
				2857	] + line[self.USER_DATA:]
				2858
				2859	write_json('%s.json' % self._logname, list(trim(load_file())), True)
				2860
				2861	def _convert_log(self, logformat):
				2862	"""Converts the ETL trace to text representation.
				2863
				2864	Normally, 'csv' is sufficient. If complex scripts are used (like eastern
				2865	languages), use 'csv_utf16'. If localization gets in the way, use 'xml'.
				2866
				2867	Arguments:
				2868	- logformat: Text format to be generated, csv, csv_utf16 or xml.
				2869
				2870	Use "tracerpt -?" for help.
				2871	"""
				2872	LOCALE_INVARIANT = 0x7F
				2873	windll.kernel32.SetThreadLocale(LOCALE_INVARIANT)
				2874	cmd_convert = [
				2875	'tracerpt.exe',
				2876	'-l', self._logname + '.etl',
				2877	'-o', self._logname + '.' + logformat,
				2878	'-gmt', # Use UTC
				2879	'-y', # No prompt
				2880	# Use -of XML to get the header of each items after column 19, e.g. all
				2881	# the actual headers of 'User Data'.
				2882	]
				2883
				2884	if logformat == 'csv':
				2885	# tracerpt localizes the 'Type' column, for major brainfuck
				2886	# entertainment. I can't imagine any sane reason to do that.
				2887	cmd_convert.extend(['-of', 'CSV'])
				2888	elif logformat == 'csv_utf16':
				2889	# This causes it to use UTF-16, which doubles the log size but ensures
				2890	# the log is readable for non-ASCII characters.
				2891	cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode'])
				2892	elif logformat == 'xml':
				2893	cmd_convert.extend(['-of', 'XML'])
				2894	else:
				2895	raise ValueError('Unexpected log format \'%s\'' % logformat)
				2896	logging.debug('Running: %s' % cmd_convert)
				2897	# This can takes tens of minutes for large logs.
				2898	# Redirects all output to stderr.
				2899	subprocess.check_call(
				2900	cmd_convert,
				2901	stdin=subprocess.PIPE,
				2902	stdout=sys.stderr,
				2903	stderr=sys.stderr)
				2904
				2905	@staticmethod
				2906	def clean_trace(logname):
				2907	for ext in ('', '.csv', '.etl', '.json', '.xml'):
				2908	if os.path.isfile(logname + ext):
				2909	os.remove(logname + ext)
				2910
				2911	@classmethod
				2912	def parse_log(cls, logname, blacklist):
				2913	logging.info('parse_log(%s, %s)' % (logname, blacklist))
				2914
				2915	def blacklist_more(filepath):
				2916	# All the NTFS metadata is in the form x:\$EXTEND or stuff like that.
				2917	return blacklist(filepath) or re.match(r'[A-Z]\:\\\$EXTEND', filepath)
				2918
				2919	data = read_json(logname)
				2920	lines = read_json(logname + '.json')
				2921	out = []
				2922	for item in data['traces']:
				2923	context = cls.Context(blacklist_more, item['pid'])
				2924	for line in lines:
				2925	context.on_line(line)
				2926	out.append(
				2927	{
				2928	'results': context.to_results(),
				2929	'trace': item['trace'],
				2930	'output': item['output'],
				2931	})
				2932	return out
				2933
				2934
				2935	def get_api():
				2936	"""Returns the correct implementation for the current OS."""
				2937	if sys.platform == 'cygwin':
				2938	raise NotImplementedError(
				2939	'Not implemented for cygwin, start the script from Win32 python')
				2940	flavors = {
				2941	'win32': LogmanTrace,
				2942	'darwin': Dtrace,
				2943	'sunos5': Dtrace,
				2944	'freebsd7': Dtrace,
				2945	'freebsd8': Dtrace,
				2946	}
				2947	# Defaults to strace.
				2948	return flavors.get(sys.platform, Strace)()
				2949
				2950
				2951	def extract_directories(root_dir, files, blacklist):
				2952	"""Detects if all the files in a directory are in \|files\| and if so, replace
				2953	the individual files by a Results.Directory instance.
				2954
				2955	Takes a list of Results.File instances and returns a shorter list of
				2956	Results.File and Results.Directory instances.
				2957
				2958	Arguments:
				2959	- root_dir: Optional base directory that shouldn't be search further.
				2960	- files: list of Results.File instances.
				2961	- blacklist: regexp of files to ignore, for example r'.+\.pyc'.
				2962	"""
				2963	logging.info(
				2964	'extract_directories(%s, %d files, ...)' % (root_dir, len(files)))
				2965	assert not (root_dir or '').endswith(os.path.sep), root_dir
				2966	assert not root_dir or (get_native_path_case(root_dir) == root_dir)
				2967	assert not any(isinstance(f, Results.Directory) for f in files)
				2968	# Remove non existent files.
				2969	files = [f for f in files if f.existent]
				2970	if not files:
				2971	return files
				2972	# All files must share the same root, which can be None.
				2973	assert len(set(f.root for f in files)) == 1, set(f.root for f in files)
				2974
				2975	# Creates a {directory: {filename: File}} mapping, up to root.
				2976	buckets = {}
				2977	if root_dir:
				2978	buckets[root_dir] = {}
				2979	for fileobj in files:
				2980	path = fileobj.full_path
				2981	directory = os.path.dirname(path)
				2982	assert directory
				2983	# Do not use os.path.basename() so trailing os.path.sep is kept.
				2984	basename = path[len(directory)+1:]
				2985	files_in_directory = buckets.setdefault(directory, {})
				2986	files_in_directory[basename] = fileobj
				2987	# Add all the directories recursively up to root.
				2988	while True:
				2989	old_d = directory
				2990	directory = os.path.dirname(directory)
				2991	if directory + os.path.sep == root_dir or directory == old_d:
				2992	break
				2993	buckets.setdefault(directory, {})
				2994
				2995	root_prefix = len(root_dir) + 1 if root_dir else 0
				2996	for directory in sorted(buckets, reverse=True):
				2997	actual = set(f for f in os.listdir(directory) if not blacklist(f))
				2998	expected = set(buckets[directory])
				2999	if not (actual - expected):
				3000	parent = os.path.dirname(directory)
				3001	buckets[parent][os.path.basename(directory)] = Results.Directory(
				3002	root_dir,
				3003	directory[root_prefix:],
				3004	False,
				3005	sum(f.size for f in buckets[directory].itervalues()),
				3006	sum(f.nb_files for f in buckets[directory].itervalues()))
				3007	# Remove the whole bucket.
				3008	del buckets[directory]
				3009
				3010	# Reverse the mapping with what remains. The original instances are returned,
				3011	# so the cached meta data is kept.
				3012	files = sum((x.values() for x in buckets.itervalues()), [])
				3013	return sorted(files, key=lambda x: x.path)
				3014
				3015
				3016	def trace(logfile, cmd, cwd, api, output):
				3017	"""Traces an executable. Returns (returncode, output) from api.
				3018
				3019	Arguments:
				3020	- logfile: file to write to.
				3021	- cmd: command to run.
				3022	- cwd: current directory to start the process in.
				3023	- api: a tracing api instance.
				3024	- output: if True, returns output, otherwise prints it at the console.
				3025	"""
				3026	cmd = fix_python_path(cmd)
				3027	api.clean_trace(logfile)
				3028	with api.get_tracer(logfile) as tracer:
				3029	return tracer.trace(cmd, cwd, 'default', output)
				3030
				3031
				3032	def load_trace(logfile, root_dir, api, blacklist):
				3033	"""Loads a trace file and returns the Results instance.
				3034
				3035	Arguments:
				3036	- logfile: File to load.
				3037	- root_dir: Root directory to use to determine if a file is relevant to the
				3038	trace or not.
				3039	- api: A tracing api instance.
				3040	- blacklist: Optional blacklist function to filter out unimportant files.
				3041	"""
				3042	data = api.parse_log(logfile, (blacklist or (lambda _: False)))
				3043	assert len(data) == 1, 'More than one trace was detected!'
				3044	if 'exception' in data[0]:
				3045	# It got an exception, raise it.
maruel@chromium.org	424f569	2012-10-15 15:42:38 +0000	[diff] [blame]	3046	raise \
				3047	data[0]['exception'][0], \
				3048	data[0]['exception'][1], \
				3049	data[0]['exception'][2]
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	3050	results = data[0]['results']
				3051	if root_dir:
				3052	results = results.strip_root(root_dir)
				3053	return results
				3054
				3055
				3056	def CMDclean(args):
				3057	"""Cleans up traces."""
				3058	parser = OptionParserTraceInputs(command='clean')
				3059	options, args = parser.parse_args(args)
				3060	api = get_api()
				3061	api.clean_trace(options.log)
				3062	return 0
				3063
				3064
				3065	def CMDtrace(args):
				3066	"""Traces an executable."""
				3067	parser = OptionParserTraceInputs(command='trace')
				3068	parser.allow_interspersed_args = False
				3069	parser.add_option(
				3070	'-q', '--quiet', action='store_true',
				3071	help='Redirects traced executable output to /dev/null')
				3072	options, args = parser.parse_args(args)
				3073
				3074	if not args:
				3075	parser.error('Please provide a command to run')
				3076
				3077	if not os.path.isabs(args[0]) and os.access(args[0], os.X_OK):
				3078	args[0] = os.path.abspath(args[0])
				3079
				3080	api = get_api()
				3081	return trace(options.log, args, os.getcwd(), api, options.quiet)[0]
				3082
				3083
				3084	def CMDread(args):
				3085	"""Reads the logs and prints the result."""
				3086	parser = OptionParserTraceInputs(command='read')
				3087	parser.add_option(
				3088	'-V', '--variable',
				3089	nargs=2,
				3090	action='append',
				3091	dest='variables',
				3092	metavar='VAR_NAME directory',
				3093	default=[],
				3094	help=('Variables to replace relative directories against. Example: '
				3095	'"-v \'$HOME\' \'/home/%s\'" will replace all occurence of your '
				3096	'home dir with $HOME') % getpass.getuser())
				3097	parser.add_option(
				3098	'--root-dir',
				3099	help='Root directory to base everything off it. Anything outside of this '
				3100	'this directory will not be reported')
				3101	parser.add_option(
				3102	'-j', '--json', action='store_true',
				3103	help='Outputs raw result data as json')
				3104	parser.add_option(
				3105	'-b', '--blacklist', action='append', default=[],
				3106	help='List of regexp to use as blacklist filter')
				3107	options, args = parser.parse_args(args)
				3108
				3109	if options.root_dir:
				3110	options.root_dir = os.path.abspath(options.root_dir)
				3111
				3112	variables = dict(options.variables)
				3113	api = get_api()
				3114	def blacklist(f):
				3115	return any(re.match(b, f) for b in options.blacklist)
				3116	data = api.parse_log(options.log, blacklist)
				3117	# Process each trace.
				3118	output_as_json = []
				3119	for item in data:
				3120	if 'exception' in item:
maruel@chromium.org	424f569	2012-10-15 15:42:38 +0000	[diff] [blame]	3121	# Do not abort the other traces.
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	3122	print >> sys.stderr, (
maruel@chromium.org	424f569	2012-10-15 15:42:38 +0000	[diff] [blame]	3123	'Trace %s: Got an exception: %s' % (
				3124	item['trace'], item['exception'][1]))
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	3125	continue
				3126	results = item['results']
				3127	if options.root_dir:
				3128	results = results.strip_root(options.root_dir)
				3129
				3130	if options.json:
				3131	output_as_json.append(results.flatten())
				3132	else:
				3133	simplified = extract_directories(
				3134	options.root_dir, results.files, blacklist)
				3135	simplified = [f.replace_variables(variables) for f in simplified]
				3136	if len(data) > 1:
				3137	print('Trace: %s' % item['trace'])
				3138	print('Total: %d' % len(results.files))
				3139	print('Non existent: %d' % len(results.non_existent))
				3140	for f in results.non_existent:
				3141	print(' %s' % f.path)
				3142	print(
				3143	'Interesting: %d reduced to %d' % (
				3144	len(results.existent), len(simplified)))
				3145	for f in simplified:
				3146	print(' %s' % f.path)
				3147
				3148	if options.json:
				3149	write_json(sys.stdout, output_as_json, False)
				3150	return 0
				3151
				3152
				3153	class OptionParserWithLogging(optparse.OptionParser):
				3154	"""Adds --verbose option."""
				3155	def __init__(self, verbose=0, **kwargs):
				3156	optparse.OptionParser.__init__(self, **kwargs)
				3157	self.add_option(
				3158	'-v', '--verbose',
				3159	action='count',
				3160	default=verbose,
				3161	help='Use multiple times to increase verbosity')
				3162
				3163	def parse_args(self, args, *kwargs):
				3164	options, args = optparse.OptionParser.parse_args(self, args, *kwargs)
				3165	levels = [logging.ERROR, logging.INFO, logging.DEBUG]
				3166	logging.basicConfig(
				3167	level=levels[min(len(levels)-1, options.verbose)],
				3168	format='%(levelname)5s %(module)15s(%(lineno)3d): %(message)s')
				3169	return options, args
				3170
				3171
				3172	class OptionParserWithNiceDescription(OptionParserWithLogging):
				3173	"""Generates the description with the command's docstring."""
				3174	def __init__(self, **kwargs):
				3175	"""Sets 'description' and 'usage' if not already specified."""
				3176	command = kwargs.pop('command', 'help')
				3177	kwargs.setdefault(
				3178	'description',
				3179	re.sub('[\r\n ]{2,}', ' ', get_command_handler(command).__doc__))
				3180	kwargs.setdefault('usage', '%%prog %s [options]' % command)
				3181	OptionParserWithLogging.__init__(self, **kwargs)
				3182
				3183
				3184	class OptionParserTraceInputs(OptionParserWithNiceDescription):
				3185	"""Adds automatic --log handling."""
				3186	def __init__(self, **kwargs):
				3187	OptionParserWithNiceDescription.__init__(self, **kwargs)
				3188	self.add_option(
				3189	'-l', '--log', help='Log file to generate or read, required')
				3190
				3191	def parse_args(self, args, *kwargs):
				3192	"""Makes sure the paths make sense.
				3193
				3194	On Windows, / and \ are often mixed together in a path.
				3195	"""
				3196	options, args = OptionParserWithNiceDescription.parse_args(
				3197	self, args, *kwargs)
				3198	if not options.log:
				3199	self.error('Must supply a log file with -l')
				3200	options.log = os.path.abspath(options.log)
				3201	return options, args
				3202
				3203
				3204	def extract_documentation():
				3205	"""Returns a dict {command: description} for each of documented command."""
				3206	commands = (
				3207	fn[3:]
				3208	for fn in dir(sys.modules['__main__'])
				3209	if fn.startswith('CMD') and get_command_handler(fn[3:]).__doc__)
				3210	return dict((fn, get_command_handler(fn).__doc__) for fn in commands)
				3211
				3212
				3213	def CMDhelp(args):
				3214	"""Prints list of commands or help for a specific command."""
				3215	doc = extract_documentation()
				3216	# Calculates the optimal offset.
				3217	offset = max(len(cmd) for cmd in doc)
				3218	format_str = ' %-' + str(offset + 2) + 's %s'
				3219	# Generate a one-liner documentation of each commands.
				3220	commands_description = '\n'.join(
				3221	format_str % (cmd, doc[cmd].split('\n')[0]) for cmd in sorted(doc))
				3222
				3223	parser = OptionParserWithNiceDescription(
				3224	usage='%prog <command> [options]',
				3225	description='Commands are:\n%s\n' % commands_description)
				3226	parser.format_description = lambda _: parser.description
				3227
				3228	# Strip out any -h or --help argument.
				3229	_, args = parser.parse_args([i for i in args if not i in ('-h', '--help')])
				3230	if len(args) == 1:
				3231	if not get_command_handler(args[0]):
				3232	parser.error('Unknown command %s' % args[0])
				3233	# The command was "%prog help command", replaces ourself with
				3234	# "%prog command --help" so help is correctly printed out.
				3235	return main(args + ['--help'])
				3236	elif args:
				3237	parser.error('Unknown argument "%s"' % ' '.join(args))
				3238	parser.print_help()
				3239	return 0
				3240
				3241
				3242	def get_command_handler(name):
				3243	"""Returns the command handler or CMDhelp if it doesn't exist."""
				3244	return getattr(sys.modules['__main__'], 'CMD%s' % name, None)
				3245
				3246
				3247	def main_impl(argv):
				3248	command = get_command_handler(argv[0] if argv else 'help')
				3249	if not command:
				3250	return CMDhelp(argv)
				3251	return command(argv[1:])
				3252
				3253	def main(argv):
				3254	try:
				3255	main_impl(argv)
				3256	except TracingFailure, e:
				3257	sys.stderr.write('\nError: ')
				3258	sys.stderr.write(str(e))
				3259	sys.stderr.write('\n')
				3260	return 1
				3261
				3262
				3263	if __name__ == '__main__':
				3264	sys.exit(main(sys.argv[1:]))