Blame - trace_inputs.py - chromium.googlesource.com/infra/luci/client-py

blob: c870f41cf32b82af1954316db0020baedfbbb6a6 [file] [log] [blame]

maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1	#!/usr/bin/env python
				2	# coding=utf-8
				3	# Copyright (c) 2012 The Chromium Authors. All rights reserved.
				4	# Use of this source code is governed by a BSD-style license that can be
				5	# found in the LICENSE file.
				6
				7	"""Traces an executable and its child processes and extract the files accessed
				8	by them.
				9
				10	The implementation uses OS-specific API. The native Kernel logger and the ETL
				11	interface is used on Windows. Dtrace is used on OSX. Strace is used otherwise.
				12	The OS-specific implementation is hidden in an 'API' interface.
				13
				14	The results are embedded in a Results instance. The tracing is done in two
				15	phases, the first is to do the actual trace and generate an
				16	implementation-specific log file. Then the log file is parsed to extract the
				17	information, including the individual child processes and the files accessed
				18	from the log.
				19	"""
				20
				21	import codecs
				22	import csv
				23	import getpass
				24	import glob
				25	import json
				26	import logging
				27	import optparse
				28	import os
				29	import re
				30	import subprocess
				31	import sys
				32	import tempfile
				33	import threading
				34	import time
				35	import weakref
				36
				37	## OS-specific imports
				38
				39	if sys.platform == 'win32':
				40	from ctypes.wintypes import byref, create_unicode_buffer, c_int, c_wchar_p
				41	from ctypes.wintypes import windll, FormatError # pylint: disable=E0611
				42	from ctypes.wintypes import GetLastError # pylint: disable=E0611
				43	elif sys.platform == 'darwin':
				44	import Carbon.File # pylint: disable=F0401
				45	import MacOS # pylint: disable=F0401
				46
				47
				48	BASE_DIR = os.path.dirname(os.path.abspath(__file__))
				49	ROOT_DIR = os.path.dirname(os.path.dirname(BASE_DIR))
				50
				51
				52	class TracingFailure(Exception):
				53	"""An exception occured during tracing."""
				54	def __init__(self, description, pid, line_number, line, *args):
				55	super(TracingFailure, self).__init__(
				56	description, pid, line_number, line, *args)
				57	self.description = description
				58	self.pid = pid
				59	self.line_number = line_number
				60	self.line = line
				61	self.extra = args
				62
				63	def __str__(self):
				64	out = self.description
				65	if self.pid:
				66	out += '\npid: %d' % self.pid
				67	if self.line_number:
				68	out += '\nline: %d' % self.line_number
				69	if self.line:
				70	out += '\n%s' % self.line
				71	if self.extra:
				72	out += '\n' + ', '.join(map(str, filter(None, self.extra)))
				73	return out
				74
				75
				76	## OS-specific functions
				77
				78	if sys.platform == 'win32':
				79	def QueryDosDevice(drive_letter):
				80	"""Returns the Windows 'native' path for a DOS drive letter."""
				81	assert re.match(r'^[a-zA-Z]:$', drive_letter), drive_letter
				82	# Guesswork. QueryDosDeviceW never returns the required number of bytes.
				83	chars = 1024
				84	drive_letter = unicode(drive_letter)
				85	p = create_unicode_buffer(chars)
				86	if 0 == windll.kernel32.QueryDosDeviceW(drive_letter, p, chars):
				87	err = GetLastError()
				88	if err:
				89	# pylint: disable=E0602
				90	raise WindowsError(
				91	err,
				92	'QueryDosDevice(%s): %s (%d)' % (
				93	str(drive_letter), FormatError(err), err))
				94	return p.value
				95
				96
				97	def GetShortPathName(long_path):
				98	"""Returns the Windows short path equivalent for a 'long' path."""
				99	long_path = unicode(long_path)
				100	# Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is
				101	# not enforced.
				102	if os.path.isabs(long_path) and not long_path.startswith('\\\\?\\'):
				103	long_path = '\\\\?\\' + long_path
				104	chars = windll.kernel32.GetShortPathNameW(long_path, None, 0)
				105	if chars:
				106	p = create_unicode_buffer(chars)
				107	if windll.kernel32.GetShortPathNameW(long_path, p, chars):
				108	return p.value
				109
				110	err = GetLastError()
				111	if err:
				112	# pylint: disable=E0602
				113	raise WindowsError(
				114	err,
				115	'GetShortPathName(%s): %s (%d)' % (
				116	str(long_path), FormatError(err), err))
				117
				118
				119	def GetLongPathName(short_path):
				120	"""Returns the Windows long path equivalent for a 'short' path."""
				121	short_path = unicode(short_path)
				122	# Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is
				123	# not enforced.
				124	if os.path.isabs(short_path) and not short_path.startswith('\\\\?\\'):
				125	short_path = '\\\\?\\' + short_path
				126	chars = windll.kernel32.GetLongPathNameW(short_path, None, 0)
				127	if chars:
				128	p = create_unicode_buffer(chars)
				129	if windll.kernel32.GetLongPathNameW(short_path, p, chars):
				130	return p.value
				131
				132	err = GetLastError()
				133	if err:
				134	# pylint: disable=E0602
				135	raise WindowsError(
				136	err,
				137	'GetLongPathName(%s): %s (%d)' % (
				138	str(short_path), FormatError(err), err))
				139
				140
				141	def get_current_encoding():
				142	"""Returns the 'ANSI' code page associated to the process."""
				143	return 'cp%d' % int(windll.kernel32.GetACP())
				144
				145
				146	class DosDriveMap(object):
				147	"""Maps \Device\HarddiskVolumeN to N: on Windows."""
				148	# Keep one global cache.
				149	_MAPPING = {}
				150
				151	def __init__(self):
				152	"""Lazy loads the cache."""
				153	if not self._MAPPING:
				154	# This is related to UNC resolver on windows. Ignore that.
				155	self._MAPPING['\\Device\\Mup'] = None
				156	self._MAPPING['\\SystemRoot'] = os.environ['SystemRoot']
				157
				158	for letter in (chr(l) for l in xrange(ord('C'), ord('Z')+1)):
				159	try:
				160	letter = '%s:' % letter
				161	mapped = QueryDosDevice(letter)
				162	if mapped in self._MAPPING:
				163	logging.warn(
				164	('Two drives: \'%s\' and \'%s\', are mapped to the same disk'
				165	'. Drive letters are a user-mode concept and the kernel '
				166	'traces only have NT path, so all accesses will be '
				167	'associated with the first drive letter, independent of the '
				168	'actual letter used by the code') % (
				169	self._MAPPING[mapped], letter))
				170	else:
				171	self._MAPPING[mapped] = letter
				172	except WindowsError: # pylint: disable=E0602
				173	pass
				174
				175	def to_win32(self, path):
				176	"""Converts a native NT path to Win32/DOS compatible path."""
				177	match = re.match(r'(^\\Device\\[a-zA-Z0-9]+)(\\.*)?$', path)
				178	if not match:
				179	raise ValueError(
				180	'Can\'t convert %s into a Win32 compatible path' % path,
				181	path)
				182	if not match.group(1) in self._MAPPING:
				183	# Unmapped partitions may be accessed by windows for the
				184	# fun of it while the test is running. Discard these.
				185	return None
				186	drive = self._MAPPING[match.group(1)]
				187	if not drive or not match.group(2):
				188	return drive
				189	return drive + match.group(2)
				190
				191
				192	def isabs(path):
				193	"""Accepts X: as an absolute path, unlike python's os.path.isabs()."""
				194	return os.path.isabs(path) or len(path) == 2 and path[1] == ':'
				195
				196
				197	def get_native_path_case(p):
				198	"""Returns the native path case for an existing file.
				199
				200	On Windows, removes any leading '\\?\'.
				201	"""
				202	if not isabs(p):
				203	raise ValueError(
				204	'Can\'t get native path case for a non-absolute path: %s' % p,
				205	p)
				206	# Windows used to have an option to turn on case sensitivity on non Win32
				207	# subsystem but that's out of scope here and isn't supported anymore.
				208	# Go figure why GetShortPathName() is needed.
				209	try:
				210	out = GetLongPathName(GetShortPathName(p))
				211	except OSError, e:
				212	if e.args[0] in (2, 3, 5):
				213	# The path does not exist. Try to recurse and reconstruct the path.
				214	base = os.path.dirname(p)
				215	rest = os.path.basename(p)
				216	return os.path.join(get_native_path_case(base), rest)
				217	raise
				218	if out.startswith('\\\\?\\'):
				219	out = out[4:]
				220	# Always upper case the first letter since GetLongPathName() will return the
				221	# drive letter in the case it was given.
				222	return out[0].upper() + out[1:]
				223
				224
				225	def CommandLineToArgvW(command_line):
				226	"""Splits a commandline into argv using CommandLineToArgvW()."""
				227	# http://msdn.microsoft.com/library/windows/desktop/bb776391.aspx
				228	size = c_int()
				229	ptr = windll.shell32.CommandLineToArgvW(unicode(command_line), byref(size))
				230	try:
				231	return [arg for arg in (c_wchar_p * size.value).from_address(ptr)]
				232	finally:
				233	windll.kernel32.LocalFree(ptr)
				234
				235
				236	elif sys.platform == 'darwin':
				237
				238
				239	# On non-windows, keep the stdlib behavior.
				240	isabs = os.path.isabs
				241
				242
				243	def _find_item_native_case(root_path, item):
				244	"""Gets the native path case of a single item based at root_path.
				245
				246	There is no API to get the native path case of symlinks on OSX. So it
				247	needs to be done the slow way.
				248	"""
				249	item = item.lower()
				250	for element in os.listdir(root_path):
				251	if element.lower() == item:
				252	return element
				253
				254
				255	def _native_case(p):
				256	"""Gets the native path case. Warning: this function resolves symlinks."""
				257	logging.debug('native_case(%s)' % p)
				258	try:
				259	rel_ref, _ = Carbon.File.FSPathMakeRef(p)
				260	out = rel_ref.FSRefMakePath()
				261	if p.endswith(os.path.sep) and not out.endswith(os.path.sep):
				262	return out + os.path.sep
				263	return out
				264	except MacOS.Error, e:
				265	if e.args[0] in (-43, -120):
				266	# The path does not exist. Try to recurse and reconstruct the path.
				267	# -43 means file not found.
				268	# -120 means directory not found.
				269	base = os.path.dirname(p)
				270	rest = os.path.basename(p)
				271	return os.path.join(_native_case(base), rest)
				272	raise OSError(
				273	e.args[0], 'Failed to get native path for %s' % p, p, e.args[1])
				274
				275
				276	def _split_at_symlink_native(base_path, rest):
				277	"""Returns the native path for a symlink."""
				278	base, symlink, rest = split_at_symlink(base_path, rest)
				279	if symlink:
				280	if not base_path:
				281	base_path = base
				282	else:
				283	base_path = safe_join(base_path, base)
				284	symlink = _find_item_native_case(base_path, symlink)
				285	return base, symlink, rest
				286
				287
				288	def get_native_path_case(path):
				289	"""Returns the native path case for an existing file.
				290
				291	Technically, it's only HFS+ on OSX that is case preserving and
				292	insensitive. It's the default setting on HFS+ but can be changed.
				293	"""
				294	if not isabs(path):
				295	raise ValueError(
				296	'Can\'t get native path case for a non-absolute path: %s' % path,
				297	path)
				298	if path.startswith('/dev'):
				299	# /dev is not visible from Carbon, causing an exception.
				300	return path
				301
				302	# Starts assuming there is no symlink along the path.
				303	resolved = _native_case(path)
				304	if resolved.lower() == path.lower():
				305	# This code path is incredibly faster.
				306	return resolved
				307
				308	# There was a symlink, process it.
				309	base, symlink, rest = _split_at_symlink_native(None, path)
				310	assert symlink, (path, base, symlink, rest, resolved)
				311	prev = base
				312	base = safe_join(_native_case(base), symlink)
				313	assert len(base) > len(prev)
				314	while rest:
				315	prev = base
				316	relbase, symlink, rest = _split_at_symlink_native(base, rest)
				317	base = safe_join(base, relbase)
				318	assert len(base) > len(prev), (prev, base, symlink)
				319	if symlink:
				320	base = safe_join(base, symlink)
				321	assert len(base) > len(prev), (prev, base, symlink)
				322	# Make sure no symlink was resolved.
				323	assert base.lower() == path.lower(), (base, path)
				324	return base
				325
				326
				327	else: # OSes other than Windows and OSX.
				328
				329
				330	# On non-windows, keep the stdlib behavior.
				331	isabs = os.path.isabs
				332
				333
				334	def get_native_path_case(path):
				335	"""Returns the native path case for an existing file.
				336
				337	On OSes other than OSX and Windows, assume the file system is
				338	case-sensitive.
				339
				340	TODO(maruel): This is not strictly true. Implement if necessary.
				341	"""
				342	if not isabs(path):
				343	raise ValueError(
				344	'Can\'t get native path case for a non-absolute path: %s' % path,
				345	path)
				346	# Give up on cygwin, as GetLongPathName() can't be called.
				347	# Linux traces tends to not be normalized so use this occasion to normalize
				348	# it. This function implementation already normalizes the path on the other
				349	# OS so this needs to be done here to be coherent between OSes.
				350	out = os.path.normpath(path)
				351	if path.endswith(os.path.sep) and not out.endswith(os.path.sep):
				352	return out + os.path.sep
				353	return out
				354
				355
				356	if sys.platform != 'win32': # All non-Windows OSes.
				357
				358
				359	def safe_join(*args):
				360	"""Joins path elements like os.path.join() but doesn't abort on absolute
				361	path.
				362
				363	os.path.join('foo', '/bar') == '/bar'
				364	but safe_join('foo', '/bar') == 'foo/bar'.
				365	"""
				366	out = ''
				367	for element in args:
				368	if element.startswith(os.path.sep):
				369	if out.endswith(os.path.sep):
				370	out += element[1:]
				371	else:
				372	out += element
				373	else:
				374	if out.endswith(os.path.sep):
				375	out += element
				376	else:
				377	out += os.path.sep + element
				378	return out
				379
				380
				381	def split_at_symlink(base_dir, relfile):
				382	"""Scans each component of relfile and cut the string at the symlink if
				383	there is any.
				384
				385	Returns a tuple (base_path, symlink, rest), with symlink == rest == None if
				386	not symlink was found.
				387	"""
				388	if base_dir:
				389	assert relfile
				390	assert os.path.isabs(base_dir)
				391	index = 0
				392	else:
				393	assert os.path.isabs(relfile)
				394	index = 1
				395
				396	def at_root(rest):
				397	if base_dir:
				398	return safe_join(base_dir, rest)
				399	return rest
				400
				401	while True:
				402	try:
				403	index = relfile.index(os.path.sep, index)
				404	except ValueError:
				405	index = len(relfile)
				406	full = at_root(relfile[:index])
				407	if os.path.islink(full):
				408	# A symlink!
				409	base = os.path.dirname(relfile[:index])
				410	symlink = os.path.basename(relfile[:index])
				411	rest = relfile[index:]
				412	logging.debug(
				413	'split_at_symlink(%s, %s) -> (%s, %s, %s)' %
				414	(base_dir, relfile, base, symlink, rest))
				415	return base, symlink, rest
				416	if index == len(relfile):
				417	break
				418	index += 1
				419	return relfile, None, None
				420
				421
				422	def fix_python_path(cmd):
				423	"""Returns the fixed command line to call the right python executable."""
				424	out = cmd[:]
				425	if out[0] == 'python':
				426	out[0] = sys.executable
				427	elif out[0].endswith('.py'):
				428	out.insert(0, sys.executable)
				429	return out
				430
				431
				432	def create_thunk():
				433	handle, name = tempfile.mkstemp(prefix='trace_inputs_thunk', suffix='.py')
				434	os.write(
				435	handle,
				436	(
				437	'import subprocess\n'
				438	'import sys\n'
				439	'sys.exit(subprocess.call(sys.argv[2:]))\n'
				440	))
				441	os.close(handle)
				442	return name
				443
				444
				445	def strace_process_quoted_arguments(text):
				446	"""Extracts quoted arguments on a string and return the arguments as a list.
				447
				448	Implemented as an automaton. Supports incomplete strings in the form
				449	'"foo"...'.
				450
				451	Example:
				452	With text = '"foo", "bar"', the function will return ['foo', 'bar']
				453
				454	TODO(maruel): Implement escaping.
				455	"""
				456	# All the possible states of the DFA.
				457	( NEED_QUOTE, # Begining of a new arguments.
				458	INSIDE_STRING, # Inside an argument.
				459	ESCAPED, # Found a '\' inside a quote. Treat the next char as-is.
				460	NEED_COMMA_OR_DOT, # Right after the closing quote of an argument. Could be
				461	# a serie of 3 dots or a comma.
				462	NEED_SPACE, # Right after a comma
				463	NEED_DOT_2, # Found a dot, need a second one.
				464	NEED_DOT_3, # Found second dot, need a third one.
				465	NEED_COMMA, # Found third dot, need a comma.
				466	) = range(8)
				467
				468	state = NEED_QUOTE
				469	out = []
				470	for index, char in enumerate(text):
				471	if char == '"':
				472	if state == NEED_QUOTE:
				473	state = INSIDE_STRING
				474	# A new argument was found.
				475	out.append('')
				476	elif state == INSIDE_STRING:
				477	# The argument is now closed.
				478	state = NEED_COMMA_OR_DOT
				479	elif state == ESCAPED:
				480	out[-1] += char
				481	state = INSIDE_STRING
				482	else:
				483	raise ValueError(
				484	'Can\'t process char at column %d for: %r' % (index, text),
				485	index,
				486	text)
				487	elif char == ',':
				488	if state in (NEED_COMMA_OR_DOT, NEED_COMMA):
				489	state = NEED_SPACE
				490	elif state == INSIDE_STRING:
				491	out[-1] += char
				492	elif state == ESCAPED:
				493	out[-1] += char
				494	state = INSIDE_STRING
				495	else:
				496	raise ValueError(
				497	'Can\'t process char at column %d for: %r' % (index, text),
				498	index,
				499	text)
				500	elif char == ' ':
				501	if state == NEED_SPACE:
				502	state = NEED_QUOTE
				503	elif state == INSIDE_STRING:
				504	out[-1] += char
				505	elif state == ESCAPED:
				506	out[-1] += char
				507	state = INSIDE_STRING
				508	else:
				509	raise ValueError(
				510	'Can\'t process char at column %d for: %r' % (index, text),
				511	index,
				512	text)
				513	elif char == '.':
				514	if state == NEED_COMMA_OR_DOT:
				515	# The string is incomplete, this mean the strace -s flag should be
				516	# increased.
				517	state = NEED_DOT_2
				518	elif state == NEED_DOT_2:
				519	state = NEED_DOT_3
				520	elif state == NEED_DOT_3:
				521	state = NEED_COMMA
				522	elif state == INSIDE_STRING:
				523	out[-1] += char
				524	elif state == ESCAPED:
				525	out[-1] += char
				526	state = INSIDE_STRING
				527	else:
				528	raise ValueError(
				529	'Can\'t process char at column %d for: %r' % (index, text),
				530	index,
				531	text)
				532	elif char == '\\':
				533	if state == ESCAPED:
				534	out[-1] += char
				535	state = INSIDE_STRING
				536	elif state == INSIDE_STRING:
				537	state = ESCAPED
				538	else:
				539	raise ValueError(
				540	'Can\'t process char at column %d for: %r' % (index, text),
				541	index,
				542	text)
				543	else:
				544	if state == INSIDE_STRING:
				545	out[-1] += char
				546	else:
				547	raise ValueError(
				548	'Can\'t process char at column %d for: %r' % (index, text),
				549	index,
				550	text)
				551	if state not in (NEED_COMMA, NEED_COMMA_OR_DOT):
				552	raise ValueError(
				553	'String is incorrectly terminated: %r' % text,
				554	text)
				555	return out
				556
				557
				558	def read_json(filepath):
				559	with open(filepath, 'r') as f:
				560	return json.load(f)
				561
				562
				563	def write_json(filepath_or_handle, data, dense):
				564	"""Writes data into filepath or file handle encoded as json.
				565
				566	If dense is True, the json is packed. Otherwise, it is human readable.
				567	"""
				568	if hasattr(filepath_or_handle, 'write'):
				569	if dense:
				570	filepath_or_handle.write(
				571	json.dumps(data, sort_keys=True, separators=(',',':')))
				572	else:
				573	filepath_or_handle.write(json.dumps(data, sort_keys=True, indent=2))
				574	else:
				575	with open(filepath_or_handle, 'wb') as f:
				576	if dense:
				577	json.dump(data, f, sort_keys=True, separators=(',',':'))
				578	else:
				579	json.dump(data, f, sort_keys=True, indent=2)
				580
				581
				582	class Results(object):
				583	"""Results of a trace session."""
				584
				585	class _TouchedObject(object):
				586	"""Something, a file or a directory, that was accessed."""
				587	def __init__(self, root, path, tainted, size, nb_files):
				588	logging.debug(
				589	'%s(%s, %s, %s, %s, %s)' %
				590	(self.__class__.__name__, root, path, tainted, size, nb_files))
				591	self.root = root
				592	self.path = path
				593	self.tainted = tainted
				594	self.nb_files = nb_files
				595	# Can be used as a cache or a default value, depending on context.
				596	self._size = size
				597	# These are cache only.
				598	self._real_path = None
				599
				600	# Check internal consistency.
				601	assert path, path
				602	assert tainted or bool(root) != bool(isabs(path)), (root, path)
				603	assert tainted or (
				604	not os.path.exists(self.full_path) or
				605	(self.full_path == get_native_path_case(self.full_path))), (
				606	tainted, self.full_path, get_native_path_case(self.full_path))
				607
				608	@property
				609	def existent(self):
				610	return self.size != -1
				611
				612	@property
				613	def full_path(self):
				614	if self.root:
				615	return os.path.join(self.root, self.path)
				616	return self.path
				617
				618	@property
				619	def real_path(self):
				620	"""Returns the path with symlinks resolved."""
				621	if not self._real_path:
				622	self._real_path = os.path.realpath(self.full_path)
				623	return self._real_path
				624
				625	@property
				626	def size(self):
				627	"""File's size. -1 is not existent."""
				628	if self._size is None and not self.tainted:
				629	try:
				630	self._size = os.stat(self.full_path).st_size
				631	except OSError:
				632	self._size = -1
				633	return self._size
				634
				635	def flatten(self):
				636	"""Returns a dict representing this object.
				637
				638	A 'size' of 0 means the file was only touched and not read.
				639	"""
				640	return {
				641	'path': self.path,
				642	'size': self.size,
				643	}
				644
				645	def replace_variables(self, variables):
				646	"""Replaces the root of this File with one of the variables if it matches.
				647
				648	If a variable replacement occurs, the cloned object becomes tainted.
				649	"""
				650	for variable, root_path in variables.iteritems():
				651	if self.path.startswith(root_path):
				652	return self._clone(
				653	self.root, variable + self.path[len(root_path):], True)
				654	# No need to clone, returns ourself.
				655	return self
				656
				657	def strip_root(self, root):
				658	"""Returns a clone of itself with 'root' stripped off."""
				659	# Check internal consistency.
				660	assert self.tainted or (isabs(root) and root.endswith(os.path.sep)), root
				661	if not self.full_path.startswith(root):
				662	# Now try to resolve the symlinks to see if it can be reached this way.
				663	# Only try after trying without resolving symlink.
				664	if not self.real_path.startswith(root):
				665	return None
				666	path = self.real_path
				667	else:
				668	path = self.full_path
				669	return self._clone(root, path[len(root):], self.tainted)
				670
				671	def _clone(self, new_root, new_path, tainted):
				672	raise NotImplementedError(self.__class__.__name__)
				673
				674	class File(_TouchedObject):
				675	"""A file that was accessed. May not be present anymore.
				676
				677	If tainted is true, it means it is not a real path anymore as a variable
				678	replacement occured.
				679
				680	If touched_only is True, this means the file was probed for existence, and
				681	it is existent, but was never _opened_. If touched_only is True, the file
				682	must have existed.
				683	"""
				684	def __init__(self, root, path, tainted, size):
				685	super(Results.File, self).__init__(root, path, tainted, size, 1)
				686
				687	def _clone(self, new_root, new_path, tainted):
				688	"""Clones itself keeping meta-data."""
				689	# Keep the self.size and self._real_path caches for performance reason. It
				690	# is also important when the file becomes tainted (with a variable instead
				691	# of the real path) since self.path is not an on-disk path anymore so
				692	# out._size cannot be updated.
				693	out = self.__class__(new_root, new_path, tainted, self.size)
				694	out._real_path = self._real_path
				695	return out
				696
				697	class Directory(_TouchedObject):
				698	"""A directory of files. Must exist."""
				699	def __init__(self, root, path, tainted, size, nb_files):
				700	"""path='.' is a valid value and must be handled appropriately."""
				701	assert not path.endswith(os.path.sep), path
				702	super(Results.Directory, self).__init__(
				703	root, path + os.path.sep, tainted, size, nb_files)
				704	# For a Directory instance, self.size is not a cache, it's an actual value
				705	# that is never modified and represents the total size of the files
				706	# contained in this directory. It is possible that the directory is empty
				707	# so that size == 0; this happens if there's only an invalid symlink in
				708	# it.
				709
				710	def flatten(self):
				711	out = super(Results.Directory, self).flatten()
				712	out['nb_files'] = self.nb_files
				713	return out
				714
				715	def _clone(self, new_root, new_path, tainted):
				716	"""Clones itself keeping meta-data."""
				717	out = self.__class__(
				718	new_root,
				719	new_path.rstrip(os.path.sep),
				720	tainted,
				721	self.size,
				722	self.nb_files)
				723	out._real_path = self._real_path
				724	return out
				725
				726	class Process(object):
				727	"""A process that was traced.
				728
				729	Contains references to the files accessed by this process and its children.
				730	"""
				731	def __init__(self, pid, files, executable, command, initial_cwd, children):
				732	logging.debug('Process(%s, %d, ...)' % (pid, len(files)))
				733	self.pid = pid
				734	self.files = sorted(files, key=lambda x: x.path)
				735	self.children = children
				736	self.executable = executable
				737	self.command = command
				738	self.initial_cwd = initial_cwd
				739
				740	# Check internal consistency.
				741	assert len(set(f.path for f in self.files)) == len(self.files), sorted(
				742	f.path for f in self.files)
				743	assert isinstance(self.children, list)
				744	assert isinstance(self.files, list)
				745
				746	@property
				747	def all(self):
				748	for child in self.children:
				749	for i in child.all:
				750	yield i
				751	yield self
				752
				753	def flatten(self):
				754	return {
				755	'children': [c.flatten() for c in self.children],
				756	'command': self.command,
				757	'executable': self.executable,
				758	'files': [f.flatten() for f in self.files],
				759	'initial_cwd': self.initial_cwd,
				760	'pid': self.pid,
				761	}
				762
				763	def strip_root(self, root):
				764	assert isabs(root) and root.endswith(os.path.sep), root
				765	# Loads the files after since they are constructed as objects.
				766	out = self.__class__(
				767	self.pid,
				768	filter(None, (f.strip_root(root) for f in self.files)),
				769	self.executable,
				770	self.command,
				771	self.initial_cwd,
				772	[c.strip_root(root) for c in self.children])
				773	logging.debug(
				774	'strip_root(%s) %d -> %d' % (root, len(self.files), len(out.files)))
				775	return out
				776
				777	def __init__(self, process):
				778	self.process = process
				779	# Cache.
				780	self._files = None
				781
				782	def flatten(self):
				783	return {
				784	'root': self.process.flatten(),
				785	}
				786
				787	@property
				788	def files(self):
				789	if self._files is None:
				790	self._files = sorted(
				791	sum((p.files for p in self.process.all), []),
				792	key=lambda x: x.path)
				793	return self._files
				794
				795	@property
				796	def existent(self):
				797	return [f for f in self.files if f.existent]
				798
				799	@property
				800	def non_existent(self):
				801	return [f for f in self.files if not f.existent]
				802
				803	def strip_root(self, root):
				804	"""Returns a clone with all the files outside the directory \|root\| removed
				805	and converts all the path to be relative paths.
				806	"""
				807	# Resolve any symlink
				808	root = os.path.realpath(root)
				809	root = get_native_path_case(root).rstrip(os.path.sep) + os.path.sep
				810	logging.debug('strip_root(%s)' % root)
				811	return Results(self.process.strip_root(root))
				812
				813
				814	class ApiBase(object):
				815	"""OS-agnostic API to trace a process and its children."""
				816	class Context(object):
				817	"""Processes one log line at a time and keeps the list of traced processes.
				818
				819	The parsing is complicated by the fact that logs are traced out of order for
				820	strace but in-order for dtrace and logman. In addition, on Windows it is
				821	very frequent that processids are reused so a flat list cannot be used. But
				822	at the same time, it is impossible to faithfully construct a graph when the
				823	logs are processed out of order. So both a tree and a flat mapping are used,
				824	the tree is the real process tree, while the flat mapping stores the last
				825	valid process for the corresponding processid. For the strace case, the
				826	tree's head is guessed at the last moment.
				827	"""
				828	class Process(object):
				829	"""Keeps context for one traced child process.
				830
				831	Logs all the files this process touched. Ignores directories.
				832	"""
				833	def __init__(self, blacklist, pid, initial_cwd):
				834	# Check internal consistency.
				835	assert isinstance(pid, int), repr(pid)
				836	self.pid = pid
				837	# children are Process instances.
				838	self.children = []
				839	self.initial_cwd = initial_cwd
				840	self.cwd = None
				841	self.files = set()
				842	self.only_touched = set()
				843	self.executable = None
				844	self.command = None
				845	self._blacklist = blacklist
				846
				847	def to_results_process(self):
				848	"""Resolves file case sensitivity and or late-bound strings."""
				849	# When resolving files, it's normal to get dupe because a file could be
				850	# opened multiple times with different case. Resolve the deduplication
				851	# here.
				852	def fix_path(x):
				853	"""Returns the native file path case.
				854
				855	Converts late-bound strings.
				856	"""
				857	if not x:
				858	# Do not convert None instance to 'None'.
				859	return x
				860	# TODO(maruel): Do not upconvert to unicode here, on linux we don't
				861	# know the file path encoding so they must be treated as bytes.
				862	x = unicode(x)
				863	if os.path.isabs(x):
				864	# If the path is not absolute, which tends to happen occasionally on
				865	# Windows, it is not possible to get the native path case so ignore
				866	# that trace. It mostly happens for 'executable' value.
				867	x = get_native_path_case(x)
				868	return x
				869
				870	def fix_and_blacklist_path(x):
				871	x = fix_path(x)
				872	if not x:
				873	return
				874	# The blacklist needs to be reapplied, since path casing could
				875	# influence blacklisting.
				876	if self._blacklist(x):
				877	return
				878	return x
				879
				880	# Filters out directories. Some may have passed through.
				881	files = set(f for f in map(fix_and_blacklist_path, self.files) if f)
				882	only_touched = set(
				883	f for f in map(fix_and_blacklist_path, self.only_touched) if f)
				884	only_touched -= files
				885
				886	files = [
				887	Results.File(None, f, False, None) for f in files
				888	if not os.path.isdir(f)
				889	]
				890	# Using 0 as size means the file's content is ignored since the file was
				891	# never opened for I/O.
				892	files.extend(
				893	Results.File(None, f, False, 0) for f in only_touched
				894	if not os.path.isdir(f)
				895	)
				896	return Results.Process(
				897	self.pid,
				898	files,
				899	fix_path(self.executable),
				900	self.command,
				901	fix_path(self.initial_cwd),
				902	[c.to_results_process() for c in self.children])
				903
				904	def add_file(self, filepath, touch_only):
				905	"""Adds a file if it passes the blacklist."""
				906	if self._blacklist(unicode(filepath)):
				907	return
				908	logging.debug('add_file(%d, %s, %s)' % (self.pid, filepath, touch_only))
				909	# Note that filepath and not unicode(filepath) is added. It is because
				910	# filepath could be something else than a string, like a RelativePath
				911	# instance for dtrace logs.
				912	if touch_only:
				913	self.only_touched.add(filepath)
				914	else:
				915	self.files.add(filepath)
				916
				917	def __init__(self, blacklist):
				918	self.blacklist = blacklist
				919	# Initial process.
				920	self.root_process = None
				921	# dict to accelerate process lookup, to not have to lookup the whole graph
				922	# each time.
				923	self._process_lookup = {}
				924
				925	class Tracer(object):
				926	"""During it's lifetime, the tracing subsystem is enabled."""
				927	def __init__(self, logname):
				928	self._logname = logname
				929	self._lock = threading.Lock()
				930	self._traces = []
				931	self._initialized = True
				932
				933	def trace(self, cmd, cwd, tracename, output):
				934	"""Runs the OS-specific trace program on an executable.
				935
				936	Arguments:
				937	- cmd: The command (a list) to run.
				938	- cwd: Current directory to start the child process in.
				939	- tracename: Name of the trace in the logname file.
				940	- output: If False, redirects output to PIPEs.
				941
				942	Returns a tuple (resultcode, output) and updates the internal trace
				943	entries.
				944	"""
				945	# The implementation adds an item to self._traces.
				946	raise NotImplementedError(self.__class__.__name__)
				947
				948	def close(self, _timeout=None):
				949	"""Saves the meta-data in the logname file.
				950
				951	For kernel-based tracing, stops the tracing subsystem.
				952
				953	Must not be used manually when using 'with' construct.
				954	"""
				955	with self._lock:
				956	assert self._initialized
				957	try:
				958	data = {
				959	'traces': self._traces,
				960	}
				961	write_json(self._logname, data, False)
				962	finally:
				963	self._initialized = False
				964
				965	def post_process_log(self):
				966	"""Post-processes the log so it becomes faster to load afterward.
				967
				968	Must not be used manually when using 'with' construct.
				969	"""
				970	assert not self._initialized, 'Must stop tracing first.'
				971
				972	def __enter__(self):
				973	"""Enables 'with' statement."""
				974	return self
				975
				976	def __exit__(self, exc_type, exc_value, traceback):
				977	"""Enables 'with' statement."""
				978	self.close()
				979	# If an exception was thrown, do not process logs.
				980	if not exc_type:
				981	self.post_process_log()
				982
				983	def get_tracer(self, logname):
				984	"""Returns an ApiBase.Tracer instance.
				985
				986	Initializes the tracing subsystem, which is a requirement for kernel-based
				987	tracers. Only one tracer instance should be live at a time!
				988
				989	logname is the filepath to the json file that will contain the meta-data
				990	about the logs.
				991	"""
				992	return self.Tracer(logname)
				993
				994	@staticmethod
				995	def clean_trace(logname):
				996	"""Deletes an old log."""
				997	raise NotImplementedError()
				998
				999	@classmethod
				1000	def parse_log(cls, logname, blacklist):
				1001	"""Processes trace logs and returns the files opened and the files that do
				1002	not exist.
				1003
				1004	It does not track directories.
				1005
maruel@chromium.org	8b056ba	2012-10-16 14:04:49 +0000	[diff] [blame]	1006	\|logname\| must be an absolute path.
				1007
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1008	Most of the time, files that do not exist are temporary test files that
				1009	should be put in /tmp instead. See http://crbug.com/116251.
				1010
				1011	Returns a list of dict with keys:
				1012	- results: A Results instance.
				1013	- trace: The corresponding tracename parameter provided to
				1014	get_tracer().trace().
				1015	- output: Output gathered during execution, if get_tracer().trace(...,
				1016	output=False) was used.
				1017	"""
				1018	raise NotImplementedError(cls.__class__.__name__)
				1019
				1020
				1021	class Strace(ApiBase):
				1022	"""strace implies linux."""
				1023	class Context(ApiBase.Context):
				1024	"""Processes a strace log line and keeps the list of existent and non
				1025	existent files accessed.
				1026
				1027	Ignores directories.
				1028
				1029	Uses late-binding to processes the cwd of each process. The problem is that
				1030	strace generates one log file per process it traced but doesn't give any
				1031	information about which process was started when and by who. So we don't
				1032	even know which process is the initial one. So process the logs out of
				1033	order and use late binding with RelativePath to be able to deduce the
				1034	initial directory of each process once all the logs are parsed.
				1035	"""
				1036	class Process(ApiBase.Context.Process):
				1037	"""Represents the state of a process.
				1038
				1039	Contains all the information retrieved from the pid-specific log.
				1040	"""
				1041	# Function names are using ([a-z_0-9]+)
				1042	# This is the most common format. function(args) = result
				1043	RE_HEADER = re.compile(r'^([a-z_0-9]+)$(.+?)$\s+= (.+)$')
				1044	# An interrupted function call, only grab the minimal header.
				1045	RE_UNFINISHED = re.compile(r'^([^\(]+)(.*) \<unfinished \.\.\.\>$')
maruel@chromium.org	9f72e0f	2012-10-18 14:48:49 +0000	[diff] [blame^]	1046	# An interrupted function call, with the process exiting. It must be the
				1047	# last line in the log.
				1048	RE_UNFINISHED_EXIT = re.compile(
				1049	r'^([^\(]+)(.*) \<unfinished \.\.\.\ exit status \d+>$')
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1050	# A resumed function call.
				1051	RE_RESUMED = re.compile(r'^<\.\.\. ([^ ]+) resumed> (.+)$')
				1052	# A process received a signal.
				1053	RE_SIGNAL = re.compile(r'^--- SIG[A-Z]+ .+ ---')
				1054	# A process didn't handle a signal. Ignore any junk appearing before,
				1055	# because the process was forcibly killed so it won't open any new file.
				1056	RE_KILLED = re.compile(
				1057	r'^.*\+\+\+ killed by ([A-Z]+)( $core dumped$)? \+\+\+$')
				1058	# The process has exited.
				1059	RE_PROCESS_EXITED = re.compile(r'^\+\+\+ exited with (\d+) \+\+\+')
				1060	# A call was canceled. Ignore any prefix.
				1061	RE_UNAVAILABLE = re.compile(r'^.\)\s= \? <unavailable>$')
				1062	# Happens when strace fails to even get the function name.
				1063	UNNAMED_FUNCTION = '????'
				1064
				1065	# Corner-case in python, a class member function decorator must not be
				1066	# @staticmethod.
				1067	def parse_args(regexp, expect_zero): # pylint: disable=E0213
				1068	"""Automatically convert the str 'args' into a list of processed
				1069	arguments.
				1070
				1071	Arguments:
				1072	- regexp is used to parse args.
				1073	- expect_zero: one of True, False or None.
				1074	- True: will check for result.startswith('0') first and will ignore
				1075	the trace line completely otherwise. This is important because for
				1076	many functions, the regexp will not process if the call failed.
				1077	- False: will check for not result.startswith(('?', '-1')) for the
				1078	same reason than with True.
				1079	- None: ignore result.
				1080	"""
				1081	def meta_hook(function):
				1082	assert function.__name__.startswith('handle_')
				1083	def hook(self, args, result):
				1084	if expect_zero is True and not result.startswith('0'):
				1085	return
				1086	if expect_zero is False and result.startswith(('?', '-1')):
				1087	return
				1088	match = re.match(regexp, args)
				1089	if not match:
				1090	raise TracingFailure(
				1091	'Failed to parse %s(%s) = %s' %
				1092	(function.__name__[len('handle_'):], args, result),
				1093	None, None, None)
				1094	return function(self, match.groups(), result)
				1095	return hook
				1096	return meta_hook
				1097
				1098	class RelativePath(object):
				1099	"""A late-bound relative path."""
				1100	def __init__(self, parent, value):
				1101	self.parent = parent
				1102	self.value = value
				1103
				1104	def render(self):
				1105	"""Returns the current directory this instance is representing.
				1106
				1107	This function is used to return the late-bound value.
				1108	"""
				1109	if self.value and self.value.startswith(u'/'):
				1110	# An absolute path.
				1111	return self.value
				1112	parent = self.parent.render() if self.parent else u'<None>'
				1113	if self.value:
				1114	return os.path.normpath(os.path.join(parent, self.value))
				1115	return parent
				1116
				1117	def __unicode__(self):
				1118	"""Acts as a string whenever needed."""
				1119	return unicode(self.render())
				1120
				1121	def __str__(self):
				1122	"""Acts as a string whenever needed."""
				1123	return str(self.render())
				1124
				1125	def __init__(self, root, pid):
				1126	"""Keeps enough information to be able to guess the original process
				1127	root.
				1128
				1129	strace doesn't store which process was the initial process. So more
				1130	information needs to be kept so the graph can be reconstructed from the
				1131	flat map.
				1132	"""
				1133	logging.info('%s(%d)' % (self.__class__.__name__, pid))
				1134	super(Strace.Context.Process, self).__init__(root.blacklist, pid, None)
				1135	assert isinstance(root, ApiBase.Context)
				1136	self._root = weakref.ref(root)
				1137	# The dict key is the function name of the pending call, like 'open'
				1138	# or 'execve'.
				1139	self._pending_calls = {}
				1140	self._line_number = 0
				1141	# Current directory when the process started.
				1142	self.initial_cwd = self.RelativePath(self._root(), None)
				1143	self.parentid = None
maruel@chromium.org	9f72e0f	2012-10-18 14:48:49 +0000	[diff] [blame^]	1144	self._done = False
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1145
				1146	def get_cwd(self):
				1147	"""Returns the best known value of cwd."""
				1148	return self.cwd or self.initial_cwd
				1149
				1150	def render(self):
				1151	"""Returns the string value of the RelativePath() object.
				1152
				1153	Used by RelativePath. Returns the initial directory and not the
				1154	current one since the current directory 'cwd' validity is time-limited.
				1155
				1156	The validity is only guaranteed once all the logs are processed.
				1157	"""
				1158	return self.initial_cwd.render()
				1159
				1160	def on_line(self, line):
				1161	self._line_number += 1
maruel@chromium.org	9f72e0f	2012-10-18 14:48:49 +0000	[diff] [blame^]	1162	if self._done:
				1163	raise TracingFailure(
				1164	'Found a trace for a terminated process',
				1165	None, None, None)
				1166
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1167	if self.RE_SIGNAL.match(line):
				1168	# Ignore signals.
				1169	return
				1170
				1171	try:
				1172	match = self.RE_KILLED.match(line)
				1173	if match:
				1174	# Converts a '+++ killed by Foo +++' trace into an exit_group().
				1175	self.handle_exit_group(match.group(1), None)
				1176	return
				1177
				1178	match = self.RE_PROCESS_EXITED.match(line)
				1179	if match:
				1180	# Converts a '+++ exited with 1 +++' trace into an exit_group()
				1181	self.handle_exit_group(match.group(1), None)
				1182	return
				1183
				1184	match = self.RE_UNFINISHED.match(line)
				1185	if match:
				1186	if match.group(1) in self._pending_calls:
				1187	raise TracingFailure(
				1188	'Found two unfinished calls for the same function',
				1189	None, None, None,
				1190	self._pending_calls)
				1191	self._pending_calls[match.group(1)] = (
				1192	match.group(1) + match.group(2))
				1193	return
				1194
maruel@chromium.org	9f72e0f	2012-10-18 14:48:49 +0000	[diff] [blame^]	1195	match = self.RE_UNFINISHED_EXIT.match(line)
				1196	if match:
				1197	# The process died. No other line can be processed afterward.
				1198	self._done = True
				1199	return
				1200
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1201	match = self.RE_UNAVAILABLE.match(line)
				1202	if match:
				1203	# This usually means a process was killed and a pending call was
				1204	# canceled.
				1205	# TODO(maruel): Look up the last exit_group() trace just above and
				1206	# make sure any self._pending_calls[anything] is properly flushed.
				1207	return
				1208
				1209	match = self.RE_RESUMED.match(line)
				1210	if match:
				1211	if match.group(1) not in self._pending_calls:
				1212	raise TracingFailure(
				1213	'Found a resumed call that was not logged as unfinished',
				1214	None, None, None,
				1215	self._pending_calls)
				1216	pending = self._pending_calls.pop(match.group(1))
				1217	# Reconstruct the line.
				1218	line = pending + match.group(2)
				1219
				1220	match = self.RE_HEADER.match(line)
				1221	if not match:
				1222	raise TracingFailure(
				1223	'Found an invalid line: %s' % line,
				1224	None, None, None)
				1225	if match.group(1) == self.UNNAMED_FUNCTION:
				1226	return
				1227
				1228	# It's a valid line, handle it.
				1229	handler = getattr(self, 'handle_%s' % match.group(1), None)
				1230	if not handler:
				1231	self._handle_unknown(match.group(1), match.group(2), match.group(3))
				1232	return handler(match.group(2), match.group(3))
				1233	except TracingFailure, e:
				1234	# Hack in the values since the handler could be a static function.
				1235	e.pid = self.pid
				1236	e.line = line
				1237	e.line_number = self._line_number
				1238	# Re-raise the modified exception.
				1239	raise
				1240	except (KeyError, NotImplementedError, ValueError), e:
				1241	raise TracingFailure(
				1242	'Trace generated a %s exception: %s' % (
				1243	e.__class__.__name__, str(e)),
				1244	self.pid,
				1245	self._line_number,
				1246	line,
				1247	e)
				1248
				1249	@parse_args(r'^\"(.+?)\", [FKORWX_\|]+$', True)
				1250	def handle_access(self, args, _result):
				1251	self._handle_file(args[0], True)
				1252
				1253	@parse_args(r'^\"(.+?)\"$', True)
				1254	def handle_chdir(self, args, _result):
				1255	"""Updates cwd."""
				1256	self.cwd = self.RelativePath(self, args[0])
				1257	logging.debug('handle_chdir(%d, %s)' % (self.pid, self.cwd))
				1258
				1259	def handle_clone(self, _args, result):
				1260	"""Transfers cwd."""
				1261	if result.startswith(('?', '-1')):
				1262	# The call failed.
				1263	return
				1264	# Update the other process right away.
				1265	childpid = int(result)
				1266	child = self._root().get_or_set_proc(childpid)
				1267	if child.parentid is not None or childpid in self.children:
				1268	raise TracingFailure(
				1269	'Found internal inconsitency in process lifetime detection '
				1270	'during a clone() call',
				1271	None, None, None)
				1272
				1273	# Copy the cwd object.
				1274	child.initial_cwd = self.get_cwd()
				1275	child.parentid = self.pid
				1276	# It is necessary because the logs are processed out of order.
				1277	self.children.append(child)
				1278
				1279	def handle_close(self, _args, _result):
				1280	pass
				1281
				1282	def handle_chmod(self, _args, _result):
				1283	pass
				1284
				1285	def handle_creat(self, _args, _result):
				1286	# Ignore files created, since they didn't need to exist.
				1287	pass
				1288
				1289	@parse_args(r'^\"(.+?)\", \[(.+)\], \[\/\* \d+ vars? \*\/\]$', True)
				1290	def handle_execve(self, args, _result):
				1291	# Even if in practice execve() doesn't returns when it succeeds, strace
				1292	# still prints '0' as the result.
				1293	filepath = args[0]
				1294	self._handle_file(filepath, False)
				1295	self.executable = self.RelativePath(self.get_cwd(), filepath)
				1296	self.command = strace_process_quoted_arguments(args[1])
				1297
				1298	def handle_exit_group(self, _args, _result):
				1299	"""Removes cwd."""
				1300	self.cwd = None
				1301
				1302	def handle_fork(self, args, result):
				1303	self._handle_unknown('fork', args, result)
				1304
maruel@chromium.org	9f72e0f	2012-10-18 14:48:49 +0000	[diff] [blame^]	1305	def handle_futex(self, _args, _result):
				1306	pass
				1307
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1308	def handle_getcwd(self, _args, _result):
				1309	pass
				1310
				1311	@parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
				1312	def handle_link(self, args, _result):
				1313	self._handle_file(args[0], False)
				1314	self._handle_file(args[1], False)
				1315
				1316	@parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True)
				1317	def handle_lstat(self, args, _result):
				1318	self._handle_file(args[0], True)
				1319
				1320	def handle_mkdir(self, _args, _result):
				1321	pass
				1322
				1323	@parse_args(r'^\"(.*?)\", ([A-Z\_\\|]+)(\|, \d+)$', False)
				1324	def handle_open(self, args, _result):
				1325	if 'O_DIRECTORY' in args[1]:
				1326	return
				1327	self._handle_file(args[0], False)
				1328
				1329	@parse_args(r'^(\d+\|AT_FDCWD), \"(.*?)\", ([A-Z\_\\|]+)(\|, \d+)$', False)
				1330	def handle_openat(self, args, _result):
				1331	if 'O_DIRECTORY' in args[2]:
				1332	return
				1333	if args[1] == 'AT_FDCWD':
				1334	self._handle_file(args[1], False)
				1335	else:
				1336	# TODO(maruel): Implement relative open if necessary instead of the
				1337	# AT_FDCWD flag, let's hope not since this means tracking all active
				1338	# directory handles.
				1339	raise Exception('Relative open via openat not implemented.')
				1340
				1341	@parse_args(r'^\"(.+?)\", \".+?\"(\.\.\.)?, \d+$', False)
				1342	def handle_readlink(self, args, _result):
				1343	self._handle_file(args[0], False)
				1344
				1345	@parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
				1346	def handle_rename(self, args, _result):
				1347	self._handle_file(args[0], False)
				1348	self._handle_file(args[1], False)
				1349
				1350	def handle_rmdir(self, _args, _result):
				1351	pass
				1352
				1353	def handle_setxattr(self, _args, _result):
				1354	pass
				1355
				1356	@parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True)
				1357	def handle_stat(self, args, _result):
				1358	self._handle_file(args[0], True)
				1359
				1360	def handle_symlink(self, _args, _result):
				1361	pass
				1362
				1363	def handle_unlink(self, _args, _result):
				1364	# In theory, the file had to be created anyway.
				1365	pass
				1366
				1367	def handle_statfs(self, _args, _result):
				1368	pass
				1369
				1370	def handle_vfork(self, args, result):
				1371	self._handle_unknown('vfork', args, result)
				1372
				1373	@staticmethod
				1374	def _handle_unknown(function, args, result):
				1375	raise TracingFailure(
				1376	'Unexpected/unimplemented trace %s(%s)= %s' %
				1377	(function, args, result),
				1378	None, None, None)
				1379
				1380	def _handle_file(self, filepath, touch_only):
				1381	filepath = self.RelativePath(self.get_cwd(), filepath)
				1382	#assert not touch_only, unicode(filepath)
				1383	self.add_file(filepath, touch_only)
				1384
				1385	def __init__(self, blacklist, initial_cwd):
				1386	super(Strace.Context, self).__init__(blacklist)
				1387	self.initial_cwd = initial_cwd
				1388
				1389	def render(self):
				1390	"""Returns the string value of the initial cwd of the root process.
				1391
				1392	Used by RelativePath.
				1393	"""
				1394	return self.initial_cwd
				1395
				1396	def on_line(self, pid, line):
				1397	"""Transfers control into the Process.on_line() function."""
				1398	self.get_or_set_proc(pid).on_line(line.strip())
				1399
				1400	def to_results(self):
				1401	"""Finds back the root process and verify consistency."""
				1402	# TODO(maruel): Absolutely unecessary, fix me.
				1403	root = [p for p in self._process_lookup.itervalues() if not p.parentid]
				1404	if len(root) != 1:
				1405	raise TracingFailure(
				1406	'Found internal inconsitency in process lifetime detection '
				1407	'while finding the root process',
				1408	None,
				1409	None,
				1410	None,
				1411	sorted(p.pid for p in root))
				1412	self.root_process = root[0]
				1413	process = self.root_process.to_results_process()
				1414	if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
				1415	raise TracingFailure(
				1416	'Found internal inconsitency in process lifetime detection '
				1417	'while looking for len(tree) == len(list)',
				1418	None,
				1419	None,
				1420	None,
				1421	sorted(self._process_lookup),
				1422	sorted(p.pid for p in process.all))
				1423	return Results(process)
				1424
				1425	def get_or_set_proc(self, pid):
				1426	"""Returns the Context.Process instance for this pid or creates a new one.
				1427	"""
				1428	if not pid or not isinstance(pid, int):
				1429	raise TracingFailure(
				1430	'Unpexpected value for pid: %r' % pid,
				1431	pid,
				1432	None,
				1433	None,
				1434	pid)
				1435	if pid not in self._process_lookup:
				1436	self._process_lookup[pid] = self.Process(self, pid)
				1437	return self._process_lookup[pid]
				1438
				1439	@classmethod
				1440	def traces(cls):
				1441	"""Returns the list of all handled traces to pass this as an argument to
				1442	strace.
				1443	"""
				1444	prefix = 'handle_'
				1445	return [i[len(prefix):] for i in dir(cls.Process) if i.startswith(prefix)]
				1446
				1447	class Tracer(ApiBase.Tracer):
				1448	MAX_LEN = 256
				1449
				1450	def trace(self, cmd, cwd, tracename, output):
				1451	"""Runs strace on an executable."""
				1452	logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
				1453	assert os.path.isabs(cmd[0]), cmd[0]
				1454	assert os.path.isabs(cwd), cwd
				1455	assert os.path.normpath(cwd) == cwd, cwd
				1456	with self._lock:
				1457	if not self._initialized:
				1458	raise TracingFailure(
				1459	'Called Tracer.trace() on an unitialized object',
				1460	None, None, None, tracename)
				1461	assert tracename not in (i['trace'] for i in self._traces)
				1462	stdout = stderr = None
				1463	if output:
				1464	stdout = subprocess.PIPE
				1465	stderr = subprocess.STDOUT
				1466	# Ensure all file related APIs are hooked.
				1467	traces = ','.join(Strace.Context.traces() + ['file'])
				1468	trace_cmd = [
				1469	'strace',
				1470	'-ff',
				1471	'-s', '%d' % self.MAX_LEN,
				1472	'-e', 'trace=%s' % traces,
				1473	'-o', self._logname + '.' + tracename,
				1474	]
				1475	child = subprocess.Popen(
				1476	trace_cmd + cmd,
				1477	cwd=cwd,
				1478	stdin=subprocess.PIPE,
				1479	stdout=stdout,
				1480	stderr=stderr)
				1481	out = child.communicate()[0]
				1482	# TODO(maruel): Walk the logs and figure out the root process would
				1483	# simplify parsing the logs a lot.
				1484	with self._lock:
				1485	assert tracename not in (i['trace'] for i in self._traces)
				1486	self._traces.append(
				1487	{
				1488	'cmd': cmd,
				1489	'cwd': cwd,
				1490	# The pid of strace process, not very useful.
				1491	'pid': child.pid,
				1492	'trace': tracename,
				1493	'output': out,
				1494	})
				1495	return child.returncode, out
				1496
				1497	@staticmethod
				1498	def clean_trace(logname):
				1499	if os.path.isfile(logname):
				1500	os.remove(logname)
				1501	# Also delete any pid specific file from previous traces.
				1502	for i in glob.iglob(logname + '.*'):
				1503	if i.rsplit('.', 1)[1].isdigit():
				1504	os.remove(i)
				1505
				1506	@classmethod
				1507	def parse_log(cls, logname, blacklist):
				1508	logging.info('parse_log(%s, %s)' % (logname, blacklist))
maruel@chromium.org	8b056ba	2012-10-16 14:04:49 +0000	[diff] [blame]	1509	assert os.path.isabs(logname)
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1510	data = read_json(logname)
				1511	out = []
				1512	for item in data['traces']:
				1513	result = {
				1514	'trace': item['trace'],
				1515	'output': item['output'],
				1516	}
				1517	try:
				1518	context = cls.Context(blacklist, item['cwd'])
				1519	for pidfile in glob.iglob('%s.%s.*' % (logname, item['trace'])):
				1520	pid = pidfile.rsplit('.', 1)[1]
				1521	if pid.isdigit():
				1522	pid = int(pid)
				1523	# TODO(maruel): Load as utf-8
				1524	for line in open(pidfile, 'rb'):
				1525	context.on_line(pid, line)
				1526	result['results'] = context.to_results()
maruel@chromium.org	424f569	2012-10-15 15:42:38 +0000	[diff] [blame]	1527	except TracingFailure:
				1528	result['exception'] = sys.exc_info()
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1529	out.append(result)
				1530	return out
				1531
				1532
				1533	class Dtrace(ApiBase):
				1534	"""Uses DTrace framework through dtrace. Requires root access.
				1535
				1536	Implies Mac OSX.
				1537
				1538	dtruss can't be used because it has compatibility issues with python.
				1539
				1540	Also, the pid->cwd handling needs to be done manually since OSX has no way to
				1541	get the absolute path of the 'cwd' dtrace variable from the probe.
				1542
				1543	Also, OSX doesn't populate curpsinfo->pr_psargs properly, see
				1544	https://discussions.apple.com/thread/1980539. So resort to handling execve()
				1545	manually.
				1546
				1547	errno is not printed in the log since this implementation currently only cares
				1548	about files that were successfully opened.
				1549	"""
				1550	class Context(ApiBase.Context):
				1551	# Format: index pid function(args)
				1552	RE_HEADER = re.compile(r'^\d+ (\d+) ([a-zA-Z_\-]+)$(.*?)$$')
				1553
				1554	# Arguments parsing.
				1555	RE_DTRACE_BEGIN = re.compile(r'^\"(.+?)\"$')
				1556	RE_CHDIR = re.compile(r'^\"(.+?)\"$')
				1557	RE_EXECVE = re.compile(r'^\"(.+?)\", \[(\d+), (.+)\]$')
				1558	RE_OPEN = re.compile(r'^\"(.+?)\", (0x[0-9a-z]+), (0x[0-9a-z]+)$')
				1559	RE_PROC_START = re.compile(r'^(\d+), \"(.+?)\", (\d+)$')
				1560	RE_RENAME = re.compile(r'^\"(.+?)\", \"(.+?)\"$')
				1561
				1562	O_DIRECTORY = 0x100000
				1563
				1564	class Process(ApiBase.Context.Process):
				1565	def __init__(self, *args):
				1566	super(Dtrace.Context.Process, self).__init__(*args)
				1567	self.cwd = self.initial_cwd
				1568
				1569	def __init__(self, blacklist, tracer_pid, initial_cwd):
				1570	logging.info(
				1571	'%s(%d, %s)' % (self.__class__.__name__, tracer_pid, initial_cwd))
				1572	super(Dtrace.Context, self).__init__(blacklist)
				1573	# Process ID of the temporary script created by create_thunk().
				1574	self._tracer_pid = tracer_pid
				1575	self._initial_cwd = initial_cwd
				1576	self._line_number = 0
				1577
				1578	def on_line(self, line):
				1579	self._line_number += 1
				1580	match = self.RE_HEADER.match(line)
				1581	if not match:
				1582	raise TracingFailure(
				1583	'Found malformed line: %s' % line,
				1584	None,
				1585	self._line_number,
				1586	line)
				1587	fn = getattr(
				1588	self,
				1589	'handle_%s' % match.group(2).replace('-', '_'),
				1590	self._handle_ignored)
				1591	# It is guaranteed to succeed because of the regexp. Or at least I thought
				1592	# it would.
				1593	pid = int(match.group(1))
				1594	try:
				1595	return fn(pid, match.group(3))
				1596	except TracingFailure, e:
				1597	# Hack in the values since the handler could be a static function.
				1598	e.pid = pid
				1599	e.line = line
				1600	e.line_number = self._line_number
				1601	# Re-raise the modified exception.
				1602	raise
				1603	except (KeyError, NotImplementedError, ValueError), e:
				1604	raise TracingFailure(
				1605	'Trace generated a %s exception: %s' % (
				1606	e.__class__.__name__, str(e)),
				1607	pid,
				1608	self._line_number,
				1609	line,
				1610	e)
				1611
				1612	def to_results(self):
				1613	process = self.root_process.to_results_process()
				1614	# Internal concistency check.
				1615	if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
				1616	raise TracingFailure(
				1617	'Found internal inconsitency in process lifetime detection '
				1618	'while looking for len(tree) == len(list)',
				1619	None,
				1620	None,
				1621	None,
				1622	sorted(self._process_lookup),
				1623	sorted(p.pid for p in process.all))
				1624	return Results(process)
				1625
				1626	def handle_dtrace_BEGIN(self, _pid, args):
				1627	if not self.RE_DTRACE_BEGIN.match(args):
				1628	raise TracingFailure(
				1629	'Found internal inconsitency in dtrace_BEGIN log line',
				1630	None, None, None)
				1631
				1632	def handle_proc_start(self, pid, args):
				1633	"""Transfers cwd.
				1634
				1635	The dtrace script already takes care of only tracing the processes that
				1636	are child of the traced processes so there is no need to verify the
				1637	process hierarchy.
				1638	"""
				1639	if pid in self._process_lookup:
				1640	raise TracingFailure(
				1641	'Found internal inconsitency in proc_start: %d started two times' %
				1642	pid,
				1643	None, None, None)
				1644	match = self.RE_PROC_START.match(args)
				1645	if not match:
				1646	raise TracingFailure(
				1647	'Failed to parse arguments: %s' % args,
				1648	None, None, None)
				1649	ppid = int(match.group(1))
				1650	if ppid == self._tracer_pid and not self.root_process:
				1651	proc = self.root_process = self.Process(
				1652	self.blacklist, pid, self._initial_cwd)
				1653	elif ppid in self._process_lookup:
				1654	proc = self.Process(self.blacklist, pid, self._process_lookup[ppid].cwd)
				1655	self._process_lookup[ppid].children.append(proc)
				1656	else:
				1657	# Another process tree, ignore.
				1658	return
				1659	self._process_lookup[pid] = proc
				1660	logging.debug(
				1661	'New child: %s -> %d cwd:%s' %
				1662	(ppid, pid, unicode(proc.initial_cwd)))
				1663
				1664	def handle_proc_exit(self, pid, _args):
				1665	"""Removes cwd."""
				1666	if pid in self._process_lookup:
				1667	# self._tracer_pid is not traced itself and other traces run neither.
				1668	self._process_lookup[pid].cwd = None
				1669
				1670	def handle_execve(self, pid, args):
				1671	"""Sets the process' executable.
				1672
				1673	TODO(maruel): Read command line arguments. See
				1674	https://discussions.apple.com/thread/1980539 for an example.
				1675	https://gist.github.com/1242279
				1676
				1677	Will have to put the answer at http://stackoverflow.com/questions/7556249.
				1678	:)
				1679	"""
				1680	if not pid in self._process_lookup:
				1681	# Another process tree, ignore.
				1682	return
				1683	match = self.RE_EXECVE.match(args)
				1684	if not match:
				1685	raise TracingFailure(
				1686	'Failed to parse arguments: %r' % args,
				1687	None, None, None)
				1688	proc = self._process_lookup[pid]
				1689	proc.executable = match.group(1)
				1690	proc.command = self.process_escaped_arguments(match.group(3))
				1691	if int(match.group(2)) != len(proc.command):
				1692	raise TracingFailure(
				1693	'Failed to parse execve() arguments: %s' % args,
				1694	None, None, None)
				1695
				1696	def handle_chdir(self, pid, args):
				1697	"""Updates cwd."""
				1698	if pid not in self._process_lookup:
				1699	# Another process tree, ignore.
				1700	return
				1701	cwd = self.RE_CHDIR.match(args).group(1)
				1702	if not cwd.startswith('/'):
				1703	cwd2 = os.path.join(self._process_lookup[pid].cwd, cwd)
				1704	logging.debug('handle_chdir(%d, %s) -> %s' % (pid, cwd, cwd2))
				1705	else:
				1706	logging.debug('handle_chdir(%d, %s)' % (pid, cwd))
				1707	cwd2 = cwd
				1708	self._process_lookup[pid].cwd = cwd2
				1709
				1710	def handle_open_nocancel(self, pid, args):
				1711	"""Redirects to handle_open()."""
				1712	return self.handle_open(pid, args)
				1713
				1714	def handle_open(self, pid, args):
				1715	if pid not in self._process_lookup:
				1716	# Another process tree, ignore.
				1717	return
				1718	match = self.RE_OPEN.match(args)
				1719	if not match:
				1720	raise TracingFailure(
				1721	'Failed to parse arguments: %s' % args,
				1722	None, None, None)
				1723	flag = int(match.group(2), 16)
				1724	if self.O_DIRECTORY & flag == self.O_DIRECTORY:
				1725	# Ignore directories.
				1726	return
				1727	self._handle_file(pid, match.group(1))
				1728
				1729	def handle_rename(self, pid, args):
				1730	if pid not in self._process_lookup:
				1731	# Another process tree, ignore.
				1732	return
				1733	match = self.RE_RENAME.match(args)
				1734	if not match:
				1735	raise TracingFailure(
				1736	'Failed to parse arguments: %s' % args,
				1737	None, None, None)
				1738	self._handle_file(pid, match.group(1))
				1739	self._handle_file(pid, match.group(2))
				1740
				1741	def _handle_file(self, pid, filepath):
				1742	if not filepath.startswith('/'):
				1743	filepath = os.path.join(self._process_lookup[pid].cwd, filepath)
				1744	# We can get '..' in the path.
				1745	filepath = os.path.normpath(filepath)
				1746	# Sadly, still need to filter out directories here;
				1747	# saw open_nocancel(".", 0, 0) = 0 lines.
				1748	if os.path.isdir(filepath):
				1749	return
				1750	self._process_lookup[pid].add_file(filepath, False)
				1751
				1752	def handle_ftruncate(self, pid, args):
				1753	"""Just used as a signal to kill dtrace, ignoring."""
				1754	pass
				1755
				1756	@staticmethod
				1757	def _handle_ignored(pid, args):
				1758	"""Is called for all the event traces that are not handled."""
				1759	raise NotImplementedError('Please implement me')
				1760
				1761	@staticmethod
				1762	def process_escaped_arguments(text):
				1763	"""Extracts escaped arguments on a string and return the arguments as a
				1764	list.
				1765
				1766	Implemented as an automaton.
				1767
				1768	Example:
				1769	With text = '\\001python2.7\\001-c\\001print(\\"hi\\")\\0', the
				1770	function will return ['python2.7', '-c', 'print("hi")]
				1771	"""
				1772	if not text.endswith('\\0'):
				1773	raise ValueError('String is not null terminated: %r' % text, text)
				1774	text = text[:-2]
				1775
				1776	def unescape(x):
				1777	"""Replaces '\\' with '\' and '\?' (where ? is anything) with ?."""
				1778	out = []
				1779	escaped = False
				1780	for i in x:
				1781	if i == '\\' and not escaped:
				1782	escaped = True
				1783	continue
				1784	escaped = False
				1785	out.append(i)
				1786	return ''.join(out)
				1787
				1788	return [unescape(i) for i in text.split('\\001')]
				1789
				1790	class Tracer(ApiBase.Tracer):
				1791	# pylint: disable=C0301
				1792	#
				1793	# To understand the following code, you'll want to take a look at:
				1794	# http://developers.sun.com/solaris/articles/dtrace_quickref/dtrace_quickref.html
				1795	# https://wikis.oracle.com/display/DTrace/Variables
				1796	# http://docs.oracle.com/cd/E19205-01/820-4221/
				1797	#
				1798	# 0. Dump all the valid probes into a text file. It is important, you
				1799	# want to redirect into a file and you don't want to constantly 'sudo'.
				1800	# $ sudo dtrace -l > probes.txt
				1801	#
				1802	# 1. Count the number of probes:
				1803	# $ wc -l probes.txt
				1804	# 81823 # On OSX 10.7, including 1 header line.
				1805	#
				1806	# 2. List providers, intentionally skipping all the 'syspolicy10925' and the
				1807	# likes and skipping the header with NR>1:
				1808	# $ awk 'NR>1 { print $2 }' probes.txt \| sort \| uniq \| grep -v '[[:digit:]]'
				1809	# dtrace
				1810	# fbt
				1811	# io
				1812	# ip
				1813	# lockstat
				1814	# mach_trap
				1815	# proc
				1816	# profile
				1817	# sched
				1818	# syscall
				1819	# tcp
				1820	# vminfo
				1821	#
				1822	# 3. List of valid probes:
				1823	# $ grep syscall probes.txt \| less
				1824	# or use dtrace directly:
				1825	# $ sudo dtrace -l -P syscall \| less
				1826	#
				1827	# trackedpid is an associative array where its value can be 0, 1 or 2.
				1828	# 0 is for untracked processes and is the default value for items not
				1829	# in the associative array.
				1830	# 1 is for tracked processes.
				1831	# 2 is for the script created by create_thunk() only. It is not tracked
				1832	# itself but all its decendants are.
				1833	#
				1834	# The script will kill itself only once waiting_to_die == 1 and
				1835	# current_processes == 0, so that both getlogin() was called and that
				1836	# all traced processes exited.
				1837	#
				1838	# TODO(maruel): Use cacheable predicates. See
				1839	# https://wikis.oracle.com/display/DTrace/Performance+Considerations
				1840	D_CODE = """
				1841	dtrace:::BEGIN {
				1842	waiting_to_die = 0;
				1843	current_processes = 0;
				1844	logindex = 0;
				1845	printf("%d %d %s_%s(\\"%s\\")\\n",
				1846	logindex, PID, probeprov, probename, SCRIPT);
				1847	logindex++;
				1848	}
				1849
				1850	proc:::start /trackedpid[ppid]/ {
				1851	trackedpid[pid] = 1;
				1852	current_processes += 1;
				1853	printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
				1854	logindex, pid, probeprov, probename,
				1855	ppid,
				1856	execname,
				1857	current_processes);
				1858	logindex++;
				1859	}
				1860	/* Should use SCRIPT but there is no access to this variable at that
				1861	* point. */
				1862	proc:::start /ppid == PID && execname == "Python"/ {
				1863	trackedpid[pid] = 2;
				1864	current_processes += 1;
				1865	printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
				1866	logindex, pid, probeprov, probename,
				1867	ppid,
				1868	execname,
				1869	current_processes);
				1870	logindex++;
				1871	}
				1872	proc:::exit /trackedpid[pid] &&
				1873	current_processes == 1 &&
				1874	waiting_to_die == 1/ {
				1875	trackedpid[pid] = 0;
				1876	current_processes -= 1;
				1877	printf("%d %d %s_%s(%d)\\n",
				1878	logindex, pid, probeprov, probename,
				1879	current_processes);
				1880	logindex++;
				1881	exit(0);
				1882	}
				1883	proc:::exit /trackedpid[pid]/ {
				1884	trackedpid[pid] = 0;
				1885	current_processes -= 1;
				1886	printf("%d %d %s_%s(%d)\\n",
				1887	logindex, pid, probeprov, probename,
				1888	current_processes);
				1889	logindex++;
				1890	}
				1891
				1892	/* Use an arcane function to detect when we need to die */
				1893	syscall::ftruncate:entry /pid == PID && arg0 == FILE_ID/ {
				1894	waiting_to_die = 1;
				1895	printf("%d %d %s()\\n", logindex, pid, probefunc);
				1896	logindex++;
				1897	}
				1898	syscall::ftruncate:entry /
				1899	pid == PID && arg0 == FILE_ID && current_processes == 0/ {
				1900	exit(0);
				1901	}
				1902
				1903	syscall::open*:entry /trackedpid[pid] == 1/ {
				1904	self->open_arg0 = arg0;
				1905	self->open_arg1 = arg1;
				1906	self->open_arg2 = arg2;
				1907	}
				1908	syscall::open*:return /trackedpid[pid] == 1 && errno == 0/ {
				1909	this->open_arg0 = copyinstr(self->open_arg0);
				1910	printf("%d %d %s(\\"%s\\", 0x%x, 0x%x)\\n",
				1911	logindex, pid, probefunc,
				1912	this->open_arg0,
				1913	self->open_arg1,
				1914	self->open_arg2);
				1915	logindex++;
				1916	this->open_arg0 = 0;
				1917	}
				1918	syscall::open*:return /trackedpid[pid] == 1/ {
				1919	self->open_arg0 = 0;
				1920	self->open_arg1 = 0;
				1921	self->open_arg2 = 0;
				1922	}
				1923
				1924	syscall::rename:entry /trackedpid[pid] == 1/ {
				1925	self->rename_arg0 = arg0;
				1926	self->rename_arg1 = arg1;
				1927	}
				1928	syscall::rename:return /trackedpid[pid] == 1 && errno == 0/ {
				1929	this->rename_arg0 = copyinstr(self->rename_arg0);
				1930	this->rename_arg1 = copyinstr(self->rename_arg1);
				1931	printf("%d %d %s(\\"%s\\", \\"%s\\")\\n",
				1932	logindex, pid, probefunc,
				1933	this->rename_arg0,
				1934	this->rename_arg1);
				1935	logindex++;
				1936	this->rename_arg0 = 0;
				1937	this->rename_arg1 = 0;
				1938	}
				1939	syscall::rename:return /trackedpid[pid] == 1/ {
				1940	self->rename_arg0 = 0;
				1941	self->rename_arg1 = 0;
				1942	}
				1943
				1944	/* Track chdir, it's painful because it is only receiving relative path.
				1945	*/
				1946	syscall::chdir:entry /trackedpid[pid] == 1/ {
				1947	self->chdir_arg0 = arg0;
				1948	}
				1949	syscall::chdir:return /trackedpid[pid] == 1 && errno == 0/ {
				1950	this->chdir_arg0 = copyinstr(self->chdir_arg0);
				1951	printf("%d %d %s(\\"%s\\")\\n",
				1952	logindex, pid, probefunc,
				1953	this->chdir_arg0);
				1954	logindex++;
				1955	this->chdir_arg0 = 0;
				1956	}
				1957	syscall::chdir:return /trackedpid[pid] == 1/ {
				1958	self->chdir_arg0 = 0;
				1959	}
				1960	"""
				1961
				1962	# execve-specific code, tends to throw a lot of exceptions.
				1963	D_CODE_EXECVE = """
				1964	/* Finally what we care about! */
				1965	syscall::exec*:entry /trackedpid[pid]/ {
				1966	self->exec_arg0 = copyinstr(arg0);
				1967	/* Incrementally probe for a NULL in the argv parameter of execve() to
				1968	* figure out argc. */
				1969	/* TODO(maruel): Skip the remaining copyin() when a NULL pointer was
				1970	* found. */
				1971	self->exec_argc = 0;
				1972	/* Probe for argc==1 */
				1973	this->exec_argv = (user_addr_t*)copyin(
				1974	arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
				1975	self->exec_argc = this->exec_argv[self->exec_argc] ?
				1976	(self->exec_argc + 1) : self->exec_argc;
				1977
				1978	/* Probe for argc==2 */
				1979	this->exec_argv = (user_addr_t*)copyin(
				1980	arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
				1981	self->exec_argc = this->exec_argv[self->exec_argc] ?
				1982	(self->exec_argc + 1) : self->exec_argc;
				1983
				1984	/* Probe for argc==3 */
				1985	this->exec_argv = (user_addr_t*)copyin(
				1986	arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
				1987	self->exec_argc = this->exec_argv[self->exec_argc] ?
				1988	(self->exec_argc + 1) : self->exec_argc;
				1989
				1990	/* Probe for argc==4 */
				1991	this->exec_argv = (user_addr_t*)copyin(
				1992	arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
				1993	self->exec_argc = this->exec_argv[self->exec_argc] ?
				1994	(self->exec_argc + 1) : self->exec_argc;
				1995
				1996	/* Copy the inputs strings since there is no guarantee they'll be
				1997	* present after the call completed. */
				1998	self->exec_argv0 = (self->exec_argc > 0) ?
				1999	copyinstr(this->exec_argv[0]) : "";
				2000	self->exec_argv1 = (self->exec_argc > 1) ?
				2001	copyinstr(this->exec_argv[1]) : "";
				2002	self->exec_argv2 = (self->exec_argc > 2) ?
				2003	copyinstr(this->exec_argv[2]) : "";
				2004	self->exec_argv3 = (self->exec_argc > 3) ?
				2005	copyinstr(this->exec_argv[3]) : "";
				2006	this->exec_argv = 0;
				2007	}
				2008	syscall::exec*:return /trackedpid[pid] && errno == 0/ {
				2009	/* We need to join strings here, as using multiple printf() would
				2010	* cause tearing when multiple threads/processes are traced.
				2011	* Since it is impossible to escape a string and join it to another one,
				2012	* like sprintf("%s%S", previous, more), use hackery.
				2013	* Each of the elements are split with a \\1. \\0 cannot be used because
				2014	* it is simply ignored. This will conflict with any program putting a
				2015	* \\1 in their execve() string but this should be "rare enough" */
				2016	this->args = "";
				2017	/* Process exec_argv[0] */
				2018	this->args = strjoin(
				2019	this->args, (self->exec_argc > 0) ? self->exec_argv0 : "");
				2020
				2021	/* Process exec_argv[1] */
				2022	this->args = strjoin(
				2023	this->args, (self->exec_argc > 1) ? "\\1" : "");
				2024	this->args = strjoin(
				2025	this->args, (self->exec_argc > 1) ? self->exec_argv1 : "");
				2026
				2027	/* Process exec_argv[2] */
				2028	this->args = strjoin(
				2029	this->args, (self->exec_argc > 2) ? "\\1" : "");
				2030	this->args = strjoin(
				2031	this->args, (self->exec_argc > 2) ? self->exec_argv2 : "");
				2032
				2033	/* Process exec_argv[3] */
				2034	this->args = strjoin(
				2035	this->args, (self->exec_argc > 3) ? "\\1" : "");
				2036	this->args = strjoin(
				2037	this->args, (self->exec_argc > 3) ? self->exec_argv3 : "");
				2038
				2039	/* Prints self->exec_argc to permits verifying the internal
				2040	* consistency since this code is quite fishy. */
				2041	printf("%d %d %s(\\"%s\\", [%d, %S])\\n",
				2042	logindex, pid, probefunc,
				2043	self->exec_arg0,
				2044	self->exec_argc,
				2045	this->args);
				2046	logindex++;
				2047	this->args = 0;
				2048	}
				2049	syscall::exec*:return /trackedpid[pid]/ {
				2050	self->exec_arg0 = 0;
				2051	self->exec_argc = 0;
				2052	self->exec_argv0 = 0;
				2053	self->exec_argv1 = 0;
				2054	self->exec_argv2 = 0;
				2055	self->exec_argv3 = 0;
				2056	}
				2057	"""
				2058
				2059	# Code currently not used.
				2060	D_EXTRANEOUS = """
				2061	/* This is a good learning experience, since it traces a lot of things
				2062	* related to the process and child processes.
				2063	* Warning: it generates a gigantic log. For example, tracing
				2064	* "data/trace_inputs/child1.py --child" generates a 2mb log and takes
				2065	* several minutes to execute.
				2066	*/
				2067	/*
				2068	mach_trap::: /trackedpid[pid] == 1 \|\| trackedpid[ppid]/ {
				2069	printf("%d %d %s_%s() = %d\\n",
				2070	logindex, pid, probeprov, probefunc, errno);
				2071	logindex++;
				2072	}
				2073	proc::: /trackedpid[pid] == 1 \|\| trackedpid[ppid]/ {
				2074	printf("%d %d %s_%s() = %d\\n",
				2075	logindex, pid, probeprov, probefunc, errno);
				2076	logindex++;
				2077	}
				2078	sched::: /trackedpid[pid] == 1 \|\| trackedpid[ppid]/ {
				2079	printf("%d %d %s_%s() = %d\\n",
				2080	logindex, pid, probeprov, probefunc, errno);
				2081	logindex++;
				2082	}
				2083	syscall::: /trackedpid[pid] == 1 \|\| trackedpid[ppid]/ {
				2084	printf("%d %d %s_%s() = %d\\n",
				2085	logindex, pid, probeprov, probefunc, errno);
				2086	logindex++;
				2087	}
				2088	vminfo::: /trackedpid[pid] == 1 \|\| trackedpid[ppid]/ {
				2089	printf("%d %d %s_%s() = %d\\n",
				2090	logindex, pid, probeprov, probefunc, errno);
				2091	logindex++;
				2092	}
				2093	*/
				2094	/* TODO(maruel): stat functions and friends
				2095	syscall::access:return,
				2096	syscall::chdir:return,
				2097	syscall::chflags:return,
				2098	syscall::chown:return,
				2099	syscall::chroot:return,
				2100	syscall::getattrlist:return,
				2101	syscall::getxattr:return,
				2102	syscall::lchown:return,
				2103	syscall::lstat64:return,
				2104	syscall::lstat:return,
				2105	syscall::mkdir:return,
				2106	syscall::pathconf:return,
				2107	syscall::readlink:return,
				2108	syscall::removexattr:return,
				2109	syscall::setxattr:return,
				2110	syscall::stat64:return,
				2111	syscall::stat:return,
				2112	syscall::truncate:return,
				2113	syscall::unlink:return,
				2114	syscall::utimes:return,
				2115	*/
				2116	"""
				2117
				2118	def __init__(self, logname):
				2119	"""Starts the log collection with dtrace.
				2120
				2121	Requires root access or chmod 4555 on dtrace. dtrace is asynchronous so
				2122	this needs to wait for dtrace to be "warmed up".
				2123	"""
				2124	super(Dtrace.Tracer, self).__init__(logname)
				2125	self._script = create_thunk()
				2126	# This unique dummy temp file is used to signal the dtrace script that it
				2127	# should stop as soon as all the child processes are done. A bit hackish
				2128	# but works fine enough.
				2129	self._dummy_file_id, self._dummy_file_name = tempfile.mkstemp(
				2130	prefix='trace_signal_file')
				2131
				2132	# Note: do not use the -p flag. It's useless if the initial process quits
				2133	# too fast, resulting in missing traces from the grand-children. The D
				2134	# code manages the dtrace lifetime itself.
				2135	trace_cmd = [
				2136	'sudo',
				2137	'dtrace',
				2138	# Use a larger buffer if getting 'out of scratch space' errors.
				2139	# Ref: https://wikis.oracle.com/display/DTrace/Options+and+Tunables
				2140	'-b', '10m',
				2141	'-x', 'dynvarsize=10m',
				2142	#'-x', 'dtrace_global_maxsize=1m',
				2143	'-x', 'evaltime=exec',
				2144	'-o', '/dev/stderr',
				2145	'-q',
				2146	'-n', self._get_dtrace_code(),
				2147	]
				2148	with open(self._logname + '.log', 'wb') as logfile:
				2149	self._dtrace = subprocess.Popen(
				2150	trace_cmd, stdout=logfile, stderr=subprocess.STDOUT)
				2151	logging.debug('Started dtrace pid: %d' % self._dtrace.pid)
				2152
				2153	# Reads until one line is printed, which signifies dtrace is up and ready.
				2154	with open(self._logname + '.log', 'rb') as logfile:
				2155	while 'dtrace_BEGIN' not in logfile.readline():
				2156	if self._dtrace.poll() is not None:
				2157	# Do a busy wait. :/
				2158	break
				2159	logging.debug('dtrace started')
				2160
				2161	def _get_dtrace_code(self):
				2162	"""Setups the D code to implement child process tracking.
				2163
				2164	Injects the cookie in the script so it knows when to stop.
				2165
				2166	The script will detect any instance of the script created with
				2167	create_thunk() and will start tracing it.
				2168	"""
				2169	return (
				2170	'inline int PID = %d;\n'
				2171	'inline string SCRIPT = "%s";\n'
				2172	'inline int FILE_ID = %d;\n'
				2173	'\n'
				2174	'%s') % (
				2175	os.getpid(),
				2176	self._script,
				2177	self._dummy_file_id,
				2178	self.D_CODE) + self.D_CODE_EXECVE
				2179
				2180	def trace(self, cmd, cwd, tracename, output):
				2181	"""Runs dtrace on an executable.
				2182
				2183	This dtruss is broken when it starts the process itself or when tracing
				2184	child processes, this code starts a wrapper process
				2185	generated with create_thunk() which starts the executable to trace.
				2186	"""
				2187	logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
				2188	assert os.path.isabs(cmd[0]), cmd[0]
				2189	assert os.path.isabs(cwd), cwd
				2190	assert os.path.normpath(cwd) == cwd, cwd
				2191	with self._lock:
				2192	if not self._initialized:
				2193	raise TracingFailure(
				2194	'Called Tracer.trace() on an unitialized object',
				2195	None, None, None, tracename)
				2196	assert tracename not in (i['trace'] for i in self._traces)
				2197
				2198	# Starts the script wrapper to start the child process. This signals the
				2199	# dtrace script that this process is to be traced.
				2200	stdout = stderr = None
				2201	if output:
				2202	stdout = subprocess.PIPE
				2203	stderr = subprocess.STDOUT
				2204	child_cmd = [
				2205	sys.executable,
				2206	self._script,
				2207	tracename,
				2208	]
				2209	# Call a dummy function so that dtrace knows I'm about to launch a process
				2210	# that needs to be traced.
				2211	# Yummy.
				2212	child = subprocess.Popen(
				2213	child_cmd + fix_python_path(cmd),
				2214	stdin=subprocess.PIPE,
				2215	stdout=stdout,
				2216	stderr=stderr,
				2217	cwd=cwd)
				2218	logging.debug('Started child pid: %d' % child.pid)
				2219
				2220	out = child.communicate()[0]
				2221	# This doesn't mean tracing is done, one of the grand-child process may
				2222	# still be alive. It will be tracked with the dtrace script.
				2223
				2224	with self._lock:
				2225	assert tracename not in (i['trace'] for i in self._traces)
				2226	self._traces.append(
				2227	{
				2228	'cmd': cmd,
				2229	'cwd': cwd,
				2230	# The pid of strace process, not very useful.
				2231	'pid': child.pid,
				2232	'trace': tracename,
				2233	'output': out,
				2234	})
				2235	return child.returncode, out
				2236
				2237	def close(self, timeout=None):
				2238	"""Terminates dtrace."""
				2239	logging.debug('close(%s)' % timeout)
				2240	try:
				2241	try:
				2242	super(Dtrace.Tracer, self).close(timeout)
				2243	# Signal dtrace that it should stop now.
maruel@chromium.org	f43e68b	2012-10-15 20:23:10 +0000	[diff] [blame]	2244	# ftruncate doesn't exist on Windows.
				2245	os.ftruncate(self._dummy_file_id, 0) # pylint: disable=E1101
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	2246	if timeout:
				2247	start = time.time()
				2248	# Use polling. :/
				2249	while (self._dtrace.poll() is None and
				2250	(time.time() - start) < timeout):
				2251	time.sleep(0.1)
				2252	self._dtrace.kill()
				2253	self._dtrace.wait()
				2254	finally:
				2255	# Make sure to kill it in any case.
				2256	if self._dtrace.poll() is None:
				2257	try:
				2258	self._dtrace.kill()
				2259	self._dtrace.wait()
				2260	except OSError:
				2261	pass
				2262
				2263	if self._dtrace.returncode != 0:
				2264	# Warn about any dtrace failure but basically ignore it.
				2265	print 'dtrace failure: %s' % self._dtrace.returncode
				2266	finally:
				2267	os.close(self._dummy_file_id)
				2268	os.remove(self._dummy_file_name)
				2269	os.remove(self._script)
				2270
				2271	def post_process_log(self):
				2272	"""Sorts the log back in order when each call occured.
				2273
				2274	dtrace doesn't save the buffer in strict order since it keeps one buffer
				2275	per CPU.
				2276	"""
				2277	super(Dtrace.Tracer, self).post_process_log()
				2278	logname = self._logname + '.log'
				2279	with open(logname, 'rb') as logfile:
				2280	lines = [l for l in logfile if l.strip()]
				2281	errors = [l for l in lines if l.startswith('dtrace:')]
				2282	if errors:
				2283	raise TracingFailure(
				2284	'Found errors in the trace: %s' % '\n'.join(errors),
				2285	None, None, None, logname)
				2286	try:
				2287	lines = sorted(lines, key=lambda l: int(l.split(' ', 1)[0]))
				2288	except ValueError:
				2289	raise TracingFailure(
				2290	'Found errors in the trace: %s' % '\n'.join(
				2291	l for l in lines if l.split(' ', 1)[0].isdigit()),
				2292	None, None, None, logname)
				2293	with open(logname, 'wb') as logfile:
				2294	logfile.write(''.join(lines))
				2295
				2296	@staticmethod
				2297	def clean_trace(logname):
				2298	for ext in ('', '.log'):
				2299	if os.path.isfile(logname + ext):
				2300	os.remove(logname + ext)
				2301
				2302	@classmethod
				2303	def parse_log(cls, logname, blacklist):
				2304	logging.info('parse_log(%s, ...)' % logname)
maruel@chromium.org	8b056ba	2012-10-16 14:04:49 +0000	[diff] [blame]	2305	assert os.path.isabs(logname)
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	2306
				2307	def blacklist_more(filepath):
				2308	# All the HFS metadata is in the form /.vol/...
				2309	return blacklist(filepath) or re.match(r'^\/\.vol\/.+$', filepath)
				2310
				2311	data = read_json(logname)
				2312	out = []
				2313	for item in data['traces']:
				2314	context = cls.Context(blacklist_more, item['pid'], item['cwd'])
				2315	for line in open(logname + '.log', 'rb'):
				2316	context.on_line(line)
				2317	out.append(
				2318	{
				2319	'results': context.to_results(),
				2320	'trace': item['trace'],
				2321	'output': item['output'],
				2322	})
				2323	return out
				2324
				2325
				2326	class LogmanTrace(ApiBase):
				2327	"""Uses the native Windows ETW based tracing functionality to trace a child
				2328	process.
				2329
				2330	Caveat: this implementations doesn't track cwd or initial_cwd. It is because
				2331	the Windows Kernel doesn't have a concept of 'current working directory' at
				2332	all. A Win32 process has a map of current directories, one per drive letter
				2333	and it is managed by the user mode kernel32.dll. In kernel, a file is always
				2334	opened relative to another file_object or as an absolute path. All the current
				2335	working directory logic is done in user mode.
				2336	"""
				2337	class Context(ApiBase.Context):
				2338	"""Processes a ETW log line and keeps the list of existent and non
				2339	existent files accessed.
				2340
				2341	Ignores directories.
				2342	"""
				2343	# These indexes are for the stripped version in json.
				2344	EVENT_NAME = 0
				2345	TYPE = 1
				2346	PID = 2
				2347	TID = 3
				2348	PROCESSOR_ID = 4
				2349	TIMESTAMP = 5
				2350	USER_DATA = 6
				2351
				2352	class Process(ApiBase.Context.Process):
				2353	def __init__(self, *args):
				2354	super(LogmanTrace.Context.Process, self).__init__(*args)
				2355	# Handle file objects that succeeded.
				2356	self.file_objects = {}
				2357
				2358	def __init__(self, blacklist, tracer_pid):
				2359	logging.info('%s(%d)' % (self.__class__.__name__, tracer_pid))
				2360	super(LogmanTrace.Context, self).__init__(blacklist)
				2361	self._drive_map = DosDriveMap()
				2362	# Threads mapping to the corresponding process id.
				2363	self._threads_active = {}
				2364	# Process ID of the tracer, e.g. tracer_inputs.py
				2365	self._tracer_pid = tracer_pid
				2366	self._line_number = 0
				2367
				2368	def on_line(self, line):
				2369	"""Processes a json Event line."""
				2370	self._line_number += 1
				2371	try:
				2372	# By Opcode
				2373	handler = getattr(
				2374	self,
				2375	'handle_%s_%s' % (line[self.EVENT_NAME], line[self.TYPE]),
				2376	None)
				2377	if not handler:
				2378	raise TracingFailure(
				2379	'Unexpected event %s_%s' % (
				2380	line[self.EVENT_NAME], line[self.TYPE]),
				2381	None, None, None)
				2382	handler(line)
				2383	except TracingFailure, e:
				2384	# Hack in the values since the handler could be a static function.
				2385	e.pid = line[self.PID]
				2386	e.line = line
				2387	e.line_number = self._line_number
				2388	# Re-raise the modified exception.
				2389	raise
				2390	except (KeyError, NotImplementedError, ValueError), e:
				2391	raise TracingFailure(
				2392	'Trace generated a %s exception: %s' % (
				2393	e.__class__.__name__, str(e)),
				2394	line[self.PID],
				2395	self._line_number,
				2396	line,
				2397	e)
				2398
				2399	def to_results(self):
				2400	if not self.root_process:
				2401	raise TracingFailure(
				2402	'Failed to detect the initial process',
				2403	None, None, None)
				2404	process = self.root_process.to_results_process()
				2405	return Results(process)
				2406
				2407	def _thread_to_process(self, tid):
				2408	"""Finds the process from the thread id."""
				2409	tid = int(tid, 16)
				2410	pid = self._threads_active.get(tid)
				2411	if not pid or not self._process_lookup.get(pid):
				2412	return
				2413	return self._process_lookup[pid]
				2414
				2415	@classmethod
				2416	def handle_EventTrace_Header(cls, line):
				2417	"""Verifies no event was dropped, e.g. no buffer overrun occured."""
				2418	BUFFER_SIZE = cls.USER_DATA
				2419	#VERSION = cls.USER_DATA + 1
				2420	#PROVIDER_VERSION = cls.USER_DATA + 2
				2421	#NUMBER_OF_PROCESSORS = cls.USER_DATA + 3
				2422	#END_TIME = cls.USER_DATA + 4
				2423	#TIMER_RESOLUTION = cls.USER_DATA + 5
				2424	#MAX_FILE_SIZE = cls.USER_DATA + 6
				2425	#LOG_FILE_MODE = cls.USER_DATA + 7
				2426	#BUFFERS_WRITTEN = cls.USER_DATA + 8
				2427	#START_BUFFERS = cls.USER_DATA + 9
				2428	#POINTER_SIZE = cls.USER_DATA + 10
				2429	EVENTS_LOST = cls.USER_DATA + 11
				2430	#CPU_SPEED = cls.USER_DATA + 12
				2431	#LOGGER_NAME = cls.USER_DATA + 13
				2432	#LOG_FILE_NAME = cls.USER_DATA + 14
				2433	#BOOT_TIME = cls.USER_DATA + 15
				2434	#PERF_FREQ = cls.USER_DATA + 16
				2435	#START_TIME = cls.USER_DATA + 17
				2436	#RESERVED_FLAGS = cls.USER_DATA + 18
				2437	#BUFFERS_LOST = cls.USER_DATA + 19
				2438	#SESSION_NAME_STRING = cls.USER_DATA + 20
				2439	#LOG_FILE_NAME_STRING = cls.USER_DATA + 21
				2440	if line[EVENTS_LOST] != '0':
				2441	raise TracingFailure(
				2442	( '%s events were lost during trace, please increase the buffer '
				2443	'size from %s') % (line[EVENTS_LOST], line[BUFFER_SIZE]),
				2444	None, None, None)
				2445
				2446	def handle_FileIo_Cleanup(self, line):
				2447	"""General wisdom: if a file is closed, it's because it was opened.
				2448
				2449	Note that FileIo_Close is not used since if a file was opened properly but
				2450	not closed before the process exits, only Cleanup will be logged.
				2451	"""
				2452	#IRP = self.USER_DATA
				2453	TTID = self.USER_DATA + 1 # Thread ID, that's what we want.
				2454	FILE_OBJECT = self.USER_DATA + 2
				2455	#FILE_KEY = self.USER_DATA + 3
				2456	proc = self._thread_to_process(line[TTID])
				2457	if not proc:
				2458	# Not a process we care about.
				2459	return
				2460	file_object = line[FILE_OBJECT]
				2461	if file_object in proc.file_objects:
				2462	proc.add_file(proc.file_objects.pop(file_object), False)
				2463
				2464	def handle_FileIo_Create(self, line):
				2465	"""Handles a file open.
				2466
				2467	All FileIo events are described at
				2468	http://msdn.microsoft.com/library/windows/desktop/aa363884.aspx
				2469	for some value of 'description'.
				2470
				2471	" (..) process and thread id values of the IO events (..) are not valid "
				2472	http://msdn.microsoft.com/magazine/ee358703.aspx
				2473
				2474	The FileIo.Create event doesn't return if the CreateFile() call
				2475	succeeded, so keep track of the file_object and check that it is
				2476	eventually closed with FileIo_Cleanup.
				2477	"""
				2478	#IRP = self.USER_DATA
				2479	TTID = self.USER_DATA + 1 # Thread ID, that's what we want.
				2480	FILE_OBJECT = self.USER_DATA + 2
				2481	#CREATE_OPTIONS = self.USER_DATA + 3
				2482	#FILE_ATTRIBUTES = self.USER_DATA + 4
				2483	#self.USER_DATA + SHARE_ACCESS = 5
				2484	OPEN_PATH = self.USER_DATA + 6
				2485
				2486	proc = self._thread_to_process(line[TTID])
				2487	if not proc:
				2488	# Not a process we care about.
				2489	return
				2490
				2491	match = re.match(r'^\"(.+)\"$', line[OPEN_PATH])
				2492	raw_path = match.group(1)
				2493	# Ignore directories and bare drive right away.
				2494	if raw_path.endswith(os.path.sep):
				2495	return
				2496	filepath = self._drive_map.to_win32(raw_path)
				2497	# Ignore bare drive right away. Some may still fall through with format
				2498	# like '\\?\X:'
				2499	if len(filepath) == 2:
				2500	return
				2501	file_object = line[FILE_OBJECT]
				2502	if os.path.isdir(filepath):
				2503	# There is no O_DIRECTORY equivalent on Windows. The closed is
				2504	# FILE_FLAG_BACKUP_SEMANTICS but it's not exactly right either. So
				2505	# simply discard directories are they are found.
				2506	return
				2507	# Override any stale file object
				2508	proc.file_objects[file_object] = filepath
				2509
				2510	def handle_FileIo_Rename(self, line):
				2511	# TODO(maruel): Handle?
				2512	pass
				2513
				2514	def handle_Process_End(self, line):
				2515	pid = line[self.PID]
				2516	if self._process_lookup.get(pid):
				2517	logging.info('Terminated: %d' % pid)
				2518	self._process_lookup[pid] = None
				2519	else:
				2520	logging.debug('Terminated: %d' % pid)
				2521
				2522	def handle_Process_Start(self, line):
				2523	"""Handles a new child process started by PID."""
				2524	#UNIQUE_PROCESS_KEY = self.USER_DATA
				2525	PROCESS_ID = self.USER_DATA + 1
				2526	#PARENT_PID = self.USER_DATA + 2
				2527	#SESSION_ID = self.USER_DATA + 3
				2528	#EXIT_STATUS = self.USER_DATA + 4
				2529	#DIRECTORY_TABLE_BASE = self.USER_DATA + 5
				2530	#USER_SID = self.USER_DATA + 6
				2531	IMAGE_FILE_NAME = self.USER_DATA + 7
				2532	COMMAND_LINE = self.USER_DATA + 8
				2533
				2534	ppid = line[self.PID]
				2535	pid = int(line[PROCESS_ID], 16)
				2536	logging.debug(
				2537	'New process %d->%d (%s) %s' %
				2538	(ppid, pid, line[IMAGE_FILE_NAME], line[COMMAND_LINE]))
				2539
				2540	if ppid == self._tracer_pid:
				2541	# Need to ignore processes we don't know about because the log is
				2542	# system-wide. self._tracer_pid shall start only one process.
				2543	if self.root_process:
				2544	raise TracingFailure(
				2545	( 'Parent process is _tracer_pid(%d) but root_process(%d) is '
				2546	'already set') % (self._tracer_pid, self.root_process.pid),
				2547	None, None, None)
				2548	proc = self.Process(self.blacklist, pid, None)
				2549	self.root_process = proc
				2550	ppid = None
				2551	elif self._process_lookup.get(ppid):
				2552	proc = self.Process(self.blacklist, pid, None)
				2553	self._process_lookup[ppid].children.append(proc)
				2554	else:
				2555	# Ignore
				2556	return
				2557	self._process_lookup[pid] = proc
				2558
				2559	if (not line[IMAGE_FILE_NAME].startswith('"') or
				2560	not line[IMAGE_FILE_NAME].endswith('"')):
				2561	raise TracingFailure(
				2562	'Command line is not properly quoted: %s' % line[IMAGE_FILE_NAME],
				2563	None, None, None)
				2564
				2565	# TODO(maruel): Process escapes.
				2566	if (not line[COMMAND_LINE].startswith('"') or
				2567	not line[COMMAND_LINE].endswith('"')):
				2568	raise TracingFailure(
				2569	'Command line is not properly quoted: %s' % line[COMMAND_LINE],
				2570	None, None, None)
				2571	proc.command = CommandLineToArgvW(line[COMMAND_LINE][1:-1])
				2572	proc.executable = line[IMAGE_FILE_NAME][1:-1]
				2573	# proc.command[0] may be the absolute path of 'executable' but it may be
				2574	# anything else too. If it happens that command[0] ends with executable,
				2575	# use it, otherwise defaults to the base name.
				2576	cmd0 = proc.command[0].lower()
				2577	if not cmd0.endswith('.exe'):
				2578	# TODO(maruel): That's not strictly true either.
				2579	cmd0 += '.exe'
				2580	if cmd0.endswith(proc.executable) and os.path.isfile(cmd0):
				2581	# Fix the path.
				2582	cmd0 = cmd0.replace('/', os.path.sep)
				2583	cmd0 = os.path.normpath(cmd0)
				2584	proc.executable = get_native_path_case(cmd0)
				2585	logging.info(
				2586	'New child: %s -> %d %s' % (ppid, pid, proc.executable))
				2587
				2588	def handle_Thread_End(self, line):
				2589	"""Has the same parameters as Thread_Start."""
				2590	tid = int(line[self.TID], 16)
				2591	self._threads_active.pop(tid, None)
				2592
				2593	def handle_Thread_Start(self, line):
				2594	"""Handles a new thread created.
				2595
				2596	Do not use self.PID here since a process' initial thread is created by
				2597	the parent process.
				2598	"""
				2599	PROCESS_ID = self.USER_DATA
				2600	TTHREAD_ID = self.USER_DATA + 1
				2601	#STACK_BASE = self.USER_DATA + 2
				2602	#STACK_LIMIT = self.USER_DATA + 3
				2603	#USER_STACK_BASE = self.USER_DATA + 4
				2604	#USER_STACK_LIMIT = self.USER_DATA + 5
				2605	#AFFINITY = self.USER_DATA + 6
				2606	#WIN32_START_ADDR = self.USER_DATA + 7
				2607	#TEB_BASE = self.USER_DATA + 8
				2608	#SUB_PROCESS_TAG = self.USER_DATA + 9
				2609	#BASE_PRIORITY = self.USER_DATA + 10
				2610	#PAGE_PRIORITY = self.USER_DATA + 11
				2611	#IO_PRIORITY = self.USER_DATA + 12
				2612	#THREAD_FLAGS = self.USER_DATA + 13
				2613	# Do not use self.PID here since a process' initial thread is created by
				2614	# the parent process.
				2615	pid = int(line[PROCESS_ID], 16)
				2616	tid = int(line[TTHREAD_ID], 16)
				2617	logging.debug('New thread pid:%d, tid:%d' % (pid, tid))
				2618	self._threads_active[tid] = pid
				2619
				2620	@classmethod
				2621	def supported_events(cls):
				2622	"""Returns all the procesed events."""
				2623	out = []
				2624	for member in dir(cls):
				2625	match = re.match(r'^handle_([A-Za-z]+)_([A-Za-z]+)$', member)
				2626	if match:
				2627	out.append(match.groups())
				2628	return out
				2629
				2630	class Tracer(ApiBase.Tracer):
				2631	# The basic headers.
				2632	EXPECTED_HEADER = [
				2633	u'Event Name',
				2634	u'Type',
				2635	u'Event ID',
				2636	u'Version',
				2637	u'Channel',
				2638	u'Level', # 5
				2639	u'Opcode',
				2640	u'Task',
				2641	u'Keyword',
				2642	u'PID',
				2643	u'TID', # 10
				2644	u'Processor Number',
				2645	u'Instance ID',
				2646	u'Parent Instance ID',
				2647	u'Activity ID',
				2648	u'Related Activity ID', # 15
				2649	u'Clock-Time',
				2650	u'Kernel(ms)', # Both have a resolution of ~15ms which makes them
				2651	u'User(ms)', # pretty much useless.
				2652	u'User Data', # Extra arguments that are event-specific.
				2653	]
				2654	# Only the useful headers common to all entries are listed there. Any column
				2655	# at 19 or higher is dependent on the specific event.
				2656	EVENT_NAME = 0
				2657	TYPE = 1
				2658	PID = 9
				2659	TID = 10
				2660	PROCESSOR_ID = 11
				2661	TIMESTAMP = 16
				2662	NULL_GUID = '{00000000-0000-0000-0000-000000000000}'
				2663	USER_DATA = 19
				2664
				2665	def __init__(self, logname):
				2666	"""Starts the log collection.
				2667
				2668	Requires administrative access. logman.exe is synchronous so no need for a
				2669	"warmup" call. 'Windows Kernel Trace' is localized so use its GUID
				2670	instead. The GUID constant name is SystemTraceControlGuid. Lovely.
				2671
				2672	One can get the list of potentially interesting providers with:
				2673	"logman query providers \| findstr /i file"
				2674	"""
				2675	super(LogmanTrace.Tracer, self).__init__(logname)
				2676	self._script = create_thunk()
				2677	cmd_start = [
				2678	'logman.exe',
				2679	'start',
				2680	'NT Kernel Logger',
				2681	'-p', '{9e814aad-3204-11d2-9a82-006008a86939}',
				2682	# splitio,fileiocompletion,syscall,file,cswitch,img
				2683	'(process,fileio,thread)',
				2684	'-o', self._logname + '.etl',
				2685	'-ets', # Send directly to kernel
				2686	# Values extracted out of thin air.
				2687	# Event Trace Session buffer size in kb.
				2688	'-bs', '10240',
				2689	# Number of Event Trace Session buffers.
				2690	'-nb', '16', '256',
				2691	]
				2692	logging.debug('Running: %s' % cmd_start)
				2693	try:
				2694	subprocess.check_call(
				2695	cmd_start,
				2696	stdin=subprocess.PIPE,
				2697	stdout=subprocess.PIPE,
				2698	stderr=subprocess.STDOUT)
				2699	except subprocess.CalledProcessError, e:
				2700	if e.returncode == -2147024891:
				2701	print >> sys.stderr, 'Please restart with an elevated admin prompt'
				2702	elif e.returncode == -2144337737:
				2703	print >> sys.stderr, (
				2704	'A kernel trace was already running, stop it and try again')
				2705	raise
				2706
				2707	def trace(self, cmd, cwd, tracename, output):
				2708	logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
				2709	assert os.path.isabs(cmd[0]), cmd[0]
				2710	assert os.path.isabs(cwd), cwd
				2711	assert os.path.normpath(cwd) == cwd, cwd
				2712	with self._lock:
				2713	if not self._initialized:
				2714	raise TracingFailure(
				2715	'Called Tracer.trace() on an unitialized object',
				2716	None, None, None, tracename)
				2717	assert tracename not in (i['trace'] for i in self._traces)
				2718
				2719	# Use "logman -?" for help.
				2720
				2721	stdout = stderr = None
				2722	if output:
				2723	stdout = subprocess.PIPE
				2724	stderr = subprocess.STDOUT
				2725
				2726	# Run the child process.
				2727	logging.debug('Running: %s' % cmd)
				2728	# Use the temporary script generated with create_thunk() so we have a
				2729	# clear pid owner. Since trace_inputs.py can be used as a library and
				2730	# could trace multiple processes simultaneously, it makes it more complex
				2731	# if the executable to be traced is executed directly here. It also solves
				2732	# issues related to logman.exe that needs to be executed to control the
				2733	# kernel trace.
				2734	child_cmd = [
				2735	sys.executable,
				2736	self._script,
				2737	tracename,
				2738	]
				2739	child = subprocess.Popen(
				2740	child_cmd + fix_python_path(cmd),
				2741	cwd=cwd,
				2742	stdin=subprocess.PIPE,
				2743	stdout=stdout,
				2744	stderr=stderr)
				2745	logging.debug('Started child pid: %d' % child.pid)
				2746	out = child.communicate()[0]
				2747	# This doesn't mean all the grand-children are done. Sadly, we don't have
				2748	# a good way to determine that.
				2749
				2750	with self._lock:
				2751	assert tracename not in (i['trace'] for i in self._traces)
				2752	self._traces.append({
maruel@chromium.org	f43e68b	2012-10-15 20:23:10 +0000	[diff] [blame]	2753	'cmd': cmd,
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	2754	'cwd': cwd,
				2755	'pid': child.pid,
				2756	'trace': tracename,
				2757	'output': out,
				2758	})
				2759
				2760	return child.returncode, out
				2761
				2762	def close(self, _timeout=None):
				2763	"""Stops the kernel log collection and converts the traces to text
				2764	representation.
				2765	"""
				2766	with self._lock:
				2767	if not self._initialized:
				2768	raise TracingFailure(
				2769	'Called Tracer.close() on an unitialized object',
				2770	None, None, None)
				2771	os.remove(self._script)
				2772	# Save metadata, add 'format' key..
				2773	data = {
				2774	'format': 'csv',
				2775	'traces': self._traces,
				2776	}
				2777	write_json(self._logname, data, False)
				2778
				2779	cmd_stop = [
				2780	'logman.exe',
				2781	'stop',
				2782	'NT Kernel Logger',
				2783	'-ets', # Sends the command directly to the kernel.
				2784	]
				2785	logging.debug('Running: %s' % cmd_stop)
				2786	subprocess.check_call(
				2787	cmd_stop,
				2788	stdin=subprocess.PIPE,
				2789	stdout=subprocess.PIPE,
				2790	stderr=subprocess.STDOUT)
				2791	self._initialized = False
				2792
				2793	def post_process_log(self):
				2794	"""Converts the .etl file into .csv then into .json."""
				2795	super(LogmanTrace.Tracer, self).post_process_log()
				2796	logformat = 'csv'
				2797	self._convert_log(logformat)
				2798
				2799	if logformat == 'csv_utf16':
				2800	def load_file():
				2801	def utf_8_encoder(unicode_csv_data):
				2802	"""Encodes the unicode object as utf-8 encoded str instance"""
				2803	for line in unicode_csv_data:
				2804	yield line.encode('utf-8')
				2805
				2806	def unicode_csv_reader(unicode_csv_data, **kwargs):
				2807	"""Encodes temporarily as UTF-8 since csv module doesn't do unicode.
				2808	"""
				2809	csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), **kwargs)
				2810	for row in csv_reader:
				2811	# Decode str utf-8 instances back to unicode instances, cell by
				2812	# cell:
				2813	yield [cell.decode('utf-8') for cell in row]
				2814
				2815	# The CSV file is UTF-16 so use codecs.open() to load the file into
				2816	# the python internal unicode format (utf-8). Then explicitly
				2817	# re-encode as utf8 as str instances so csv can parse it fine. Then
				2818	# decode the utf-8 str back into python unicode instances. This
				2819	# sounds about right.
				2820	for line in unicode_csv_reader(
				2821	codecs.open(self._logname + '.' + logformat, 'r', 'utf-16')):
				2822	# line is a list of unicode objects
				2823	# So much white space!
				2824	yield [i.strip() for i in line]
				2825
				2826	elif logformat == 'csv':
				2827	def load_file():
				2828	def ansi_csv_reader(ansi_csv_data, **kwargs):
				2829	"""Loads an 'ANSI' code page and returns unicode() objects."""
				2830	assert sys.getfilesystemencoding() == 'mbcs'
				2831	encoding = get_current_encoding()
				2832	for row in csv.reader(ansi_csv_data, **kwargs):
				2833	# Decode str 'ansi' instances to unicode instances, cell by cell:
				2834	yield [cell.decode(encoding) for cell in row]
				2835
				2836	# The fastest and smallest format but only supports 'ANSI' file paths.
				2837	# E.g. the filenames are encoding in the 'current' encoding.
				2838	for line in ansi_csv_reader(open(self._logname + '.' + logformat)):
				2839	# line is a list of unicode objects.
				2840	yield [i.strip() for i in line]
				2841
				2842	supported_events = LogmanTrace.Context.supported_events()
				2843
				2844	def trim(generator):
				2845	for index, line in enumerate(generator):
				2846	if not index:
				2847	if line != self.EXPECTED_HEADER:
				2848	raise TracingFailure(
				2849	'Found malformed header: %s' % ' '.join(line),
				2850	None, None, None)
				2851	continue
				2852	# As you can see, the CSV is full of useful non-redundant information:
				2853	if (line[2] != '0' or # Event ID
				2854	line[3] not in ('2', '3') or # Version
				2855	line[4] != '0' or # Channel
				2856	line[5] != '0' or # Level
				2857	line[7] != '0' or # Task
				2858	line[8] != '0x0000000000000000' or # Keyword
				2859	line[12] != '' or # Instance ID
				2860	line[13] != '' or # Parent Instance ID
				2861	line[14] != self.NULL_GUID or # Activity ID
				2862	line[15] != ''): # Related Activity ID
				2863	raise TracingFailure(
				2864	'Found unexpected values in line: %s' % ' '.join(line),
				2865	None, None, None)
				2866
				2867	if (line[self.EVENT_NAME], line[self.TYPE]) not in supported_events:
				2868	continue
				2869
				2870	# Convert the PID in-place from hex.
				2871	line[self.PID] = int(line[self.PID], 16)
				2872
				2873	yield [
				2874	line[self.EVENT_NAME],
				2875	line[self.TYPE],
				2876	line[self.PID],
				2877	line[self.TID],
				2878	line[self.PROCESSOR_ID],
				2879	line[self.TIMESTAMP],
				2880	] + line[self.USER_DATA:]
				2881
				2882	write_json('%s.json' % self._logname, list(trim(load_file())), True)
				2883
				2884	def _convert_log(self, logformat):
				2885	"""Converts the ETL trace to text representation.
				2886
				2887	Normally, 'csv' is sufficient. If complex scripts are used (like eastern
				2888	languages), use 'csv_utf16'. If localization gets in the way, use 'xml'.
				2889
				2890	Arguments:
				2891	- logformat: Text format to be generated, csv, csv_utf16 or xml.
				2892
				2893	Use "tracerpt -?" for help.
				2894	"""
				2895	LOCALE_INVARIANT = 0x7F
				2896	windll.kernel32.SetThreadLocale(LOCALE_INVARIANT)
				2897	cmd_convert = [
				2898	'tracerpt.exe',
				2899	'-l', self._logname + '.etl',
				2900	'-o', self._logname + '.' + logformat,
				2901	'-gmt', # Use UTC
				2902	'-y', # No prompt
				2903	# Use -of XML to get the header of each items after column 19, e.g. all
				2904	# the actual headers of 'User Data'.
				2905	]
				2906
				2907	if logformat == 'csv':
				2908	# tracerpt localizes the 'Type' column, for major brainfuck
				2909	# entertainment. I can't imagine any sane reason to do that.
				2910	cmd_convert.extend(['-of', 'CSV'])
				2911	elif logformat == 'csv_utf16':
				2912	# This causes it to use UTF-16, which doubles the log size but ensures
				2913	# the log is readable for non-ASCII characters.
				2914	cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode'])
				2915	elif logformat == 'xml':
				2916	cmd_convert.extend(['-of', 'XML'])
				2917	else:
				2918	raise ValueError('Unexpected log format \'%s\'' % logformat)
				2919	logging.debug('Running: %s' % cmd_convert)
				2920	# This can takes tens of minutes for large logs.
				2921	# Redirects all output to stderr.
				2922	subprocess.check_call(
				2923	cmd_convert,
				2924	stdin=subprocess.PIPE,
				2925	stdout=sys.stderr,
				2926	stderr=sys.stderr)
				2927
				2928	@staticmethod
				2929	def clean_trace(logname):
				2930	for ext in ('', '.csv', '.etl', '.json', '.xml'):
				2931	if os.path.isfile(logname + ext):
				2932	os.remove(logname + ext)
				2933
				2934	@classmethod
				2935	def parse_log(cls, logname, blacklist):
				2936	logging.info('parse_log(%s, %s)' % (logname, blacklist))
maruel@chromium.org	8b056ba	2012-10-16 14:04:49 +0000	[diff] [blame]	2937	assert os.path.isabs(logname)
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	2938
				2939	def blacklist_more(filepath):
				2940	# All the NTFS metadata is in the form x:\$EXTEND or stuff like that.
				2941	return blacklist(filepath) or re.match(r'[A-Z]\:\\\$EXTEND', filepath)
				2942
				2943	data = read_json(logname)
				2944	lines = read_json(logname + '.json')
				2945	out = []
				2946	for item in data['traces']:
				2947	context = cls.Context(blacklist_more, item['pid'])
				2948	for line in lines:
				2949	context.on_line(line)
				2950	out.append(
				2951	{
				2952	'results': context.to_results(),
				2953	'trace': item['trace'],
				2954	'output': item['output'],
				2955	})
				2956	return out
				2957
				2958
				2959	def get_api():
				2960	"""Returns the correct implementation for the current OS."""
				2961	if sys.platform == 'cygwin':
				2962	raise NotImplementedError(
				2963	'Not implemented for cygwin, start the script from Win32 python')
				2964	flavors = {
				2965	'win32': LogmanTrace,
				2966	'darwin': Dtrace,
				2967	'sunos5': Dtrace,
				2968	'freebsd7': Dtrace,
				2969	'freebsd8': Dtrace,
				2970	}
				2971	# Defaults to strace.
				2972	return flavors.get(sys.platform, Strace)()
				2973
				2974
				2975	def extract_directories(root_dir, files, blacklist):
				2976	"""Detects if all the files in a directory are in \|files\| and if so, replace
				2977	the individual files by a Results.Directory instance.
				2978
				2979	Takes a list of Results.File instances and returns a shorter list of
				2980	Results.File and Results.Directory instances.
				2981
				2982	Arguments:
				2983	- root_dir: Optional base directory that shouldn't be search further.
				2984	- files: list of Results.File instances.
				2985	- blacklist: regexp of files to ignore, for example r'.+\.pyc'.
				2986	"""
				2987	logging.info(
				2988	'extract_directories(%s, %d files, ...)' % (root_dir, len(files)))
				2989	assert not (root_dir or '').endswith(os.path.sep), root_dir
				2990	assert not root_dir or (get_native_path_case(root_dir) == root_dir)
				2991	assert not any(isinstance(f, Results.Directory) for f in files)
				2992	# Remove non existent files.
				2993	files = [f for f in files if f.existent]
				2994	if not files:
				2995	return files
				2996	# All files must share the same root, which can be None.
				2997	assert len(set(f.root for f in files)) == 1, set(f.root for f in files)
				2998
				2999	# Creates a {directory: {filename: File}} mapping, up to root.
				3000	buckets = {}
				3001	if root_dir:
				3002	buckets[root_dir] = {}
				3003	for fileobj in files:
				3004	path = fileobj.full_path
				3005	directory = os.path.dirname(path)
				3006	assert directory
				3007	# Do not use os.path.basename() so trailing os.path.sep is kept.
				3008	basename = path[len(directory)+1:]
				3009	files_in_directory = buckets.setdefault(directory, {})
				3010	files_in_directory[basename] = fileobj
				3011	# Add all the directories recursively up to root.
				3012	while True:
				3013	old_d = directory
				3014	directory = os.path.dirname(directory)
				3015	if directory + os.path.sep == root_dir or directory == old_d:
				3016	break
				3017	buckets.setdefault(directory, {})
				3018
				3019	root_prefix = len(root_dir) + 1 if root_dir else 0
				3020	for directory in sorted(buckets, reverse=True):
				3021	actual = set(f for f in os.listdir(directory) if not blacklist(f))
				3022	expected = set(buckets[directory])
				3023	if not (actual - expected):
				3024	parent = os.path.dirname(directory)
				3025	buckets[parent][os.path.basename(directory)] = Results.Directory(
				3026	root_dir,
				3027	directory[root_prefix:],
				3028	False,
				3029	sum(f.size for f in buckets[directory].itervalues()),
				3030	sum(f.nb_files for f in buckets[directory].itervalues()))
				3031	# Remove the whole bucket.
				3032	del buckets[directory]
				3033
				3034	# Reverse the mapping with what remains. The original instances are returned,
				3035	# so the cached meta data is kept.
				3036	files = sum((x.values() for x in buckets.itervalues()), [])
				3037	return sorted(files, key=lambda x: x.path)
				3038
				3039
				3040	def trace(logfile, cmd, cwd, api, output):
				3041	"""Traces an executable. Returns (returncode, output) from api.
				3042
				3043	Arguments:
				3044	- logfile: file to write to.
				3045	- cmd: command to run.
				3046	- cwd: current directory to start the process in.
				3047	- api: a tracing api instance.
				3048	- output: if True, returns output, otherwise prints it at the console.
				3049	"""
				3050	cmd = fix_python_path(cmd)
				3051	api.clean_trace(logfile)
				3052	with api.get_tracer(logfile) as tracer:
				3053	return tracer.trace(cmd, cwd, 'default', output)
				3054
				3055
				3056	def load_trace(logfile, root_dir, api, blacklist):
				3057	"""Loads a trace file and returns the Results instance.
				3058
				3059	Arguments:
				3060	- logfile: File to load.
				3061	- root_dir: Root directory to use to determine if a file is relevant to the
				3062	trace or not.
				3063	- api: A tracing api instance.
				3064	- blacklist: Optional blacklist function to filter out unimportant files.
				3065	"""
				3066	data = api.parse_log(logfile, (blacklist or (lambda _: False)))
				3067	assert len(data) == 1, 'More than one trace was detected!'
				3068	if 'exception' in data[0]:
				3069	# It got an exception, raise it.
maruel@chromium.org	424f569	2012-10-15 15:42:38 +0000	[diff] [blame]	3070	raise \
				3071	data[0]['exception'][0], \
				3072	data[0]['exception'][1], \
				3073	data[0]['exception'][2]
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	3074	results = data[0]['results']
				3075	if root_dir:
				3076	results = results.strip_root(root_dir)
				3077	return results
				3078
				3079
				3080	def CMDclean(args):
				3081	"""Cleans up traces."""
				3082	parser = OptionParserTraceInputs(command='clean')
				3083	options, args = parser.parse_args(args)
				3084	api = get_api()
				3085	api.clean_trace(options.log)
				3086	return 0
				3087
				3088
				3089	def CMDtrace(args):
				3090	"""Traces an executable."""
				3091	parser = OptionParserTraceInputs(command='trace')
				3092	parser.allow_interspersed_args = False
				3093	parser.add_option(
				3094	'-q', '--quiet', action='store_true',
				3095	help='Redirects traced executable output to /dev/null')
				3096	options, args = parser.parse_args(args)
				3097
				3098	if not args:
				3099	parser.error('Please provide a command to run')
				3100
				3101	if not os.path.isabs(args[0]) and os.access(args[0], os.X_OK):
				3102	args[0] = os.path.abspath(args[0])
				3103
				3104	api = get_api()
				3105	return trace(options.log, args, os.getcwd(), api, options.quiet)[0]
				3106
				3107
				3108	def CMDread(args):
				3109	"""Reads the logs and prints the result."""
				3110	parser = OptionParserTraceInputs(command='read')
				3111	parser.add_option(
				3112	'-V', '--variable',
				3113	nargs=2,
				3114	action='append',
				3115	dest='variables',
				3116	metavar='VAR_NAME directory',
				3117	default=[],
				3118	help=('Variables to replace relative directories against. Example: '
				3119	'"-v \'$HOME\' \'/home/%s\'" will replace all occurence of your '
				3120	'home dir with $HOME') % getpass.getuser())
				3121	parser.add_option(
				3122	'--root-dir',
				3123	help='Root directory to base everything off it. Anything outside of this '
				3124	'this directory will not be reported')
				3125	parser.add_option(
				3126	'-j', '--json', action='store_true',
				3127	help='Outputs raw result data as json')
				3128	parser.add_option(
				3129	'-b', '--blacklist', action='append', default=[],
				3130	help='List of regexp to use as blacklist filter')
				3131	options, args = parser.parse_args(args)
				3132
				3133	if options.root_dir:
				3134	options.root_dir = os.path.abspath(options.root_dir)
				3135
				3136	variables = dict(options.variables)
				3137	api = get_api()
				3138	def blacklist(f):
				3139	return any(re.match(b, f) for b in options.blacklist)
				3140	data = api.parse_log(options.log, blacklist)
				3141	# Process each trace.
				3142	output_as_json = []
				3143	for item in data:
				3144	if 'exception' in item:
maruel@chromium.org	424f569	2012-10-15 15:42:38 +0000	[diff] [blame]	3145	# Do not abort the other traces.
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	3146	print >> sys.stderr, (
maruel@chromium.org	424f569	2012-10-15 15:42:38 +0000	[diff] [blame]	3147	'Trace %s: Got an exception: %s' % (
				3148	item['trace'], item['exception'][1]))
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	3149	continue
				3150	results = item['results']
				3151	if options.root_dir:
				3152	results = results.strip_root(options.root_dir)
				3153
				3154	if options.json:
				3155	output_as_json.append(results.flatten())
				3156	else:
				3157	simplified = extract_directories(
				3158	options.root_dir, results.files, blacklist)
				3159	simplified = [f.replace_variables(variables) for f in simplified]
				3160	if len(data) > 1:
				3161	print('Trace: %s' % item['trace'])
				3162	print('Total: %d' % len(results.files))
				3163	print('Non existent: %d' % len(results.non_existent))
				3164	for f in results.non_existent:
				3165	print(' %s' % f.path)
				3166	print(
				3167	'Interesting: %d reduced to %d' % (
				3168	len(results.existent), len(simplified)))
				3169	for f in simplified:
				3170	print(' %s' % f.path)
				3171
				3172	if options.json:
				3173	write_json(sys.stdout, output_as_json, False)
				3174	return 0
				3175
				3176
				3177	class OptionParserWithLogging(optparse.OptionParser):
				3178	"""Adds --verbose option."""
				3179	def __init__(self, verbose=0, **kwargs):
				3180	optparse.OptionParser.__init__(self, **kwargs)
				3181	self.add_option(
				3182	'-v', '--verbose',
				3183	action='count',
				3184	default=verbose,
				3185	help='Use multiple times to increase verbosity')
				3186
				3187	def parse_args(self, args, *kwargs):
				3188	options, args = optparse.OptionParser.parse_args(self, args, *kwargs)
				3189	levels = [logging.ERROR, logging.INFO, logging.DEBUG]
				3190	logging.basicConfig(
				3191	level=levels[min(len(levels)-1, options.verbose)],
				3192	format='%(levelname)5s %(module)15s(%(lineno)3d): %(message)s')
				3193	return options, args
				3194
				3195
				3196	class OptionParserWithNiceDescription(OptionParserWithLogging):
				3197	"""Generates the description with the command's docstring."""
				3198	def __init__(self, **kwargs):
				3199	"""Sets 'description' and 'usage' if not already specified."""
				3200	command = kwargs.pop('command', 'help')
				3201	kwargs.setdefault(
				3202	'description',
				3203	re.sub('[\r\n ]{2,}', ' ', get_command_handler(command).__doc__))
				3204	kwargs.setdefault('usage', '%%prog %s [options]' % command)
				3205	OptionParserWithLogging.__init__(self, **kwargs)
				3206
				3207
				3208	class OptionParserTraceInputs(OptionParserWithNiceDescription):
				3209	"""Adds automatic --log handling."""
				3210	def __init__(self, **kwargs):
				3211	OptionParserWithNiceDescription.__init__(self, **kwargs)
				3212	self.add_option(
				3213	'-l', '--log', help='Log file to generate or read, required')
				3214
				3215	def parse_args(self, args, *kwargs):
				3216	"""Makes sure the paths make sense.
				3217
				3218	On Windows, / and \ are often mixed together in a path.
				3219	"""
				3220	options, args = OptionParserWithNiceDescription.parse_args(
				3221	self, args, *kwargs)
				3222	if not options.log:
				3223	self.error('Must supply a log file with -l')
				3224	options.log = os.path.abspath(options.log)
				3225	return options, args
				3226
				3227
				3228	def extract_documentation():
				3229	"""Returns a dict {command: description} for each of documented command."""
				3230	commands = (
				3231	fn[3:]
				3232	for fn in dir(sys.modules['__main__'])
				3233	if fn.startswith('CMD') and get_command_handler(fn[3:]).__doc__)
				3234	return dict((fn, get_command_handler(fn).__doc__) for fn in commands)
				3235
				3236
				3237	def CMDhelp(args):
				3238	"""Prints list of commands or help for a specific command."""
				3239	doc = extract_documentation()
				3240	# Calculates the optimal offset.
				3241	offset = max(len(cmd) for cmd in doc)
				3242	format_str = ' %-' + str(offset + 2) + 's %s'
				3243	# Generate a one-liner documentation of each commands.
				3244	commands_description = '\n'.join(
				3245	format_str % (cmd, doc[cmd].split('\n')[0]) for cmd in sorted(doc))
				3246
				3247	parser = OptionParserWithNiceDescription(
				3248	usage='%prog <command> [options]',
				3249	description='Commands are:\n%s\n' % commands_description)
				3250	parser.format_description = lambda _: parser.description
				3251
				3252	# Strip out any -h or --help argument.
				3253	_, args = parser.parse_args([i for i in args if not i in ('-h', '--help')])
				3254	if len(args) == 1:
				3255	if not get_command_handler(args[0]):
				3256	parser.error('Unknown command %s' % args[0])
				3257	# The command was "%prog help command", replaces ourself with
				3258	# "%prog command --help" so help is correctly printed out.
				3259	return main(args + ['--help'])
				3260	elif args:
				3261	parser.error('Unknown argument "%s"' % ' '.join(args))
				3262	parser.print_help()
				3263	return 0
				3264
				3265
				3266	def get_command_handler(name):
				3267	"""Returns the command handler or CMDhelp if it doesn't exist."""
				3268	return getattr(sys.modules['__main__'], 'CMD%s' % name, None)
				3269
				3270
				3271	def main_impl(argv):
				3272	command = get_command_handler(argv[0] if argv else 'help')
				3273	if not command:
				3274	return CMDhelp(argv)
				3275	return command(argv[1:])
				3276
				3277	def main(argv):
				3278	try:
				3279	main_impl(argv)
				3280	except TracingFailure, e:
				3281	sys.stderr.write('\nError: ')
				3282	sys.stderr.write(str(e))
				3283	sys.stderr.write('\n')
				3284	return 1
				3285
				3286
				3287	if __name__ == '__main__':
				3288	sys.exit(main(sys.argv[1:]))