Blame - trace_inputs.py - chromium.googlesource.com/infra/luci/client-py

blob: ab2c95d50e66f7511a7614ec63c639da90aeeef5 [file] [log] [blame]

maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1	#!/usr/bin/env python
				2	# coding=utf-8
				3	# Copyright (c) 2012 The Chromium Authors. All rights reserved.
				4	# Use of this source code is governed by a BSD-style license that can be
				5	# found in the LICENSE file.
				6
				7	"""Traces an executable and its child processes and extract the files accessed
				8	by them.
				9
				10	The implementation uses OS-specific API. The native Kernel logger and the ETL
				11	interface is used on Windows. Dtrace is used on OSX. Strace is used otherwise.
				12	The OS-specific implementation is hidden in an 'API' interface.
				13
				14	The results are embedded in a Results instance. The tracing is done in two
				15	phases, the first is to do the actual trace and generate an
				16	implementation-specific log file. Then the log file is parsed to extract the
				17	information, including the individual child processes and the files accessed
				18	from the log.
				19	"""
				20
				21	import codecs
				22	import csv
				23	import getpass
				24	import glob
				25	import json
				26	import logging
				27	import optparse
				28	import os
				29	import re
				30	import subprocess
				31	import sys
				32	import tempfile
				33	import threading
				34	import time
				35	import weakref
				36
				37	## OS-specific imports
				38
				39	if sys.platform == 'win32':
				40	from ctypes.wintypes import byref, create_unicode_buffer, c_int, c_wchar_p
				41	from ctypes.wintypes import windll, FormatError # pylint: disable=E0611
				42	from ctypes.wintypes import GetLastError # pylint: disable=E0611
				43	elif sys.platform == 'darwin':
				44	import Carbon.File # pylint: disable=F0401
				45	import MacOS # pylint: disable=F0401
				46
				47
				48	BASE_DIR = os.path.dirname(os.path.abspath(__file__))
				49	ROOT_DIR = os.path.dirname(os.path.dirname(BASE_DIR))
				50
				51
				52	class TracingFailure(Exception):
				53	"""An exception occured during tracing."""
				54	def __init__(self, description, pid, line_number, line, *args):
				55	super(TracingFailure, self).__init__(
				56	description, pid, line_number, line, *args)
				57	self.description = description
				58	self.pid = pid
				59	self.line_number = line_number
				60	self.line = line
				61	self.extra = args
				62
				63	def __str__(self):
				64	out = self.description
				65	if self.pid:
				66	out += '\npid: %d' % self.pid
				67	if self.line_number:
				68	out += '\nline: %d' % self.line_number
				69	if self.line:
				70	out += '\n%s' % self.line
				71	if self.extra:
				72	out += '\n' + ', '.join(map(str, filter(None, self.extra)))
				73	return out
				74
				75
				76	## OS-specific functions
				77
				78	if sys.platform == 'win32':
				79	def QueryDosDevice(drive_letter):
				80	"""Returns the Windows 'native' path for a DOS drive letter."""
				81	assert re.match(r'^[a-zA-Z]:$', drive_letter), drive_letter
				82	# Guesswork. QueryDosDeviceW never returns the required number of bytes.
				83	chars = 1024
				84	drive_letter = unicode(drive_letter)
				85	p = create_unicode_buffer(chars)
				86	if 0 == windll.kernel32.QueryDosDeviceW(drive_letter, p, chars):
				87	err = GetLastError()
				88	if err:
				89	# pylint: disable=E0602
				90	raise WindowsError(
				91	err,
				92	'QueryDosDevice(%s): %s (%d)' % (
				93	str(drive_letter), FormatError(err), err))
				94	return p.value
				95
				96
				97	def GetShortPathName(long_path):
				98	"""Returns the Windows short path equivalent for a 'long' path."""
				99	long_path = unicode(long_path)
				100	# Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is
				101	# not enforced.
				102	if os.path.isabs(long_path) and not long_path.startswith('\\\\?\\'):
				103	long_path = '\\\\?\\' + long_path
				104	chars = windll.kernel32.GetShortPathNameW(long_path, None, 0)
				105	if chars:
				106	p = create_unicode_buffer(chars)
				107	if windll.kernel32.GetShortPathNameW(long_path, p, chars):
				108	return p.value
				109
				110	err = GetLastError()
				111	if err:
				112	# pylint: disable=E0602
				113	raise WindowsError(
				114	err,
				115	'GetShortPathName(%s): %s (%d)' % (
				116	str(long_path), FormatError(err), err))
				117
				118
				119	def GetLongPathName(short_path):
				120	"""Returns the Windows long path equivalent for a 'short' path."""
				121	short_path = unicode(short_path)
				122	# Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is
				123	# not enforced.
				124	if os.path.isabs(short_path) and not short_path.startswith('\\\\?\\'):
				125	short_path = '\\\\?\\' + short_path
				126	chars = windll.kernel32.GetLongPathNameW(short_path, None, 0)
				127	if chars:
				128	p = create_unicode_buffer(chars)
				129	if windll.kernel32.GetLongPathNameW(short_path, p, chars):
				130	return p.value
				131
				132	err = GetLastError()
				133	if err:
				134	# pylint: disable=E0602
				135	raise WindowsError(
				136	err,
				137	'GetLongPathName(%s): %s (%d)' % (
				138	str(short_path), FormatError(err), err))
				139
				140
				141	def get_current_encoding():
				142	"""Returns the 'ANSI' code page associated to the process."""
				143	return 'cp%d' % int(windll.kernel32.GetACP())
				144
				145
				146	class DosDriveMap(object):
				147	"""Maps \Device\HarddiskVolumeN to N: on Windows."""
				148	# Keep one global cache.
				149	_MAPPING = {}
				150
				151	def __init__(self):
				152	"""Lazy loads the cache."""
				153	if not self._MAPPING:
				154	# This is related to UNC resolver on windows. Ignore that.
				155	self._MAPPING['\\Device\\Mup'] = None
				156	self._MAPPING['\\SystemRoot'] = os.environ['SystemRoot']
				157
				158	for letter in (chr(l) for l in xrange(ord('C'), ord('Z')+1)):
				159	try:
				160	letter = '%s:' % letter
				161	mapped = QueryDosDevice(letter)
				162	if mapped in self._MAPPING:
				163	logging.warn(
				164	('Two drives: \'%s\' and \'%s\', are mapped to the same disk'
				165	'. Drive letters are a user-mode concept and the kernel '
				166	'traces only have NT path, so all accesses will be '
				167	'associated with the first drive letter, independent of the '
				168	'actual letter used by the code') % (
				169	self._MAPPING[mapped], letter))
				170	else:
				171	self._MAPPING[mapped] = letter
				172	except WindowsError: # pylint: disable=E0602
				173	pass
				174
				175	def to_win32(self, path):
				176	"""Converts a native NT path to Win32/DOS compatible path."""
				177	match = re.match(r'(^\\Device\\[a-zA-Z0-9]+)(\\.*)?$', path)
				178	if not match:
				179	raise ValueError(
				180	'Can\'t convert %s into a Win32 compatible path' % path,
				181	path)
				182	if not match.group(1) in self._MAPPING:
				183	# Unmapped partitions may be accessed by windows for the
				184	# fun of it while the test is running. Discard these.
				185	return None
				186	drive = self._MAPPING[match.group(1)]
				187	if not drive or not match.group(2):
				188	return drive
				189	return drive + match.group(2)
				190
				191
				192	def isabs(path):
				193	"""Accepts X: as an absolute path, unlike python's os.path.isabs()."""
				194	return os.path.isabs(path) or len(path) == 2 and path[1] == ':'
				195
				196
				197	def get_native_path_case(p):
				198	"""Returns the native path case for an existing file.
				199
				200	On Windows, removes any leading '\\?\'.
				201	"""
				202	if not isabs(p):
				203	raise ValueError(
				204	'Can\'t get native path case for a non-absolute path: %s' % p,
				205	p)
				206	# Windows used to have an option to turn on case sensitivity on non Win32
				207	# subsystem but that's out of scope here and isn't supported anymore.
				208	# Go figure why GetShortPathName() is needed.
				209	try:
				210	out = GetLongPathName(GetShortPathName(p))
				211	except OSError, e:
				212	if e.args[0] in (2, 3, 5):
				213	# The path does not exist. Try to recurse and reconstruct the path.
				214	base = os.path.dirname(p)
				215	rest = os.path.basename(p)
				216	return os.path.join(get_native_path_case(base), rest)
				217	raise
				218	if out.startswith('\\\\?\\'):
				219	out = out[4:]
				220	# Always upper case the first letter since GetLongPathName() will return the
				221	# drive letter in the case it was given.
				222	return out[0].upper() + out[1:]
				223
				224
				225	def CommandLineToArgvW(command_line):
				226	"""Splits a commandline into argv using CommandLineToArgvW()."""
				227	# http://msdn.microsoft.com/library/windows/desktop/bb776391.aspx
				228	size = c_int()
				229	ptr = windll.shell32.CommandLineToArgvW(unicode(command_line), byref(size))
				230	try:
				231	return [arg for arg in (c_wchar_p * size.value).from_address(ptr)]
				232	finally:
				233	windll.kernel32.LocalFree(ptr)
				234
				235
				236	elif sys.platform == 'darwin':
				237
				238
				239	# On non-windows, keep the stdlib behavior.
				240	isabs = os.path.isabs
				241
				242
				243	def _find_item_native_case(root_path, item):
				244	"""Gets the native path case of a single item based at root_path.
				245
				246	There is no API to get the native path case of symlinks on OSX. So it
				247	needs to be done the slow way.
				248	"""
				249	item = item.lower()
				250	for element in os.listdir(root_path):
				251	if element.lower() == item:
				252	return element
				253
				254
				255	def _native_case(p):
				256	"""Gets the native path case. Warning: this function resolves symlinks."""
				257	logging.debug('native_case(%s)' % p)
				258	try:
				259	rel_ref, _ = Carbon.File.FSPathMakeRef(p)
				260	out = rel_ref.FSRefMakePath()
				261	if p.endswith(os.path.sep) and not out.endswith(os.path.sep):
				262	return out + os.path.sep
				263	return out
				264	except MacOS.Error, e:
				265	if e.args[0] in (-43, -120):
				266	# The path does not exist. Try to recurse and reconstruct the path.
				267	# -43 means file not found.
				268	# -120 means directory not found.
				269	base = os.path.dirname(p)
				270	rest = os.path.basename(p)
				271	return os.path.join(_native_case(base), rest)
				272	raise OSError(
				273	e.args[0], 'Failed to get native path for %s' % p, p, e.args[1])
				274
				275
				276	def _split_at_symlink_native(base_path, rest):
				277	"""Returns the native path for a symlink."""
				278	base, symlink, rest = split_at_symlink(base_path, rest)
				279	if symlink:
				280	if not base_path:
				281	base_path = base
				282	else:
				283	base_path = safe_join(base_path, base)
				284	symlink = _find_item_native_case(base_path, symlink)
				285	return base, symlink, rest
				286
				287
				288	def get_native_path_case(path):
				289	"""Returns the native path case for an existing file.
				290
				291	Technically, it's only HFS+ on OSX that is case preserving and
				292	insensitive. It's the default setting on HFS+ but can be changed.
				293	"""
				294	if not isabs(path):
				295	raise ValueError(
				296	'Can\'t get native path case for a non-absolute path: %s' % path,
				297	path)
				298	if path.startswith('/dev'):
				299	# /dev is not visible from Carbon, causing an exception.
				300	return path
				301
				302	# Starts assuming there is no symlink along the path.
				303	resolved = _native_case(path)
				304	if resolved.lower() == path.lower():
				305	# This code path is incredibly faster.
				306	return resolved
				307
				308	# There was a symlink, process it.
				309	base, symlink, rest = _split_at_symlink_native(None, path)
				310	assert symlink, (path, base, symlink, rest, resolved)
				311	prev = base
				312	base = safe_join(_native_case(base), symlink)
				313	assert len(base) > len(prev)
				314	while rest:
				315	prev = base
				316	relbase, symlink, rest = _split_at_symlink_native(base, rest)
				317	base = safe_join(base, relbase)
				318	assert len(base) > len(prev), (prev, base, symlink)
				319	if symlink:
				320	base = safe_join(base, symlink)
				321	assert len(base) > len(prev), (prev, base, symlink)
				322	# Make sure no symlink was resolved.
				323	assert base.lower() == path.lower(), (base, path)
				324	return base
				325
				326
				327	else: # OSes other than Windows and OSX.
				328
				329
				330	# On non-windows, keep the stdlib behavior.
				331	isabs = os.path.isabs
				332
				333
				334	def get_native_path_case(path):
				335	"""Returns the native path case for an existing file.
				336
				337	On OSes other than OSX and Windows, assume the file system is
				338	case-sensitive.
				339
				340	TODO(maruel): This is not strictly true. Implement if necessary.
				341	"""
				342	if not isabs(path):
				343	raise ValueError(
				344	'Can\'t get native path case for a non-absolute path: %s' % path,
				345	path)
				346	# Give up on cygwin, as GetLongPathName() can't be called.
				347	# Linux traces tends to not be normalized so use this occasion to normalize
				348	# it. This function implementation already normalizes the path on the other
				349	# OS so this needs to be done here to be coherent between OSes.
				350	out = os.path.normpath(path)
				351	if path.endswith(os.path.sep) and not out.endswith(os.path.sep):
				352	return out + os.path.sep
				353	return out
				354
				355
				356	if sys.platform != 'win32': # All non-Windows OSes.
				357
				358
				359	def safe_join(*args):
				360	"""Joins path elements like os.path.join() but doesn't abort on absolute
				361	path.
				362
				363	os.path.join('foo', '/bar') == '/bar'
				364	but safe_join('foo', '/bar') == 'foo/bar'.
				365	"""
				366	out = ''
				367	for element in args:
				368	if element.startswith(os.path.sep):
				369	if out.endswith(os.path.sep):
				370	out += element[1:]
				371	else:
				372	out += element
				373	else:
				374	if out.endswith(os.path.sep):
				375	out += element
				376	else:
				377	out += os.path.sep + element
				378	return out
				379
				380
				381	def split_at_symlink(base_dir, relfile):
				382	"""Scans each component of relfile and cut the string at the symlink if
				383	there is any.
				384
				385	Returns a tuple (base_path, symlink, rest), with symlink == rest == None if
				386	not symlink was found.
				387	"""
				388	if base_dir:
				389	assert relfile
				390	assert os.path.isabs(base_dir)
				391	index = 0
				392	else:
				393	assert os.path.isabs(relfile)
				394	index = 1
				395
				396	def at_root(rest):
				397	if base_dir:
				398	return safe_join(base_dir, rest)
				399	return rest
				400
				401	while True:
				402	try:
				403	index = relfile.index(os.path.sep, index)
				404	except ValueError:
				405	index = len(relfile)
				406	full = at_root(relfile[:index])
				407	if os.path.islink(full):
				408	# A symlink!
				409	base = os.path.dirname(relfile[:index])
				410	symlink = os.path.basename(relfile[:index])
				411	rest = relfile[index:]
				412	logging.debug(
				413	'split_at_symlink(%s, %s) -> (%s, %s, %s)' %
				414	(base_dir, relfile, base, symlink, rest))
				415	return base, symlink, rest
				416	if index == len(relfile):
				417	break
				418	index += 1
				419	return relfile, None, None
				420
				421
				422	def fix_python_path(cmd):
				423	"""Returns the fixed command line to call the right python executable."""
				424	out = cmd[:]
				425	if out[0] == 'python':
				426	out[0] = sys.executable
				427	elif out[0].endswith('.py'):
				428	out.insert(0, sys.executable)
				429	return out
				430
				431
				432	def create_thunk():
				433	handle, name = tempfile.mkstemp(prefix='trace_inputs_thunk', suffix='.py')
				434	os.write(
				435	handle,
				436	(
				437	'import subprocess\n'
				438	'import sys\n'
				439	'sys.exit(subprocess.call(sys.argv[2:]))\n'
				440	))
				441	os.close(handle)
				442	return name
				443
				444
				445	def strace_process_quoted_arguments(text):
				446	"""Extracts quoted arguments on a string and return the arguments as a list.
				447
				448	Implemented as an automaton. Supports incomplete strings in the form
				449	'"foo"...'.
				450
				451	Example:
				452	With text = '"foo", "bar"', the function will return ['foo', 'bar']
				453
				454	TODO(maruel): Implement escaping.
				455	"""
				456	# All the possible states of the DFA.
				457	( NEED_QUOTE, # Begining of a new arguments.
				458	INSIDE_STRING, # Inside an argument.
				459	ESCAPED, # Found a '\' inside a quote. Treat the next char as-is.
				460	NEED_COMMA_OR_DOT, # Right after the closing quote of an argument. Could be
				461	# a serie of 3 dots or a comma.
				462	NEED_SPACE, # Right after a comma
				463	NEED_DOT_2, # Found a dot, need a second one.
				464	NEED_DOT_3, # Found second dot, need a third one.
				465	NEED_COMMA, # Found third dot, need a comma.
				466	) = range(8)
				467
				468	state = NEED_QUOTE
				469	out = []
				470	for index, char in enumerate(text):
				471	if char == '"':
				472	if state == NEED_QUOTE:
				473	state = INSIDE_STRING
				474	# A new argument was found.
				475	out.append('')
				476	elif state == INSIDE_STRING:
				477	# The argument is now closed.
				478	state = NEED_COMMA_OR_DOT
				479	elif state == ESCAPED:
				480	out[-1] += char
				481	state = INSIDE_STRING
				482	else:
				483	raise ValueError(
				484	'Can\'t process char at column %d for: %r' % (index, text),
				485	index,
				486	text)
				487	elif char == ',':
				488	if state in (NEED_COMMA_OR_DOT, NEED_COMMA):
				489	state = NEED_SPACE
				490	elif state == INSIDE_STRING:
				491	out[-1] += char
				492	elif state == ESCAPED:
				493	out[-1] += char
				494	state = INSIDE_STRING
				495	else:
				496	raise ValueError(
				497	'Can\'t process char at column %d for: %r' % (index, text),
				498	index,
				499	text)
				500	elif char == ' ':
				501	if state == NEED_SPACE:
				502	state = NEED_QUOTE
				503	elif state == INSIDE_STRING:
				504	out[-1] += char
				505	elif state == ESCAPED:
				506	out[-1] += char
				507	state = INSIDE_STRING
				508	else:
				509	raise ValueError(
				510	'Can\'t process char at column %d for: %r' % (index, text),
				511	index,
				512	text)
				513	elif char == '.':
				514	if state == NEED_COMMA_OR_DOT:
				515	# The string is incomplete, this mean the strace -s flag should be
				516	# increased.
				517	state = NEED_DOT_2
				518	elif state == NEED_DOT_2:
				519	state = NEED_DOT_3
				520	elif state == NEED_DOT_3:
				521	state = NEED_COMMA
				522	elif state == INSIDE_STRING:
				523	out[-1] += char
				524	elif state == ESCAPED:
				525	out[-1] += char
				526	state = INSIDE_STRING
				527	else:
				528	raise ValueError(
				529	'Can\'t process char at column %d for: %r' % (index, text),
				530	index,
				531	text)
				532	elif char == '\\':
				533	if state == ESCAPED:
				534	out[-1] += char
				535	state = INSIDE_STRING
				536	elif state == INSIDE_STRING:
				537	state = ESCAPED
				538	else:
				539	raise ValueError(
				540	'Can\'t process char at column %d for: %r' % (index, text),
				541	index,
				542	text)
				543	else:
				544	if state == INSIDE_STRING:
				545	out[-1] += char
				546	else:
				547	raise ValueError(
				548	'Can\'t process char at column %d for: %r' % (index, text),
				549	index,
				550	text)
				551	if state not in (NEED_COMMA, NEED_COMMA_OR_DOT):
				552	raise ValueError(
				553	'String is incorrectly terminated: %r' % text,
				554	text)
				555	return out
				556
				557
				558	def read_json(filepath):
				559	with open(filepath, 'r') as f:
				560	return json.load(f)
				561
				562
				563	def write_json(filepath_or_handle, data, dense):
				564	"""Writes data into filepath or file handle encoded as json.
				565
				566	If dense is True, the json is packed. Otherwise, it is human readable.
				567	"""
				568	if hasattr(filepath_or_handle, 'write'):
				569	if dense:
				570	filepath_or_handle.write(
				571	json.dumps(data, sort_keys=True, separators=(',',':')))
				572	else:
				573	filepath_or_handle.write(json.dumps(data, sort_keys=True, indent=2))
				574	else:
				575	with open(filepath_or_handle, 'wb') as f:
				576	if dense:
				577	json.dump(data, f, sort_keys=True, separators=(',',':'))
				578	else:
				579	json.dump(data, f, sort_keys=True, indent=2)
				580
				581
				582	class Results(object):
				583	"""Results of a trace session."""
				584
				585	class _TouchedObject(object):
				586	"""Something, a file or a directory, that was accessed."""
				587	def __init__(self, root, path, tainted, size, nb_files):
				588	logging.debug(
				589	'%s(%s, %s, %s, %s, %s)' %
				590	(self.__class__.__name__, root, path, tainted, size, nb_files))
				591	self.root = root
				592	self.path = path
				593	self.tainted = tainted
				594	self.nb_files = nb_files
				595	# Can be used as a cache or a default value, depending on context.
				596	self._size = size
				597	# These are cache only.
				598	self._real_path = None
				599
				600	# Check internal consistency.
				601	assert path, path
				602	assert tainted or bool(root) != bool(isabs(path)), (root, path)
				603	assert tainted or (
				604	not os.path.exists(self.full_path) or
				605	(self.full_path == get_native_path_case(self.full_path))), (
				606	tainted, self.full_path, get_native_path_case(self.full_path))
				607
				608	@property
				609	def existent(self):
				610	return self.size != -1
				611
				612	@property
				613	def full_path(self):
				614	if self.root:
				615	return os.path.join(self.root, self.path)
				616	return self.path
				617
				618	@property
				619	def real_path(self):
				620	"""Returns the path with symlinks resolved."""
				621	if not self._real_path:
				622	self._real_path = os.path.realpath(self.full_path)
				623	return self._real_path
				624
				625	@property
				626	def size(self):
				627	"""File's size. -1 is not existent."""
				628	if self._size is None and not self.tainted:
				629	try:
				630	self._size = os.stat(self.full_path).st_size
				631	except OSError:
				632	self._size = -1
				633	return self._size
				634
				635	def flatten(self):
				636	"""Returns a dict representing this object.
				637
				638	A 'size' of 0 means the file was only touched and not read.
				639	"""
				640	return {
				641	'path': self.path,
				642	'size': self.size,
				643	}
				644
				645	def replace_variables(self, variables):
				646	"""Replaces the root of this File with one of the variables if it matches.
				647
				648	If a variable replacement occurs, the cloned object becomes tainted.
				649	"""
				650	for variable, root_path in variables.iteritems():
				651	if self.path.startswith(root_path):
				652	return self._clone(
				653	self.root, variable + self.path[len(root_path):], True)
				654	# No need to clone, returns ourself.
				655	return self
				656
				657	def strip_root(self, root):
				658	"""Returns a clone of itself with 'root' stripped off."""
				659	# Check internal consistency.
				660	assert self.tainted or (isabs(root) and root.endswith(os.path.sep)), root
				661	if not self.full_path.startswith(root):
				662	# Now try to resolve the symlinks to see if it can be reached this way.
				663	# Only try after trying without resolving symlink.
				664	if not self.real_path.startswith(root):
				665	return None
				666	path = self.real_path
				667	else:
				668	path = self.full_path
				669	return self._clone(root, path[len(root):], self.tainted)
				670
				671	def _clone(self, new_root, new_path, tainted):
				672	raise NotImplementedError(self.__class__.__name__)
				673
				674	class File(_TouchedObject):
				675	"""A file that was accessed. May not be present anymore.
				676
				677	If tainted is true, it means it is not a real path anymore as a variable
				678	replacement occured.
				679
				680	If touched_only is True, this means the file was probed for existence, and
				681	it is existent, but was never _opened_. If touched_only is True, the file
				682	must have existed.
				683	"""
				684	def __init__(self, root, path, tainted, size):
				685	super(Results.File, self).__init__(root, path, tainted, size, 1)
				686
				687	def _clone(self, new_root, new_path, tainted):
				688	"""Clones itself keeping meta-data."""
				689	# Keep the self.size and self._real_path caches for performance reason. It
				690	# is also important when the file becomes tainted (with a variable instead
				691	# of the real path) since self.path is not an on-disk path anymore so
				692	# out._size cannot be updated.
				693	out = self.__class__(new_root, new_path, tainted, self.size)
				694	out._real_path = self._real_path
				695	return out
				696
				697	class Directory(_TouchedObject):
				698	"""A directory of files. Must exist."""
				699	def __init__(self, root, path, tainted, size, nb_files):
				700	"""path='.' is a valid value and must be handled appropriately."""
				701	assert not path.endswith(os.path.sep), path
				702	super(Results.Directory, self).__init__(
				703	root, path + os.path.sep, tainted, size, nb_files)
				704	# For a Directory instance, self.size is not a cache, it's an actual value
				705	# that is never modified and represents the total size of the files
				706	# contained in this directory. It is possible that the directory is empty
				707	# so that size == 0; this happens if there's only an invalid symlink in
				708	# it.
				709
				710	def flatten(self):
				711	out = super(Results.Directory, self).flatten()
				712	out['nb_files'] = self.nb_files
				713	return out
				714
				715	def _clone(self, new_root, new_path, tainted):
				716	"""Clones itself keeping meta-data."""
				717	out = self.__class__(
				718	new_root,
				719	new_path.rstrip(os.path.sep),
				720	tainted,
				721	self.size,
				722	self.nb_files)
				723	out._real_path = self._real_path
				724	return out
				725
				726	class Process(object):
				727	"""A process that was traced.
				728
				729	Contains references to the files accessed by this process and its children.
				730	"""
				731	def __init__(self, pid, files, executable, command, initial_cwd, children):
				732	logging.debug('Process(%s, %d, ...)' % (pid, len(files)))
				733	self.pid = pid
				734	self.files = sorted(files, key=lambda x: x.path)
				735	self.children = children
				736	self.executable = executable
				737	self.command = command
				738	self.initial_cwd = initial_cwd
				739
				740	# Check internal consistency.
				741	assert len(set(f.path for f in self.files)) == len(self.files), sorted(
				742	f.path for f in self.files)
				743	assert isinstance(self.children, list)
				744	assert isinstance(self.files, list)
				745
				746	@property
				747	def all(self):
				748	for child in self.children:
				749	for i in child.all:
				750	yield i
				751	yield self
				752
				753	def flatten(self):
				754	return {
				755	'children': [c.flatten() for c in self.children],
				756	'command': self.command,
				757	'executable': self.executable,
				758	'files': [f.flatten() for f in self.files],
				759	'initial_cwd': self.initial_cwd,
				760	'pid': self.pid,
				761	}
				762
				763	def strip_root(self, root):
				764	assert isabs(root) and root.endswith(os.path.sep), root
				765	# Loads the files after since they are constructed as objects.
				766	out = self.__class__(
				767	self.pid,
				768	filter(None, (f.strip_root(root) for f in self.files)),
				769	self.executable,
				770	self.command,
				771	self.initial_cwd,
				772	[c.strip_root(root) for c in self.children])
				773	logging.debug(
				774	'strip_root(%s) %d -> %d' % (root, len(self.files), len(out.files)))
				775	return out
				776
				777	def __init__(self, process):
				778	self.process = process
				779	# Cache.
				780	self._files = None
				781
				782	def flatten(self):
				783	return {
				784	'root': self.process.flatten(),
				785	}
				786
				787	@property
				788	def files(self):
				789	if self._files is None:
				790	self._files = sorted(
				791	sum((p.files for p in self.process.all), []),
				792	key=lambda x: x.path)
				793	return self._files
				794
				795	@property
				796	def existent(self):
				797	return [f for f in self.files if f.existent]
				798
				799	@property
				800	def non_existent(self):
				801	return [f for f in self.files if not f.existent]
				802
				803	def strip_root(self, root):
				804	"""Returns a clone with all the files outside the directory \|root\| removed
				805	and converts all the path to be relative paths.
				806	"""
				807	# Resolve any symlink
				808	root = os.path.realpath(root)
				809	root = get_native_path_case(root).rstrip(os.path.sep) + os.path.sep
				810	logging.debug('strip_root(%s)' % root)
				811	return Results(self.process.strip_root(root))
				812
				813
				814	class ApiBase(object):
				815	"""OS-agnostic API to trace a process and its children."""
				816	class Context(object):
				817	"""Processes one log line at a time and keeps the list of traced processes.
				818
				819	The parsing is complicated by the fact that logs are traced out of order for
				820	strace but in-order for dtrace and logman. In addition, on Windows it is
				821	very frequent that processids are reused so a flat list cannot be used. But
				822	at the same time, it is impossible to faithfully construct a graph when the
				823	logs are processed out of order. So both a tree and a flat mapping are used,
				824	the tree is the real process tree, while the flat mapping stores the last
				825	valid process for the corresponding processid. For the strace case, the
				826	tree's head is guessed at the last moment.
				827	"""
				828	class Process(object):
				829	"""Keeps context for one traced child process.
				830
				831	Logs all the files this process touched. Ignores directories.
				832	"""
				833	def __init__(self, blacklist, pid, initial_cwd):
				834	# Check internal consistency.
				835	assert isinstance(pid, int), repr(pid)
				836	self.pid = pid
				837	# children are Process instances.
				838	self.children = []
				839	self.initial_cwd = initial_cwd
				840	self.cwd = None
				841	self.files = set()
				842	self.only_touched = set()
				843	self.executable = None
				844	self.command = None
				845	self._blacklist = blacklist
				846
				847	def to_results_process(self):
				848	"""Resolves file case sensitivity and or late-bound strings."""
				849	# When resolving files, it's normal to get dupe because a file could be
				850	# opened multiple times with different case. Resolve the deduplication
				851	# here.
				852	def fix_path(x):
				853	"""Returns the native file path case.
				854
				855	Converts late-bound strings.
				856	"""
				857	if not x:
				858	# Do not convert None instance to 'None'.
				859	return x
				860	# TODO(maruel): Do not upconvert to unicode here, on linux we don't
				861	# know the file path encoding so they must be treated as bytes.
				862	x = unicode(x)
				863	if os.path.isabs(x):
				864	# If the path is not absolute, which tends to happen occasionally on
				865	# Windows, it is not possible to get the native path case so ignore
				866	# that trace. It mostly happens for 'executable' value.
				867	x = get_native_path_case(x)
				868	return x
				869
				870	def fix_and_blacklist_path(x):
				871	x = fix_path(x)
				872	if not x:
				873	return
				874	# The blacklist needs to be reapplied, since path casing could
				875	# influence blacklisting.
				876	if self._blacklist(x):
				877	return
				878	return x
				879
				880	# Filters out directories. Some may have passed through.
				881	files = set(f for f in map(fix_and_blacklist_path, self.files) if f)
				882	only_touched = set(
				883	f for f in map(fix_and_blacklist_path, self.only_touched) if f)
				884	only_touched -= files
				885
				886	files = [
				887	Results.File(None, f, False, None) for f in files
				888	if not os.path.isdir(f)
				889	]
				890	# Using 0 as size means the file's content is ignored since the file was
				891	# never opened for I/O.
				892	files.extend(
				893	Results.File(None, f, False, 0) for f in only_touched
				894	if not os.path.isdir(f)
				895	)
				896	return Results.Process(
				897	self.pid,
				898	files,
				899	fix_path(self.executable),
				900	self.command,
				901	fix_path(self.initial_cwd),
				902	[c.to_results_process() for c in self.children])
				903
				904	def add_file(self, filepath, touch_only):
				905	"""Adds a file if it passes the blacklist."""
				906	if self._blacklist(unicode(filepath)):
				907	return
				908	logging.debug('add_file(%d, %s, %s)' % (self.pid, filepath, touch_only))
				909	# Note that filepath and not unicode(filepath) is added. It is because
				910	# filepath could be something else than a string, like a RelativePath
				911	# instance for dtrace logs.
				912	if touch_only:
				913	self.only_touched.add(filepath)
				914	else:
				915	self.files.add(filepath)
				916
				917	def __init__(self, blacklist):
				918	self.blacklist = blacklist
				919	# Initial process.
				920	self.root_process = None
				921	# dict to accelerate process lookup, to not have to lookup the whole graph
				922	# each time.
				923	self._process_lookup = {}
				924
				925	class Tracer(object):
				926	"""During it's lifetime, the tracing subsystem is enabled."""
				927	def __init__(self, logname):
				928	self._logname = logname
				929	self._lock = threading.Lock()
				930	self._traces = []
				931	self._initialized = True
				932
				933	def trace(self, cmd, cwd, tracename, output):
				934	"""Runs the OS-specific trace program on an executable.
				935
				936	Arguments:
				937	- cmd: The command (a list) to run.
				938	- cwd: Current directory to start the child process in.
				939	- tracename: Name of the trace in the logname file.
				940	- output: If False, redirects output to PIPEs.
				941
				942	Returns a tuple (resultcode, output) and updates the internal trace
				943	entries.
				944	"""
				945	# The implementation adds an item to self._traces.
				946	raise NotImplementedError(self.__class__.__name__)
				947
				948	def close(self, _timeout=None):
				949	"""Saves the meta-data in the logname file.
				950
				951	For kernel-based tracing, stops the tracing subsystem.
				952
				953	Must not be used manually when using 'with' construct.
				954	"""
				955	with self._lock:
				956	assert self._initialized
				957	try:
				958	data = {
				959	'traces': self._traces,
				960	}
				961	write_json(self._logname, data, False)
				962	finally:
				963	self._initialized = False
				964
				965	def post_process_log(self):
				966	"""Post-processes the log so it becomes faster to load afterward.
				967
				968	Must not be used manually when using 'with' construct.
				969	"""
				970	assert not self._initialized, 'Must stop tracing first.'
				971
				972	def __enter__(self):
				973	"""Enables 'with' statement."""
				974	return self
				975
				976	def __exit__(self, exc_type, exc_value, traceback):
				977	"""Enables 'with' statement."""
				978	self.close()
				979	# If an exception was thrown, do not process logs.
				980	if not exc_type:
				981	self.post_process_log()
				982
				983	def get_tracer(self, logname):
				984	"""Returns an ApiBase.Tracer instance.
				985
				986	Initializes the tracing subsystem, which is a requirement for kernel-based
				987	tracers. Only one tracer instance should be live at a time!
				988
				989	logname is the filepath to the json file that will contain the meta-data
				990	about the logs.
				991	"""
				992	return self.Tracer(logname)
				993
				994	@staticmethod
				995	def clean_trace(logname):
				996	"""Deletes an old log."""
				997	raise NotImplementedError()
				998
				999	@classmethod
				1000	def parse_log(cls, logname, blacklist):
				1001	"""Processes trace logs and returns the files opened and the files that do
				1002	not exist.
				1003
				1004	It does not track directories.
				1005
maruel@chromium.org	8b056ba	2012-10-16 14:04:49 +0000	[diff] [blame]	1006	\|logname\| must be an absolute path.
				1007
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1008	Most of the time, files that do not exist are temporary test files that
				1009	should be put in /tmp instead. See http://crbug.com/116251.
				1010
				1011	Returns a list of dict with keys:
				1012	- results: A Results instance.
				1013	- trace: The corresponding tracename parameter provided to
				1014	get_tracer().trace().
				1015	- output: Output gathered during execution, if get_tracer().trace(...,
				1016	output=False) was used.
				1017	"""
				1018	raise NotImplementedError(cls.__class__.__name__)
				1019
				1020
				1021	class Strace(ApiBase):
				1022	"""strace implies linux."""
				1023	class Context(ApiBase.Context):
				1024	"""Processes a strace log line and keeps the list of existent and non
				1025	existent files accessed.
				1026
				1027	Ignores directories.
				1028
				1029	Uses late-binding to processes the cwd of each process. The problem is that
				1030	strace generates one log file per process it traced but doesn't give any
				1031	information about which process was started when and by who. So we don't
				1032	even know which process is the initial one. So process the logs out of
				1033	order and use late binding with RelativePath to be able to deduce the
				1034	initial directory of each process once all the logs are parsed.
				1035	"""
				1036	class Process(ApiBase.Context.Process):
				1037	"""Represents the state of a process.
				1038
				1039	Contains all the information retrieved from the pid-specific log.
				1040	"""
				1041	# Function names are using ([a-z_0-9]+)
				1042	# This is the most common format. function(args) = result
				1043	RE_HEADER = re.compile(r'^([a-z_0-9]+)$(.+?)$\s+= (.+)$')
				1044	# An interrupted function call, only grab the minimal header.
				1045	RE_UNFINISHED = re.compile(r'^([^\(]+)(.*) \<unfinished \.\.\.\>$')
maruel@chromium.org	9f72e0f	2012-10-18 14:48:49 +0000	[diff] [blame]	1046	# An interrupted function call, with the process exiting. It must be the
				1047	# last line in the log.
				1048	RE_UNFINISHED_EXIT = re.compile(
				1049	r'^([^\(]+)(.*) \<unfinished \.\.\.\ exit status \d+>$')
maruel@chromium.org	4e50650	2012-10-18 15:02:58 +0000	[diff] [blame]	1050	# An interrupted function call the hard way. Usually observed with futex()
				1051	# on ubuntu 12.04.
csharp@chromium.org	4d75355	2012-10-25 19:31:55 +0000	[diff] [blame^]	1052	RE_INTERRUPTED_HARD = re.compile(r'^([^\(]+)\([A-Z0-9a-fx\,\_\\|\{\} ]*$')
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1053	# A resumed function call.
				1054	RE_RESUMED = re.compile(r'^<\.\.\. ([^ ]+) resumed> (.+)$')
				1055	# A process received a signal.
				1056	RE_SIGNAL = re.compile(r'^--- SIG[A-Z]+ .+ ---')
				1057	# A process didn't handle a signal. Ignore any junk appearing before,
				1058	# because the process was forcibly killed so it won't open any new file.
				1059	RE_KILLED = re.compile(
				1060	r'^.*\+\+\+ killed by ([A-Z]+)( $core dumped$)? \+\+\+$')
				1061	# The process has exited.
				1062	RE_PROCESS_EXITED = re.compile(r'^\+\+\+ exited with (\d+) \+\+\+')
				1063	# A call was canceled. Ignore any prefix.
				1064	RE_UNAVAILABLE = re.compile(r'^.\)\s= \? <unavailable>$')
				1065	# Happens when strace fails to even get the function name.
				1066	UNNAMED_FUNCTION = '????'
				1067
				1068	# Corner-case in python, a class member function decorator must not be
				1069	# @staticmethod.
				1070	def parse_args(regexp, expect_zero): # pylint: disable=E0213
				1071	"""Automatically convert the str 'args' into a list of processed
				1072	arguments.
				1073
				1074	Arguments:
				1075	- regexp is used to parse args.
				1076	- expect_zero: one of True, False or None.
				1077	- True: will check for result.startswith('0') first and will ignore
				1078	the trace line completely otherwise. This is important because for
				1079	many functions, the regexp will not process if the call failed.
				1080	- False: will check for not result.startswith(('?', '-1')) for the
				1081	same reason than with True.
				1082	- None: ignore result.
				1083	"""
				1084	def meta_hook(function):
				1085	assert function.__name__.startswith('handle_')
				1086	def hook(self, args, result):
				1087	if expect_zero is True and not result.startswith('0'):
				1088	return
				1089	if expect_zero is False and result.startswith(('?', '-1')):
				1090	return
				1091	match = re.match(regexp, args)
				1092	if not match:
				1093	raise TracingFailure(
				1094	'Failed to parse %s(%s) = %s' %
				1095	(function.__name__[len('handle_'):], args, result),
				1096	None, None, None)
				1097	return function(self, match.groups(), result)
				1098	return hook
				1099	return meta_hook
				1100
				1101	class RelativePath(object):
				1102	"""A late-bound relative path."""
				1103	def __init__(self, parent, value):
				1104	self.parent = parent
				1105	self.value = value
				1106
				1107	def render(self):
				1108	"""Returns the current directory this instance is representing.
				1109
				1110	This function is used to return the late-bound value.
				1111	"""
				1112	if self.value and self.value.startswith(u'/'):
				1113	# An absolute path.
				1114	return self.value
				1115	parent = self.parent.render() if self.parent else u'<None>'
				1116	if self.value:
				1117	return os.path.normpath(os.path.join(parent, self.value))
				1118	return parent
				1119
				1120	def __unicode__(self):
				1121	"""Acts as a string whenever needed."""
				1122	return unicode(self.render())
				1123
				1124	def __str__(self):
				1125	"""Acts as a string whenever needed."""
				1126	return str(self.render())
				1127
				1128	def __init__(self, root, pid):
				1129	"""Keeps enough information to be able to guess the original process
				1130	root.
				1131
				1132	strace doesn't store which process was the initial process. So more
				1133	information needs to be kept so the graph can be reconstructed from the
				1134	flat map.
				1135	"""
				1136	logging.info('%s(%d)' % (self.__class__.__name__, pid))
				1137	super(Strace.Context.Process, self).__init__(root.blacklist, pid, None)
				1138	assert isinstance(root, ApiBase.Context)
				1139	self._root = weakref.ref(root)
				1140	# The dict key is the function name of the pending call, like 'open'
				1141	# or 'execve'.
				1142	self._pending_calls = {}
				1143	self._line_number = 0
				1144	# Current directory when the process started.
				1145	self.initial_cwd = self.RelativePath(self._root(), None)
				1146	self.parentid = None
maruel@chromium.org	9f72e0f	2012-10-18 14:48:49 +0000	[diff] [blame]	1147	self._done = False
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1148
				1149	def get_cwd(self):
				1150	"""Returns the best known value of cwd."""
				1151	return self.cwd or self.initial_cwd
				1152
				1153	def render(self):
				1154	"""Returns the string value of the RelativePath() object.
				1155
				1156	Used by RelativePath. Returns the initial directory and not the
				1157	current one since the current directory 'cwd' validity is time-limited.
				1158
				1159	The validity is only guaranteed once all the logs are processed.
				1160	"""
				1161	return self.initial_cwd.render()
				1162
				1163	def on_line(self, line):
				1164	self._line_number += 1
maruel@chromium.org	9f72e0f	2012-10-18 14:48:49 +0000	[diff] [blame]	1165	if self._done:
				1166	raise TracingFailure(
				1167	'Found a trace for a terminated process',
				1168	None, None, None)
				1169
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1170	if self.RE_SIGNAL.match(line):
				1171	# Ignore signals.
				1172	return
				1173
				1174	try:
				1175	match = self.RE_KILLED.match(line)
				1176	if match:
				1177	# Converts a '+++ killed by Foo +++' trace into an exit_group().
				1178	self.handle_exit_group(match.group(1), None)
				1179	return
				1180
				1181	match = self.RE_PROCESS_EXITED.match(line)
				1182	if match:
				1183	# Converts a '+++ exited with 1 +++' trace into an exit_group()
				1184	self.handle_exit_group(match.group(1), None)
				1185	return
				1186
				1187	match = self.RE_UNFINISHED.match(line)
				1188	if match:
				1189	if match.group(1) in self._pending_calls:
				1190	raise TracingFailure(
				1191	'Found two unfinished calls for the same function',
				1192	None, None, None,
				1193	self._pending_calls)
				1194	self._pending_calls[match.group(1)] = (
				1195	match.group(1) + match.group(2))
				1196	return
				1197
maruel@chromium.org	4e50650	2012-10-18 15:02:58 +0000	[diff] [blame]	1198	match = (
				1199	self.RE_UNFINISHED_EXIT.match(line) or
				1200	self.RE_INTERRUPTED_HARD.match(line))
maruel@chromium.org	9f72e0f	2012-10-18 14:48:49 +0000	[diff] [blame]	1201	if match:
				1202	# The process died. No other line can be processed afterward.
				1203	self._done = True
				1204	return
				1205
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1206	match = self.RE_UNAVAILABLE.match(line)
				1207	if match:
				1208	# This usually means a process was killed and a pending call was
				1209	# canceled.
				1210	# TODO(maruel): Look up the last exit_group() trace just above and
				1211	# make sure any self._pending_calls[anything] is properly flushed.
				1212	return
				1213
				1214	match = self.RE_RESUMED.match(line)
				1215	if match:
				1216	if match.group(1) not in self._pending_calls:
				1217	raise TracingFailure(
				1218	'Found a resumed call that was not logged as unfinished',
				1219	None, None, None,
				1220	self._pending_calls)
				1221	pending = self._pending_calls.pop(match.group(1))
				1222	# Reconstruct the line.
				1223	line = pending + match.group(2)
				1224
				1225	match = self.RE_HEADER.match(line)
				1226	if not match:
				1227	raise TracingFailure(
				1228	'Found an invalid line: %s' % line,
				1229	None, None, None)
				1230	if match.group(1) == self.UNNAMED_FUNCTION:
				1231	return
				1232
				1233	# It's a valid line, handle it.
				1234	handler = getattr(self, 'handle_%s' % match.group(1), None)
				1235	if not handler:
				1236	self._handle_unknown(match.group(1), match.group(2), match.group(3))
				1237	return handler(match.group(2), match.group(3))
				1238	except TracingFailure, e:
				1239	# Hack in the values since the handler could be a static function.
				1240	e.pid = self.pid
				1241	e.line = line
				1242	e.line_number = self._line_number
				1243	# Re-raise the modified exception.
				1244	raise
				1245	except (KeyError, NotImplementedError, ValueError), e:
				1246	raise TracingFailure(
				1247	'Trace generated a %s exception: %s' % (
				1248	e.__class__.__name__, str(e)),
				1249	self.pid,
				1250	self._line_number,
				1251	line,
				1252	e)
				1253
				1254	@parse_args(r'^\"(.+?)\", [FKORWX_\|]+$', True)
				1255	def handle_access(self, args, _result):
				1256	self._handle_file(args[0], True)
				1257
				1258	@parse_args(r'^\"(.+?)\"$', True)
				1259	def handle_chdir(self, args, _result):
				1260	"""Updates cwd."""
				1261	self.cwd = self.RelativePath(self, args[0])
				1262	logging.debug('handle_chdir(%d, %s)' % (self.pid, self.cwd))
				1263
				1264	def handle_clone(self, _args, result):
				1265	"""Transfers cwd."""
				1266	if result.startswith(('?', '-1')):
				1267	# The call failed.
				1268	return
				1269	# Update the other process right away.
				1270	childpid = int(result)
				1271	child = self._root().get_or_set_proc(childpid)
				1272	if child.parentid is not None or childpid in self.children:
				1273	raise TracingFailure(
				1274	'Found internal inconsitency in process lifetime detection '
				1275	'during a clone() call',
				1276	None, None, None)
				1277
				1278	# Copy the cwd object.
				1279	child.initial_cwd = self.get_cwd()
				1280	child.parentid = self.pid
				1281	# It is necessary because the logs are processed out of order.
				1282	self.children.append(child)
				1283
				1284	def handle_close(self, _args, _result):
				1285	pass
				1286
				1287	def handle_chmod(self, _args, _result):
				1288	pass
				1289
				1290	def handle_creat(self, _args, _result):
				1291	# Ignore files created, since they didn't need to exist.
				1292	pass
				1293
				1294	@parse_args(r'^\"(.+?)\", \[(.+)\], \[\/\* \d+ vars? \*\/\]$', True)
				1295	def handle_execve(self, args, _result):
				1296	# Even if in practice execve() doesn't returns when it succeeds, strace
				1297	# still prints '0' as the result.
				1298	filepath = args[0]
				1299	self._handle_file(filepath, False)
				1300	self.executable = self.RelativePath(self.get_cwd(), filepath)
				1301	self.command = strace_process_quoted_arguments(args[1])
				1302
				1303	def handle_exit_group(self, _args, _result):
				1304	"""Removes cwd."""
				1305	self.cwd = None
				1306
				1307	def handle_fork(self, args, result):
				1308	self._handle_unknown('fork', args, result)
				1309
maruel@chromium.org	9f72e0f	2012-10-18 14:48:49 +0000	[diff] [blame]	1310	def handle_futex(self, _args, _result):
				1311	pass
				1312
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1313	def handle_getcwd(self, _args, _result):
				1314	pass
				1315
				1316	@parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
				1317	def handle_link(self, args, _result):
				1318	self._handle_file(args[0], False)
				1319	self._handle_file(args[1], False)
				1320
				1321	@parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True)
				1322	def handle_lstat(self, args, _result):
				1323	self._handle_file(args[0], True)
				1324
				1325	def handle_mkdir(self, _args, _result):
				1326	pass
				1327
				1328	@parse_args(r'^\"(.*?)\", ([A-Z\_\\|]+)(\|, \d+)$', False)
				1329	def handle_open(self, args, _result):
				1330	if 'O_DIRECTORY' in args[1]:
				1331	return
				1332	self._handle_file(args[0], False)
				1333
				1334	@parse_args(r'^(\d+\|AT_FDCWD), \"(.*?)\", ([A-Z\_\\|]+)(\|, \d+)$', False)
				1335	def handle_openat(self, args, _result):
				1336	if 'O_DIRECTORY' in args[2]:
				1337	return
				1338	if args[1] == 'AT_FDCWD':
				1339	self._handle_file(args[1], False)
				1340	else:
				1341	# TODO(maruel): Implement relative open if necessary instead of the
				1342	# AT_FDCWD flag, let's hope not since this means tracking all active
				1343	# directory handles.
				1344	raise Exception('Relative open via openat not implemented.')
				1345
				1346	@parse_args(r'^\"(.+?)\", \".+?\"(\.\.\.)?, \d+$', False)
				1347	def handle_readlink(self, args, _result):
				1348	self._handle_file(args[0], False)
				1349
				1350	@parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
				1351	def handle_rename(self, args, _result):
				1352	self._handle_file(args[0], False)
				1353	self._handle_file(args[1], False)
				1354
				1355	def handle_rmdir(self, _args, _result):
				1356	pass
				1357
				1358	def handle_setxattr(self, _args, _result):
				1359	pass
				1360
				1361	@parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True)
				1362	def handle_stat(self, args, _result):
				1363	self._handle_file(args[0], True)
				1364
				1365	def handle_symlink(self, _args, _result):
				1366	pass
				1367
				1368	def handle_unlink(self, _args, _result):
				1369	# In theory, the file had to be created anyway.
				1370	pass
				1371
				1372	def handle_statfs(self, _args, _result):
				1373	pass
				1374
				1375	def handle_vfork(self, args, result):
				1376	self._handle_unknown('vfork', args, result)
				1377
				1378	@staticmethod
				1379	def _handle_unknown(function, args, result):
				1380	raise TracingFailure(
				1381	'Unexpected/unimplemented trace %s(%s)= %s' %
				1382	(function, args, result),
				1383	None, None, None)
				1384
				1385	def _handle_file(self, filepath, touch_only):
				1386	filepath = self.RelativePath(self.get_cwd(), filepath)
				1387	#assert not touch_only, unicode(filepath)
				1388	self.add_file(filepath, touch_only)
				1389
				1390	def __init__(self, blacklist, initial_cwd):
				1391	super(Strace.Context, self).__init__(blacklist)
				1392	self.initial_cwd = initial_cwd
				1393
				1394	def render(self):
				1395	"""Returns the string value of the initial cwd of the root process.
				1396
				1397	Used by RelativePath.
				1398	"""
				1399	return self.initial_cwd
				1400
				1401	def on_line(self, pid, line):
				1402	"""Transfers control into the Process.on_line() function."""
				1403	self.get_or_set_proc(pid).on_line(line.strip())
				1404
				1405	def to_results(self):
				1406	"""Finds back the root process and verify consistency."""
				1407	# TODO(maruel): Absolutely unecessary, fix me.
				1408	root = [p for p in self._process_lookup.itervalues() if not p.parentid]
				1409	if len(root) != 1:
				1410	raise TracingFailure(
				1411	'Found internal inconsitency in process lifetime detection '
				1412	'while finding the root process',
				1413	None,
				1414	None,
				1415	None,
				1416	sorted(p.pid for p in root))
				1417	self.root_process = root[0]
				1418	process = self.root_process.to_results_process()
				1419	if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
				1420	raise TracingFailure(
				1421	'Found internal inconsitency in process lifetime detection '
				1422	'while looking for len(tree) == len(list)',
				1423	None,
				1424	None,
				1425	None,
				1426	sorted(self._process_lookup),
				1427	sorted(p.pid for p in process.all))
				1428	return Results(process)
				1429
				1430	def get_or_set_proc(self, pid):
				1431	"""Returns the Context.Process instance for this pid or creates a new one.
				1432	"""
				1433	if not pid or not isinstance(pid, int):
				1434	raise TracingFailure(
				1435	'Unpexpected value for pid: %r' % pid,
				1436	pid,
				1437	None,
				1438	None,
				1439	pid)
				1440	if pid not in self._process_lookup:
				1441	self._process_lookup[pid] = self.Process(self, pid)
				1442	return self._process_lookup[pid]
				1443
				1444	@classmethod
				1445	def traces(cls):
				1446	"""Returns the list of all handled traces to pass this as an argument to
				1447	strace.
				1448	"""
				1449	prefix = 'handle_'
				1450	return [i[len(prefix):] for i in dir(cls.Process) if i.startswith(prefix)]
				1451
				1452	class Tracer(ApiBase.Tracer):
				1453	MAX_LEN = 256
				1454
				1455	def trace(self, cmd, cwd, tracename, output):
				1456	"""Runs strace on an executable."""
				1457	logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
				1458	assert os.path.isabs(cmd[0]), cmd[0]
				1459	assert os.path.isabs(cwd), cwd
				1460	assert os.path.normpath(cwd) == cwd, cwd
				1461	with self._lock:
				1462	if not self._initialized:
				1463	raise TracingFailure(
				1464	'Called Tracer.trace() on an unitialized object',
				1465	None, None, None, tracename)
				1466	assert tracename not in (i['trace'] for i in self._traces)
				1467	stdout = stderr = None
				1468	if output:
				1469	stdout = subprocess.PIPE
				1470	stderr = subprocess.STDOUT
				1471	# Ensure all file related APIs are hooked.
				1472	traces = ','.join(Strace.Context.traces() + ['file'])
				1473	trace_cmd = [
				1474	'strace',
				1475	'-ff',
				1476	'-s', '%d' % self.MAX_LEN,
				1477	'-e', 'trace=%s' % traces,
				1478	'-o', self._logname + '.' + tracename,
				1479	]
				1480	child = subprocess.Popen(
				1481	trace_cmd + cmd,
				1482	cwd=cwd,
				1483	stdin=subprocess.PIPE,
				1484	stdout=stdout,
				1485	stderr=stderr)
				1486	out = child.communicate()[0]
				1487	# TODO(maruel): Walk the logs and figure out the root process would
				1488	# simplify parsing the logs a lot.
				1489	with self._lock:
				1490	assert tracename not in (i['trace'] for i in self._traces)
				1491	self._traces.append(
				1492	{
				1493	'cmd': cmd,
				1494	'cwd': cwd,
				1495	# The pid of strace process, not very useful.
				1496	'pid': child.pid,
				1497	'trace': tracename,
				1498	'output': out,
				1499	})
				1500	return child.returncode, out
				1501
				1502	@staticmethod
				1503	def clean_trace(logname):
				1504	if os.path.isfile(logname):
				1505	os.remove(logname)
				1506	# Also delete any pid specific file from previous traces.
				1507	for i in glob.iglob(logname + '.*'):
				1508	if i.rsplit('.', 1)[1].isdigit():
				1509	os.remove(i)
				1510
				1511	@classmethod
				1512	def parse_log(cls, logname, blacklist):
				1513	logging.info('parse_log(%s, %s)' % (logname, blacklist))
maruel@chromium.org	8b056ba	2012-10-16 14:04:49 +0000	[diff] [blame]	1514	assert os.path.isabs(logname)
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1515	data = read_json(logname)
				1516	out = []
				1517	for item in data['traces']:
				1518	result = {
				1519	'trace': item['trace'],
				1520	'output': item['output'],
				1521	}
				1522	try:
				1523	context = cls.Context(blacklist, item['cwd'])
				1524	for pidfile in glob.iglob('%s.%s.*' % (logname, item['trace'])):
				1525	pid = pidfile.rsplit('.', 1)[1]
				1526	if pid.isdigit():
				1527	pid = int(pid)
				1528	# TODO(maruel): Load as utf-8
				1529	for line in open(pidfile, 'rb'):
				1530	context.on_line(pid, line)
				1531	result['results'] = context.to_results()
maruel@chromium.org	424f569	2012-10-15 15:42:38 +0000	[diff] [blame]	1532	except TracingFailure:
				1533	result['exception'] = sys.exc_info()
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1534	out.append(result)
				1535	return out
				1536
				1537
				1538	class Dtrace(ApiBase):
				1539	"""Uses DTrace framework through dtrace. Requires root access.
				1540
				1541	Implies Mac OSX.
				1542
				1543	dtruss can't be used because it has compatibility issues with python.
				1544
				1545	Also, the pid->cwd handling needs to be done manually since OSX has no way to
				1546	get the absolute path of the 'cwd' dtrace variable from the probe.
				1547
				1548	Also, OSX doesn't populate curpsinfo->pr_psargs properly, see
				1549	https://discussions.apple.com/thread/1980539. So resort to handling execve()
				1550	manually.
				1551
				1552	errno is not printed in the log since this implementation currently only cares
				1553	about files that were successfully opened.
				1554	"""
				1555	class Context(ApiBase.Context):
				1556	# Format: index pid function(args)
				1557	RE_HEADER = re.compile(r'^\d+ (\d+) ([a-zA-Z_\-]+)$(.*?)$$')
				1558
				1559	# Arguments parsing.
				1560	RE_DTRACE_BEGIN = re.compile(r'^\"(.+?)\"$')
				1561	RE_CHDIR = re.compile(r'^\"(.+?)\"$')
				1562	RE_EXECVE = re.compile(r'^\"(.+?)\", \[(\d+), (.+)\]$')
				1563	RE_OPEN = re.compile(r'^\"(.+?)\", (0x[0-9a-z]+), (0x[0-9a-z]+)$')
				1564	RE_PROC_START = re.compile(r'^(\d+), \"(.+?)\", (\d+)$')
				1565	RE_RENAME = re.compile(r'^\"(.+?)\", \"(.+?)\"$')
				1566
				1567	O_DIRECTORY = 0x100000
				1568
				1569	class Process(ApiBase.Context.Process):
				1570	def __init__(self, *args):
				1571	super(Dtrace.Context.Process, self).__init__(*args)
				1572	self.cwd = self.initial_cwd
				1573
				1574	def __init__(self, blacklist, tracer_pid, initial_cwd):
				1575	logging.info(
				1576	'%s(%d, %s)' % (self.__class__.__name__, tracer_pid, initial_cwd))
				1577	super(Dtrace.Context, self).__init__(blacklist)
				1578	# Process ID of the temporary script created by create_thunk().
				1579	self._tracer_pid = tracer_pid
				1580	self._initial_cwd = initial_cwd
				1581	self._line_number = 0
				1582
				1583	def on_line(self, line):
				1584	self._line_number += 1
				1585	match = self.RE_HEADER.match(line)
				1586	if not match:
				1587	raise TracingFailure(
				1588	'Found malformed line: %s' % line,
				1589	None,
				1590	self._line_number,
				1591	line)
				1592	fn = getattr(
				1593	self,
				1594	'handle_%s' % match.group(2).replace('-', '_'),
				1595	self._handle_ignored)
				1596	# It is guaranteed to succeed because of the regexp. Or at least I thought
				1597	# it would.
				1598	pid = int(match.group(1))
				1599	try:
				1600	return fn(pid, match.group(3))
				1601	except TracingFailure, e:
				1602	# Hack in the values since the handler could be a static function.
				1603	e.pid = pid
				1604	e.line = line
				1605	e.line_number = self._line_number
				1606	# Re-raise the modified exception.
				1607	raise
				1608	except (KeyError, NotImplementedError, ValueError), e:
				1609	raise TracingFailure(
				1610	'Trace generated a %s exception: %s' % (
				1611	e.__class__.__name__, str(e)),
				1612	pid,
				1613	self._line_number,
				1614	line,
				1615	e)
				1616
				1617	def to_results(self):
				1618	process = self.root_process.to_results_process()
				1619	# Internal concistency check.
				1620	if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
				1621	raise TracingFailure(
				1622	'Found internal inconsitency in process lifetime detection '
				1623	'while looking for len(tree) == len(list)',
				1624	None,
				1625	None,
				1626	None,
				1627	sorted(self._process_lookup),
				1628	sorted(p.pid for p in process.all))
				1629	return Results(process)
				1630
				1631	def handle_dtrace_BEGIN(self, _pid, args):
				1632	if not self.RE_DTRACE_BEGIN.match(args):
				1633	raise TracingFailure(
				1634	'Found internal inconsitency in dtrace_BEGIN log line',
				1635	None, None, None)
				1636
				1637	def handle_proc_start(self, pid, args):
				1638	"""Transfers cwd.
				1639
				1640	The dtrace script already takes care of only tracing the processes that
				1641	are child of the traced processes so there is no need to verify the
				1642	process hierarchy.
				1643	"""
				1644	if pid in self._process_lookup:
				1645	raise TracingFailure(
				1646	'Found internal inconsitency in proc_start: %d started two times' %
				1647	pid,
				1648	None, None, None)
				1649	match = self.RE_PROC_START.match(args)
				1650	if not match:
				1651	raise TracingFailure(
				1652	'Failed to parse arguments: %s' % args,
				1653	None, None, None)
				1654	ppid = int(match.group(1))
				1655	if ppid == self._tracer_pid and not self.root_process:
				1656	proc = self.root_process = self.Process(
				1657	self.blacklist, pid, self._initial_cwd)
				1658	elif ppid in self._process_lookup:
				1659	proc = self.Process(self.blacklist, pid, self._process_lookup[ppid].cwd)
				1660	self._process_lookup[ppid].children.append(proc)
				1661	else:
				1662	# Another process tree, ignore.
				1663	return
				1664	self._process_lookup[pid] = proc
				1665	logging.debug(
				1666	'New child: %s -> %d cwd:%s' %
				1667	(ppid, pid, unicode(proc.initial_cwd)))
				1668
				1669	def handle_proc_exit(self, pid, _args):
				1670	"""Removes cwd."""
				1671	if pid in self._process_lookup:
				1672	# self._tracer_pid is not traced itself and other traces run neither.
				1673	self._process_lookup[pid].cwd = None
				1674
				1675	def handle_execve(self, pid, args):
				1676	"""Sets the process' executable.
				1677
				1678	TODO(maruel): Read command line arguments. See
				1679	https://discussions.apple.com/thread/1980539 for an example.
				1680	https://gist.github.com/1242279
				1681
				1682	Will have to put the answer at http://stackoverflow.com/questions/7556249.
				1683	:)
				1684	"""
				1685	if not pid in self._process_lookup:
				1686	# Another process tree, ignore.
				1687	return
				1688	match = self.RE_EXECVE.match(args)
				1689	if not match:
				1690	raise TracingFailure(
				1691	'Failed to parse arguments: %r' % args,
				1692	None, None, None)
				1693	proc = self._process_lookup[pid]
				1694	proc.executable = match.group(1)
				1695	proc.command = self.process_escaped_arguments(match.group(3))
				1696	if int(match.group(2)) != len(proc.command):
				1697	raise TracingFailure(
				1698	'Failed to parse execve() arguments: %s' % args,
				1699	None, None, None)
				1700
				1701	def handle_chdir(self, pid, args):
				1702	"""Updates cwd."""
				1703	if pid not in self._process_lookup:
				1704	# Another process tree, ignore.
				1705	return
				1706	cwd = self.RE_CHDIR.match(args).group(1)
				1707	if not cwd.startswith('/'):
				1708	cwd2 = os.path.join(self._process_lookup[pid].cwd, cwd)
				1709	logging.debug('handle_chdir(%d, %s) -> %s' % (pid, cwd, cwd2))
				1710	else:
				1711	logging.debug('handle_chdir(%d, %s)' % (pid, cwd))
				1712	cwd2 = cwd
				1713	self._process_lookup[pid].cwd = cwd2
				1714
				1715	def handle_open_nocancel(self, pid, args):
				1716	"""Redirects to handle_open()."""
				1717	return self.handle_open(pid, args)
				1718
				1719	def handle_open(self, pid, args):
				1720	if pid not in self._process_lookup:
				1721	# Another process tree, ignore.
				1722	return
				1723	match = self.RE_OPEN.match(args)
				1724	if not match:
				1725	raise TracingFailure(
				1726	'Failed to parse arguments: %s' % args,
				1727	None, None, None)
				1728	flag = int(match.group(2), 16)
				1729	if self.O_DIRECTORY & flag == self.O_DIRECTORY:
				1730	# Ignore directories.
				1731	return
				1732	self._handle_file(pid, match.group(1))
				1733
				1734	def handle_rename(self, pid, args):
				1735	if pid not in self._process_lookup:
				1736	# Another process tree, ignore.
				1737	return
				1738	match = self.RE_RENAME.match(args)
				1739	if not match:
				1740	raise TracingFailure(
				1741	'Failed to parse arguments: %s' % args,
				1742	None, None, None)
				1743	self._handle_file(pid, match.group(1))
				1744	self._handle_file(pid, match.group(2))
				1745
				1746	def _handle_file(self, pid, filepath):
				1747	if not filepath.startswith('/'):
				1748	filepath = os.path.join(self._process_lookup[pid].cwd, filepath)
				1749	# We can get '..' in the path.
				1750	filepath = os.path.normpath(filepath)
				1751	# Sadly, still need to filter out directories here;
				1752	# saw open_nocancel(".", 0, 0) = 0 lines.
				1753	if os.path.isdir(filepath):
				1754	return
				1755	self._process_lookup[pid].add_file(filepath, False)
				1756
				1757	def handle_ftruncate(self, pid, args):
				1758	"""Just used as a signal to kill dtrace, ignoring."""
				1759	pass
				1760
				1761	@staticmethod
				1762	def _handle_ignored(pid, args):
				1763	"""Is called for all the event traces that are not handled."""
				1764	raise NotImplementedError('Please implement me')
				1765
				1766	@staticmethod
				1767	def process_escaped_arguments(text):
				1768	"""Extracts escaped arguments on a string and return the arguments as a
				1769	list.
				1770
				1771	Implemented as an automaton.
				1772
				1773	Example:
				1774	With text = '\\001python2.7\\001-c\\001print(\\"hi\\")\\0', the
				1775	function will return ['python2.7', '-c', 'print("hi")]
				1776	"""
				1777	if not text.endswith('\\0'):
				1778	raise ValueError('String is not null terminated: %r' % text, text)
				1779	text = text[:-2]
				1780
				1781	def unescape(x):
				1782	"""Replaces '\\' with '\' and '\?' (where ? is anything) with ?."""
				1783	out = []
				1784	escaped = False
				1785	for i in x:
				1786	if i == '\\' and not escaped:
				1787	escaped = True
				1788	continue
				1789	escaped = False
				1790	out.append(i)
				1791	return ''.join(out)
				1792
				1793	return [unescape(i) for i in text.split('\\001')]
				1794
				1795	class Tracer(ApiBase.Tracer):
				1796	# pylint: disable=C0301
				1797	#
				1798	# To understand the following code, you'll want to take a look at:
				1799	# http://developers.sun.com/solaris/articles/dtrace_quickref/dtrace_quickref.html
				1800	# https://wikis.oracle.com/display/DTrace/Variables
				1801	# http://docs.oracle.com/cd/E19205-01/820-4221/
				1802	#
				1803	# 0. Dump all the valid probes into a text file. It is important, you
				1804	# want to redirect into a file and you don't want to constantly 'sudo'.
				1805	# $ sudo dtrace -l > probes.txt
				1806	#
				1807	# 1. Count the number of probes:
				1808	# $ wc -l probes.txt
				1809	# 81823 # On OSX 10.7, including 1 header line.
				1810	#
				1811	# 2. List providers, intentionally skipping all the 'syspolicy10925' and the
				1812	# likes and skipping the header with NR>1:
				1813	# $ awk 'NR>1 { print $2 }' probes.txt \| sort \| uniq \| grep -v '[[:digit:]]'
				1814	# dtrace
				1815	# fbt
				1816	# io
				1817	# ip
				1818	# lockstat
				1819	# mach_trap
				1820	# proc
				1821	# profile
				1822	# sched
				1823	# syscall
				1824	# tcp
				1825	# vminfo
				1826	#
				1827	# 3. List of valid probes:
				1828	# $ grep syscall probes.txt \| less
				1829	# or use dtrace directly:
				1830	# $ sudo dtrace -l -P syscall \| less
				1831	#
				1832	# trackedpid is an associative array where its value can be 0, 1 or 2.
				1833	# 0 is for untracked processes and is the default value for items not
				1834	# in the associative array.
				1835	# 1 is for tracked processes.
				1836	# 2 is for the script created by create_thunk() only. It is not tracked
				1837	# itself but all its decendants are.
				1838	#
				1839	# The script will kill itself only once waiting_to_die == 1 and
				1840	# current_processes == 0, so that both getlogin() was called and that
				1841	# all traced processes exited.
				1842	#
				1843	# TODO(maruel): Use cacheable predicates. See
				1844	# https://wikis.oracle.com/display/DTrace/Performance+Considerations
				1845	D_CODE = """
				1846	dtrace:::BEGIN {
				1847	waiting_to_die = 0;
				1848	current_processes = 0;
				1849	logindex = 0;
				1850	printf("%d %d %s_%s(\\"%s\\")\\n",
				1851	logindex, PID, probeprov, probename, SCRIPT);
				1852	logindex++;
				1853	}
				1854
				1855	proc:::start /trackedpid[ppid]/ {
				1856	trackedpid[pid] = 1;
				1857	current_processes += 1;
				1858	printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
				1859	logindex, pid, probeprov, probename,
				1860	ppid,
				1861	execname,
				1862	current_processes);
				1863	logindex++;
				1864	}
				1865	/* Should use SCRIPT but there is no access to this variable at that
				1866	* point. */
				1867	proc:::start /ppid == PID && execname == "Python"/ {
				1868	trackedpid[pid] = 2;
				1869	current_processes += 1;
				1870	printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
				1871	logindex, pid, probeprov, probename,
				1872	ppid,
				1873	execname,
				1874	current_processes);
				1875	logindex++;
				1876	}
				1877	proc:::exit /trackedpid[pid] &&
				1878	current_processes == 1 &&
				1879	waiting_to_die == 1/ {
				1880	trackedpid[pid] = 0;
				1881	current_processes -= 1;
				1882	printf("%d %d %s_%s(%d)\\n",
				1883	logindex, pid, probeprov, probename,
				1884	current_processes);
				1885	logindex++;
				1886	exit(0);
				1887	}
				1888	proc:::exit /trackedpid[pid]/ {
				1889	trackedpid[pid] = 0;
				1890	current_processes -= 1;
				1891	printf("%d %d %s_%s(%d)\\n",
				1892	logindex, pid, probeprov, probename,
				1893	current_processes);
				1894	logindex++;
				1895	}
				1896
				1897	/* Use an arcane function to detect when we need to die */
				1898	syscall::ftruncate:entry /pid == PID && arg0 == FILE_ID/ {
				1899	waiting_to_die = 1;
				1900	printf("%d %d %s()\\n", logindex, pid, probefunc);
				1901	logindex++;
				1902	}
				1903	syscall::ftruncate:entry /
				1904	pid == PID && arg0 == FILE_ID && current_processes == 0/ {
				1905	exit(0);
				1906	}
				1907
				1908	syscall::open*:entry /trackedpid[pid] == 1/ {
				1909	self->open_arg0 = arg0;
				1910	self->open_arg1 = arg1;
				1911	self->open_arg2 = arg2;
				1912	}
				1913	syscall::open*:return /trackedpid[pid] == 1 && errno == 0/ {
				1914	this->open_arg0 = copyinstr(self->open_arg0);
				1915	printf("%d %d %s(\\"%s\\", 0x%x, 0x%x)\\n",
				1916	logindex, pid, probefunc,
				1917	this->open_arg0,
				1918	self->open_arg1,
				1919	self->open_arg2);
				1920	logindex++;
				1921	this->open_arg0 = 0;
				1922	}
				1923	syscall::open*:return /trackedpid[pid] == 1/ {
				1924	self->open_arg0 = 0;
				1925	self->open_arg1 = 0;
				1926	self->open_arg2 = 0;
				1927	}
				1928
				1929	syscall::rename:entry /trackedpid[pid] == 1/ {
				1930	self->rename_arg0 = arg0;
				1931	self->rename_arg1 = arg1;
				1932	}
				1933	syscall::rename:return /trackedpid[pid] == 1 && errno == 0/ {
				1934	this->rename_arg0 = copyinstr(self->rename_arg0);
				1935	this->rename_arg1 = copyinstr(self->rename_arg1);
				1936	printf("%d %d %s(\\"%s\\", \\"%s\\")\\n",
				1937	logindex, pid, probefunc,
				1938	this->rename_arg0,
				1939	this->rename_arg1);
				1940	logindex++;
				1941	this->rename_arg0 = 0;
				1942	this->rename_arg1 = 0;
				1943	}
				1944	syscall::rename:return /trackedpid[pid] == 1/ {
				1945	self->rename_arg0 = 0;
				1946	self->rename_arg1 = 0;
				1947	}
				1948
				1949	/* Track chdir, it's painful because it is only receiving relative path.
				1950	*/
				1951	syscall::chdir:entry /trackedpid[pid] == 1/ {
				1952	self->chdir_arg0 = arg0;
				1953	}
				1954	syscall::chdir:return /trackedpid[pid] == 1 && errno == 0/ {
				1955	this->chdir_arg0 = copyinstr(self->chdir_arg0);
				1956	printf("%d %d %s(\\"%s\\")\\n",
				1957	logindex, pid, probefunc,
				1958	this->chdir_arg0);
				1959	logindex++;
				1960	this->chdir_arg0 = 0;
				1961	}
				1962	syscall::chdir:return /trackedpid[pid] == 1/ {
				1963	self->chdir_arg0 = 0;
				1964	}
				1965	"""
				1966
				1967	# execve-specific code, tends to throw a lot of exceptions.
				1968	D_CODE_EXECVE = """
				1969	/* Finally what we care about! */
				1970	syscall::exec*:entry /trackedpid[pid]/ {
				1971	self->exec_arg0 = copyinstr(arg0);
				1972	/* Incrementally probe for a NULL in the argv parameter of execve() to
				1973	* figure out argc. */
				1974	/* TODO(maruel): Skip the remaining copyin() when a NULL pointer was
				1975	* found. */
				1976	self->exec_argc = 0;
				1977	/* Probe for argc==1 */
				1978	this->exec_argv = (user_addr_t*)copyin(
				1979	arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
				1980	self->exec_argc = this->exec_argv[self->exec_argc] ?
				1981	(self->exec_argc + 1) : self->exec_argc;
				1982
				1983	/* Probe for argc==2 */
				1984	this->exec_argv = (user_addr_t*)copyin(
				1985	arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
				1986	self->exec_argc = this->exec_argv[self->exec_argc] ?
				1987	(self->exec_argc + 1) : self->exec_argc;
				1988
				1989	/* Probe for argc==3 */
				1990	this->exec_argv = (user_addr_t*)copyin(
				1991	arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
				1992	self->exec_argc = this->exec_argv[self->exec_argc] ?
				1993	(self->exec_argc + 1) : self->exec_argc;
				1994
				1995	/* Probe for argc==4 */
				1996	this->exec_argv = (user_addr_t*)copyin(
				1997	arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
				1998	self->exec_argc = this->exec_argv[self->exec_argc] ?
				1999	(self->exec_argc + 1) : self->exec_argc;
				2000
				2001	/* Copy the inputs strings since there is no guarantee they'll be
				2002	* present after the call completed. */
				2003	self->exec_argv0 = (self->exec_argc > 0) ?
				2004	copyinstr(this->exec_argv[0]) : "";
				2005	self->exec_argv1 = (self->exec_argc > 1) ?
				2006	copyinstr(this->exec_argv[1]) : "";
				2007	self->exec_argv2 = (self->exec_argc > 2) ?
				2008	copyinstr(this->exec_argv[2]) : "";
				2009	self->exec_argv3 = (self->exec_argc > 3) ?
				2010	copyinstr(this->exec_argv[3]) : "";
				2011	this->exec_argv = 0;
				2012	}
				2013	syscall::exec*:return /trackedpid[pid] && errno == 0/ {
				2014	/* We need to join strings here, as using multiple printf() would
				2015	* cause tearing when multiple threads/processes are traced.
				2016	* Since it is impossible to escape a string and join it to another one,
				2017	* like sprintf("%s%S", previous, more), use hackery.
				2018	* Each of the elements are split with a \\1. \\0 cannot be used because
				2019	* it is simply ignored. This will conflict with any program putting a
				2020	* \\1 in their execve() string but this should be "rare enough" */
				2021	this->args = "";
				2022	/* Process exec_argv[0] */
				2023	this->args = strjoin(
				2024	this->args, (self->exec_argc > 0) ? self->exec_argv0 : "");
				2025
				2026	/* Process exec_argv[1] */
				2027	this->args = strjoin(
				2028	this->args, (self->exec_argc > 1) ? "\\1" : "");
				2029	this->args = strjoin(
				2030	this->args, (self->exec_argc > 1) ? self->exec_argv1 : "");
				2031
				2032	/* Process exec_argv[2] */
				2033	this->args = strjoin(
				2034	this->args, (self->exec_argc > 2) ? "\\1" : "");
				2035	this->args = strjoin(
				2036	this->args, (self->exec_argc > 2) ? self->exec_argv2 : "");
				2037
				2038	/* Process exec_argv[3] */
				2039	this->args = strjoin(
				2040	this->args, (self->exec_argc > 3) ? "\\1" : "");
				2041	this->args = strjoin(
				2042	this->args, (self->exec_argc > 3) ? self->exec_argv3 : "");
				2043
				2044	/* Prints self->exec_argc to permits verifying the internal
				2045	* consistency since this code is quite fishy. */
				2046	printf("%d %d %s(\\"%s\\", [%d, %S])\\n",
				2047	logindex, pid, probefunc,
				2048	self->exec_arg0,
				2049	self->exec_argc,
				2050	this->args);
				2051	logindex++;
				2052	this->args = 0;
				2053	}
				2054	syscall::exec*:return /trackedpid[pid]/ {
				2055	self->exec_arg0 = 0;
				2056	self->exec_argc = 0;
				2057	self->exec_argv0 = 0;
				2058	self->exec_argv1 = 0;
				2059	self->exec_argv2 = 0;
				2060	self->exec_argv3 = 0;
				2061	}
				2062	"""
				2063
				2064	# Code currently not used.
				2065	D_EXTRANEOUS = """
				2066	/* This is a good learning experience, since it traces a lot of things
				2067	* related to the process and child processes.
				2068	* Warning: it generates a gigantic log. For example, tracing
				2069	* "data/trace_inputs/child1.py --child" generates a 2mb log and takes
				2070	* several minutes to execute.
				2071	*/
				2072	/*
				2073	mach_trap::: /trackedpid[pid] == 1 \|\| trackedpid[ppid]/ {
				2074	printf("%d %d %s_%s() = %d\\n",
				2075	logindex, pid, probeprov, probefunc, errno);
				2076	logindex++;
				2077	}
				2078	proc::: /trackedpid[pid] == 1 \|\| trackedpid[ppid]/ {
				2079	printf("%d %d %s_%s() = %d\\n",
				2080	logindex, pid, probeprov, probefunc, errno);
				2081	logindex++;
				2082	}
				2083	sched::: /trackedpid[pid] == 1 \|\| trackedpid[ppid]/ {
				2084	printf("%d %d %s_%s() = %d\\n",
				2085	logindex, pid, probeprov, probefunc, errno);
				2086	logindex++;
				2087	}
				2088	syscall::: /trackedpid[pid] == 1 \|\| trackedpid[ppid]/ {
				2089	printf("%d %d %s_%s() = %d\\n",
				2090	logindex, pid, probeprov, probefunc, errno);
				2091	logindex++;
				2092	}
				2093	vminfo::: /trackedpid[pid] == 1 \|\| trackedpid[ppid]/ {
				2094	printf("%d %d %s_%s() = %d\\n",
				2095	logindex, pid, probeprov, probefunc, errno);
				2096	logindex++;
				2097	}
				2098	*/
				2099	/* TODO(maruel): stat functions and friends
				2100	syscall::access:return,
				2101	syscall::chdir:return,
				2102	syscall::chflags:return,
				2103	syscall::chown:return,
				2104	syscall::chroot:return,
				2105	syscall::getattrlist:return,
				2106	syscall::getxattr:return,
				2107	syscall::lchown:return,
				2108	syscall::lstat64:return,
				2109	syscall::lstat:return,
				2110	syscall::mkdir:return,
				2111	syscall::pathconf:return,
				2112	syscall::readlink:return,
				2113	syscall::removexattr:return,
				2114	syscall::setxattr:return,
				2115	syscall::stat64:return,
				2116	syscall::stat:return,
				2117	syscall::truncate:return,
				2118	syscall::unlink:return,
				2119	syscall::utimes:return,
				2120	*/
				2121	"""
				2122
				2123	def __init__(self, logname):
				2124	"""Starts the log collection with dtrace.
				2125
				2126	Requires root access or chmod 4555 on dtrace. dtrace is asynchronous so
				2127	this needs to wait for dtrace to be "warmed up".
				2128	"""
				2129	super(Dtrace.Tracer, self).__init__(logname)
				2130	self._script = create_thunk()
				2131	# This unique dummy temp file is used to signal the dtrace script that it
				2132	# should stop as soon as all the child processes are done. A bit hackish
				2133	# but works fine enough.
				2134	self._dummy_file_id, self._dummy_file_name = tempfile.mkstemp(
				2135	prefix='trace_signal_file')
				2136
				2137	# Note: do not use the -p flag. It's useless if the initial process quits
				2138	# too fast, resulting in missing traces from the grand-children. The D
				2139	# code manages the dtrace lifetime itself.
				2140	trace_cmd = [
				2141	'sudo',
				2142	'dtrace',
				2143	# Use a larger buffer if getting 'out of scratch space' errors.
				2144	# Ref: https://wikis.oracle.com/display/DTrace/Options+and+Tunables
				2145	'-b', '10m',
				2146	'-x', 'dynvarsize=10m',
				2147	#'-x', 'dtrace_global_maxsize=1m',
				2148	'-x', 'evaltime=exec',
				2149	'-o', '/dev/stderr',
				2150	'-q',
				2151	'-n', self._get_dtrace_code(),
				2152	]
				2153	with open(self._logname + '.log', 'wb') as logfile:
				2154	self._dtrace = subprocess.Popen(
				2155	trace_cmd, stdout=logfile, stderr=subprocess.STDOUT)
				2156	logging.debug('Started dtrace pid: %d' % self._dtrace.pid)
				2157
				2158	# Reads until one line is printed, which signifies dtrace is up and ready.
				2159	with open(self._logname + '.log', 'rb') as logfile:
				2160	while 'dtrace_BEGIN' not in logfile.readline():
				2161	if self._dtrace.poll() is not None:
				2162	# Do a busy wait. :/
				2163	break
				2164	logging.debug('dtrace started')
				2165
				2166	def _get_dtrace_code(self):
				2167	"""Setups the D code to implement child process tracking.
				2168
				2169	Injects the cookie in the script so it knows when to stop.
				2170
				2171	The script will detect any instance of the script created with
				2172	create_thunk() and will start tracing it.
				2173	"""
				2174	return (
				2175	'inline int PID = %d;\n'
				2176	'inline string SCRIPT = "%s";\n'
				2177	'inline int FILE_ID = %d;\n'
				2178	'\n'
				2179	'%s') % (
				2180	os.getpid(),
				2181	self._script,
				2182	self._dummy_file_id,
				2183	self.D_CODE) + self.D_CODE_EXECVE
				2184
				2185	def trace(self, cmd, cwd, tracename, output):
				2186	"""Runs dtrace on an executable.
				2187
				2188	This dtruss is broken when it starts the process itself or when tracing
				2189	child processes, this code starts a wrapper process
				2190	generated with create_thunk() which starts the executable to trace.
				2191	"""
				2192	logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
				2193	assert os.path.isabs(cmd[0]), cmd[0]
				2194	assert os.path.isabs(cwd), cwd
				2195	assert os.path.normpath(cwd) == cwd, cwd
				2196	with self._lock:
				2197	if not self._initialized:
				2198	raise TracingFailure(
				2199	'Called Tracer.trace() on an unitialized object',
				2200	None, None, None, tracename)
				2201	assert tracename not in (i['trace'] for i in self._traces)
				2202
				2203	# Starts the script wrapper to start the child process. This signals the
				2204	# dtrace script that this process is to be traced.
				2205	stdout = stderr = None
				2206	if output:
				2207	stdout = subprocess.PIPE
				2208	stderr = subprocess.STDOUT
				2209	child_cmd = [
				2210	sys.executable,
				2211	self._script,
				2212	tracename,
				2213	]
				2214	# Call a dummy function so that dtrace knows I'm about to launch a process
				2215	# that needs to be traced.
				2216	# Yummy.
				2217	child = subprocess.Popen(
				2218	child_cmd + fix_python_path(cmd),
				2219	stdin=subprocess.PIPE,
				2220	stdout=stdout,
				2221	stderr=stderr,
				2222	cwd=cwd)
				2223	logging.debug('Started child pid: %d' % child.pid)
				2224
				2225	out = child.communicate()[0]
				2226	# This doesn't mean tracing is done, one of the grand-child process may
				2227	# still be alive. It will be tracked with the dtrace script.
				2228
				2229	with self._lock:
				2230	assert tracename not in (i['trace'] for i in self._traces)
				2231	self._traces.append(
				2232	{
				2233	'cmd': cmd,
				2234	'cwd': cwd,
				2235	# The pid of strace process, not very useful.
				2236	'pid': child.pid,
				2237	'trace': tracename,
				2238	'output': out,
				2239	})
				2240	return child.returncode, out
				2241
				2242	def close(self, timeout=None):
				2243	"""Terminates dtrace."""
				2244	logging.debug('close(%s)' % timeout)
				2245	try:
				2246	try:
				2247	super(Dtrace.Tracer, self).close(timeout)
				2248	# Signal dtrace that it should stop now.
maruel@chromium.org	f43e68b	2012-10-15 20:23:10 +0000	[diff] [blame]	2249	# ftruncate doesn't exist on Windows.
				2250	os.ftruncate(self._dummy_file_id, 0) # pylint: disable=E1101
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	2251	if timeout:
				2252	start = time.time()
				2253	# Use polling. :/
				2254	while (self._dtrace.poll() is None and
				2255	(time.time() - start) < timeout):
				2256	time.sleep(0.1)
				2257	self._dtrace.kill()
				2258	self._dtrace.wait()
				2259	finally:
				2260	# Make sure to kill it in any case.
				2261	if self._dtrace.poll() is None:
				2262	try:
				2263	self._dtrace.kill()
				2264	self._dtrace.wait()
				2265	except OSError:
				2266	pass
				2267
				2268	if self._dtrace.returncode != 0:
				2269	# Warn about any dtrace failure but basically ignore it.
				2270	print 'dtrace failure: %s' % self._dtrace.returncode
				2271	finally:
				2272	os.close(self._dummy_file_id)
				2273	os.remove(self._dummy_file_name)
				2274	os.remove(self._script)
				2275
				2276	def post_process_log(self):
				2277	"""Sorts the log back in order when each call occured.
				2278
				2279	dtrace doesn't save the buffer in strict order since it keeps one buffer
				2280	per CPU.
				2281	"""
				2282	super(Dtrace.Tracer, self).post_process_log()
				2283	logname = self._logname + '.log'
				2284	with open(logname, 'rb') as logfile:
				2285	lines = [l for l in logfile if l.strip()]
				2286	errors = [l for l in lines if l.startswith('dtrace:')]
				2287	if errors:
				2288	raise TracingFailure(
				2289	'Found errors in the trace: %s' % '\n'.join(errors),
				2290	None, None, None, logname)
				2291	try:
				2292	lines = sorted(lines, key=lambda l: int(l.split(' ', 1)[0]))
				2293	except ValueError:
				2294	raise TracingFailure(
				2295	'Found errors in the trace: %s' % '\n'.join(
				2296	l for l in lines if l.split(' ', 1)[0].isdigit()),
				2297	None, None, None, logname)
				2298	with open(logname, 'wb') as logfile:
				2299	logfile.write(''.join(lines))
				2300
				2301	@staticmethod
				2302	def clean_trace(logname):
				2303	for ext in ('', '.log'):
				2304	if os.path.isfile(logname + ext):
				2305	os.remove(logname + ext)
				2306
				2307	@classmethod
				2308	def parse_log(cls, logname, blacklist):
				2309	logging.info('parse_log(%s, ...)' % logname)
maruel@chromium.org	8b056ba	2012-10-16 14:04:49 +0000	[diff] [blame]	2310	assert os.path.isabs(logname)
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	2311
				2312	def blacklist_more(filepath):
				2313	# All the HFS metadata is in the form /.vol/...
				2314	return blacklist(filepath) or re.match(r'^\/\.vol\/.+$', filepath)
				2315
				2316	data = read_json(logname)
				2317	out = []
				2318	for item in data['traces']:
				2319	context = cls.Context(blacklist_more, item['pid'], item['cwd'])
				2320	for line in open(logname + '.log', 'rb'):
				2321	context.on_line(line)
				2322	out.append(
				2323	{
				2324	'results': context.to_results(),
				2325	'trace': item['trace'],
				2326	'output': item['output'],
				2327	})
				2328	return out
				2329
				2330
				2331	class LogmanTrace(ApiBase):
				2332	"""Uses the native Windows ETW based tracing functionality to trace a child
				2333	process.
				2334
				2335	Caveat: this implementations doesn't track cwd or initial_cwd. It is because
				2336	the Windows Kernel doesn't have a concept of 'current working directory' at
				2337	all. A Win32 process has a map of current directories, one per drive letter
				2338	and it is managed by the user mode kernel32.dll. In kernel, a file is always
				2339	opened relative to another file_object or as an absolute path. All the current
				2340	working directory logic is done in user mode.
				2341	"""
				2342	class Context(ApiBase.Context):
				2343	"""Processes a ETW log line and keeps the list of existent and non
				2344	existent files accessed.
				2345
				2346	Ignores directories.
				2347	"""
				2348	# These indexes are for the stripped version in json.
				2349	EVENT_NAME = 0
				2350	TYPE = 1
				2351	PID = 2
				2352	TID = 3
				2353	PROCESSOR_ID = 4
				2354	TIMESTAMP = 5
				2355	USER_DATA = 6
				2356
				2357	class Process(ApiBase.Context.Process):
				2358	def __init__(self, *args):
				2359	super(LogmanTrace.Context.Process, self).__init__(*args)
				2360	# Handle file objects that succeeded.
				2361	self.file_objects = {}
				2362
				2363	def __init__(self, blacklist, tracer_pid):
				2364	logging.info('%s(%d)' % (self.__class__.__name__, tracer_pid))
				2365	super(LogmanTrace.Context, self).__init__(blacklist)
				2366	self._drive_map = DosDriveMap()
				2367	# Threads mapping to the corresponding process id.
				2368	self._threads_active = {}
				2369	# Process ID of the tracer, e.g. tracer_inputs.py
				2370	self._tracer_pid = tracer_pid
				2371	self._line_number = 0
				2372
				2373	def on_line(self, line):
				2374	"""Processes a json Event line."""
				2375	self._line_number += 1
				2376	try:
				2377	# By Opcode
				2378	handler = getattr(
				2379	self,
				2380	'handle_%s_%s' % (line[self.EVENT_NAME], line[self.TYPE]),
				2381	None)
				2382	if not handler:
				2383	raise TracingFailure(
				2384	'Unexpected event %s_%s' % (
				2385	line[self.EVENT_NAME], line[self.TYPE]),
				2386	None, None, None)
				2387	handler(line)
				2388	except TracingFailure, e:
				2389	# Hack in the values since the handler could be a static function.
				2390	e.pid = line[self.PID]
				2391	e.line = line
				2392	e.line_number = self._line_number
				2393	# Re-raise the modified exception.
				2394	raise
				2395	except (KeyError, NotImplementedError, ValueError), e:
				2396	raise TracingFailure(
				2397	'Trace generated a %s exception: %s' % (
				2398	e.__class__.__name__, str(e)),
				2399	line[self.PID],
				2400	self._line_number,
				2401	line,
				2402	e)
				2403
				2404	def to_results(self):
				2405	if not self.root_process:
				2406	raise TracingFailure(
				2407	'Failed to detect the initial process',
				2408	None, None, None)
				2409	process = self.root_process.to_results_process()
				2410	return Results(process)
				2411
				2412	def _thread_to_process(self, tid):
				2413	"""Finds the process from the thread id."""
				2414	tid = int(tid, 16)
				2415	pid = self._threads_active.get(tid)
				2416	if not pid or not self._process_lookup.get(pid):
				2417	return
				2418	return self._process_lookup[pid]
				2419
				2420	@classmethod
				2421	def handle_EventTrace_Header(cls, line):
				2422	"""Verifies no event was dropped, e.g. no buffer overrun occured."""
				2423	BUFFER_SIZE = cls.USER_DATA
				2424	#VERSION = cls.USER_DATA + 1
				2425	#PROVIDER_VERSION = cls.USER_DATA + 2
				2426	#NUMBER_OF_PROCESSORS = cls.USER_DATA + 3
				2427	#END_TIME = cls.USER_DATA + 4
				2428	#TIMER_RESOLUTION = cls.USER_DATA + 5
				2429	#MAX_FILE_SIZE = cls.USER_DATA + 6
				2430	#LOG_FILE_MODE = cls.USER_DATA + 7
				2431	#BUFFERS_WRITTEN = cls.USER_DATA + 8
				2432	#START_BUFFERS = cls.USER_DATA + 9
				2433	#POINTER_SIZE = cls.USER_DATA + 10
				2434	EVENTS_LOST = cls.USER_DATA + 11
				2435	#CPU_SPEED = cls.USER_DATA + 12
				2436	#LOGGER_NAME = cls.USER_DATA + 13
				2437	#LOG_FILE_NAME = cls.USER_DATA + 14
				2438	#BOOT_TIME = cls.USER_DATA + 15
				2439	#PERF_FREQ = cls.USER_DATA + 16
				2440	#START_TIME = cls.USER_DATA + 17
				2441	#RESERVED_FLAGS = cls.USER_DATA + 18
				2442	#BUFFERS_LOST = cls.USER_DATA + 19
				2443	#SESSION_NAME_STRING = cls.USER_DATA + 20
				2444	#LOG_FILE_NAME_STRING = cls.USER_DATA + 21
				2445	if line[EVENTS_LOST] != '0':
				2446	raise TracingFailure(
				2447	( '%s events were lost during trace, please increase the buffer '
				2448	'size from %s') % (line[EVENTS_LOST], line[BUFFER_SIZE]),
				2449	None, None, None)
				2450
				2451	def handle_FileIo_Cleanup(self, line):
				2452	"""General wisdom: if a file is closed, it's because it was opened.
				2453
				2454	Note that FileIo_Close is not used since if a file was opened properly but
				2455	not closed before the process exits, only Cleanup will be logged.
				2456	"""
				2457	#IRP = self.USER_DATA
				2458	TTID = self.USER_DATA + 1 # Thread ID, that's what we want.
				2459	FILE_OBJECT = self.USER_DATA + 2
				2460	#FILE_KEY = self.USER_DATA + 3
				2461	proc = self._thread_to_process(line[TTID])
				2462	if not proc:
				2463	# Not a process we care about.
				2464	return
				2465	file_object = line[FILE_OBJECT]
				2466	if file_object in proc.file_objects:
				2467	proc.add_file(proc.file_objects.pop(file_object), False)
				2468
				2469	def handle_FileIo_Create(self, line):
				2470	"""Handles a file open.
				2471
				2472	All FileIo events are described at
				2473	http://msdn.microsoft.com/library/windows/desktop/aa363884.aspx
				2474	for some value of 'description'.
				2475
				2476	" (..) process and thread id values of the IO events (..) are not valid "
				2477	http://msdn.microsoft.com/magazine/ee358703.aspx
				2478
				2479	The FileIo.Create event doesn't return if the CreateFile() call
				2480	succeeded, so keep track of the file_object and check that it is
				2481	eventually closed with FileIo_Cleanup.
				2482	"""
				2483	#IRP = self.USER_DATA
				2484	TTID = self.USER_DATA + 1 # Thread ID, that's what we want.
				2485	FILE_OBJECT = self.USER_DATA + 2
				2486	#CREATE_OPTIONS = self.USER_DATA + 3
				2487	#FILE_ATTRIBUTES = self.USER_DATA + 4
				2488	#self.USER_DATA + SHARE_ACCESS = 5
				2489	OPEN_PATH = self.USER_DATA + 6
				2490
				2491	proc = self._thread_to_process(line[TTID])
				2492	if not proc:
				2493	# Not a process we care about.
				2494	return
				2495
				2496	match = re.match(r'^\"(.+)\"$', line[OPEN_PATH])
				2497	raw_path = match.group(1)
				2498	# Ignore directories and bare drive right away.
				2499	if raw_path.endswith(os.path.sep):
				2500	return
				2501	filepath = self._drive_map.to_win32(raw_path)
				2502	# Ignore bare drive right away. Some may still fall through with format
				2503	# like '\\?\X:'
				2504	if len(filepath) == 2:
				2505	return
				2506	file_object = line[FILE_OBJECT]
				2507	if os.path.isdir(filepath):
				2508	# There is no O_DIRECTORY equivalent on Windows. The closed is
				2509	# FILE_FLAG_BACKUP_SEMANTICS but it's not exactly right either. So
				2510	# simply discard directories are they are found.
				2511	return
				2512	# Override any stale file object
				2513	proc.file_objects[file_object] = filepath
				2514
				2515	def handle_FileIo_Rename(self, line):
				2516	# TODO(maruel): Handle?
				2517	pass
				2518
				2519	def handle_Process_End(self, line):
				2520	pid = line[self.PID]
				2521	if self._process_lookup.get(pid):
				2522	logging.info('Terminated: %d' % pid)
				2523	self._process_lookup[pid] = None
				2524	else:
				2525	logging.debug('Terminated: %d' % pid)
				2526
				2527	def handle_Process_Start(self, line):
				2528	"""Handles a new child process started by PID."""
				2529	#UNIQUE_PROCESS_KEY = self.USER_DATA
				2530	PROCESS_ID = self.USER_DATA + 1
				2531	#PARENT_PID = self.USER_DATA + 2
				2532	#SESSION_ID = self.USER_DATA + 3
				2533	#EXIT_STATUS = self.USER_DATA + 4
				2534	#DIRECTORY_TABLE_BASE = self.USER_DATA + 5
				2535	#USER_SID = self.USER_DATA + 6
				2536	IMAGE_FILE_NAME = self.USER_DATA + 7
				2537	COMMAND_LINE = self.USER_DATA + 8
				2538
				2539	ppid = line[self.PID]
				2540	pid = int(line[PROCESS_ID], 16)
				2541	logging.debug(
				2542	'New process %d->%d (%s) %s' %
				2543	(ppid, pid, line[IMAGE_FILE_NAME], line[COMMAND_LINE]))
				2544
				2545	if ppid == self._tracer_pid:
				2546	# Need to ignore processes we don't know about because the log is
				2547	# system-wide. self._tracer_pid shall start only one process.
				2548	if self.root_process:
				2549	raise TracingFailure(
				2550	( 'Parent process is _tracer_pid(%d) but root_process(%d) is '
				2551	'already set') % (self._tracer_pid, self.root_process.pid),
				2552	None, None, None)
				2553	proc = self.Process(self.blacklist, pid, None)
				2554	self.root_process = proc
				2555	ppid = None
				2556	elif self._process_lookup.get(ppid):
				2557	proc = self.Process(self.blacklist, pid, None)
				2558	self._process_lookup[ppid].children.append(proc)
				2559	else:
				2560	# Ignore
				2561	return
				2562	self._process_lookup[pid] = proc
				2563
				2564	if (not line[IMAGE_FILE_NAME].startswith('"') or
				2565	not line[IMAGE_FILE_NAME].endswith('"')):
				2566	raise TracingFailure(
				2567	'Command line is not properly quoted: %s' % line[IMAGE_FILE_NAME],
				2568	None, None, None)
				2569
				2570	# TODO(maruel): Process escapes.
				2571	if (not line[COMMAND_LINE].startswith('"') or
				2572	not line[COMMAND_LINE].endswith('"')):
				2573	raise TracingFailure(
				2574	'Command line is not properly quoted: %s' % line[COMMAND_LINE],
				2575	None, None, None)
				2576	proc.command = CommandLineToArgvW(line[COMMAND_LINE][1:-1])
				2577	proc.executable = line[IMAGE_FILE_NAME][1:-1]
				2578	# proc.command[0] may be the absolute path of 'executable' but it may be
				2579	# anything else too. If it happens that command[0] ends with executable,
				2580	# use it, otherwise defaults to the base name.
				2581	cmd0 = proc.command[0].lower()
				2582	if not cmd0.endswith('.exe'):
				2583	# TODO(maruel): That's not strictly true either.
				2584	cmd0 += '.exe'
				2585	if cmd0.endswith(proc.executable) and os.path.isfile(cmd0):
				2586	# Fix the path.
				2587	cmd0 = cmd0.replace('/', os.path.sep)
				2588	cmd0 = os.path.normpath(cmd0)
				2589	proc.executable = get_native_path_case(cmd0)
				2590	logging.info(
				2591	'New child: %s -> %d %s' % (ppid, pid, proc.executable))
				2592
				2593	def handle_Thread_End(self, line):
				2594	"""Has the same parameters as Thread_Start."""
				2595	tid = int(line[self.TID], 16)
				2596	self._threads_active.pop(tid, None)
				2597
				2598	def handle_Thread_Start(self, line):
				2599	"""Handles a new thread created.
				2600
				2601	Do not use self.PID here since a process' initial thread is created by
				2602	the parent process.
				2603	"""
				2604	PROCESS_ID = self.USER_DATA
				2605	TTHREAD_ID = self.USER_DATA + 1
				2606	#STACK_BASE = self.USER_DATA + 2
				2607	#STACK_LIMIT = self.USER_DATA + 3
				2608	#USER_STACK_BASE = self.USER_DATA + 4
				2609	#USER_STACK_LIMIT = self.USER_DATA + 5
				2610	#AFFINITY = self.USER_DATA + 6
				2611	#WIN32_START_ADDR = self.USER_DATA + 7
				2612	#TEB_BASE = self.USER_DATA + 8
				2613	#SUB_PROCESS_TAG = self.USER_DATA + 9
				2614	#BASE_PRIORITY = self.USER_DATA + 10
				2615	#PAGE_PRIORITY = self.USER_DATA + 11
				2616	#IO_PRIORITY = self.USER_DATA + 12
				2617	#THREAD_FLAGS = self.USER_DATA + 13
				2618	# Do not use self.PID here since a process' initial thread is created by
				2619	# the parent process.
				2620	pid = int(line[PROCESS_ID], 16)
				2621	tid = int(line[TTHREAD_ID], 16)
				2622	logging.debug('New thread pid:%d, tid:%d' % (pid, tid))
				2623	self._threads_active[tid] = pid
				2624
				2625	@classmethod
				2626	def supported_events(cls):
				2627	"""Returns all the procesed events."""
				2628	out = []
				2629	for member in dir(cls):
				2630	match = re.match(r'^handle_([A-Za-z]+)_([A-Za-z]+)$', member)
				2631	if match:
				2632	out.append(match.groups())
				2633	return out
				2634
				2635	class Tracer(ApiBase.Tracer):
				2636	# The basic headers.
				2637	EXPECTED_HEADER = [
				2638	u'Event Name',
				2639	u'Type',
				2640	u'Event ID',
				2641	u'Version',
				2642	u'Channel',
				2643	u'Level', # 5
				2644	u'Opcode',
				2645	u'Task',
				2646	u'Keyword',
				2647	u'PID',
				2648	u'TID', # 10
				2649	u'Processor Number',
				2650	u'Instance ID',
				2651	u'Parent Instance ID',
				2652	u'Activity ID',
				2653	u'Related Activity ID', # 15
				2654	u'Clock-Time',
				2655	u'Kernel(ms)', # Both have a resolution of ~15ms which makes them
				2656	u'User(ms)', # pretty much useless.
				2657	u'User Data', # Extra arguments that are event-specific.
				2658	]
				2659	# Only the useful headers common to all entries are listed there. Any column
				2660	# at 19 or higher is dependent on the specific event.
				2661	EVENT_NAME = 0
				2662	TYPE = 1
				2663	PID = 9
				2664	TID = 10
				2665	PROCESSOR_ID = 11
				2666	TIMESTAMP = 16
				2667	NULL_GUID = '{00000000-0000-0000-0000-000000000000}'
				2668	USER_DATA = 19
				2669
				2670	def __init__(self, logname):
				2671	"""Starts the log collection.
				2672
				2673	Requires administrative access. logman.exe is synchronous so no need for a
				2674	"warmup" call. 'Windows Kernel Trace' is localized so use its GUID
				2675	instead. The GUID constant name is SystemTraceControlGuid. Lovely.
				2676
				2677	One can get the list of potentially interesting providers with:
				2678	"logman query providers \| findstr /i file"
				2679	"""
				2680	super(LogmanTrace.Tracer, self).__init__(logname)
				2681	self._script = create_thunk()
				2682	cmd_start = [
				2683	'logman.exe',
				2684	'start',
				2685	'NT Kernel Logger',
				2686	'-p', '{9e814aad-3204-11d2-9a82-006008a86939}',
				2687	# splitio,fileiocompletion,syscall,file,cswitch,img
				2688	'(process,fileio,thread)',
				2689	'-o', self._logname + '.etl',
				2690	'-ets', # Send directly to kernel
				2691	# Values extracted out of thin air.
				2692	# Event Trace Session buffer size in kb.
				2693	'-bs', '10240',
				2694	# Number of Event Trace Session buffers.
				2695	'-nb', '16', '256',
				2696	]
				2697	logging.debug('Running: %s' % cmd_start)
				2698	try:
				2699	subprocess.check_call(
				2700	cmd_start,
				2701	stdin=subprocess.PIPE,
				2702	stdout=subprocess.PIPE,
				2703	stderr=subprocess.STDOUT)
				2704	except subprocess.CalledProcessError, e:
				2705	if e.returncode == -2147024891:
				2706	print >> sys.stderr, 'Please restart with an elevated admin prompt'
				2707	elif e.returncode == -2144337737:
				2708	print >> sys.stderr, (
				2709	'A kernel trace was already running, stop it and try again')
				2710	raise
				2711
				2712	def trace(self, cmd, cwd, tracename, output):
				2713	logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
				2714	assert os.path.isabs(cmd[0]), cmd[0]
				2715	assert os.path.isabs(cwd), cwd
				2716	assert os.path.normpath(cwd) == cwd, cwd
				2717	with self._lock:
				2718	if not self._initialized:
				2719	raise TracingFailure(
				2720	'Called Tracer.trace() on an unitialized object',
				2721	None, None, None, tracename)
				2722	assert tracename not in (i['trace'] for i in self._traces)
				2723
				2724	# Use "logman -?" for help.
				2725
				2726	stdout = stderr = None
				2727	if output:
				2728	stdout = subprocess.PIPE
				2729	stderr = subprocess.STDOUT
				2730
				2731	# Run the child process.
				2732	logging.debug('Running: %s' % cmd)
				2733	# Use the temporary script generated with create_thunk() so we have a
				2734	# clear pid owner. Since trace_inputs.py can be used as a library and
				2735	# could trace multiple processes simultaneously, it makes it more complex
				2736	# if the executable to be traced is executed directly here. It also solves
				2737	# issues related to logman.exe that needs to be executed to control the
				2738	# kernel trace.
				2739	child_cmd = [
				2740	sys.executable,
				2741	self._script,
				2742	tracename,
				2743	]
				2744	child = subprocess.Popen(
				2745	child_cmd + fix_python_path(cmd),
				2746	cwd=cwd,
				2747	stdin=subprocess.PIPE,
				2748	stdout=stdout,
				2749	stderr=stderr)
				2750	logging.debug('Started child pid: %d' % child.pid)
				2751	out = child.communicate()[0]
				2752	# This doesn't mean all the grand-children are done. Sadly, we don't have
				2753	# a good way to determine that.
				2754
				2755	with self._lock:
				2756	assert tracename not in (i['trace'] for i in self._traces)
				2757	self._traces.append({
maruel@chromium.org	f43e68b	2012-10-15 20:23:10 +0000	[diff] [blame]	2758	'cmd': cmd,
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	2759	'cwd': cwd,
				2760	'pid': child.pid,
				2761	'trace': tracename,
				2762	'output': out,
				2763	})
				2764
				2765	return child.returncode, out
				2766
				2767	def close(self, _timeout=None):
				2768	"""Stops the kernel log collection and converts the traces to text
				2769	representation.
				2770	"""
				2771	with self._lock:
				2772	if not self._initialized:
				2773	raise TracingFailure(
				2774	'Called Tracer.close() on an unitialized object',
				2775	None, None, None)
				2776	os.remove(self._script)
				2777	# Save metadata, add 'format' key..
				2778	data = {
				2779	'format': 'csv',
				2780	'traces': self._traces,
				2781	}
				2782	write_json(self._logname, data, False)
				2783
				2784	cmd_stop = [
				2785	'logman.exe',
				2786	'stop',
				2787	'NT Kernel Logger',
				2788	'-ets', # Sends the command directly to the kernel.
				2789	]
				2790	logging.debug('Running: %s' % cmd_stop)
				2791	subprocess.check_call(
				2792	cmd_stop,
				2793	stdin=subprocess.PIPE,
				2794	stdout=subprocess.PIPE,
				2795	stderr=subprocess.STDOUT)
				2796	self._initialized = False
				2797
				2798	def post_process_log(self):
				2799	"""Converts the .etl file into .csv then into .json."""
				2800	super(LogmanTrace.Tracer, self).post_process_log()
				2801	logformat = 'csv'
				2802	self._convert_log(logformat)
				2803
				2804	if logformat == 'csv_utf16':
				2805	def load_file():
				2806	def utf_8_encoder(unicode_csv_data):
				2807	"""Encodes the unicode object as utf-8 encoded str instance"""
				2808	for line in unicode_csv_data:
				2809	yield line.encode('utf-8')
				2810
				2811	def unicode_csv_reader(unicode_csv_data, **kwargs):
				2812	"""Encodes temporarily as UTF-8 since csv module doesn't do unicode.
				2813	"""
				2814	csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), **kwargs)
				2815	for row in csv_reader:
				2816	# Decode str utf-8 instances back to unicode instances, cell by
				2817	# cell:
				2818	yield [cell.decode('utf-8') for cell in row]
				2819
				2820	# The CSV file is UTF-16 so use codecs.open() to load the file into
				2821	# the python internal unicode format (utf-8). Then explicitly
				2822	# re-encode as utf8 as str instances so csv can parse it fine. Then
				2823	# decode the utf-8 str back into python unicode instances. This
				2824	# sounds about right.
				2825	for line in unicode_csv_reader(
				2826	codecs.open(self._logname + '.' + logformat, 'r', 'utf-16')):
				2827	# line is a list of unicode objects
				2828	# So much white space!
				2829	yield [i.strip() for i in line]
				2830
				2831	elif logformat == 'csv':
				2832	def load_file():
				2833	def ansi_csv_reader(ansi_csv_data, **kwargs):
				2834	"""Loads an 'ANSI' code page and returns unicode() objects."""
				2835	assert sys.getfilesystemencoding() == 'mbcs'
				2836	encoding = get_current_encoding()
				2837	for row in csv.reader(ansi_csv_data, **kwargs):
				2838	# Decode str 'ansi' instances to unicode instances, cell by cell:
				2839	yield [cell.decode(encoding) for cell in row]
				2840
				2841	# The fastest and smallest format but only supports 'ANSI' file paths.
				2842	# E.g. the filenames are encoding in the 'current' encoding.
				2843	for line in ansi_csv_reader(open(self._logname + '.' + logformat)):
				2844	# line is a list of unicode objects.
				2845	yield [i.strip() for i in line]
				2846
				2847	supported_events = LogmanTrace.Context.supported_events()
				2848
				2849	def trim(generator):
				2850	for index, line in enumerate(generator):
				2851	if not index:
				2852	if line != self.EXPECTED_HEADER:
				2853	raise TracingFailure(
				2854	'Found malformed header: %s' % ' '.join(line),
				2855	None, None, None)
				2856	continue
				2857	# As you can see, the CSV is full of useful non-redundant information:
				2858	if (line[2] != '0' or # Event ID
				2859	line[3] not in ('2', '3') or # Version
				2860	line[4] != '0' or # Channel
				2861	line[5] != '0' or # Level
				2862	line[7] != '0' or # Task
				2863	line[8] != '0x0000000000000000' or # Keyword
				2864	line[12] != '' or # Instance ID
				2865	line[13] != '' or # Parent Instance ID
				2866	line[14] != self.NULL_GUID or # Activity ID
				2867	line[15] != ''): # Related Activity ID
				2868	raise TracingFailure(
				2869	'Found unexpected values in line: %s' % ' '.join(line),
				2870	None, None, None)
				2871
				2872	if (line[self.EVENT_NAME], line[self.TYPE]) not in supported_events:
				2873	continue
				2874
				2875	# Convert the PID in-place from hex.
				2876	line[self.PID] = int(line[self.PID], 16)
				2877
				2878	yield [
				2879	line[self.EVENT_NAME],
				2880	line[self.TYPE],
				2881	line[self.PID],
				2882	line[self.TID],
				2883	line[self.PROCESSOR_ID],
				2884	line[self.TIMESTAMP],
				2885	] + line[self.USER_DATA:]
				2886
				2887	write_json('%s.json' % self._logname, list(trim(load_file())), True)
				2888
				2889	def _convert_log(self, logformat):
				2890	"""Converts the ETL trace to text representation.
				2891
				2892	Normally, 'csv' is sufficient. If complex scripts are used (like eastern
				2893	languages), use 'csv_utf16'. If localization gets in the way, use 'xml'.
				2894
				2895	Arguments:
				2896	- logformat: Text format to be generated, csv, csv_utf16 or xml.
				2897
				2898	Use "tracerpt -?" for help.
				2899	"""
				2900	LOCALE_INVARIANT = 0x7F
				2901	windll.kernel32.SetThreadLocale(LOCALE_INVARIANT)
				2902	cmd_convert = [
				2903	'tracerpt.exe',
				2904	'-l', self._logname + '.etl',
				2905	'-o', self._logname + '.' + logformat,
				2906	'-gmt', # Use UTC
				2907	'-y', # No prompt
				2908	# Use -of XML to get the header of each items after column 19, e.g. all
				2909	# the actual headers of 'User Data'.
				2910	]
				2911
				2912	if logformat == 'csv':
				2913	# tracerpt localizes the 'Type' column, for major brainfuck
				2914	# entertainment. I can't imagine any sane reason to do that.
				2915	cmd_convert.extend(['-of', 'CSV'])
				2916	elif logformat == 'csv_utf16':
				2917	# This causes it to use UTF-16, which doubles the log size but ensures
				2918	# the log is readable for non-ASCII characters.
				2919	cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode'])
				2920	elif logformat == 'xml':
				2921	cmd_convert.extend(['-of', 'XML'])
				2922	else:
				2923	raise ValueError('Unexpected log format \'%s\'' % logformat)
				2924	logging.debug('Running: %s' % cmd_convert)
				2925	# This can takes tens of minutes for large logs.
				2926	# Redirects all output to stderr.
				2927	subprocess.check_call(
				2928	cmd_convert,
				2929	stdin=subprocess.PIPE,
				2930	stdout=sys.stderr,
				2931	stderr=sys.stderr)
				2932
				2933	@staticmethod
				2934	def clean_trace(logname):
				2935	for ext in ('', '.csv', '.etl', '.json', '.xml'):
				2936	if os.path.isfile(logname + ext):
				2937	os.remove(logname + ext)
				2938
				2939	@classmethod
				2940	def parse_log(cls, logname, blacklist):
				2941	logging.info('parse_log(%s, %s)' % (logname, blacklist))
maruel@chromium.org	8b056ba	2012-10-16 14:04:49 +0000	[diff] [blame]	2942	assert os.path.isabs(logname)
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	2943
				2944	def blacklist_more(filepath):
				2945	# All the NTFS metadata is in the form x:\$EXTEND or stuff like that.
				2946	return blacklist(filepath) or re.match(r'[A-Z]\:\\\$EXTEND', filepath)
				2947
				2948	data = read_json(logname)
				2949	lines = read_json(logname + '.json')
				2950	out = []
				2951	for item in data['traces']:
				2952	context = cls.Context(blacklist_more, item['pid'])
				2953	for line in lines:
				2954	context.on_line(line)
				2955	out.append(
				2956	{
				2957	'results': context.to_results(),
				2958	'trace': item['trace'],
				2959	'output': item['output'],
				2960	})
				2961	return out
				2962
				2963
				2964	def get_api():
				2965	"""Returns the correct implementation for the current OS."""
				2966	if sys.platform == 'cygwin':
				2967	raise NotImplementedError(
				2968	'Not implemented for cygwin, start the script from Win32 python')
				2969	flavors = {
				2970	'win32': LogmanTrace,
				2971	'darwin': Dtrace,
				2972	'sunos5': Dtrace,
				2973	'freebsd7': Dtrace,
				2974	'freebsd8': Dtrace,
				2975	}
				2976	# Defaults to strace.
				2977	return flavors.get(sys.platform, Strace)()
				2978
				2979
				2980	def extract_directories(root_dir, files, blacklist):
				2981	"""Detects if all the files in a directory are in \|files\| and if so, replace
				2982	the individual files by a Results.Directory instance.
				2983
				2984	Takes a list of Results.File instances and returns a shorter list of
				2985	Results.File and Results.Directory instances.
				2986
				2987	Arguments:
				2988	- root_dir: Optional base directory that shouldn't be search further.
				2989	- files: list of Results.File instances.
				2990	- blacklist: regexp of files to ignore, for example r'.+\.pyc'.
				2991	"""
				2992	logging.info(
				2993	'extract_directories(%s, %d files, ...)' % (root_dir, len(files)))
				2994	assert not (root_dir or '').endswith(os.path.sep), root_dir
				2995	assert not root_dir or (get_native_path_case(root_dir) == root_dir)
				2996	assert not any(isinstance(f, Results.Directory) for f in files)
				2997	# Remove non existent files.
				2998	files = [f for f in files if f.existent]
				2999	if not files:
				3000	return files
				3001	# All files must share the same root, which can be None.
				3002	assert len(set(f.root for f in files)) == 1, set(f.root for f in files)
				3003
				3004	# Creates a {directory: {filename: File}} mapping, up to root.
				3005	buckets = {}
				3006	if root_dir:
				3007	buckets[root_dir] = {}
				3008	for fileobj in files:
				3009	path = fileobj.full_path
				3010	directory = os.path.dirname(path)
				3011	assert directory
				3012	# Do not use os.path.basename() so trailing os.path.sep is kept.
				3013	basename = path[len(directory)+1:]
				3014	files_in_directory = buckets.setdefault(directory, {})
				3015	files_in_directory[basename] = fileobj
				3016	# Add all the directories recursively up to root.
				3017	while True:
				3018	old_d = directory
				3019	directory = os.path.dirname(directory)
				3020	if directory + os.path.sep == root_dir or directory == old_d:
				3021	break
				3022	buckets.setdefault(directory, {})
				3023
				3024	root_prefix = len(root_dir) + 1 if root_dir else 0
				3025	for directory in sorted(buckets, reverse=True):
				3026	actual = set(f for f in os.listdir(directory) if not blacklist(f))
				3027	expected = set(buckets[directory])
				3028	if not (actual - expected):
				3029	parent = os.path.dirname(directory)
				3030	buckets[parent][os.path.basename(directory)] = Results.Directory(
				3031	root_dir,
				3032	directory[root_prefix:],
				3033	False,
				3034	sum(f.size for f in buckets[directory].itervalues()),
				3035	sum(f.nb_files for f in buckets[directory].itervalues()))
				3036	# Remove the whole bucket.
				3037	del buckets[directory]
				3038
				3039	# Reverse the mapping with what remains. The original instances are returned,
				3040	# so the cached meta data is kept.
				3041	files = sum((x.values() for x in buckets.itervalues()), [])
				3042	return sorted(files, key=lambda x: x.path)
				3043
				3044
				3045	def trace(logfile, cmd, cwd, api, output):
				3046	"""Traces an executable. Returns (returncode, output) from api.
				3047
				3048	Arguments:
				3049	- logfile: file to write to.
				3050	- cmd: command to run.
				3051	- cwd: current directory to start the process in.
				3052	- api: a tracing api instance.
				3053	- output: if True, returns output, otherwise prints it at the console.
				3054	"""
				3055	cmd = fix_python_path(cmd)
				3056	api.clean_trace(logfile)
				3057	with api.get_tracer(logfile) as tracer:
				3058	return tracer.trace(cmd, cwd, 'default', output)
				3059
				3060
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	3061	def CMDclean(args):
				3062	"""Cleans up traces."""
				3063	parser = OptionParserTraceInputs(command='clean')
				3064	options, args = parser.parse_args(args)
				3065	api = get_api()
				3066	api.clean_trace(options.log)
				3067	return 0
				3068
				3069
				3070	def CMDtrace(args):
				3071	"""Traces an executable."""
				3072	parser = OptionParserTraceInputs(command='trace')
				3073	parser.allow_interspersed_args = False
				3074	parser.add_option(
				3075	'-q', '--quiet', action='store_true',
				3076	help='Redirects traced executable output to /dev/null')
				3077	options, args = parser.parse_args(args)
				3078
				3079	if not args:
				3080	parser.error('Please provide a command to run')
				3081
				3082	if not os.path.isabs(args[0]) and os.access(args[0], os.X_OK):
				3083	args[0] = os.path.abspath(args[0])
				3084
				3085	api = get_api()
				3086	return trace(options.log, args, os.getcwd(), api, options.quiet)[0]
				3087
				3088
				3089	def CMDread(args):
				3090	"""Reads the logs and prints the result."""
				3091	parser = OptionParserTraceInputs(command='read')
				3092	parser.add_option(
				3093	'-V', '--variable',
				3094	nargs=2,
				3095	action='append',
				3096	dest='variables',
				3097	metavar='VAR_NAME directory',
				3098	default=[],
				3099	help=('Variables to replace relative directories against. Example: '
				3100	'"-v \'$HOME\' \'/home/%s\'" will replace all occurence of your '
				3101	'home dir with $HOME') % getpass.getuser())
				3102	parser.add_option(
				3103	'--root-dir',
				3104	help='Root directory to base everything off it. Anything outside of this '
				3105	'this directory will not be reported')
				3106	parser.add_option(
				3107	'-j', '--json', action='store_true',
				3108	help='Outputs raw result data as json')
				3109	parser.add_option(
				3110	'-b', '--blacklist', action='append', default=[],
				3111	help='List of regexp to use as blacklist filter')
				3112	options, args = parser.parse_args(args)
				3113
				3114	if options.root_dir:
				3115	options.root_dir = os.path.abspath(options.root_dir)
				3116
				3117	variables = dict(options.variables)
				3118	api = get_api()
				3119	def blacklist(f):
				3120	return any(re.match(b, f) for b in options.blacklist)
				3121	data = api.parse_log(options.log, blacklist)
				3122	# Process each trace.
				3123	output_as_json = []
				3124	for item in data:
				3125	if 'exception' in item:
maruel@chromium.org	424f569	2012-10-15 15:42:38 +0000	[diff] [blame]	3126	# Do not abort the other traces.
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	3127	print >> sys.stderr, (
maruel@chromium.org	424f569	2012-10-15 15:42:38 +0000	[diff] [blame]	3128	'Trace %s: Got an exception: %s' % (
				3129	item['trace'], item['exception'][1]))
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	3130	continue
				3131	results = item['results']
				3132	if options.root_dir:
				3133	results = results.strip_root(options.root_dir)
				3134
				3135	if options.json:
				3136	output_as_json.append(results.flatten())
				3137	else:
				3138	simplified = extract_directories(
				3139	options.root_dir, results.files, blacklist)
				3140	simplified = [f.replace_variables(variables) for f in simplified]
				3141	if len(data) > 1:
				3142	print('Trace: %s' % item['trace'])
				3143	print('Total: %d' % len(results.files))
				3144	print('Non existent: %d' % len(results.non_existent))
				3145	for f in results.non_existent:
				3146	print(' %s' % f.path)
				3147	print(
				3148	'Interesting: %d reduced to %d' % (
				3149	len(results.existent), len(simplified)))
				3150	for f in simplified:
				3151	print(' %s' % f.path)
				3152
				3153	if options.json:
				3154	write_json(sys.stdout, output_as_json, False)
				3155	return 0
				3156
				3157
				3158	class OptionParserWithLogging(optparse.OptionParser):
				3159	"""Adds --verbose option."""
				3160	def __init__(self, verbose=0, **kwargs):
				3161	optparse.OptionParser.__init__(self, **kwargs)
				3162	self.add_option(
				3163	'-v', '--verbose',
				3164	action='count',
				3165	default=verbose,
				3166	help='Use multiple times to increase verbosity')
				3167
				3168	def parse_args(self, args, *kwargs):
				3169	options, args = optparse.OptionParser.parse_args(self, args, *kwargs)
				3170	levels = [logging.ERROR, logging.INFO, logging.DEBUG]
				3171	logging.basicConfig(
				3172	level=levels[min(len(levels)-1, options.verbose)],
				3173	format='%(levelname)5s %(module)15s(%(lineno)3d): %(message)s')
				3174	return options, args
				3175
				3176
				3177	class OptionParserWithNiceDescription(OptionParserWithLogging):
				3178	"""Generates the description with the command's docstring."""
				3179	def __init__(self, **kwargs):
				3180	"""Sets 'description' and 'usage' if not already specified."""
				3181	command = kwargs.pop('command', 'help')
				3182	kwargs.setdefault(
				3183	'description',
				3184	re.sub('[\r\n ]{2,}', ' ', get_command_handler(command).__doc__))
				3185	kwargs.setdefault('usage', '%%prog %s [options]' % command)
				3186	OptionParserWithLogging.__init__(self, **kwargs)
				3187
				3188
				3189	class OptionParserTraceInputs(OptionParserWithNiceDescription):
				3190	"""Adds automatic --log handling."""
				3191	def __init__(self, **kwargs):
				3192	OptionParserWithNiceDescription.__init__(self, **kwargs)
				3193	self.add_option(
				3194	'-l', '--log', help='Log file to generate or read, required')
				3195
				3196	def parse_args(self, args, *kwargs):
				3197	"""Makes sure the paths make sense.
				3198
				3199	On Windows, / and \ are often mixed together in a path.
				3200	"""
				3201	options, args = OptionParserWithNiceDescription.parse_args(
				3202	self, args, *kwargs)
				3203	if not options.log:
				3204	self.error('Must supply a log file with -l')
				3205	options.log = os.path.abspath(options.log)
				3206	return options, args
				3207
				3208
				3209	def extract_documentation():
				3210	"""Returns a dict {command: description} for each of documented command."""
				3211	commands = (
				3212	fn[3:]
				3213	for fn in dir(sys.modules['__main__'])
				3214	if fn.startswith('CMD') and get_command_handler(fn[3:]).__doc__)
				3215	return dict((fn, get_command_handler(fn).__doc__) for fn in commands)
				3216
				3217
				3218	def CMDhelp(args):
				3219	"""Prints list of commands or help for a specific command."""
				3220	doc = extract_documentation()
				3221	# Calculates the optimal offset.
				3222	offset = max(len(cmd) for cmd in doc)
				3223	format_str = ' %-' + str(offset + 2) + 's %s'
				3224	# Generate a one-liner documentation of each commands.
				3225	commands_description = '\n'.join(
				3226	format_str % (cmd, doc[cmd].split('\n')[0]) for cmd in sorted(doc))
				3227
				3228	parser = OptionParserWithNiceDescription(
				3229	usage='%prog <command> [options]',
				3230	description='Commands are:\n%s\n' % commands_description)
				3231	parser.format_description = lambda _: parser.description
				3232
				3233	# Strip out any -h or --help argument.
				3234	_, args = parser.parse_args([i for i in args if not i in ('-h', '--help')])
				3235	if len(args) == 1:
				3236	if not get_command_handler(args[0]):
				3237	parser.error('Unknown command %s' % args[0])
				3238	# The command was "%prog help command", replaces ourself with
				3239	# "%prog command --help" so help is correctly printed out.
				3240	return main(args + ['--help'])
				3241	elif args:
				3242	parser.error('Unknown argument "%s"' % ' '.join(args))
				3243	parser.print_help()
				3244	return 0
				3245
				3246
				3247	def get_command_handler(name):
				3248	"""Returns the command handler or CMDhelp if it doesn't exist."""
				3249	return getattr(sys.modules['__main__'], 'CMD%s' % name, None)
				3250
				3251
				3252	def main_impl(argv):
				3253	command = get_command_handler(argv[0] if argv else 'help')
				3254	if not command:
				3255	return CMDhelp(argv)
				3256	return command(argv[1:])
				3257
				3258	def main(argv):
				3259	try:
				3260	main_impl(argv)
				3261	except TracingFailure, e:
				3262	sys.stderr.write('\nError: ')
				3263	sys.stderr.write(str(e))
				3264	sys.stderr.write('\n')
				3265	return 1
				3266
				3267
				3268	if __name__ == '__main__':
				3269	sys.exit(main(sys.argv[1:]))