Blame - trace_inputs.py - chromium.googlesource.com/infra/luci/client-py

blob: a3c804c5382a7a898d0b8699930a33aa0a221677 [file] [log] [blame]

maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1	#!/usr/bin/env python
				2	# coding=utf-8
				3	# Copyright (c) 2012 The Chromium Authors. All rights reserved.
				4	# Use of this source code is governed by a BSD-style license that can be
				5	# found in the LICENSE file.
				6
				7	"""Traces an executable and its child processes and extract the files accessed
				8	by them.
				9
				10	The implementation uses OS-specific API. The native Kernel logger and the ETL
				11	interface is used on Windows. Dtrace is used on OSX. Strace is used otherwise.
				12	The OS-specific implementation is hidden in an 'API' interface.
				13
				14	The results are embedded in a Results instance. The tracing is done in two
				15	phases, the first is to do the actual trace and generate an
				16	implementation-specific log file. Then the log file is parsed to extract the
				17	information, including the individual child processes and the files accessed
				18	from the log.
				19	"""
				20
				21	import codecs
				22	import csv
				23	import getpass
				24	import glob
				25	import json
				26	import logging
				27	import optparse
				28	import os
				29	import re
				30	import subprocess
				31	import sys
				32	import tempfile
				33	import threading
				34	import time
				35	import weakref
				36
				37	## OS-specific imports
				38
				39	if sys.platform == 'win32':
				40	from ctypes.wintypes import byref, create_unicode_buffer, c_int, c_wchar_p
				41	from ctypes.wintypes import windll, FormatError # pylint: disable=E0611
				42	from ctypes.wintypes import GetLastError # pylint: disable=E0611
				43	elif sys.platform == 'darwin':
				44	import Carbon.File # pylint: disable=F0401
				45	import MacOS # pylint: disable=F0401
				46
				47
				48	BASE_DIR = os.path.dirname(os.path.abspath(__file__))
				49	ROOT_DIR = os.path.dirname(os.path.dirname(BASE_DIR))
				50
				51
				52	class TracingFailure(Exception):
				53	"""An exception occured during tracing."""
				54	def __init__(self, description, pid, line_number, line, *args):
				55	super(TracingFailure, self).__init__(
				56	description, pid, line_number, line, *args)
				57	self.description = description
				58	self.pid = pid
				59	self.line_number = line_number
				60	self.line = line
				61	self.extra = args
				62
				63	def __str__(self):
				64	out = self.description
				65	if self.pid:
				66	out += '\npid: %d' % self.pid
				67	if self.line_number:
				68	out += '\nline: %d' % self.line_number
				69	if self.line:
				70	out += '\n%s' % self.line
				71	if self.extra:
				72	out += '\n' + ', '.join(map(str, filter(None, self.extra)))
				73	return out
				74
				75
				76	## OS-specific functions
				77
				78	if sys.platform == 'win32':
				79	def QueryDosDevice(drive_letter):
				80	"""Returns the Windows 'native' path for a DOS drive letter."""
				81	assert re.match(r'^[a-zA-Z]:$', drive_letter), drive_letter
				82	# Guesswork. QueryDosDeviceW never returns the required number of bytes.
				83	chars = 1024
				84	drive_letter = unicode(drive_letter)
				85	p = create_unicode_buffer(chars)
				86	if 0 == windll.kernel32.QueryDosDeviceW(drive_letter, p, chars):
				87	err = GetLastError()
				88	if err:
				89	# pylint: disable=E0602
				90	raise WindowsError(
				91	err,
				92	'QueryDosDevice(%s): %s (%d)' % (
				93	str(drive_letter), FormatError(err), err))
				94	return p.value
				95
				96
				97	def GetShortPathName(long_path):
				98	"""Returns the Windows short path equivalent for a 'long' path."""
				99	long_path = unicode(long_path)
				100	# Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is
				101	# not enforced.
				102	if os.path.isabs(long_path) and not long_path.startswith('\\\\?\\'):
				103	long_path = '\\\\?\\' + long_path
				104	chars = windll.kernel32.GetShortPathNameW(long_path, None, 0)
				105	if chars:
				106	p = create_unicode_buffer(chars)
				107	if windll.kernel32.GetShortPathNameW(long_path, p, chars):
				108	return p.value
				109
				110	err = GetLastError()
				111	if err:
				112	# pylint: disable=E0602
				113	raise WindowsError(
				114	err,
				115	'GetShortPathName(%s): %s (%d)' % (
				116	str(long_path), FormatError(err), err))
				117
				118
				119	def GetLongPathName(short_path):
				120	"""Returns the Windows long path equivalent for a 'short' path."""
				121	short_path = unicode(short_path)
				122	# Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is
				123	# not enforced.
				124	if os.path.isabs(short_path) and not short_path.startswith('\\\\?\\'):
				125	short_path = '\\\\?\\' + short_path
				126	chars = windll.kernel32.GetLongPathNameW(short_path, None, 0)
				127	if chars:
				128	p = create_unicode_buffer(chars)
				129	if windll.kernel32.GetLongPathNameW(short_path, p, chars):
				130	return p.value
				131
				132	err = GetLastError()
				133	if err:
				134	# pylint: disable=E0602
				135	raise WindowsError(
				136	err,
				137	'GetLongPathName(%s): %s (%d)' % (
				138	str(short_path), FormatError(err), err))
				139
				140
				141	def get_current_encoding():
				142	"""Returns the 'ANSI' code page associated to the process."""
				143	return 'cp%d' % int(windll.kernel32.GetACP())
				144
				145
				146	class DosDriveMap(object):
				147	"""Maps \Device\HarddiskVolumeN to N: on Windows."""
				148	# Keep one global cache.
				149	_MAPPING = {}
				150
				151	def __init__(self):
				152	"""Lazy loads the cache."""
				153	if not self._MAPPING:
				154	# This is related to UNC resolver on windows. Ignore that.
				155	self._MAPPING['\\Device\\Mup'] = None
				156	self._MAPPING['\\SystemRoot'] = os.environ['SystemRoot']
				157
				158	for letter in (chr(l) for l in xrange(ord('C'), ord('Z')+1)):
				159	try:
				160	letter = '%s:' % letter
				161	mapped = QueryDosDevice(letter)
				162	if mapped in self._MAPPING:
				163	logging.warn(
				164	('Two drives: \'%s\' and \'%s\', are mapped to the same disk'
				165	'. Drive letters are a user-mode concept and the kernel '
				166	'traces only have NT path, so all accesses will be '
				167	'associated with the first drive letter, independent of the '
				168	'actual letter used by the code') % (
				169	self._MAPPING[mapped], letter))
				170	else:
				171	self._MAPPING[mapped] = letter
				172	except WindowsError: # pylint: disable=E0602
				173	pass
				174
				175	def to_win32(self, path):
				176	"""Converts a native NT path to Win32/DOS compatible path."""
				177	match = re.match(r'(^\\Device\\[a-zA-Z0-9]+)(\\.*)?$', path)
				178	if not match:
				179	raise ValueError(
				180	'Can\'t convert %s into a Win32 compatible path' % path,
				181	path)
				182	if not match.group(1) in self._MAPPING:
				183	# Unmapped partitions may be accessed by windows for the
				184	# fun of it while the test is running. Discard these.
				185	return None
				186	drive = self._MAPPING[match.group(1)]
				187	if not drive or not match.group(2):
				188	return drive
				189	return drive + match.group(2)
				190
				191
				192	def isabs(path):
				193	"""Accepts X: as an absolute path, unlike python's os.path.isabs()."""
				194	return os.path.isabs(path) or len(path) == 2 and path[1] == ':'
				195
				196
				197	def get_native_path_case(p):
				198	"""Returns the native path case for an existing file.
				199
				200	On Windows, removes any leading '\\?\'.
				201	"""
				202	if not isabs(p):
				203	raise ValueError(
				204	'Can\'t get native path case for a non-absolute path: %s' % p,
				205	p)
				206	# Windows used to have an option to turn on case sensitivity on non Win32
				207	# subsystem but that's out of scope here and isn't supported anymore.
				208	# Go figure why GetShortPathName() is needed.
				209	try:
				210	out = GetLongPathName(GetShortPathName(p))
				211	except OSError, e:
				212	if e.args[0] in (2, 3, 5):
				213	# The path does not exist. Try to recurse and reconstruct the path.
				214	base = os.path.dirname(p)
				215	rest = os.path.basename(p)
				216	return os.path.join(get_native_path_case(base), rest)
				217	raise
				218	if out.startswith('\\\\?\\'):
				219	out = out[4:]
				220	# Always upper case the first letter since GetLongPathName() will return the
				221	# drive letter in the case it was given.
				222	return out[0].upper() + out[1:]
				223
				224
				225	def CommandLineToArgvW(command_line):
				226	"""Splits a commandline into argv using CommandLineToArgvW()."""
				227	# http://msdn.microsoft.com/library/windows/desktop/bb776391.aspx
				228	size = c_int()
				229	ptr = windll.shell32.CommandLineToArgvW(unicode(command_line), byref(size))
				230	try:
				231	return [arg for arg in (c_wchar_p * size.value).from_address(ptr)]
				232	finally:
				233	windll.kernel32.LocalFree(ptr)
				234
				235
				236	elif sys.platform == 'darwin':
				237
				238
				239	# On non-windows, keep the stdlib behavior.
				240	isabs = os.path.isabs
				241
				242
				243	def _find_item_native_case(root_path, item):
				244	"""Gets the native path case of a single item based at root_path.
				245
				246	There is no API to get the native path case of symlinks on OSX. So it
				247	needs to be done the slow way.
				248	"""
				249	item = item.lower()
				250	for element in os.listdir(root_path):
				251	if element.lower() == item:
				252	return element
				253
				254
				255	def _native_case(p):
				256	"""Gets the native path case. Warning: this function resolves symlinks."""
				257	logging.debug('native_case(%s)' % p)
				258	try:
				259	rel_ref, _ = Carbon.File.FSPathMakeRef(p)
				260	out = rel_ref.FSRefMakePath()
				261	if p.endswith(os.path.sep) and not out.endswith(os.path.sep):
				262	return out + os.path.sep
				263	return out
				264	except MacOS.Error, e:
				265	if e.args[0] in (-43, -120):
				266	# The path does not exist. Try to recurse and reconstruct the path.
				267	# -43 means file not found.
				268	# -120 means directory not found.
				269	base = os.path.dirname(p)
				270	rest = os.path.basename(p)
				271	return os.path.join(_native_case(base), rest)
				272	raise OSError(
				273	e.args[0], 'Failed to get native path for %s' % p, p, e.args[1])
				274
				275
				276	def _split_at_symlink_native(base_path, rest):
				277	"""Returns the native path for a symlink."""
				278	base, symlink, rest = split_at_symlink(base_path, rest)
				279	if symlink:
				280	if not base_path:
				281	base_path = base
				282	else:
				283	base_path = safe_join(base_path, base)
				284	symlink = _find_item_native_case(base_path, symlink)
				285	return base, symlink, rest
				286
				287
				288	def get_native_path_case(path):
				289	"""Returns the native path case for an existing file.
				290
				291	Technically, it's only HFS+ on OSX that is case preserving and
				292	insensitive. It's the default setting on HFS+ but can be changed.
				293	"""
				294	if not isabs(path):
				295	raise ValueError(
				296	'Can\'t get native path case for a non-absolute path: %s' % path,
				297	path)
				298	if path.startswith('/dev'):
				299	# /dev is not visible from Carbon, causing an exception.
				300	return path
				301
				302	# Starts assuming there is no symlink along the path.
				303	resolved = _native_case(path)
				304	if resolved.lower() == path.lower():
				305	# This code path is incredibly faster.
				306	return resolved
				307
				308	# There was a symlink, process it.
				309	base, symlink, rest = _split_at_symlink_native(None, path)
				310	assert symlink, (path, base, symlink, rest, resolved)
				311	prev = base
				312	base = safe_join(_native_case(base), symlink)
				313	assert len(base) > len(prev)
				314	while rest:
				315	prev = base
				316	relbase, symlink, rest = _split_at_symlink_native(base, rest)
				317	base = safe_join(base, relbase)
				318	assert len(base) > len(prev), (prev, base, symlink)
				319	if symlink:
				320	base = safe_join(base, symlink)
				321	assert len(base) > len(prev), (prev, base, symlink)
				322	# Make sure no symlink was resolved.
				323	assert base.lower() == path.lower(), (base, path)
				324	return base
				325
				326
				327	else: # OSes other than Windows and OSX.
				328
				329
				330	# On non-windows, keep the stdlib behavior.
				331	isabs = os.path.isabs
				332
				333
				334	def get_native_path_case(path):
				335	"""Returns the native path case for an existing file.
				336
				337	On OSes other than OSX and Windows, assume the file system is
				338	case-sensitive.
				339
				340	TODO(maruel): This is not strictly true. Implement if necessary.
				341	"""
				342	if not isabs(path):
				343	raise ValueError(
				344	'Can\'t get native path case for a non-absolute path: %s' % path,
				345	path)
				346	# Give up on cygwin, as GetLongPathName() can't be called.
				347	# Linux traces tends to not be normalized so use this occasion to normalize
				348	# it. This function implementation already normalizes the path on the other
				349	# OS so this needs to be done here to be coherent between OSes.
				350	out = os.path.normpath(path)
				351	if path.endswith(os.path.sep) and not out.endswith(os.path.sep):
				352	return out + os.path.sep
				353	return out
				354
				355
				356	if sys.platform != 'win32': # All non-Windows OSes.
				357
				358
				359	def safe_join(*args):
				360	"""Joins path elements like os.path.join() but doesn't abort on absolute
				361	path.
				362
				363	os.path.join('foo', '/bar') == '/bar'
				364	but safe_join('foo', '/bar') == 'foo/bar'.
				365	"""
				366	out = ''
				367	for element in args:
				368	if element.startswith(os.path.sep):
				369	if out.endswith(os.path.sep):
				370	out += element[1:]
				371	else:
				372	out += element
				373	else:
				374	if out.endswith(os.path.sep):
				375	out += element
				376	else:
				377	out += os.path.sep + element
				378	return out
				379
				380
				381	def split_at_symlink(base_dir, relfile):
				382	"""Scans each component of relfile and cut the string at the symlink if
				383	there is any.
				384
				385	Returns a tuple (base_path, symlink, rest), with symlink == rest == None if
				386	not symlink was found.
				387	"""
				388	if base_dir:
				389	assert relfile
				390	assert os.path.isabs(base_dir)
				391	index = 0
				392	else:
				393	assert os.path.isabs(relfile)
				394	index = 1
				395
				396	def at_root(rest):
				397	if base_dir:
				398	return safe_join(base_dir, rest)
				399	return rest
				400
				401	while True:
				402	try:
				403	index = relfile.index(os.path.sep, index)
				404	except ValueError:
				405	index = len(relfile)
				406	full = at_root(relfile[:index])
				407	if os.path.islink(full):
				408	# A symlink!
				409	base = os.path.dirname(relfile[:index])
				410	symlink = os.path.basename(relfile[:index])
				411	rest = relfile[index:]
				412	logging.debug(
				413	'split_at_symlink(%s, %s) -> (%s, %s, %s)' %
				414	(base_dir, relfile, base, symlink, rest))
				415	return base, symlink, rest
				416	if index == len(relfile):
				417	break
				418	index += 1
				419	return relfile, None, None
				420
				421
				422	def fix_python_path(cmd):
				423	"""Returns the fixed command line to call the right python executable."""
				424	out = cmd[:]
				425	if out[0] == 'python':
				426	out[0] = sys.executable
				427	elif out[0].endswith('.py'):
				428	out.insert(0, sys.executable)
				429	return out
				430
				431
				432	def create_thunk():
				433	handle, name = tempfile.mkstemp(prefix='trace_inputs_thunk', suffix='.py')
				434	os.write(
				435	handle,
				436	(
				437	'import subprocess\n'
				438	'import sys\n'
				439	'sys.exit(subprocess.call(sys.argv[2:]))\n'
				440	))
				441	os.close(handle)
				442	return name
				443
				444
				445	def strace_process_quoted_arguments(text):
				446	"""Extracts quoted arguments on a string and return the arguments as a list.
				447
				448	Implemented as an automaton. Supports incomplete strings in the form
				449	'"foo"...'.
				450
				451	Example:
				452	With text = '"foo", "bar"', the function will return ['foo', 'bar']
				453
				454	TODO(maruel): Implement escaping.
				455	"""
				456	# All the possible states of the DFA.
				457	( NEED_QUOTE, # Begining of a new arguments.
				458	INSIDE_STRING, # Inside an argument.
				459	ESCAPED, # Found a '\' inside a quote. Treat the next char as-is.
				460	NEED_COMMA_OR_DOT, # Right after the closing quote of an argument. Could be
				461	# a serie of 3 dots or a comma.
				462	NEED_SPACE, # Right after a comma
				463	NEED_DOT_2, # Found a dot, need a second one.
				464	NEED_DOT_3, # Found second dot, need a third one.
				465	NEED_COMMA, # Found third dot, need a comma.
				466	) = range(8)
				467
				468	state = NEED_QUOTE
				469	out = []
				470	for index, char in enumerate(text):
				471	if char == '"':
				472	if state == NEED_QUOTE:
				473	state = INSIDE_STRING
				474	# A new argument was found.
				475	out.append('')
				476	elif state == INSIDE_STRING:
				477	# The argument is now closed.
				478	state = NEED_COMMA_OR_DOT
				479	elif state == ESCAPED:
				480	out[-1] += char
				481	state = INSIDE_STRING
				482	else:
				483	raise ValueError(
				484	'Can\'t process char at column %d for: %r' % (index, text),
				485	index,
				486	text)
				487	elif char == ',':
				488	if state in (NEED_COMMA_OR_DOT, NEED_COMMA):
				489	state = NEED_SPACE
				490	elif state == INSIDE_STRING:
				491	out[-1] += char
				492	elif state == ESCAPED:
				493	out[-1] += char
				494	state = INSIDE_STRING
				495	else:
				496	raise ValueError(
				497	'Can\'t process char at column %d for: %r' % (index, text),
				498	index,
				499	text)
				500	elif char == ' ':
				501	if state == NEED_SPACE:
				502	state = NEED_QUOTE
				503	elif state == INSIDE_STRING:
				504	out[-1] += char
				505	elif state == ESCAPED:
				506	out[-1] += char
				507	state = INSIDE_STRING
				508	else:
				509	raise ValueError(
				510	'Can\'t process char at column %d for: %r' % (index, text),
				511	index,
				512	text)
				513	elif char == '.':
				514	if state == NEED_COMMA_OR_DOT:
				515	# The string is incomplete, this mean the strace -s flag should be
				516	# increased.
				517	state = NEED_DOT_2
				518	elif state == NEED_DOT_2:
				519	state = NEED_DOT_3
				520	elif state == NEED_DOT_3:
				521	state = NEED_COMMA
				522	elif state == INSIDE_STRING:
				523	out[-1] += char
				524	elif state == ESCAPED:
				525	out[-1] += char
				526	state = INSIDE_STRING
				527	else:
				528	raise ValueError(
				529	'Can\'t process char at column %d for: %r' % (index, text),
				530	index,
				531	text)
				532	elif char == '\\':
				533	if state == ESCAPED:
				534	out[-1] += char
				535	state = INSIDE_STRING
				536	elif state == INSIDE_STRING:
				537	state = ESCAPED
				538	else:
				539	raise ValueError(
				540	'Can\'t process char at column %d for: %r' % (index, text),
				541	index,
				542	text)
				543	else:
				544	if state == INSIDE_STRING:
				545	out[-1] += char
				546	else:
				547	raise ValueError(
				548	'Can\'t process char at column %d for: %r' % (index, text),
				549	index,
				550	text)
				551	if state not in (NEED_COMMA, NEED_COMMA_OR_DOT):
				552	raise ValueError(
				553	'String is incorrectly terminated: %r' % text,
				554	text)
				555	return out
				556
				557
				558	def read_json(filepath):
				559	with open(filepath, 'r') as f:
				560	return json.load(f)
				561
				562
				563	def write_json(filepath_or_handle, data, dense):
				564	"""Writes data into filepath or file handle encoded as json.
				565
				566	If dense is True, the json is packed. Otherwise, it is human readable.
				567	"""
				568	if hasattr(filepath_or_handle, 'write'):
				569	if dense:
				570	filepath_or_handle.write(
				571	json.dumps(data, sort_keys=True, separators=(',',':')))
				572	else:
				573	filepath_or_handle.write(json.dumps(data, sort_keys=True, indent=2))
				574	else:
				575	with open(filepath_or_handle, 'wb') as f:
				576	if dense:
				577	json.dump(data, f, sort_keys=True, separators=(',',':'))
				578	else:
				579	json.dump(data, f, sort_keys=True, indent=2)
				580
				581
				582	class Results(object):
				583	"""Results of a trace session."""
				584
				585	class _TouchedObject(object):
				586	"""Something, a file or a directory, that was accessed."""
				587	def __init__(self, root, path, tainted, size, nb_files):
				588	logging.debug(
				589	'%s(%s, %s, %s, %s, %s)' %
				590	(self.__class__.__name__, root, path, tainted, size, nb_files))
				591	self.root = root
				592	self.path = path
				593	self.tainted = tainted
				594	self.nb_files = nb_files
				595	# Can be used as a cache or a default value, depending on context.
				596	self._size = size
				597	# These are cache only.
				598	self._real_path = None
				599
				600	# Check internal consistency.
				601	assert path, path
				602	assert tainted or bool(root) != bool(isabs(path)), (root, path)
				603	assert tainted or (
				604	not os.path.exists(self.full_path) or
				605	(self.full_path == get_native_path_case(self.full_path))), (
				606	tainted, self.full_path, get_native_path_case(self.full_path))
				607
				608	@property
				609	def existent(self):
				610	return self.size != -1
				611
				612	@property
				613	def full_path(self):
				614	if self.root:
				615	return os.path.join(self.root, self.path)
				616	return self.path
				617
				618	@property
				619	def real_path(self):
				620	"""Returns the path with symlinks resolved."""
				621	if not self._real_path:
				622	self._real_path = os.path.realpath(self.full_path)
				623	return self._real_path
				624
				625	@property
				626	def size(self):
				627	"""File's size. -1 is not existent."""
				628	if self._size is None and not self.tainted:
				629	try:
				630	self._size = os.stat(self.full_path).st_size
				631	except OSError:
				632	self._size = -1
				633	return self._size
				634
				635	def flatten(self):
				636	"""Returns a dict representing this object.
				637
				638	A 'size' of 0 means the file was only touched and not read.
				639	"""
				640	return {
				641	'path': self.path,
				642	'size': self.size,
				643	}
				644
				645	def replace_variables(self, variables):
				646	"""Replaces the root of this File with one of the variables if it matches.
				647
				648	If a variable replacement occurs, the cloned object becomes tainted.
				649	"""
				650	for variable, root_path in variables.iteritems():
				651	if self.path.startswith(root_path):
				652	return self._clone(
				653	self.root, variable + self.path[len(root_path):], True)
				654	# No need to clone, returns ourself.
				655	return self
				656
				657	def strip_root(self, root):
				658	"""Returns a clone of itself with 'root' stripped off."""
				659	# Check internal consistency.
				660	assert self.tainted or (isabs(root) and root.endswith(os.path.sep)), root
				661	if not self.full_path.startswith(root):
				662	# Now try to resolve the symlinks to see if it can be reached this way.
				663	# Only try after trying without resolving symlink.
				664	if not self.real_path.startswith(root):
				665	return None
				666	path = self.real_path
				667	else:
				668	path = self.full_path
				669	return self._clone(root, path[len(root):], self.tainted)
				670
				671	def _clone(self, new_root, new_path, tainted):
				672	raise NotImplementedError(self.__class__.__name__)
				673
				674	class File(_TouchedObject):
				675	"""A file that was accessed. May not be present anymore.
				676
				677	If tainted is true, it means it is not a real path anymore as a variable
				678	replacement occured.
				679
				680	If touched_only is True, this means the file was probed for existence, and
				681	it is existent, but was never _opened_. If touched_only is True, the file
				682	must have existed.
				683	"""
				684	def __init__(self, root, path, tainted, size):
				685	super(Results.File, self).__init__(root, path, tainted, size, 1)
				686
				687	def _clone(self, new_root, new_path, tainted):
				688	"""Clones itself keeping meta-data."""
				689	# Keep the self.size and self._real_path caches for performance reason. It
				690	# is also important when the file becomes tainted (with a variable instead
				691	# of the real path) since self.path is not an on-disk path anymore so
				692	# out._size cannot be updated.
				693	out = self.__class__(new_root, new_path, tainted, self.size)
				694	out._real_path = self._real_path
				695	return out
				696
				697	class Directory(_TouchedObject):
				698	"""A directory of files. Must exist."""
				699	def __init__(self, root, path, tainted, size, nb_files):
				700	"""path='.' is a valid value and must be handled appropriately."""
				701	assert not path.endswith(os.path.sep), path
				702	super(Results.Directory, self).__init__(
				703	root, path + os.path.sep, tainted, size, nb_files)
				704	# For a Directory instance, self.size is not a cache, it's an actual value
				705	# that is never modified and represents the total size of the files
				706	# contained in this directory. It is possible that the directory is empty
				707	# so that size == 0; this happens if there's only an invalid symlink in
				708	# it.
				709
				710	def flatten(self):
				711	out = super(Results.Directory, self).flatten()
				712	out['nb_files'] = self.nb_files
				713	return out
				714
				715	def _clone(self, new_root, new_path, tainted):
				716	"""Clones itself keeping meta-data."""
				717	out = self.__class__(
				718	new_root,
				719	new_path.rstrip(os.path.sep),
				720	tainted,
				721	self.size,
				722	self.nb_files)
				723	out._real_path = self._real_path
				724	return out
				725
				726	class Process(object):
				727	"""A process that was traced.
				728
				729	Contains references to the files accessed by this process and its children.
				730	"""
				731	def __init__(self, pid, files, executable, command, initial_cwd, children):
				732	logging.debug('Process(%s, %d, ...)' % (pid, len(files)))
				733	self.pid = pid
				734	self.files = sorted(files, key=lambda x: x.path)
				735	self.children = children
				736	self.executable = executable
				737	self.command = command
				738	self.initial_cwd = initial_cwd
				739
				740	# Check internal consistency.
				741	assert len(set(f.path for f in self.files)) == len(self.files), sorted(
				742	f.path for f in self.files)
				743	assert isinstance(self.children, list)
				744	assert isinstance(self.files, list)
				745
				746	@property
				747	def all(self):
				748	for child in self.children:
				749	for i in child.all:
				750	yield i
				751	yield self
				752
				753	def flatten(self):
				754	return {
				755	'children': [c.flatten() for c in self.children],
				756	'command': self.command,
				757	'executable': self.executable,
				758	'files': [f.flatten() for f in self.files],
				759	'initial_cwd': self.initial_cwd,
				760	'pid': self.pid,
				761	}
				762
				763	def strip_root(self, root):
				764	assert isabs(root) and root.endswith(os.path.sep), root
				765	# Loads the files after since they are constructed as objects.
				766	out = self.__class__(
				767	self.pid,
				768	filter(None, (f.strip_root(root) for f in self.files)),
				769	self.executable,
				770	self.command,
				771	self.initial_cwd,
				772	[c.strip_root(root) for c in self.children])
				773	logging.debug(
				774	'strip_root(%s) %d -> %d' % (root, len(self.files), len(out.files)))
				775	return out
				776
				777	def __init__(self, process):
				778	self.process = process
				779	# Cache.
				780	self._files = None
				781
				782	def flatten(self):
				783	return {
				784	'root': self.process.flatten(),
				785	}
				786
				787	@property
				788	def files(self):
				789	if self._files is None:
				790	self._files = sorted(
				791	sum((p.files for p in self.process.all), []),
				792	key=lambda x: x.path)
				793	return self._files
				794
				795	@property
				796	def existent(self):
				797	return [f for f in self.files if f.existent]
				798
				799	@property
				800	def non_existent(self):
				801	return [f for f in self.files if not f.existent]
				802
				803	def strip_root(self, root):
				804	"""Returns a clone with all the files outside the directory \|root\| removed
				805	and converts all the path to be relative paths.
				806	"""
				807	# Resolve any symlink
				808	root = os.path.realpath(root)
				809	root = get_native_path_case(root).rstrip(os.path.sep) + os.path.sep
				810	logging.debug('strip_root(%s)' % root)
				811	return Results(self.process.strip_root(root))
				812
				813
				814	class ApiBase(object):
				815	"""OS-agnostic API to trace a process and its children."""
				816	class Context(object):
				817	"""Processes one log line at a time and keeps the list of traced processes.
				818
				819	The parsing is complicated by the fact that logs are traced out of order for
				820	strace but in-order for dtrace and logman. In addition, on Windows it is
				821	very frequent that processids are reused so a flat list cannot be used. But
				822	at the same time, it is impossible to faithfully construct a graph when the
				823	logs are processed out of order. So both a tree and a flat mapping are used,
				824	the tree is the real process tree, while the flat mapping stores the last
				825	valid process for the corresponding processid. For the strace case, the
				826	tree's head is guessed at the last moment.
				827	"""
				828	class Process(object):
				829	"""Keeps context for one traced child process.
				830
				831	Logs all the files this process touched. Ignores directories.
				832	"""
				833	def __init__(self, blacklist, pid, initial_cwd):
				834	# Check internal consistency.
				835	assert isinstance(pid, int), repr(pid)
				836	self.pid = pid
				837	# children are Process instances.
				838	self.children = []
				839	self.initial_cwd = initial_cwd
				840	self.cwd = None
				841	self.files = set()
				842	self.only_touched = set()
				843	self.executable = None
				844	self.command = None
				845	self._blacklist = blacklist
				846
				847	def to_results_process(self):
				848	"""Resolves file case sensitivity and or late-bound strings."""
				849	# When resolving files, it's normal to get dupe because a file could be
				850	# opened multiple times with different case. Resolve the deduplication
				851	# here.
				852	def fix_path(x):
				853	"""Returns the native file path case.
				854
				855	Converts late-bound strings.
				856	"""
				857	if not x:
				858	# Do not convert None instance to 'None'.
				859	return x
				860	# TODO(maruel): Do not upconvert to unicode here, on linux we don't
				861	# know the file path encoding so they must be treated as bytes.
				862	x = unicode(x)
				863	if os.path.isabs(x):
				864	# If the path is not absolute, which tends to happen occasionally on
				865	# Windows, it is not possible to get the native path case so ignore
				866	# that trace. It mostly happens for 'executable' value.
				867	x = get_native_path_case(x)
				868	return x
				869
				870	def fix_and_blacklist_path(x):
				871	x = fix_path(x)
				872	if not x:
				873	return
				874	# The blacklist needs to be reapplied, since path casing could
				875	# influence blacklisting.
				876	if self._blacklist(x):
				877	return
				878	return x
				879
				880	# Filters out directories. Some may have passed through.
				881	files = set(f for f in map(fix_and_blacklist_path, self.files) if f)
				882	only_touched = set(
				883	f for f in map(fix_and_blacklist_path, self.only_touched) if f)
				884	only_touched -= files
				885
				886	files = [
				887	Results.File(None, f, False, None) for f in files
				888	if not os.path.isdir(f)
				889	]
				890	# Using 0 as size means the file's content is ignored since the file was
				891	# never opened for I/O.
				892	files.extend(
				893	Results.File(None, f, False, 0) for f in only_touched
				894	if not os.path.isdir(f)
				895	)
				896	return Results.Process(
				897	self.pid,
				898	files,
				899	fix_path(self.executable),
				900	self.command,
				901	fix_path(self.initial_cwd),
				902	[c.to_results_process() for c in self.children])
				903
				904	def add_file(self, filepath, touch_only):
				905	"""Adds a file if it passes the blacklist."""
				906	if self._blacklist(unicode(filepath)):
				907	return
				908	logging.debug('add_file(%d, %s, %s)' % (self.pid, filepath, touch_only))
				909	# Note that filepath and not unicode(filepath) is added. It is because
				910	# filepath could be something else than a string, like a RelativePath
				911	# instance for dtrace logs.
				912	if touch_only:
				913	self.only_touched.add(filepath)
				914	else:
				915	self.files.add(filepath)
				916
				917	def __init__(self, blacklist):
				918	self.blacklist = blacklist
				919	# Initial process.
				920	self.root_process = None
				921	# dict to accelerate process lookup, to not have to lookup the whole graph
				922	# each time.
				923	self._process_lookup = {}
				924
				925	class Tracer(object):
				926	"""During it's lifetime, the tracing subsystem is enabled."""
				927	def __init__(self, logname):
				928	self._logname = logname
				929	self._lock = threading.Lock()
				930	self._traces = []
				931	self._initialized = True
				932
				933	def trace(self, cmd, cwd, tracename, output):
				934	"""Runs the OS-specific trace program on an executable.
				935
				936	Arguments:
				937	- cmd: The command (a list) to run.
				938	- cwd: Current directory to start the child process in.
				939	- tracename: Name of the trace in the logname file.
				940	- output: If False, redirects output to PIPEs.
				941
				942	Returns a tuple (resultcode, output) and updates the internal trace
				943	entries.
				944	"""
				945	# The implementation adds an item to self._traces.
				946	raise NotImplementedError(self.__class__.__name__)
				947
				948	def close(self, _timeout=None):
				949	"""Saves the meta-data in the logname file.
				950
				951	For kernel-based tracing, stops the tracing subsystem.
				952
				953	Must not be used manually when using 'with' construct.
				954	"""
				955	with self._lock:
				956	assert self._initialized
				957	try:
				958	data = {
				959	'traces': self._traces,
				960	}
				961	write_json(self._logname, data, False)
				962	finally:
				963	self._initialized = False
				964
				965	def post_process_log(self):
				966	"""Post-processes the log so it becomes faster to load afterward.
				967
				968	Must not be used manually when using 'with' construct.
				969	"""
				970	assert not self._initialized, 'Must stop tracing first.'
				971
				972	def __enter__(self):
				973	"""Enables 'with' statement."""
				974	return self
				975
				976	def __exit__(self, exc_type, exc_value, traceback):
				977	"""Enables 'with' statement."""
				978	self.close()
				979	# If an exception was thrown, do not process logs.
				980	if not exc_type:
				981	self.post_process_log()
				982
				983	def get_tracer(self, logname):
				984	"""Returns an ApiBase.Tracer instance.
				985
				986	Initializes the tracing subsystem, which is a requirement for kernel-based
				987	tracers. Only one tracer instance should be live at a time!
				988
				989	logname is the filepath to the json file that will contain the meta-data
				990	about the logs.
				991	"""
				992	return self.Tracer(logname)
				993
				994	@staticmethod
				995	def clean_trace(logname):
				996	"""Deletes an old log."""
				997	raise NotImplementedError()
				998
				999	@classmethod
				1000	def parse_log(cls, logname, blacklist):
				1001	"""Processes trace logs and returns the files opened and the files that do
				1002	not exist.
				1003
				1004	It does not track directories.
				1005
maruel@chromium.org	8b056ba	2012-10-16 14:04:49 +0000	[diff] [blame]	1006	\|logname\| must be an absolute path.
				1007
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1008	Most of the time, files that do not exist are temporary test files that
				1009	should be put in /tmp instead. See http://crbug.com/116251.
				1010
				1011	Returns a list of dict with keys:
				1012	- results: A Results instance.
				1013	- trace: The corresponding tracename parameter provided to
				1014	get_tracer().trace().
				1015	- output: Output gathered during execution, if get_tracer().trace(...,
				1016	output=False) was used.
				1017	"""
				1018	raise NotImplementedError(cls.__class__.__name__)
				1019
				1020
				1021	class Strace(ApiBase):
				1022	"""strace implies linux."""
				1023	class Context(ApiBase.Context):
				1024	"""Processes a strace log line and keeps the list of existent and non
				1025	existent files accessed.
				1026
				1027	Ignores directories.
				1028
				1029	Uses late-binding to processes the cwd of each process. The problem is that
				1030	strace generates one log file per process it traced but doesn't give any
				1031	information about which process was started when and by who. So we don't
				1032	even know which process is the initial one. So process the logs out of
				1033	order and use late binding with RelativePath to be able to deduce the
				1034	initial directory of each process once all the logs are parsed.
				1035	"""
				1036	class Process(ApiBase.Context.Process):
				1037	"""Represents the state of a process.
				1038
				1039	Contains all the information retrieved from the pid-specific log.
				1040	"""
				1041	# Function names are using ([a-z_0-9]+)
				1042	# This is the most common format. function(args) = result
				1043	RE_HEADER = re.compile(r'^([a-z_0-9]+)$(.+?)$\s+= (.+)$')
				1044	# An interrupted function call, only grab the minimal header.
				1045	RE_UNFINISHED = re.compile(r'^([^\(]+)(.*) \<unfinished \.\.\.\>$')
maruel@chromium.org	9f72e0f	2012-10-18 14:48:49 +0000	[diff] [blame]	1046	# An interrupted function call, with the process exiting. It must be the
				1047	# last line in the log.
				1048	RE_UNFINISHED_EXIT = re.compile(
				1049	r'^([^\(]+)(.*) \<unfinished \.\.\.\ exit status \d+>$')
maruel@chromium.org	4e50650	2012-10-18 15:02:58 +0000	[diff] [blame]	1050	# An interrupted function call the hard way. Usually observed with futex()
				1051	# on ubuntu 12.04.
csharp@chromium.org	4d75355	2012-10-25 19:31:55 +0000	[diff] [blame]	1052	RE_INTERRUPTED_HARD = re.compile(r'^([^\(]+)\([A-Z0-9a-fx\,\_\\|\{\} ]*$')
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1053	# A resumed function call.
				1054	RE_RESUMED = re.compile(r'^<\.\.\. ([^ ]+) resumed> (.+)$')
				1055	# A process received a signal.
				1056	RE_SIGNAL = re.compile(r'^--- SIG[A-Z]+ .+ ---')
				1057	# A process didn't handle a signal. Ignore any junk appearing before,
				1058	# because the process was forcibly killed so it won't open any new file.
				1059	RE_KILLED = re.compile(
				1060	r'^.*\+\+\+ killed by ([A-Z]+)( $core dumped$)? \+\+\+$')
				1061	# The process has exited.
				1062	RE_PROCESS_EXITED = re.compile(r'^\+\+\+ exited with (\d+) \+\+\+')
				1063	# A call was canceled. Ignore any prefix.
				1064	RE_UNAVAILABLE = re.compile(r'^.\)\s= \? <unavailable>$')
				1065	# Happens when strace fails to even get the function name.
				1066	UNNAMED_FUNCTION = '????'
				1067
				1068	# Corner-case in python, a class member function decorator must not be
				1069	# @staticmethod.
				1070	def parse_args(regexp, expect_zero): # pylint: disable=E0213
				1071	"""Automatically convert the str 'args' into a list of processed
				1072	arguments.
				1073
				1074	Arguments:
				1075	- regexp is used to parse args.
				1076	- expect_zero: one of True, False or None.
				1077	- True: will check for result.startswith('0') first and will ignore
				1078	the trace line completely otherwise. This is important because for
				1079	many functions, the regexp will not process if the call failed.
				1080	- False: will check for not result.startswith(('?', '-1')) for the
				1081	same reason than with True.
				1082	- None: ignore result.
				1083	"""
				1084	def meta_hook(function):
				1085	assert function.__name__.startswith('handle_')
				1086	def hook(self, args, result):
				1087	if expect_zero is True and not result.startswith('0'):
				1088	return
				1089	if expect_zero is False and result.startswith(('?', '-1')):
				1090	return
				1091	match = re.match(regexp, args)
				1092	if not match:
				1093	raise TracingFailure(
				1094	'Failed to parse %s(%s) = %s' %
				1095	(function.__name__[len('handle_'):], args, result),
				1096	None, None, None)
				1097	return function(self, match.groups(), result)
				1098	return hook
				1099	return meta_hook
				1100
				1101	class RelativePath(object):
				1102	"""A late-bound relative path."""
				1103	def __init__(self, parent, value):
				1104	self.parent = parent
				1105	self.value = value
				1106
				1107	def render(self):
				1108	"""Returns the current directory this instance is representing.
				1109
				1110	This function is used to return the late-bound value.
				1111	"""
				1112	if self.value and self.value.startswith(u'/'):
				1113	# An absolute path.
				1114	return self.value
				1115	parent = self.parent.render() if self.parent else u'<None>'
				1116	if self.value:
				1117	return os.path.normpath(os.path.join(parent, self.value))
				1118	return parent
				1119
				1120	def __unicode__(self):
				1121	"""Acts as a string whenever needed."""
				1122	return unicode(self.render())
				1123
				1124	def __str__(self):
				1125	"""Acts as a string whenever needed."""
				1126	return str(self.render())
				1127
				1128	def __init__(self, root, pid):
				1129	"""Keeps enough information to be able to guess the original process
				1130	root.
				1131
				1132	strace doesn't store which process was the initial process. So more
				1133	information needs to be kept so the graph can be reconstructed from the
				1134	flat map.
				1135	"""
				1136	logging.info('%s(%d)' % (self.__class__.__name__, pid))
				1137	super(Strace.Context.Process, self).__init__(root.blacklist, pid, None)
				1138	assert isinstance(root, ApiBase.Context)
				1139	self._root = weakref.ref(root)
				1140	# The dict key is the function name of the pending call, like 'open'
				1141	# or 'execve'.
				1142	self._pending_calls = {}
				1143	self._line_number = 0
				1144	# Current directory when the process started.
				1145	self.initial_cwd = self.RelativePath(self._root(), None)
				1146	self.parentid = None
maruel@chromium.org	9f72e0f	2012-10-18 14:48:49 +0000	[diff] [blame]	1147	self._done = False
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1148
				1149	def get_cwd(self):
				1150	"""Returns the best known value of cwd."""
				1151	return self.cwd or self.initial_cwd
				1152
				1153	def render(self):
				1154	"""Returns the string value of the RelativePath() object.
				1155
				1156	Used by RelativePath. Returns the initial directory and not the
				1157	current one since the current directory 'cwd' validity is time-limited.
				1158
				1159	The validity is only guaranteed once all the logs are processed.
				1160	"""
				1161	return self.initial_cwd.render()
				1162
				1163	def on_line(self, line):
				1164	self._line_number += 1
maruel@chromium.org	9f72e0f	2012-10-18 14:48:49 +0000	[diff] [blame]	1165	if self._done:
				1166	raise TracingFailure(
				1167	'Found a trace for a terminated process',
				1168	None, None, None)
				1169
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1170	if self.RE_SIGNAL.match(line):
				1171	# Ignore signals.
				1172	return
				1173
				1174	try:
				1175	match = self.RE_KILLED.match(line)
				1176	if match:
				1177	# Converts a '+++ killed by Foo +++' trace into an exit_group().
				1178	self.handle_exit_group(match.group(1), None)
				1179	return
				1180
				1181	match = self.RE_PROCESS_EXITED.match(line)
				1182	if match:
				1183	# Converts a '+++ exited with 1 +++' trace into an exit_group()
				1184	self.handle_exit_group(match.group(1), None)
				1185	return
				1186
				1187	match = self.RE_UNFINISHED.match(line)
				1188	if match:
				1189	if match.group(1) in self._pending_calls:
				1190	raise TracingFailure(
				1191	'Found two unfinished calls for the same function',
				1192	None, None, None,
				1193	self._pending_calls)
				1194	self._pending_calls[match.group(1)] = (
				1195	match.group(1) + match.group(2))
				1196	return
				1197
maruel@chromium.org	4e50650	2012-10-18 15:02:58 +0000	[diff] [blame]	1198	match = (
				1199	self.RE_UNFINISHED_EXIT.match(line) or
				1200	self.RE_INTERRUPTED_HARD.match(line))
maruel@chromium.org	9f72e0f	2012-10-18 14:48:49 +0000	[diff] [blame]	1201	if match:
				1202	# The process died. No other line can be processed afterward.
				1203	self._done = True
				1204	return
				1205
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1206	match = self.RE_UNAVAILABLE.match(line)
				1207	if match:
				1208	# This usually means a process was killed and a pending call was
				1209	# canceled.
				1210	# TODO(maruel): Look up the last exit_group() trace just above and
				1211	# make sure any self._pending_calls[anything] is properly flushed.
				1212	return
				1213
				1214	match = self.RE_RESUMED.match(line)
				1215	if match:
				1216	if match.group(1) not in self._pending_calls:
				1217	raise TracingFailure(
				1218	'Found a resumed call that was not logged as unfinished',
				1219	None, None, None,
				1220	self._pending_calls)
				1221	pending = self._pending_calls.pop(match.group(1))
				1222	# Reconstruct the line.
				1223	line = pending + match.group(2)
				1224
				1225	match = self.RE_HEADER.match(line)
				1226	if not match:
				1227	raise TracingFailure(
				1228	'Found an invalid line: %s' % line,
				1229	None, None, None)
				1230	if match.group(1) == self.UNNAMED_FUNCTION:
				1231	return
				1232
				1233	# It's a valid line, handle it.
				1234	handler = getattr(self, 'handle_%s' % match.group(1), None)
				1235	if not handler:
				1236	self._handle_unknown(match.group(1), match.group(2), match.group(3))
				1237	return handler(match.group(2), match.group(3))
				1238	except TracingFailure, e:
				1239	# Hack in the values since the handler could be a static function.
				1240	e.pid = self.pid
				1241	e.line = line
				1242	e.line_number = self._line_number
				1243	# Re-raise the modified exception.
				1244	raise
				1245	except (KeyError, NotImplementedError, ValueError), e:
				1246	raise TracingFailure(
				1247	'Trace generated a %s exception: %s' % (
				1248	e.__class__.__name__, str(e)),
				1249	self.pid,
				1250	self._line_number,
				1251	line,
				1252	e)
				1253
				1254	@parse_args(r'^\"(.+?)\", [FKORWX_\|]+$', True)
				1255	def handle_access(self, args, _result):
				1256	self._handle_file(args[0], True)
				1257
				1258	@parse_args(r'^\"(.+?)\"$', True)
				1259	def handle_chdir(self, args, _result):
				1260	"""Updates cwd."""
				1261	self.cwd = self.RelativePath(self, args[0])
				1262	logging.debug('handle_chdir(%d, %s)' % (self.pid, self.cwd))
				1263
				1264	def handle_clone(self, _args, result):
				1265	"""Transfers cwd."""
				1266	if result.startswith(('?', '-1')):
				1267	# The call failed.
				1268	return
				1269	# Update the other process right away.
				1270	childpid = int(result)
				1271	child = self._root().get_or_set_proc(childpid)
				1272	if child.parentid is not None or childpid in self.children:
				1273	raise TracingFailure(
				1274	'Found internal inconsitency in process lifetime detection '
				1275	'during a clone() call',
				1276	None, None, None)
				1277
				1278	# Copy the cwd object.
				1279	child.initial_cwd = self.get_cwd()
				1280	child.parentid = self.pid
				1281	# It is necessary because the logs are processed out of order.
				1282	self.children.append(child)
				1283
				1284	def handle_close(self, _args, _result):
				1285	pass
				1286
				1287	def handle_chmod(self, _args, _result):
				1288	pass
				1289
				1290	def handle_creat(self, _args, _result):
				1291	# Ignore files created, since they didn't need to exist.
				1292	pass
				1293
				1294	@parse_args(r'^\"(.+?)\", \[(.+)\], \[\/\* \d+ vars? \*\/\]$', True)
				1295	def handle_execve(self, args, _result):
				1296	# Even if in practice execve() doesn't returns when it succeeds, strace
				1297	# still prints '0' as the result.
				1298	filepath = args[0]
				1299	self._handle_file(filepath, False)
				1300	self.executable = self.RelativePath(self.get_cwd(), filepath)
				1301	self.command = strace_process_quoted_arguments(args[1])
				1302
				1303	def handle_exit_group(self, _args, _result):
				1304	"""Removes cwd."""
				1305	self.cwd = None
				1306
				1307	def handle_fork(self, args, result):
				1308	self._handle_unknown('fork', args, result)
				1309
maruel@chromium.org	9f72e0f	2012-10-18 14:48:49 +0000	[diff] [blame]	1310	def handle_futex(self, _args, _result):
				1311	pass
				1312
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1313	def handle_getcwd(self, _args, _result):
				1314	pass
				1315
				1316	@parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
				1317	def handle_link(self, args, _result):
				1318	self._handle_file(args[0], False)
				1319	self._handle_file(args[1], False)
				1320
				1321	@parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True)
				1322	def handle_lstat(self, args, _result):
				1323	self._handle_file(args[0], True)
				1324
				1325	def handle_mkdir(self, _args, _result):
				1326	pass
				1327
				1328	@parse_args(r'^\"(.*?)\", ([A-Z\_\\|]+)(\|, \d+)$', False)
				1329	def handle_open(self, args, _result):
				1330	if 'O_DIRECTORY' in args[1]:
				1331	return
				1332	self._handle_file(args[0], False)
				1333
				1334	@parse_args(r'^(\d+\|AT_FDCWD), \"(.*?)\", ([A-Z\_\\|]+)(\|, \d+)$', False)
				1335	def handle_openat(self, args, _result):
				1336	if 'O_DIRECTORY' in args[2]:
				1337	return
				1338	if args[1] == 'AT_FDCWD':
				1339	self._handle_file(args[1], False)
				1340	else:
				1341	# TODO(maruel): Implement relative open if necessary instead of the
				1342	# AT_FDCWD flag, let's hope not since this means tracking all active
				1343	# directory handles.
				1344	raise Exception('Relative open via openat not implemented.')
				1345
				1346	@parse_args(r'^\"(.+?)\", \".+?\"(\.\.\.)?, \d+$', False)
				1347	def handle_readlink(self, args, _result):
				1348	self._handle_file(args[0], False)
				1349
				1350	@parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
				1351	def handle_rename(self, args, _result):
				1352	self._handle_file(args[0], False)
				1353	self._handle_file(args[1], False)
				1354
				1355	def handle_rmdir(self, _args, _result):
				1356	pass
				1357
				1358	def handle_setxattr(self, _args, _result):
				1359	pass
				1360
				1361	@parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True)
				1362	def handle_stat(self, args, _result):
				1363	self._handle_file(args[0], True)
				1364
				1365	def handle_symlink(self, _args, _result):
				1366	pass
				1367
				1368	def handle_unlink(self, _args, _result):
				1369	# In theory, the file had to be created anyway.
				1370	pass
				1371
				1372	def handle_statfs(self, _args, _result):
				1373	pass
				1374
				1375	def handle_vfork(self, args, result):
				1376	self._handle_unknown('vfork', args, result)
				1377
				1378	@staticmethod
				1379	def _handle_unknown(function, args, result):
				1380	raise TracingFailure(
				1381	'Unexpected/unimplemented trace %s(%s)= %s' %
				1382	(function, args, result),
				1383	None, None, None)
				1384
				1385	def _handle_file(self, filepath, touch_only):
				1386	filepath = self.RelativePath(self.get_cwd(), filepath)
				1387	#assert not touch_only, unicode(filepath)
				1388	self.add_file(filepath, touch_only)
				1389
				1390	def __init__(self, blacklist, initial_cwd):
				1391	super(Strace.Context, self).__init__(blacklist)
				1392	self.initial_cwd = initial_cwd
				1393
				1394	def render(self):
				1395	"""Returns the string value of the initial cwd of the root process.
				1396
				1397	Used by RelativePath.
				1398	"""
				1399	return self.initial_cwd
				1400
				1401	def on_line(self, pid, line):
				1402	"""Transfers control into the Process.on_line() function."""
				1403	self.get_or_set_proc(pid).on_line(line.strip())
				1404
				1405	def to_results(self):
				1406	"""Finds back the root process and verify consistency."""
				1407	# TODO(maruel): Absolutely unecessary, fix me.
				1408	root = [p for p in self._process_lookup.itervalues() if not p.parentid]
				1409	if len(root) != 1:
				1410	raise TracingFailure(
				1411	'Found internal inconsitency in process lifetime detection '
				1412	'while finding the root process',
				1413	None,
				1414	None,
				1415	None,
				1416	sorted(p.pid for p in root))
				1417	self.root_process = root[0]
				1418	process = self.root_process.to_results_process()
				1419	if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
				1420	raise TracingFailure(
				1421	'Found internal inconsitency in process lifetime detection '
				1422	'while looking for len(tree) == len(list)',
				1423	None,
				1424	None,
				1425	None,
				1426	sorted(self._process_lookup),
				1427	sorted(p.pid for p in process.all))
				1428	return Results(process)
				1429
				1430	def get_or_set_proc(self, pid):
				1431	"""Returns the Context.Process instance for this pid or creates a new one.
				1432	"""
				1433	if not pid or not isinstance(pid, int):
				1434	raise TracingFailure(
				1435	'Unpexpected value for pid: %r' % pid,
				1436	pid,
				1437	None,
				1438	None,
				1439	pid)
				1440	if pid not in self._process_lookup:
				1441	self._process_lookup[pid] = self.Process(self, pid)
				1442	return self._process_lookup[pid]
				1443
				1444	@classmethod
				1445	def traces(cls):
				1446	"""Returns the list of all handled traces to pass this as an argument to
				1447	strace.
				1448	"""
				1449	prefix = 'handle_'
				1450	return [i[len(prefix):] for i in dir(cls.Process) if i.startswith(prefix)]
				1451
				1452	class Tracer(ApiBase.Tracer):
				1453	MAX_LEN = 256
				1454
				1455	def trace(self, cmd, cwd, tracename, output):
				1456	"""Runs strace on an executable."""
				1457	logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
				1458	assert os.path.isabs(cmd[0]), cmd[0]
				1459	assert os.path.isabs(cwd), cwd
				1460	assert os.path.normpath(cwd) == cwd, cwd
				1461	with self._lock:
				1462	if not self._initialized:
				1463	raise TracingFailure(
				1464	'Called Tracer.trace() on an unitialized object',
				1465	None, None, None, tracename)
				1466	assert tracename not in (i['trace'] for i in self._traces)
				1467	stdout = stderr = None
				1468	if output:
				1469	stdout = subprocess.PIPE
				1470	stderr = subprocess.STDOUT
				1471	# Ensure all file related APIs are hooked.
				1472	traces = ','.join(Strace.Context.traces() + ['file'])
				1473	trace_cmd = [
				1474	'strace',
				1475	'-ff',
				1476	'-s', '%d' % self.MAX_LEN,
				1477	'-e', 'trace=%s' % traces,
				1478	'-o', self._logname + '.' + tracename,
				1479	]
				1480	child = subprocess.Popen(
				1481	trace_cmd + cmd,
				1482	cwd=cwd,
				1483	stdin=subprocess.PIPE,
				1484	stdout=stdout,
				1485	stderr=stderr)
				1486	out = child.communicate()[0]
				1487	# TODO(maruel): Walk the logs and figure out the root process would
				1488	# simplify parsing the logs a lot.
				1489	with self._lock:
				1490	assert tracename not in (i['trace'] for i in self._traces)
				1491	self._traces.append(
				1492	{
				1493	'cmd': cmd,
				1494	'cwd': cwd,
maruel@chromium.org	964ddde	2012-10-29 18:13:46 +0000	[diff] [blame^]	1495	'output': out,
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1496	# The pid of strace process, not very useful.
				1497	'pid': child.pid,
				1498	'trace': tracename,
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1499	})
				1500	return child.returncode, out
				1501
				1502	@staticmethod
				1503	def clean_trace(logname):
				1504	if os.path.isfile(logname):
				1505	os.remove(logname)
				1506	# Also delete any pid specific file from previous traces.
				1507	for i in glob.iglob(logname + '.*'):
				1508	if i.rsplit('.', 1)[1].isdigit():
				1509	os.remove(i)
				1510
				1511	@classmethod
				1512	def parse_log(cls, logname, blacklist):
				1513	logging.info('parse_log(%s, %s)' % (logname, blacklist))
maruel@chromium.org	8b056ba	2012-10-16 14:04:49 +0000	[diff] [blame]	1514	assert os.path.isabs(logname)
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1515	data = read_json(logname)
				1516	out = []
				1517	for item in data['traces']:
				1518	result = {
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1519	'output': item['output'],
maruel@chromium.org	964ddde	2012-10-29 18:13:46 +0000	[diff] [blame^]	1520	'trace': item['trace'],
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1521	}
				1522	try:
				1523	context = cls.Context(blacklist, item['cwd'])
				1524	for pidfile in glob.iglob('%s.%s.*' % (logname, item['trace'])):
				1525	pid = pidfile.rsplit('.', 1)[1]
				1526	if pid.isdigit():
				1527	pid = int(pid)
				1528	# TODO(maruel): Load as utf-8
				1529	for line in open(pidfile, 'rb'):
				1530	context.on_line(pid, line)
				1531	result['results'] = context.to_results()
maruel@chromium.org	424f569	2012-10-15 15:42:38 +0000	[diff] [blame]	1532	except TracingFailure:
				1533	result['exception'] = sys.exc_info()
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	1534	out.append(result)
				1535	return out
				1536
				1537
				1538	class Dtrace(ApiBase):
				1539	"""Uses DTrace framework through dtrace. Requires root access.
				1540
				1541	Implies Mac OSX.
				1542
				1543	dtruss can't be used because it has compatibility issues with python.
				1544
				1545	Also, the pid->cwd handling needs to be done manually since OSX has no way to
				1546	get the absolute path of the 'cwd' dtrace variable from the probe.
				1547
				1548	Also, OSX doesn't populate curpsinfo->pr_psargs properly, see
				1549	https://discussions.apple.com/thread/1980539. So resort to handling execve()
				1550	manually.
				1551
				1552	errno is not printed in the log since this implementation currently only cares
				1553	about files that were successfully opened.
				1554	"""
				1555	class Context(ApiBase.Context):
				1556	# Format: index pid function(args)
				1557	RE_HEADER = re.compile(r'^\d+ (\d+) ([a-zA-Z_\-]+)$(.*?)$$')
				1558
				1559	# Arguments parsing.
				1560	RE_DTRACE_BEGIN = re.compile(r'^\"(.+?)\"$')
				1561	RE_CHDIR = re.compile(r'^\"(.+?)\"$')
				1562	RE_EXECVE = re.compile(r'^\"(.+?)\", \[(\d+), (.+)\]$')
				1563	RE_OPEN = re.compile(r'^\"(.+?)\", (0x[0-9a-z]+), (0x[0-9a-z]+)$')
				1564	RE_PROC_START = re.compile(r'^(\d+), \"(.+?)\", (\d+)$')
				1565	RE_RENAME = re.compile(r'^\"(.+?)\", \"(.+?)\"$')
				1566
				1567	O_DIRECTORY = 0x100000
				1568
				1569	class Process(ApiBase.Context.Process):
				1570	def __init__(self, *args):
				1571	super(Dtrace.Context.Process, self).__init__(*args)
				1572	self.cwd = self.initial_cwd
				1573
				1574	def __init__(self, blacklist, tracer_pid, initial_cwd):
				1575	logging.info(
				1576	'%s(%d, %s)' % (self.__class__.__name__, tracer_pid, initial_cwd))
				1577	super(Dtrace.Context, self).__init__(blacklist)
				1578	# Process ID of the temporary script created by create_thunk().
				1579	self._tracer_pid = tracer_pid
				1580	self._initial_cwd = initial_cwd
				1581	self._line_number = 0
				1582
				1583	def on_line(self, line):
				1584	self._line_number += 1
				1585	match = self.RE_HEADER.match(line)
				1586	if not match:
				1587	raise TracingFailure(
				1588	'Found malformed line: %s' % line,
				1589	None,
				1590	self._line_number,
				1591	line)
				1592	fn = getattr(
				1593	self,
				1594	'handle_%s' % match.group(2).replace('-', '_'),
				1595	self._handle_ignored)
				1596	# It is guaranteed to succeed because of the regexp. Or at least I thought
				1597	# it would.
				1598	pid = int(match.group(1))
				1599	try:
				1600	return fn(pid, match.group(3))
				1601	except TracingFailure, e:
				1602	# Hack in the values since the handler could be a static function.
				1603	e.pid = pid
				1604	e.line = line
				1605	e.line_number = self._line_number
				1606	# Re-raise the modified exception.
				1607	raise
				1608	except (KeyError, NotImplementedError, ValueError), e:
				1609	raise TracingFailure(
				1610	'Trace generated a %s exception: %s' % (
				1611	e.__class__.__name__, str(e)),
				1612	pid,
				1613	self._line_number,
				1614	line,
				1615	e)
				1616
				1617	def to_results(self):
				1618	process = self.root_process.to_results_process()
				1619	# Internal concistency check.
				1620	if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
				1621	raise TracingFailure(
				1622	'Found internal inconsitency in process lifetime detection '
				1623	'while looking for len(tree) == len(list)',
				1624	None,
				1625	None,
				1626	None,
				1627	sorted(self._process_lookup),
				1628	sorted(p.pid for p in process.all))
				1629	return Results(process)
				1630
				1631	def handle_dtrace_BEGIN(self, _pid, args):
				1632	if not self.RE_DTRACE_BEGIN.match(args):
				1633	raise TracingFailure(
				1634	'Found internal inconsitency in dtrace_BEGIN log line',
				1635	None, None, None)
				1636
				1637	def handle_proc_start(self, pid, args):
				1638	"""Transfers cwd.
				1639
				1640	The dtrace script already takes care of only tracing the processes that
				1641	are child of the traced processes so there is no need to verify the
				1642	process hierarchy.
				1643	"""
				1644	if pid in self._process_lookup:
				1645	raise TracingFailure(
				1646	'Found internal inconsitency in proc_start: %d started two times' %
				1647	pid,
				1648	None, None, None)
				1649	match = self.RE_PROC_START.match(args)
				1650	if not match:
				1651	raise TracingFailure(
				1652	'Failed to parse arguments: %s' % args,
				1653	None, None, None)
				1654	ppid = int(match.group(1))
				1655	if ppid == self._tracer_pid and not self.root_process:
				1656	proc = self.root_process = self.Process(
				1657	self.blacklist, pid, self._initial_cwd)
				1658	elif ppid in self._process_lookup:
				1659	proc = self.Process(self.blacklist, pid, self._process_lookup[ppid].cwd)
				1660	self._process_lookup[ppid].children.append(proc)
				1661	else:
				1662	# Another process tree, ignore.
				1663	return
				1664	self._process_lookup[pid] = proc
				1665	logging.debug(
				1666	'New child: %s -> %d cwd:%s' %
				1667	(ppid, pid, unicode(proc.initial_cwd)))
				1668
				1669	def handle_proc_exit(self, pid, _args):
				1670	"""Removes cwd."""
				1671	if pid in self._process_lookup:
				1672	# self._tracer_pid is not traced itself and other traces run neither.
				1673	self._process_lookup[pid].cwd = None
				1674
				1675	def handle_execve(self, pid, args):
				1676	"""Sets the process' executable.
				1677
				1678	TODO(maruel): Read command line arguments. See
				1679	https://discussions.apple.com/thread/1980539 for an example.
				1680	https://gist.github.com/1242279
				1681
				1682	Will have to put the answer at http://stackoverflow.com/questions/7556249.
				1683	:)
				1684	"""
				1685	if not pid in self._process_lookup:
				1686	# Another process tree, ignore.
				1687	return
				1688	match = self.RE_EXECVE.match(args)
				1689	if not match:
				1690	raise TracingFailure(
				1691	'Failed to parse arguments: %r' % args,
				1692	None, None, None)
				1693	proc = self._process_lookup[pid]
				1694	proc.executable = match.group(1)
				1695	proc.command = self.process_escaped_arguments(match.group(3))
				1696	if int(match.group(2)) != len(proc.command):
				1697	raise TracingFailure(
				1698	'Failed to parse execve() arguments: %s' % args,
				1699	None, None, None)
				1700
				1701	def handle_chdir(self, pid, args):
				1702	"""Updates cwd."""
				1703	if pid not in self._process_lookup:
				1704	# Another process tree, ignore.
				1705	return
				1706	cwd = self.RE_CHDIR.match(args).group(1)
				1707	if not cwd.startswith('/'):
				1708	cwd2 = os.path.join(self._process_lookup[pid].cwd, cwd)
				1709	logging.debug('handle_chdir(%d, %s) -> %s' % (pid, cwd, cwd2))
				1710	else:
				1711	logging.debug('handle_chdir(%d, %s)' % (pid, cwd))
				1712	cwd2 = cwd
				1713	self._process_lookup[pid].cwd = cwd2
				1714
				1715	def handle_open_nocancel(self, pid, args):
				1716	"""Redirects to handle_open()."""
				1717	return self.handle_open(pid, args)
				1718
				1719	def handle_open(self, pid, args):
				1720	if pid not in self._process_lookup:
				1721	# Another process tree, ignore.
				1722	return
				1723	match = self.RE_OPEN.match(args)
				1724	if not match:
				1725	raise TracingFailure(
				1726	'Failed to parse arguments: %s' % args,
				1727	None, None, None)
				1728	flag = int(match.group(2), 16)
				1729	if self.O_DIRECTORY & flag == self.O_DIRECTORY:
				1730	# Ignore directories.
				1731	return
				1732	self._handle_file(pid, match.group(1))
				1733
				1734	def handle_rename(self, pid, args):
				1735	if pid not in self._process_lookup:
				1736	# Another process tree, ignore.
				1737	return
				1738	match = self.RE_RENAME.match(args)
				1739	if not match:
				1740	raise TracingFailure(
				1741	'Failed to parse arguments: %s' % args,
				1742	None, None, None)
				1743	self._handle_file(pid, match.group(1))
				1744	self._handle_file(pid, match.group(2))
				1745
				1746	def _handle_file(self, pid, filepath):
				1747	if not filepath.startswith('/'):
				1748	filepath = os.path.join(self._process_lookup[pid].cwd, filepath)
				1749	# We can get '..' in the path.
				1750	filepath = os.path.normpath(filepath)
				1751	# Sadly, still need to filter out directories here;
				1752	# saw open_nocancel(".", 0, 0) = 0 lines.
				1753	if os.path.isdir(filepath):
				1754	return
				1755	self._process_lookup[pid].add_file(filepath, False)
				1756
				1757	def handle_ftruncate(self, pid, args):
				1758	"""Just used as a signal to kill dtrace, ignoring."""
				1759	pass
				1760
				1761	@staticmethod
				1762	def _handle_ignored(pid, args):
				1763	"""Is called for all the event traces that are not handled."""
				1764	raise NotImplementedError('Please implement me')
				1765
				1766	@staticmethod
				1767	def process_escaped_arguments(text):
				1768	"""Extracts escaped arguments on a string and return the arguments as a
				1769	list.
				1770
				1771	Implemented as an automaton.
				1772
				1773	Example:
				1774	With text = '\\001python2.7\\001-c\\001print(\\"hi\\")\\0', the
				1775	function will return ['python2.7', '-c', 'print("hi")]
				1776	"""
				1777	if not text.endswith('\\0'):
				1778	raise ValueError('String is not null terminated: %r' % text, text)
				1779	text = text[:-2]
				1780
				1781	def unescape(x):
				1782	"""Replaces '\\' with '\' and '\?' (where ? is anything) with ?."""
				1783	out = []
				1784	escaped = False
				1785	for i in x:
				1786	if i == '\\' and not escaped:
				1787	escaped = True
				1788	continue
				1789	escaped = False
				1790	out.append(i)
				1791	return ''.join(out)
				1792
				1793	return [unescape(i) for i in text.split('\\001')]
				1794
				1795	class Tracer(ApiBase.Tracer):
				1796	# pylint: disable=C0301
				1797	#
				1798	# To understand the following code, you'll want to take a look at:
				1799	# http://developers.sun.com/solaris/articles/dtrace_quickref/dtrace_quickref.html
				1800	# https://wikis.oracle.com/display/DTrace/Variables
				1801	# http://docs.oracle.com/cd/E19205-01/820-4221/
				1802	#
				1803	# 0. Dump all the valid probes into a text file. It is important, you
				1804	# want to redirect into a file and you don't want to constantly 'sudo'.
				1805	# $ sudo dtrace -l > probes.txt
				1806	#
				1807	# 1. Count the number of probes:
				1808	# $ wc -l probes.txt
				1809	# 81823 # On OSX 10.7, including 1 header line.
				1810	#
				1811	# 2. List providers, intentionally skipping all the 'syspolicy10925' and the
				1812	# likes and skipping the header with NR>1:
				1813	# $ awk 'NR>1 { print $2 }' probes.txt \| sort \| uniq \| grep -v '[[:digit:]]'
				1814	# dtrace
				1815	# fbt
				1816	# io
				1817	# ip
				1818	# lockstat
				1819	# mach_trap
				1820	# proc
				1821	# profile
				1822	# sched
				1823	# syscall
				1824	# tcp
				1825	# vminfo
				1826	#
				1827	# 3. List of valid probes:
				1828	# $ grep syscall probes.txt \| less
				1829	# or use dtrace directly:
				1830	# $ sudo dtrace -l -P syscall \| less
				1831	#
				1832	# trackedpid is an associative array where its value can be 0, 1 or 2.
				1833	# 0 is for untracked processes and is the default value for items not
				1834	# in the associative array.
				1835	# 1 is for tracked processes.
				1836	# 2 is for the script created by create_thunk() only. It is not tracked
				1837	# itself but all its decendants are.
				1838	#
				1839	# The script will kill itself only once waiting_to_die == 1 and
				1840	# current_processes == 0, so that both getlogin() was called and that
				1841	# all traced processes exited.
				1842	#
				1843	# TODO(maruel): Use cacheable predicates. See
				1844	# https://wikis.oracle.com/display/DTrace/Performance+Considerations
				1845	D_CODE = """
				1846	dtrace:::BEGIN {
				1847	waiting_to_die = 0;
				1848	current_processes = 0;
				1849	logindex = 0;
				1850	printf("%d %d %s_%s(\\"%s\\")\\n",
				1851	logindex, PID, probeprov, probename, SCRIPT);
				1852	logindex++;
				1853	}
				1854
				1855	proc:::start /trackedpid[ppid]/ {
				1856	trackedpid[pid] = 1;
				1857	current_processes += 1;
				1858	printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
				1859	logindex, pid, probeprov, probename,
				1860	ppid,
				1861	execname,
				1862	current_processes);
				1863	logindex++;
				1864	}
				1865	/* Should use SCRIPT but there is no access to this variable at that
				1866	* point. */
				1867	proc:::start /ppid == PID && execname == "Python"/ {
				1868	trackedpid[pid] = 2;
				1869	current_processes += 1;
				1870	printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
				1871	logindex, pid, probeprov, probename,
				1872	ppid,
				1873	execname,
				1874	current_processes);
				1875	logindex++;
				1876	}
				1877	proc:::exit /trackedpid[pid] &&
				1878	current_processes == 1 &&
				1879	waiting_to_die == 1/ {
				1880	trackedpid[pid] = 0;
				1881	current_processes -= 1;
				1882	printf("%d %d %s_%s(%d)\\n",
				1883	logindex, pid, probeprov, probename,
				1884	current_processes);
				1885	logindex++;
				1886	exit(0);
				1887	}
				1888	proc:::exit /trackedpid[pid]/ {
				1889	trackedpid[pid] = 0;
				1890	current_processes -= 1;
				1891	printf("%d %d %s_%s(%d)\\n",
				1892	logindex, pid, probeprov, probename,
				1893	current_processes);
				1894	logindex++;
				1895	}
				1896
				1897	/* Use an arcane function to detect when we need to die */
				1898	syscall::ftruncate:entry /pid == PID && arg0 == FILE_ID/ {
				1899	waiting_to_die = 1;
				1900	printf("%d %d %s()\\n", logindex, pid, probefunc);
				1901	logindex++;
				1902	}
				1903	syscall::ftruncate:entry /
				1904	pid == PID && arg0 == FILE_ID && current_processes == 0/ {
				1905	exit(0);
				1906	}
				1907
				1908	syscall::open*:entry /trackedpid[pid] == 1/ {
				1909	self->open_arg0 = arg0;
				1910	self->open_arg1 = arg1;
				1911	self->open_arg2 = arg2;
				1912	}
				1913	syscall::open*:return /trackedpid[pid] == 1 && errno == 0/ {
				1914	this->open_arg0 = copyinstr(self->open_arg0);
				1915	printf("%d %d %s(\\"%s\\", 0x%x, 0x%x)\\n",
				1916	logindex, pid, probefunc,
				1917	this->open_arg0,
				1918	self->open_arg1,
				1919	self->open_arg2);
				1920	logindex++;
				1921	this->open_arg0 = 0;
				1922	}
				1923	syscall::open*:return /trackedpid[pid] == 1/ {
				1924	self->open_arg0 = 0;
				1925	self->open_arg1 = 0;
				1926	self->open_arg2 = 0;
				1927	}
				1928
				1929	syscall::rename:entry /trackedpid[pid] == 1/ {
				1930	self->rename_arg0 = arg0;
				1931	self->rename_arg1 = arg1;
				1932	}
				1933	syscall::rename:return /trackedpid[pid] == 1 && errno == 0/ {
				1934	this->rename_arg0 = copyinstr(self->rename_arg0);
				1935	this->rename_arg1 = copyinstr(self->rename_arg1);
				1936	printf("%d %d %s(\\"%s\\", \\"%s\\")\\n",
				1937	logindex, pid, probefunc,
				1938	this->rename_arg0,
				1939	this->rename_arg1);
				1940	logindex++;
				1941	this->rename_arg0 = 0;
				1942	this->rename_arg1 = 0;
				1943	}
				1944	syscall::rename:return /trackedpid[pid] == 1/ {
				1945	self->rename_arg0 = 0;
				1946	self->rename_arg1 = 0;
				1947	}
				1948
				1949	/* Track chdir, it's painful because it is only receiving relative path.
				1950	*/
				1951	syscall::chdir:entry /trackedpid[pid] == 1/ {
				1952	self->chdir_arg0 = arg0;
				1953	}
				1954	syscall::chdir:return /trackedpid[pid] == 1 && errno == 0/ {
				1955	this->chdir_arg0 = copyinstr(self->chdir_arg0);
				1956	printf("%d %d %s(\\"%s\\")\\n",
				1957	logindex, pid, probefunc,
				1958	this->chdir_arg0);
				1959	logindex++;
				1960	this->chdir_arg0 = 0;
				1961	}
				1962	syscall::chdir:return /trackedpid[pid] == 1/ {
				1963	self->chdir_arg0 = 0;
				1964	}
				1965	"""
				1966
				1967	# execve-specific code, tends to throw a lot of exceptions.
				1968	D_CODE_EXECVE = """
				1969	/* Finally what we care about! */
				1970	syscall::exec*:entry /trackedpid[pid]/ {
				1971	self->exec_arg0 = copyinstr(arg0);
				1972	/* Incrementally probe for a NULL in the argv parameter of execve() to
				1973	* figure out argc. */
				1974	/* TODO(maruel): Skip the remaining copyin() when a NULL pointer was
				1975	* found. */
				1976	self->exec_argc = 0;
				1977	/* Probe for argc==1 */
				1978	this->exec_argv = (user_addr_t*)copyin(
				1979	arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
				1980	self->exec_argc = this->exec_argv[self->exec_argc] ?
				1981	(self->exec_argc + 1) : self->exec_argc;
				1982
				1983	/* Probe for argc==2 */
				1984	this->exec_argv = (user_addr_t*)copyin(
				1985	arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
				1986	self->exec_argc = this->exec_argv[self->exec_argc] ?
				1987	(self->exec_argc + 1) : self->exec_argc;
				1988
				1989	/* Probe for argc==3 */
				1990	this->exec_argv = (user_addr_t*)copyin(
				1991	arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
				1992	self->exec_argc = this->exec_argv[self->exec_argc] ?
				1993	(self->exec_argc + 1) : self->exec_argc;
				1994
				1995	/* Probe for argc==4 */
				1996	this->exec_argv = (user_addr_t*)copyin(
				1997	arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
				1998	self->exec_argc = this->exec_argv[self->exec_argc] ?
				1999	(self->exec_argc + 1) : self->exec_argc;
				2000
				2001	/* Copy the inputs strings since there is no guarantee they'll be
				2002	* present after the call completed. */
				2003	self->exec_argv0 = (self->exec_argc > 0) ?
				2004	copyinstr(this->exec_argv[0]) : "";
				2005	self->exec_argv1 = (self->exec_argc > 1) ?
				2006	copyinstr(this->exec_argv[1]) : "";
				2007	self->exec_argv2 = (self->exec_argc > 2) ?
				2008	copyinstr(this->exec_argv[2]) : "";
				2009	self->exec_argv3 = (self->exec_argc > 3) ?
				2010	copyinstr(this->exec_argv[3]) : "";
				2011	this->exec_argv = 0;
				2012	}
				2013	syscall::exec*:return /trackedpid[pid] && errno == 0/ {
				2014	/* We need to join strings here, as using multiple printf() would
				2015	* cause tearing when multiple threads/processes are traced.
				2016	* Since it is impossible to escape a string and join it to another one,
				2017	* like sprintf("%s%S", previous, more), use hackery.
				2018	* Each of the elements are split with a \\1. \\0 cannot be used because
				2019	* it is simply ignored. This will conflict with any program putting a
				2020	* \\1 in their execve() string but this should be "rare enough" */
				2021	this->args = "";
				2022	/* Process exec_argv[0] */
				2023	this->args = strjoin(
				2024	this->args, (self->exec_argc > 0) ? self->exec_argv0 : "");
				2025
				2026	/* Process exec_argv[1] */
				2027	this->args = strjoin(
				2028	this->args, (self->exec_argc > 1) ? "\\1" : "");
				2029	this->args = strjoin(
				2030	this->args, (self->exec_argc > 1) ? self->exec_argv1 : "");
				2031
				2032	/* Process exec_argv[2] */
				2033	this->args = strjoin(
				2034	this->args, (self->exec_argc > 2) ? "\\1" : "");
				2035	this->args = strjoin(
				2036	this->args, (self->exec_argc > 2) ? self->exec_argv2 : "");
				2037
				2038	/* Process exec_argv[3] */
				2039	this->args = strjoin(
				2040	this->args, (self->exec_argc > 3) ? "\\1" : "");
				2041	this->args = strjoin(
				2042	this->args, (self->exec_argc > 3) ? self->exec_argv3 : "");
				2043
				2044	/* Prints self->exec_argc to permits verifying the internal
				2045	* consistency since this code is quite fishy. */
				2046	printf("%d %d %s(\\"%s\\", [%d, %S])\\n",
				2047	logindex, pid, probefunc,
				2048	self->exec_arg0,
				2049	self->exec_argc,
				2050	this->args);
				2051	logindex++;
				2052	this->args = 0;
				2053	}
				2054	syscall::exec*:return /trackedpid[pid]/ {
				2055	self->exec_arg0 = 0;
				2056	self->exec_argc = 0;
				2057	self->exec_argv0 = 0;
				2058	self->exec_argv1 = 0;
				2059	self->exec_argv2 = 0;
				2060	self->exec_argv3 = 0;
				2061	}
				2062	"""
				2063
				2064	# Code currently not used.
				2065	D_EXTRANEOUS = """
				2066	/* This is a good learning experience, since it traces a lot of things
				2067	* related to the process and child processes.
				2068	* Warning: it generates a gigantic log. For example, tracing
				2069	* "data/trace_inputs/child1.py --child" generates a 2mb log and takes
				2070	* several minutes to execute.
				2071	*/
				2072	/*
				2073	mach_trap::: /trackedpid[pid] == 1 \|\| trackedpid[ppid]/ {
				2074	printf("%d %d %s_%s() = %d\\n",
				2075	logindex, pid, probeprov, probefunc, errno);
				2076	logindex++;
				2077	}
				2078	proc::: /trackedpid[pid] == 1 \|\| trackedpid[ppid]/ {
				2079	printf("%d %d %s_%s() = %d\\n",
				2080	logindex, pid, probeprov, probefunc, errno);
				2081	logindex++;
				2082	}
				2083	sched::: /trackedpid[pid] == 1 \|\| trackedpid[ppid]/ {
				2084	printf("%d %d %s_%s() = %d\\n",
				2085	logindex, pid, probeprov, probefunc, errno);
				2086	logindex++;
				2087	}
				2088	syscall::: /trackedpid[pid] == 1 \|\| trackedpid[ppid]/ {
				2089	printf("%d %d %s_%s() = %d\\n",
				2090	logindex, pid, probeprov, probefunc, errno);
				2091	logindex++;
				2092	}
				2093	vminfo::: /trackedpid[pid] == 1 \|\| trackedpid[ppid]/ {
				2094	printf("%d %d %s_%s() = %d\\n",
				2095	logindex, pid, probeprov, probefunc, errno);
				2096	logindex++;
				2097	}
				2098	*/
				2099	/* TODO(maruel): stat functions and friends
				2100	syscall::access:return,
				2101	syscall::chdir:return,
				2102	syscall::chflags:return,
				2103	syscall::chown:return,
				2104	syscall::chroot:return,
				2105	syscall::getattrlist:return,
				2106	syscall::getxattr:return,
				2107	syscall::lchown:return,
				2108	syscall::lstat64:return,
				2109	syscall::lstat:return,
				2110	syscall::mkdir:return,
				2111	syscall::pathconf:return,
				2112	syscall::readlink:return,
				2113	syscall::removexattr:return,
				2114	syscall::setxattr:return,
				2115	syscall::stat64:return,
				2116	syscall::stat:return,
				2117	syscall::truncate:return,
				2118	syscall::unlink:return,
				2119	syscall::utimes:return,
				2120	*/
				2121	"""
				2122
				2123	def __init__(self, logname):
				2124	"""Starts the log collection with dtrace.
				2125
				2126	Requires root access or chmod 4555 on dtrace. dtrace is asynchronous so
				2127	this needs to wait for dtrace to be "warmed up".
				2128	"""
				2129	super(Dtrace.Tracer, self).__init__(logname)
				2130	self._script = create_thunk()
				2131	# This unique dummy temp file is used to signal the dtrace script that it
				2132	# should stop as soon as all the child processes are done. A bit hackish
				2133	# but works fine enough.
				2134	self._dummy_file_id, self._dummy_file_name = tempfile.mkstemp(
				2135	prefix='trace_signal_file')
				2136
				2137	# Note: do not use the -p flag. It's useless if the initial process quits
				2138	# too fast, resulting in missing traces from the grand-children. The D
				2139	# code manages the dtrace lifetime itself.
				2140	trace_cmd = [
				2141	'sudo',
				2142	'dtrace',
				2143	# Use a larger buffer if getting 'out of scratch space' errors.
				2144	# Ref: https://wikis.oracle.com/display/DTrace/Options+and+Tunables
				2145	'-b', '10m',
				2146	'-x', 'dynvarsize=10m',
				2147	#'-x', 'dtrace_global_maxsize=1m',
				2148	'-x', 'evaltime=exec',
				2149	'-o', '/dev/stderr',
				2150	'-q',
				2151	'-n', self._get_dtrace_code(),
				2152	]
				2153	with open(self._logname + '.log', 'wb') as logfile:
				2154	self._dtrace = subprocess.Popen(
				2155	trace_cmd, stdout=logfile, stderr=subprocess.STDOUT)
				2156	logging.debug('Started dtrace pid: %d' % self._dtrace.pid)
				2157
				2158	# Reads until one line is printed, which signifies dtrace is up and ready.
				2159	with open(self._logname + '.log', 'rb') as logfile:
				2160	while 'dtrace_BEGIN' not in logfile.readline():
				2161	if self._dtrace.poll() is not None:
				2162	# Do a busy wait. :/
				2163	break
				2164	logging.debug('dtrace started')
				2165
				2166	def _get_dtrace_code(self):
				2167	"""Setups the D code to implement child process tracking.
				2168
				2169	Injects the cookie in the script so it knows when to stop.
				2170
				2171	The script will detect any instance of the script created with
				2172	create_thunk() and will start tracing it.
				2173	"""
				2174	return (
				2175	'inline int PID = %d;\n'
				2176	'inline string SCRIPT = "%s";\n'
				2177	'inline int FILE_ID = %d;\n'
				2178	'\n'
				2179	'%s') % (
				2180	os.getpid(),
				2181	self._script,
				2182	self._dummy_file_id,
				2183	self.D_CODE) + self.D_CODE_EXECVE
				2184
				2185	def trace(self, cmd, cwd, tracename, output):
				2186	"""Runs dtrace on an executable.
				2187
				2188	This dtruss is broken when it starts the process itself or when tracing
				2189	child processes, this code starts a wrapper process
				2190	generated with create_thunk() which starts the executable to trace.
				2191	"""
				2192	logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
				2193	assert os.path.isabs(cmd[0]), cmd[0]
				2194	assert os.path.isabs(cwd), cwd
				2195	assert os.path.normpath(cwd) == cwd, cwd
				2196	with self._lock:
				2197	if not self._initialized:
				2198	raise TracingFailure(
				2199	'Called Tracer.trace() on an unitialized object',
				2200	None, None, None, tracename)
				2201	assert tracename not in (i['trace'] for i in self._traces)
				2202
				2203	# Starts the script wrapper to start the child process. This signals the
				2204	# dtrace script that this process is to be traced.
				2205	stdout = stderr = None
				2206	if output:
				2207	stdout = subprocess.PIPE
				2208	stderr = subprocess.STDOUT
				2209	child_cmd = [
				2210	sys.executable,
				2211	self._script,
				2212	tracename,
				2213	]
				2214	# Call a dummy function so that dtrace knows I'm about to launch a process
				2215	# that needs to be traced.
				2216	# Yummy.
				2217	child = subprocess.Popen(
				2218	child_cmd + fix_python_path(cmd),
				2219	stdin=subprocess.PIPE,
				2220	stdout=stdout,
				2221	stderr=stderr,
				2222	cwd=cwd)
				2223	logging.debug('Started child pid: %d' % child.pid)
				2224
				2225	out = child.communicate()[0]
				2226	# This doesn't mean tracing is done, one of the grand-child process may
				2227	# still be alive. It will be tracked with the dtrace script.
				2228
				2229	with self._lock:
				2230	assert tracename not in (i['trace'] for i in self._traces)
				2231	self._traces.append(
				2232	{
				2233	'cmd': cmd,
				2234	'cwd': cwd,
				2235	# The pid of strace process, not very useful.
				2236	'pid': child.pid,
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	2237	'output': out,
maruel@chromium.org	964ddde	2012-10-29 18:13:46 +0000	[diff] [blame^]	2238	'trace': tracename,
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	2239	})
				2240	return child.returncode, out
				2241
				2242	def close(self, timeout=None):
				2243	"""Terminates dtrace."""
				2244	logging.debug('close(%s)' % timeout)
				2245	try:
				2246	try:
				2247	super(Dtrace.Tracer, self).close(timeout)
				2248	# Signal dtrace that it should stop now.
maruel@chromium.org	f43e68b	2012-10-15 20:23:10 +0000	[diff] [blame]	2249	# ftruncate doesn't exist on Windows.
				2250	os.ftruncate(self._dummy_file_id, 0) # pylint: disable=E1101
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	2251	if timeout:
				2252	start = time.time()
				2253	# Use polling. :/
				2254	while (self._dtrace.poll() is None and
				2255	(time.time() - start) < timeout):
				2256	time.sleep(0.1)
				2257	self._dtrace.kill()
				2258	self._dtrace.wait()
				2259	finally:
				2260	# Make sure to kill it in any case.
				2261	if self._dtrace.poll() is None:
				2262	try:
				2263	self._dtrace.kill()
				2264	self._dtrace.wait()
				2265	except OSError:
				2266	pass
				2267
				2268	if self._dtrace.returncode != 0:
				2269	# Warn about any dtrace failure but basically ignore it.
				2270	print 'dtrace failure: %s' % self._dtrace.returncode
				2271	finally:
				2272	os.close(self._dummy_file_id)
				2273	os.remove(self._dummy_file_name)
				2274	os.remove(self._script)
				2275
				2276	def post_process_log(self):
				2277	"""Sorts the log back in order when each call occured.
				2278
				2279	dtrace doesn't save the buffer in strict order since it keeps one buffer
				2280	per CPU.
				2281	"""
				2282	super(Dtrace.Tracer, self).post_process_log()
				2283	logname = self._logname + '.log'
				2284	with open(logname, 'rb') as logfile:
				2285	lines = [l for l in logfile if l.strip()]
				2286	errors = [l for l in lines if l.startswith('dtrace:')]
				2287	if errors:
				2288	raise TracingFailure(
				2289	'Found errors in the trace: %s' % '\n'.join(errors),
				2290	None, None, None, logname)
				2291	try:
				2292	lines = sorted(lines, key=lambda l: int(l.split(' ', 1)[0]))
				2293	except ValueError:
				2294	raise TracingFailure(
				2295	'Found errors in the trace: %s' % '\n'.join(
				2296	l for l in lines if l.split(' ', 1)[0].isdigit()),
				2297	None, None, None, logname)
				2298	with open(logname, 'wb') as logfile:
				2299	logfile.write(''.join(lines))
				2300
				2301	@staticmethod
				2302	def clean_trace(logname):
				2303	for ext in ('', '.log'):
				2304	if os.path.isfile(logname + ext):
				2305	os.remove(logname + ext)
				2306
				2307	@classmethod
				2308	def parse_log(cls, logname, blacklist):
				2309	logging.info('parse_log(%s, ...)' % logname)
maruel@chromium.org	8b056ba	2012-10-16 14:04:49 +0000	[diff] [blame]	2310	assert os.path.isabs(logname)
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	2311
				2312	def blacklist_more(filepath):
				2313	# All the HFS metadata is in the form /.vol/...
				2314	return blacklist(filepath) or re.match(r'^\/\.vol\/.+$', filepath)
				2315
				2316	data = read_json(logname)
				2317	out = []
				2318	for item in data['traces']:
maruel@chromium.org	964ddde	2012-10-29 18:13:46 +0000	[diff] [blame^]	2319	result = {
				2320	'output': item['output'],
				2321	'trace': item['trace'],
				2322	}
				2323	try:
				2324	context = cls.Context(blacklist_more, item['pid'], item['cwd'])
				2325	for line in open(logname + '.log', 'rb'):
				2326	context.on_line(line)
				2327	result['results'] = context.to_results()
				2328	except TracingFailure:
				2329	result['exception'] = sys.exc_info()
				2330	out.append(result)
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	2331	return out
				2332
				2333
				2334	class LogmanTrace(ApiBase):
				2335	"""Uses the native Windows ETW based tracing functionality to trace a child
				2336	process.
				2337
				2338	Caveat: this implementations doesn't track cwd or initial_cwd. It is because
				2339	the Windows Kernel doesn't have a concept of 'current working directory' at
				2340	all. A Win32 process has a map of current directories, one per drive letter
				2341	and it is managed by the user mode kernel32.dll. In kernel, a file is always
				2342	opened relative to another file_object or as an absolute path. All the current
				2343	working directory logic is done in user mode.
				2344	"""
				2345	class Context(ApiBase.Context):
				2346	"""Processes a ETW log line and keeps the list of existent and non
				2347	existent files accessed.
				2348
				2349	Ignores directories.
				2350	"""
				2351	# These indexes are for the stripped version in json.
				2352	EVENT_NAME = 0
				2353	TYPE = 1
				2354	PID = 2
				2355	TID = 3
				2356	PROCESSOR_ID = 4
				2357	TIMESTAMP = 5
				2358	USER_DATA = 6
				2359
				2360	class Process(ApiBase.Context.Process):
				2361	def __init__(self, *args):
				2362	super(LogmanTrace.Context.Process, self).__init__(*args)
				2363	# Handle file objects that succeeded.
				2364	self.file_objects = {}
				2365
				2366	def __init__(self, blacklist, tracer_pid):
				2367	logging.info('%s(%d)' % (self.__class__.__name__, tracer_pid))
				2368	super(LogmanTrace.Context, self).__init__(blacklist)
				2369	self._drive_map = DosDriveMap()
				2370	# Threads mapping to the corresponding process id.
				2371	self._threads_active = {}
				2372	# Process ID of the tracer, e.g. tracer_inputs.py
				2373	self._tracer_pid = tracer_pid
				2374	self._line_number = 0
				2375
				2376	def on_line(self, line):
				2377	"""Processes a json Event line."""
				2378	self._line_number += 1
				2379	try:
				2380	# By Opcode
				2381	handler = getattr(
				2382	self,
				2383	'handle_%s_%s' % (line[self.EVENT_NAME], line[self.TYPE]),
				2384	None)
				2385	if not handler:
				2386	raise TracingFailure(
				2387	'Unexpected event %s_%s' % (
				2388	line[self.EVENT_NAME], line[self.TYPE]),
				2389	None, None, None)
				2390	handler(line)
				2391	except TracingFailure, e:
				2392	# Hack in the values since the handler could be a static function.
				2393	e.pid = line[self.PID]
				2394	e.line = line
				2395	e.line_number = self._line_number
				2396	# Re-raise the modified exception.
				2397	raise
				2398	except (KeyError, NotImplementedError, ValueError), e:
				2399	raise TracingFailure(
				2400	'Trace generated a %s exception: %s' % (
				2401	e.__class__.__name__, str(e)),
				2402	line[self.PID],
				2403	self._line_number,
				2404	line,
				2405	e)
				2406
				2407	def to_results(self):
				2408	if not self.root_process:
				2409	raise TracingFailure(
				2410	'Failed to detect the initial process',
				2411	None, None, None)
				2412	process = self.root_process.to_results_process()
				2413	return Results(process)
				2414
				2415	def _thread_to_process(self, tid):
				2416	"""Finds the process from the thread id."""
				2417	tid = int(tid, 16)
				2418	pid = self._threads_active.get(tid)
				2419	if not pid or not self._process_lookup.get(pid):
				2420	return
				2421	return self._process_lookup[pid]
				2422
				2423	@classmethod
				2424	def handle_EventTrace_Header(cls, line):
				2425	"""Verifies no event was dropped, e.g. no buffer overrun occured."""
				2426	BUFFER_SIZE = cls.USER_DATA
				2427	#VERSION = cls.USER_DATA + 1
				2428	#PROVIDER_VERSION = cls.USER_DATA + 2
				2429	#NUMBER_OF_PROCESSORS = cls.USER_DATA + 3
				2430	#END_TIME = cls.USER_DATA + 4
				2431	#TIMER_RESOLUTION = cls.USER_DATA + 5
				2432	#MAX_FILE_SIZE = cls.USER_DATA + 6
				2433	#LOG_FILE_MODE = cls.USER_DATA + 7
				2434	#BUFFERS_WRITTEN = cls.USER_DATA + 8
				2435	#START_BUFFERS = cls.USER_DATA + 9
				2436	#POINTER_SIZE = cls.USER_DATA + 10
				2437	EVENTS_LOST = cls.USER_DATA + 11
				2438	#CPU_SPEED = cls.USER_DATA + 12
				2439	#LOGGER_NAME = cls.USER_DATA + 13
				2440	#LOG_FILE_NAME = cls.USER_DATA + 14
				2441	#BOOT_TIME = cls.USER_DATA + 15
				2442	#PERF_FREQ = cls.USER_DATA + 16
				2443	#START_TIME = cls.USER_DATA + 17
				2444	#RESERVED_FLAGS = cls.USER_DATA + 18
				2445	#BUFFERS_LOST = cls.USER_DATA + 19
				2446	#SESSION_NAME_STRING = cls.USER_DATA + 20
				2447	#LOG_FILE_NAME_STRING = cls.USER_DATA + 21
				2448	if line[EVENTS_LOST] != '0':
				2449	raise TracingFailure(
				2450	( '%s events were lost during trace, please increase the buffer '
				2451	'size from %s') % (line[EVENTS_LOST], line[BUFFER_SIZE]),
				2452	None, None, None)
				2453
				2454	def handle_FileIo_Cleanup(self, line):
				2455	"""General wisdom: if a file is closed, it's because it was opened.
				2456
				2457	Note that FileIo_Close is not used since if a file was opened properly but
				2458	not closed before the process exits, only Cleanup will be logged.
				2459	"""
				2460	#IRP = self.USER_DATA
				2461	TTID = self.USER_DATA + 1 # Thread ID, that's what we want.
				2462	FILE_OBJECT = self.USER_DATA + 2
				2463	#FILE_KEY = self.USER_DATA + 3
				2464	proc = self._thread_to_process(line[TTID])
				2465	if not proc:
				2466	# Not a process we care about.
				2467	return
				2468	file_object = line[FILE_OBJECT]
				2469	if file_object in proc.file_objects:
				2470	proc.add_file(proc.file_objects.pop(file_object), False)
				2471
				2472	def handle_FileIo_Create(self, line):
				2473	"""Handles a file open.
				2474
				2475	All FileIo events are described at
				2476	http://msdn.microsoft.com/library/windows/desktop/aa363884.aspx
				2477	for some value of 'description'.
				2478
				2479	" (..) process and thread id values of the IO events (..) are not valid "
				2480	http://msdn.microsoft.com/magazine/ee358703.aspx
				2481
				2482	The FileIo.Create event doesn't return if the CreateFile() call
				2483	succeeded, so keep track of the file_object and check that it is
				2484	eventually closed with FileIo_Cleanup.
				2485	"""
				2486	#IRP = self.USER_DATA
				2487	TTID = self.USER_DATA + 1 # Thread ID, that's what we want.
				2488	FILE_OBJECT = self.USER_DATA + 2
				2489	#CREATE_OPTIONS = self.USER_DATA + 3
				2490	#FILE_ATTRIBUTES = self.USER_DATA + 4
				2491	#self.USER_DATA + SHARE_ACCESS = 5
				2492	OPEN_PATH = self.USER_DATA + 6
				2493
				2494	proc = self._thread_to_process(line[TTID])
				2495	if not proc:
				2496	# Not a process we care about.
				2497	return
				2498
				2499	match = re.match(r'^\"(.+)\"$', line[OPEN_PATH])
				2500	raw_path = match.group(1)
				2501	# Ignore directories and bare drive right away.
				2502	if raw_path.endswith(os.path.sep):
				2503	return
				2504	filepath = self._drive_map.to_win32(raw_path)
				2505	# Ignore bare drive right away. Some may still fall through with format
				2506	# like '\\?\X:'
				2507	if len(filepath) == 2:
				2508	return
				2509	file_object = line[FILE_OBJECT]
				2510	if os.path.isdir(filepath):
				2511	# There is no O_DIRECTORY equivalent on Windows. The closed is
				2512	# FILE_FLAG_BACKUP_SEMANTICS but it's not exactly right either. So
				2513	# simply discard directories are they are found.
				2514	return
				2515	# Override any stale file object
				2516	proc.file_objects[file_object] = filepath
				2517
				2518	def handle_FileIo_Rename(self, line):
				2519	# TODO(maruel): Handle?
				2520	pass
				2521
				2522	def handle_Process_End(self, line):
				2523	pid = line[self.PID]
				2524	if self._process_lookup.get(pid):
				2525	logging.info('Terminated: %d' % pid)
				2526	self._process_lookup[pid] = None
				2527	else:
				2528	logging.debug('Terminated: %d' % pid)
				2529
				2530	def handle_Process_Start(self, line):
				2531	"""Handles a new child process started by PID."""
				2532	#UNIQUE_PROCESS_KEY = self.USER_DATA
				2533	PROCESS_ID = self.USER_DATA + 1
				2534	#PARENT_PID = self.USER_DATA + 2
				2535	#SESSION_ID = self.USER_DATA + 3
				2536	#EXIT_STATUS = self.USER_DATA + 4
				2537	#DIRECTORY_TABLE_BASE = self.USER_DATA + 5
				2538	#USER_SID = self.USER_DATA + 6
				2539	IMAGE_FILE_NAME = self.USER_DATA + 7
				2540	COMMAND_LINE = self.USER_DATA + 8
				2541
				2542	ppid = line[self.PID]
				2543	pid = int(line[PROCESS_ID], 16)
				2544	logging.debug(
				2545	'New process %d->%d (%s) %s' %
				2546	(ppid, pid, line[IMAGE_FILE_NAME], line[COMMAND_LINE]))
				2547
				2548	if ppid == self._tracer_pid:
				2549	# Need to ignore processes we don't know about because the log is
				2550	# system-wide. self._tracer_pid shall start only one process.
				2551	if self.root_process:
				2552	raise TracingFailure(
				2553	( 'Parent process is _tracer_pid(%d) but root_process(%d) is '
				2554	'already set') % (self._tracer_pid, self.root_process.pid),
				2555	None, None, None)
				2556	proc = self.Process(self.blacklist, pid, None)
				2557	self.root_process = proc
				2558	ppid = None
				2559	elif self._process_lookup.get(ppid):
				2560	proc = self.Process(self.blacklist, pid, None)
				2561	self._process_lookup[ppid].children.append(proc)
				2562	else:
				2563	# Ignore
				2564	return
				2565	self._process_lookup[pid] = proc
				2566
				2567	if (not line[IMAGE_FILE_NAME].startswith('"') or
				2568	not line[IMAGE_FILE_NAME].endswith('"')):
				2569	raise TracingFailure(
				2570	'Command line is not properly quoted: %s' % line[IMAGE_FILE_NAME],
				2571	None, None, None)
				2572
				2573	# TODO(maruel): Process escapes.
				2574	if (not line[COMMAND_LINE].startswith('"') or
				2575	not line[COMMAND_LINE].endswith('"')):
				2576	raise TracingFailure(
				2577	'Command line is not properly quoted: %s' % line[COMMAND_LINE],
				2578	None, None, None)
				2579	proc.command = CommandLineToArgvW(line[COMMAND_LINE][1:-1])
				2580	proc.executable = line[IMAGE_FILE_NAME][1:-1]
				2581	# proc.command[0] may be the absolute path of 'executable' but it may be
				2582	# anything else too. If it happens that command[0] ends with executable,
				2583	# use it, otherwise defaults to the base name.
				2584	cmd0 = proc.command[0].lower()
				2585	if not cmd0.endswith('.exe'):
				2586	# TODO(maruel): That's not strictly true either.
				2587	cmd0 += '.exe'
				2588	if cmd0.endswith(proc.executable) and os.path.isfile(cmd0):
				2589	# Fix the path.
				2590	cmd0 = cmd0.replace('/', os.path.sep)
				2591	cmd0 = os.path.normpath(cmd0)
				2592	proc.executable = get_native_path_case(cmd0)
				2593	logging.info(
				2594	'New child: %s -> %d %s' % (ppid, pid, proc.executable))
				2595
				2596	def handle_Thread_End(self, line):
				2597	"""Has the same parameters as Thread_Start."""
				2598	tid = int(line[self.TID], 16)
				2599	self._threads_active.pop(tid, None)
				2600
				2601	def handle_Thread_Start(self, line):
				2602	"""Handles a new thread created.
				2603
				2604	Do not use self.PID here since a process' initial thread is created by
				2605	the parent process.
				2606	"""
				2607	PROCESS_ID = self.USER_DATA
				2608	TTHREAD_ID = self.USER_DATA + 1
				2609	#STACK_BASE = self.USER_DATA + 2
				2610	#STACK_LIMIT = self.USER_DATA + 3
				2611	#USER_STACK_BASE = self.USER_DATA + 4
				2612	#USER_STACK_LIMIT = self.USER_DATA + 5
				2613	#AFFINITY = self.USER_DATA + 6
				2614	#WIN32_START_ADDR = self.USER_DATA + 7
				2615	#TEB_BASE = self.USER_DATA + 8
				2616	#SUB_PROCESS_TAG = self.USER_DATA + 9
				2617	#BASE_PRIORITY = self.USER_DATA + 10
				2618	#PAGE_PRIORITY = self.USER_DATA + 11
				2619	#IO_PRIORITY = self.USER_DATA + 12
				2620	#THREAD_FLAGS = self.USER_DATA + 13
				2621	# Do not use self.PID here since a process' initial thread is created by
				2622	# the parent process.
				2623	pid = int(line[PROCESS_ID], 16)
				2624	tid = int(line[TTHREAD_ID], 16)
				2625	logging.debug('New thread pid:%d, tid:%d' % (pid, tid))
				2626	self._threads_active[tid] = pid
				2627
				2628	@classmethod
				2629	def supported_events(cls):
				2630	"""Returns all the procesed events."""
				2631	out = []
				2632	for member in dir(cls):
				2633	match = re.match(r'^handle_([A-Za-z]+)_([A-Za-z]+)$', member)
				2634	if match:
				2635	out.append(match.groups())
				2636	return out
				2637
				2638	class Tracer(ApiBase.Tracer):
				2639	# The basic headers.
				2640	EXPECTED_HEADER = [
				2641	u'Event Name',
				2642	u'Type',
				2643	u'Event ID',
				2644	u'Version',
				2645	u'Channel',
				2646	u'Level', # 5
				2647	u'Opcode',
				2648	u'Task',
				2649	u'Keyword',
				2650	u'PID',
				2651	u'TID', # 10
				2652	u'Processor Number',
				2653	u'Instance ID',
				2654	u'Parent Instance ID',
				2655	u'Activity ID',
				2656	u'Related Activity ID', # 15
				2657	u'Clock-Time',
				2658	u'Kernel(ms)', # Both have a resolution of ~15ms which makes them
				2659	u'User(ms)', # pretty much useless.
				2660	u'User Data', # Extra arguments that are event-specific.
				2661	]
				2662	# Only the useful headers common to all entries are listed there. Any column
				2663	# at 19 or higher is dependent on the specific event.
				2664	EVENT_NAME = 0
				2665	TYPE = 1
				2666	PID = 9
				2667	TID = 10
				2668	PROCESSOR_ID = 11
				2669	TIMESTAMP = 16
				2670	NULL_GUID = '{00000000-0000-0000-0000-000000000000}'
				2671	USER_DATA = 19
				2672
				2673	def __init__(self, logname):
				2674	"""Starts the log collection.
				2675
				2676	Requires administrative access. logman.exe is synchronous so no need for a
				2677	"warmup" call. 'Windows Kernel Trace' is localized so use its GUID
				2678	instead. The GUID constant name is SystemTraceControlGuid. Lovely.
				2679
				2680	One can get the list of potentially interesting providers with:
				2681	"logman query providers \| findstr /i file"
				2682	"""
				2683	super(LogmanTrace.Tracer, self).__init__(logname)
				2684	self._script = create_thunk()
				2685	cmd_start = [
				2686	'logman.exe',
				2687	'start',
				2688	'NT Kernel Logger',
				2689	'-p', '{9e814aad-3204-11d2-9a82-006008a86939}',
				2690	# splitio,fileiocompletion,syscall,file,cswitch,img
				2691	'(process,fileio,thread)',
				2692	'-o', self._logname + '.etl',
				2693	'-ets', # Send directly to kernel
				2694	# Values extracted out of thin air.
				2695	# Event Trace Session buffer size in kb.
				2696	'-bs', '10240',
				2697	# Number of Event Trace Session buffers.
				2698	'-nb', '16', '256',
				2699	]
				2700	logging.debug('Running: %s' % cmd_start)
				2701	try:
				2702	subprocess.check_call(
				2703	cmd_start,
				2704	stdin=subprocess.PIPE,
				2705	stdout=subprocess.PIPE,
				2706	stderr=subprocess.STDOUT)
				2707	except subprocess.CalledProcessError, e:
				2708	if e.returncode == -2147024891:
				2709	print >> sys.stderr, 'Please restart with an elevated admin prompt'
				2710	elif e.returncode == -2144337737:
				2711	print >> sys.stderr, (
				2712	'A kernel trace was already running, stop it and try again')
				2713	raise
				2714
				2715	def trace(self, cmd, cwd, tracename, output):
				2716	logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
				2717	assert os.path.isabs(cmd[0]), cmd[0]
				2718	assert os.path.isabs(cwd), cwd
				2719	assert os.path.normpath(cwd) == cwd, cwd
				2720	with self._lock:
				2721	if not self._initialized:
				2722	raise TracingFailure(
				2723	'Called Tracer.trace() on an unitialized object',
				2724	None, None, None, tracename)
				2725	assert tracename not in (i['trace'] for i in self._traces)
				2726
				2727	# Use "logman -?" for help.
				2728
				2729	stdout = stderr = None
				2730	if output:
				2731	stdout = subprocess.PIPE
				2732	stderr = subprocess.STDOUT
				2733
				2734	# Run the child process.
				2735	logging.debug('Running: %s' % cmd)
				2736	# Use the temporary script generated with create_thunk() so we have a
				2737	# clear pid owner. Since trace_inputs.py can be used as a library and
				2738	# could trace multiple processes simultaneously, it makes it more complex
				2739	# if the executable to be traced is executed directly here. It also solves
				2740	# issues related to logman.exe that needs to be executed to control the
				2741	# kernel trace.
				2742	child_cmd = [
				2743	sys.executable,
				2744	self._script,
				2745	tracename,
				2746	]
				2747	child = subprocess.Popen(
				2748	child_cmd + fix_python_path(cmd),
				2749	cwd=cwd,
				2750	stdin=subprocess.PIPE,
				2751	stdout=stdout,
				2752	stderr=stderr)
				2753	logging.debug('Started child pid: %d' % child.pid)
				2754	out = child.communicate()[0]
				2755	# This doesn't mean all the grand-children are done. Sadly, we don't have
				2756	# a good way to determine that.
				2757
				2758	with self._lock:
				2759	assert tracename not in (i['trace'] for i in self._traces)
				2760	self._traces.append({
maruel@chromium.org	f43e68b	2012-10-15 20:23:10 +0000	[diff] [blame]	2761	'cmd': cmd,
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	2762	'cwd': cwd,
maruel@chromium.org	964ddde	2012-10-29 18:13:46 +0000	[diff] [blame^]	2763	'output': out,
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	2764	'pid': child.pid,
				2765	'trace': tracename,
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	2766	})
				2767
				2768	return child.returncode, out
				2769
				2770	def close(self, _timeout=None):
				2771	"""Stops the kernel log collection and converts the traces to text
				2772	representation.
				2773	"""
				2774	with self._lock:
				2775	if not self._initialized:
				2776	raise TracingFailure(
				2777	'Called Tracer.close() on an unitialized object',
				2778	None, None, None)
				2779	os.remove(self._script)
				2780	# Save metadata, add 'format' key..
				2781	data = {
				2782	'format': 'csv',
				2783	'traces': self._traces,
				2784	}
				2785	write_json(self._logname, data, False)
				2786
				2787	cmd_stop = [
				2788	'logman.exe',
				2789	'stop',
				2790	'NT Kernel Logger',
				2791	'-ets', # Sends the command directly to the kernel.
				2792	]
				2793	logging.debug('Running: %s' % cmd_stop)
				2794	subprocess.check_call(
				2795	cmd_stop,
				2796	stdin=subprocess.PIPE,
				2797	stdout=subprocess.PIPE,
				2798	stderr=subprocess.STDOUT)
				2799	self._initialized = False
				2800
				2801	def post_process_log(self):
				2802	"""Converts the .etl file into .csv then into .json."""
				2803	super(LogmanTrace.Tracer, self).post_process_log()
				2804	logformat = 'csv'
				2805	self._convert_log(logformat)
				2806
				2807	if logformat == 'csv_utf16':
				2808	def load_file():
				2809	def utf_8_encoder(unicode_csv_data):
				2810	"""Encodes the unicode object as utf-8 encoded str instance"""
				2811	for line in unicode_csv_data:
				2812	yield line.encode('utf-8')
				2813
				2814	def unicode_csv_reader(unicode_csv_data, **kwargs):
				2815	"""Encodes temporarily as UTF-8 since csv module doesn't do unicode.
				2816	"""
				2817	csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), **kwargs)
				2818	for row in csv_reader:
				2819	# Decode str utf-8 instances back to unicode instances, cell by
				2820	# cell:
				2821	yield [cell.decode('utf-8') for cell in row]
				2822
				2823	# The CSV file is UTF-16 so use codecs.open() to load the file into
				2824	# the python internal unicode format (utf-8). Then explicitly
				2825	# re-encode as utf8 as str instances so csv can parse it fine. Then
				2826	# decode the utf-8 str back into python unicode instances. This
				2827	# sounds about right.
				2828	for line in unicode_csv_reader(
				2829	codecs.open(self._logname + '.' + logformat, 'r', 'utf-16')):
				2830	# line is a list of unicode objects
				2831	# So much white space!
				2832	yield [i.strip() for i in line]
				2833
				2834	elif logformat == 'csv':
				2835	def load_file():
				2836	def ansi_csv_reader(ansi_csv_data, **kwargs):
				2837	"""Loads an 'ANSI' code page and returns unicode() objects."""
				2838	assert sys.getfilesystemencoding() == 'mbcs'
				2839	encoding = get_current_encoding()
				2840	for row in csv.reader(ansi_csv_data, **kwargs):
				2841	# Decode str 'ansi' instances to unicode instances, cell by cell:
				2842	yield [cell.decode(encoding) for cell in row]
				2843
				2844	# The fastest and smallest format but only supports 'ANSI' file paths.
				2845	# E.g. the filenames are encoding in the 'current' encoding.
				2846	for line in ansi_csv_reader(open(self._logname + '.' + logformat)):
				2847	# line is a list of unicode objects.
				2848	yield [i.strip() for i in line]
				2849
				2850	supported_events = LogmanTrace.Context.supported_events()
				2851
				2852	def trim(generator):
				2853	for index, line in enumerate(generator):
				2854	if not index:
				2855	if line != self.EXPECTED_HEADER:
				2856	raise TracingFailure(
				2857	'Found malformed header: %s' % ' '.join(line),
				2858	None, None, None)
				2859	continue
				2860	# As you can see, the CSV is full of useful non-redundant information:
				2861	if (line[2] != '0' or # Event ID
				2862	line[3] not in ('2', '3') or # Version
				2863	line[4] != '0' or # Channel
				2864	line[5] != '0' or # Level
				2865	line[7] != '0' or # Task
				2866	line[8] != '0x0000000000000000' or # Keyword
				2867	line[12] != '' or # Instance ID
				2868	line[13] != '' or # Parent Instance ID
				2869	line[14] != self.NULL_GUID or # Activity ID
				2870	line[15] != ''): # Related Activity ID
				2871	raise TracingFailure(
				2872	'Found unexpected values in line: %s' % ' '.join(line),
				2873	None, None, None)
				2874
				2875	if (line[self.EVENT_NAME], line[self.TYPE]) not in supported_events:
				2876	continue
				2877
				2878	# Convert the PID in-place from hex.
				2879	line[self.PID] = int(line[self.PID], 16)
				2880
				2881	yield [
				2882	line[self.EVENT_NAME],
				2883	line[self.TYPE],
				2884	line[self.PID],
				2885	line[self.TID],
				2886	line[self.PROCESSOR_ID],
				2887	line[self.TIMESTAMP],
				2888	] + line[self.USER_DATA:]
				2889
				2890	write_json('%s.json' % self._logname, list(trim(load_file())), True)
				2891
				2892	def _convert_log(self, logformat):
				2893	"""Converts the ETL trace to text representation.
				2894
				2895	Normally, 'csv' is sufficient. If complex scripts are used (like eastern
				2896	languages), use 'csv_utf16'. If localization gets in the way, use 'xml'.
				2897
				2898	Arguments:
				2899	- logformat: Text format to be generated, csv, csv_utf16 or xml.
				2900
				2901	Use "tracerpt -?" for help.
				2902	"""
				2903	LOCALE_INVARIANT = 0x7F
				2904	windll.kernel32.SetThreadLocale(LOCALE_INVARIANT)
				2905	cmd_convert = [
				2906	'tracerpt.exe',
				2907	'-l', self._logname + '.etl',
				2908	'-o', self._logname + '.' + logformat,
				2909	'-gmt', # Use UTC
				2910	'-y', # No prompt
				2911	# Use -of XML to get the header of each items after column 19, e.g. all
				2912	# the actual headers of 'User Data'.
				2913	]
				2914
				2915	if logformat == 'csv':
				2916	# tracerpt localizes the 'Type' column, for major brainfuck
				2917	# entertainment. I can't imagine any sane reason to do that.
				2918	cmd_convert.extend(['-of', 'CSV'])
				2919	elif logformat == 'csv_utf16':
				2920	# This causes it to use UTF-16, which doubles the log size but ensures
				2921	# the log is readable for non-ASCII characters.
				2922	cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode'])
				2923	elif logformat == 'xml':
				2924	cmd_convert.extend(['-of', 'XML'])
				2925	else:
				2926	raise ValueError('Unexpected log format \'%s\'' % logformat)
				2927	logging.debug('Running: %s' % cmd_convert)
				2928	# This can takes tens of minutes for large logs.
				2929	# Redirects all output to stderr.
				2930	subprocess.check_call(
				2931	cmd_convert,
				2932	stdin=subprocess.PIPE,
				2933	stdout=sys.stderr,
				2934	stderr=sys.stderr)
				2935
				2936	@staticmethod
				2937	def clean_trace(logname):
				2938	for ext in ('', '.csv', '.etl', '.json', '.xml'):
				2939	if os.path.isfile(logname + ext):
				2940	os.remove(logname + ext)
				2941
				2942	@classmethod
				2943	def parse_log(cls, logname, blacklist):
				2944	logging.info('parse_log(%s, %s)' % (logname, blacklist))
maruel@chromium.org	8b056ba	2012-10-16 14:04:49 +0000	[diff] [blame]	2945	assert os.path.isabs(logname)
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	2946
				2947	def blacklist_more(filepath):
				2948	# All the NTFS metadata is in the form x:\$EXTEND or stuff like that.
				2949	return blacklist(filepath) or re.match(r'[A-Z]\:\\\$EXTEND', filepath)
				2950
				2951	data = read_json(logname)
				2952	lines = read_json(logname + '.json')
				2953	out = []
				2954	for item in data['traces']:
maruel@chromium.org	964ddde	2012-10-29 18:13:46 +0000	[diff] [blame^]	2955	result = {
				2956	'output': item['output'],
				2957	'trace': item['trace'],
				2958	}
				2959	try:
				2960	context = cls.Context(blacklist_more, item['pid'], item['trace'])
				2961	for line in lines:
				2962	context.on_line(line)
				2963	result['results'] = context.to_results()
				2964	except TracingFailure:
				2965	result['exception'] = sys.exc_info()
				2966	out.append(result)
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	2967	return out
				2968
				2969
				2970	def get_api():
				2971	"""Returns the correct implementation for the current OS."""
				2972	if sys.platform == 'cygwin':
				2973	raise NotImplementedError(
				2974	'Not implemented for cygwin, start the script from Win32 python')
				2975	flavors = {
				2976	'win32': LogmanTrace,
				2977	'darwin': Dtrace,
				2978	'sunos5': Dtrace,
				2979	'freebsd7': Dtrace,
				2980	'freebsd8': Dtrace,
				2981	}
				2982	# Defaults to strace.
				2983	return flavors.get(sys.platform, Strace)()
				2984
				2985
				2986	def extract_directories(root_dir, files, blacklist):
				2987	"""Detects if all the files in a directory are in \|files\| and if so, replace
				2988	the individual files by a Results.Directory instance.
				2989
				2990	Takes a list of Results.File instances and returns a shorter list of
				2991	Results.File and Results.Directory instances.
				2992
				2993	Arguments:
				2994	- root_dir: Optional base directory that shouldn't be search further.
				2995	- files: list of Results.File instances.
				2996	- blacklist: regexp of files to ignore, for example r'.+\.pyc'.
				2997	"""
				2998	logging.info(
				2999	'extract_directories(%s, %d files, ...)' % (root_dir, len(files)))
				3000	assert not (root_dir or '').endswith(os.path.sep), root_dir
				3001	assert not root_dir or (get_native_path_case(root_dir) == root_dir)
				3002	assert not any(isinstance(f, Results.Directory) for f in files)
				3003	# Remove non existent files.
				3004	files = [f for f in files if f.existent]
				3005	if not files:
				3006	return files
				3007	# All files must share the same root, which can be None.
				3008	assert len(set(f.root for f in files)) == 1, set(f.root for f in files)
				3009
				3010	# Creates a {directory: {filename: File}} mapping, up to root.
				3011	buckets = {}
				3012	if root_dir:
				3013	buckets[root_dir] = {}
				3014	for fileobj in files:
				3015	path = fileobj.full_path
				3016	directory = os.path.dirname(path)
				3017	assert directory
				3018	# Do not use os.path.basename() so trailing os.path.sep is kept.
				3019	basename = path[len(directory)+1:]
				3020	files_in_directory = buckets.setdefault(directory, {})
				3021	files_in_directory[basename] = fileobj
				3022	# Add all the directories recursively up to root.
				3023	while True:
				3024	old_d = directory
				3025	directory = os.path.dirname(directory)
				3026	if directory + os.path.sep == root_dir or directory == old_d:
				3027	break
				3028	buckets.setdefault(directory, {})
				3029
				3030	root_prefix = len(root_dir) + 1 if root_dir else 0
				3031	for directory in sorted(buckets, reverse=True):
				3032	actual = set(f for f in os.listdir(directory) if not blacklist(f))
				3033	expected = set(buckets[directory])
				3034	if not (actual - expected):
				3035	parent = os.path.dirname(directory)
				3036	buckets[parent][os.path.basename(directory)] = Results.Directory(
				3037	root_dir,
				3038	directory[root_prefix:],
				3039	False,
				3040	sum(f.size for f in buckets[directory].itervalues()),
				3041	sum(f.nb_files for f in buckets[directory].itervalues()))
				3042	# Remove the whole bucket.
				3043	del buckets[directory]
				3044
				3045	# Reverse the mapping with what remains. The original instances are returned,
				3046	# so the cached meta data is kept.
				3047	files = sum((x.values() for x in buckets.itervalues()), [])
				3048	return sorted(files, key=lambda x: x.path)
				3049
				3050
				3051	def trace(logfile, cmd, cwd, api, output):
				3052	"""Traces an executable. Returns (returncode, output) from api.
				3053
				3054	Arguments:
				3055	- logfile: file to write to.
				3056	- cmd: command to run.
				3057	- cwd: current directory to start the process in.
				3058	- api: a tracing api instance.
				3059	- output: if True, returns output, otherwise prints it at the console.
				3060	"""
				3061	cmd = fix_python_path(cmd)
				3062	api.clean_trace(logfile)
				3063	with api.get_tracer(logfile) as tracer:
				3064	return tracer.trace(cmd, cwd, 'default', output)
				3065
				3066
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	3067	def CMDclean(args):
				3068	"""Cleans up traces."""
				3069	parser = OptionParserTraceInputs(command='clean')
				3070	options, args = parser.parse_args(args)
				3071	api = get_api()
				3072	api.clean_trace(options.log)
				3073	return 0
				3074
				3075
				3076	def CMDtrace(args):
				3077	"""Traces an executable."""
				3078	parser = OptionParserTraceInputs(command='trace')
				3079	parser.allow_interspersed_args = False
				3080	parser.add_option(
				3081	'-q', '--quiet', action='store_true',
				3082	help='Redirects traced executable output to /dev/null')
				3083	options, args = parser.parse_args(args)
				3084
				3085	if not args:
				3086	parser.error('Please provide a command to run')
				3087
				3088	if not os.path.isabs(args[0]) and os.access(args[0], os.X_OK):
				3089	args[0] = os.path.abspath(args[0])
				3090
				3091	api = get_api()
				3092	return trace(options.log, args, os.getcwd(), api, options.quiet)[0]
				3093
				3094
				3095	def CMDread(args):
				3096	"""Reads the logs and prints the result."""
				3097	parser = OptionParserTraceInputs(command='read')
				3098	parser.add_option(
				3099	'-V', '--variable',
				3100	nargs=2,
				3101	action='append',
				3102	dest='variables',
				3103	metavar='VAR_NAME directory',
				3104	default=[],
				3105	help=('Variables to replace relative directories against. Example: '
				3106	'"-v \'$HOME\' \'/home/%s\'" will replace all occurence of your '
				3107	'home dir with $HOME') % getpass.getuser())
				3108	parser.add_option(
				3109	'--root-dir',
				3110	help='Root directory to base everything off it. Anything outside of this '
				3111	'this directory will not be reported')
				3112	parser.add_option(
				3113	'-j', '--json', action='store_true',
				3114	help='Outputs raw result data as json')
				3115	parser.add_option(
				3116	'-b', '--blacklist', action='append', default=[],
				3117	help='List of regexp to use as blacklist filter')
				3118	options, args = parser.parse_args(args)
				3119
				3120	if options.root_dir:
				3121	options.root_dir = os.path.abspath(options.root_dir)
				3122
				3123	variables = dict(options.variables)
				3124	api = get_api()
				3125	def blacklist(f):
				3126	return any(re.match(b, f) for b in options.blacklist)
				3127	data = api.parse_log(options.log, blacklist)
				3128	# Process each trace.
				3129	output_as_json = []
				3130	for item in data:
				3131	if 'exception' in item:
maruel@chromium.org	424f569	2012-10-15 15:42:38 +0000	[diff] [blame]	3132	# Do not abort the other traces.
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	3133	print >> sys.stderr, (
maruel@chromium.org	424f569	2012-10-15 15:42:38 +0000	[diff] [blame]	3134	'Trace %s: Got an exception: %s' % (
				3135	item['trace'], item['exception'][1]))
maruel@chromium.org	8fb47fe	2012-10-03 20:13:15 +0000	[diff] [blame]	3136	continue
				3137	results = item['results']
				3138	if options.root_dir:
				3139	results = results.strip_root(options.root_dir)
				3140
				3141	if options.json:
				3142	output_as_json.append(results.flatten())
				3143	else:
				3144	simplified = extract_directories(
				3145	options.root_dir, results.files, blacklist)
				3146	simplified = [f.replace_variables(variables) for f in simplified]
				3147	if len(data) > 1:
				3148	print('Trace: %s' % item['trace'])
				3149	print('Total: %d' % len(results.files))
				3150	print('Non existent: %d' % len(results.non_existent))
				3151	for f in results.non_existent:
				3152	print(' %s' % f.path)
				3153	print(
				3154	'Interesting: %d reduced to %d' % (
				3155	len(results.existent), len(simplified)))
				3156	for f in simplified:
				3157	print(' %s' % f.path)
				3158
				3159	if options.json:
				3160	write_json(sys.stdout, output_as_json, False)
				3161	return 0
				3162
				3163
				3164	class OptionParserWithLogging(optparse.OptionParser):
				3165	"""Adds --verbose option."""
				3166	def __init__(self, verbose=0, **kwargs):
				3167	optparse.OptionParser.__init__(self, **kwargs)
				3168	self.add_option(
				3169	'-v', '--verbose',
				3170	action='count',
				3171	default=verbose,
				3172	help='Use multiple times to increase verbosity')
				3173
				3174	def parse_args(self, args, *kwargs):
				3175	options, args = optparse.OptionParser.parse_args(self, args, *kwargs)
				3176	levels = [logging.ERROR, logging.INFO, logging.DEBUG]
				3177	logging.basicConfig(
				3178	level=levels[min(len(levels)-1, options.verbose)],
				3179	format='%(levelname)5s %(module)15s(%(lineno)3d): %(message)s')
				3180	return options, args
				3181
				3182
				3183	class OptionParserWithNiceDescription(OptionParserWithLogging):
				3184	"""Generates the description with the command's docstring."""
				3185	def __init__(self, **kwargs):
				3186	"""Sets 'description' and 'usage' if not already specified."""
				3187	command = kwargs.pop('command', 'help')
				3188	kwargs.setdefault(
				3189	'description',
				3190	re.sub('[\r\n ]{2,}', ' ', get_command_handler(command).__doc__))
				3191	kwargs.setdefault('usage', '%%prog %s [options]' % command)
				3192	OptionParserWithLogging.__init__(self, **kwargs)
				3193
				3194
				3195	class OptionParserTraceInputs(OptionParserWithNiceDescription):
				3196	"""Adds automatic --log handling."""
				3197	def __init__(self, **kwargs):
				3198	OptionParserWithNiceDescription.__init__(self, **kwargs)
				3199	self.add_option(
				3200	'-l', '--log', help='Log file to generate or read, required')
				3201
				3202	def parse_args(self, args, *kwargs):
				3203	"""Makes sure the paths make sense.
				3204
				3205	On Windows, / and \ are often mixed together in a path.
				3206	"""
				3207	options, args = OptionParserWithNiceDescription.parse_args(
				3208	self, args, *kwargs)
				3209	if not options.log:
				3210	self.error('Must supply a log file with -l')
				3211	options.log = os.path.abspath(options.log)
				3212	return options, args
				3213
				3214
				3215	def extract_documentation():
				3216	"""Returns a dict {command: description} for each of documented command."""
				3217	commands = (
				3218	fn[3:]
				3219	for fn in dir(sys.modules['__main__'])
				3220	if fn.startswith('CMD') and get_command_handler(fn[3:]).__doc__)
				3221	return dict((fn, get_command_handler(fn).__doc__) for fn in commands)
				3222
				3223
				3224	def CMDhelp(args):
				3225	"""Prints list of commands or help for a specific command."""
				3226	doc = extract_documentation()
				3227	# Calculates the optimal offset.
				3228	offset = max(len(cmd) for cmd in doc)
				3229	format_str = ' %-' + str(offset + 2) + 's %s'
				3230	# Generate a one-liner documentation of each commands.
				3231	commands_description = '\n'.join(
				3232	format_str % (cmd, doc[cmd].split('\n')[0]) for cmd in sorted(doc))
				3233
				3234	parser = OptionParserWithNiceDescription(
				3235	usage='%prog <command> [options]',
				3236	description='Commands are:\n%s\n' % commands_description)
				3237	parser.format_description = lambda _: parser.description
				3238
				3239	# Strip out any -h or --help argument.
				3240	_, args = parser.parse_args([i for i in args if not i in ('-h', '--help')])
				3241	if len(args) == 1:
				3242	if not get_command_handler(args[0]):
				3243	parser.error('Unknown command %s' % args[0])
				3244	# The command was "%prog help command", replaces ourself with
				3245	# "%prog command --help" so help is correctly printed out.
				3246	return main(args + ['--help'])
				3247	elif args:
				3248	parser.error('Unknown argument "%s"' % ' '.join(args))
				3249	parser.print_help()
				3250	return 0
				3251
				3252
				3253	def get_command_handler(name):
				3254	"""Returns the command handler or CMDhelp if it doesn't exist."""
				3255	return getattr(sys.modules['__main__'], 'CMD%s' % name, None)
				3256
				3257
				3258	def main_impl(argv):
				3259	command = get_command_handler(argv[0] if argv else 'help')
				3260	if not command:
				3261	return CMDhelp(argv)
				3262	return command(argv[1:])
				3263
				3264	def main(argv):
				3265	try:
				3266	main_impl(argv)
				3267	except TracingFailure, e:
				3268	sys.stderr.write('\nError: ')
				3269	sys.stderr.write(str(e))
				3270	sys.stderr.write('\n')
				3271	return 1
				3272
				3273
				3274	if __name__ == '__main__':
				3275	sys.exit(main(sys.argv[1:]))