Blame - isolateserver.py - chromium.googlesource.com/infra/luci/client-py

blob: 6649bbe6f1418edbfb102f63136f4e77f83eab36 [file] [log] [blame]

maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	1	#!/usr/bin/env python
Marc-Antoine Ruel	8add124	2013-11-05 17:28:27 -0500	[diff] [blame]	2	# Copyright 2013 The Swarming Authors. All rights reserved.
Marc-Antoine Ruel	e98b112	2013-11-05 20:27:57 -0500	[diff] [blame]	3	# Use of this source code is governed under the Apache License, Version 2.0 that
				4	# can be found in the LICENSE file.
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	5
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	6	"""Archives a set of files or directories to a server."""
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	7
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	8	__version__ = '0.3'
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	9
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	10	import functools
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	11	import hashlib
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	12	import json
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	13	import logging
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	14	import os
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	15	import re
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	16	import shutil
				17	import stat
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	18	import sys
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	19	import tempfile
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	20	import threading
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	21	import time
maruel@chromium.org	e82112e	2013-04-24 14:41:55 +0000	[diff] [blame]	22	import urllib
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	23	import zlib
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	24
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	25	from third_party import colorama
				26	from third_party.depot_tools import fix_encoding
				27	from third_party.depot_tools import subcommand
				28
Marc-Antoine Ruel	3798993	2013-11-19 16:28:08 -0500	[diff] [blame]	29	from utils import file_path
vadimsh@chromium.org	6b70621	2013-08-28 15:03:46 +0000	[diff] [blame]	30	from utils import net
vadimsh@chromium.org	b074b16	2013-08-22 17:55:46 +0000	[diff] [blame]	31	from utils import threading_utils
vadimsh@chromium.org	a432647	2013-08-24 02:05:41 +0000	[diff] [blame]	32	from utils import tools
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	33
				34
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	35	# Version of isolate protocol passed to the server in /handshake request.
				36	ISOLATE_PROTOCOL_VERSION = '1.0'
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	37
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	38
				39	# The number of files to check the isolate server per /pre-upload query.
vadimsh@chromium.org	eea5242	2013-08-21 19:35:54 +0000	[diff] [blame]	40	# All files are sorted by likelihood of a change in the file content
				41	# (currently file size is used to estimate this: larger the file -> larger the
				42	# possibility it has changed). Then first ITEMS_PER_CONTAINS_QUERIES[0] files
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	43	# are taken and send to '/pre-upload', then next ITEMS_PER_CONTAINS_QUERIES[1],
vadimsh@chromium.org	eea5242	2013-08-21 19:35:54 +0000	[diff] [blame]	44	# and so on. Numbers here is a trade-off; the more per request, the lower the
				45	# effect of HTTP round trip latency and TCP-level chattiness. On the other hand,
				46	# larger values cause longer lookups, increasing the initial latency to start
				47	# uploading, which is especially an issue for large files. This value is
				48	# optimized for the "few thousands files to look up with minimal number of large
				49	# files missing" case.
				50	ITEMS_PER_CONTAINS_QUERIES = [20, 20, 50, 50, 50, 100]
csharp@chromium.org	07fa759	2013-01-11 18:19:30 +0000	[diff] [blame]	51
maruel@chromium.org	9958e4a	2013-09-17 00:01:48 +0000	[diff] [blame]	52
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	53	# A list of already compressed extension types that should not receive any
				54	# compression before being uploaded.
				55	ALREADY_COMPRESSED_TYPES = [
				56	'7z', 'avi', 'cur', 'gif', 'h264', 'jar', 'jpeg', 'jpg', 'pdf', 'png',
				57	'wav', 'zip'
				58	]
				59
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	60
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	61	# The file size to be used when we don't know the correct file size,
				62	# generally used for .isolated files.
				63	UNKNOWN_FILE_SIZE = None
				64
				65
				66	# The size of each chunk to read when downloading and unzipping files.
				67	ZIPPED_FILE_CHUNK = 16 * 1024
				68
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	69	# Chunk size to use when doing disk I/O.
				70	DISK_FILE_CHUNK = 1024 * 1024
				71
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	72	# Chunk size to use when reading from network stream.
				73	NET_IO_FILE_CHUNK = 16 * 1024
				74
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	75
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	76	# Read timeout in seconds for downloads from isolate storage. If there's no
				77	# response from the server within this timeout whole download will be aborted.
				78	DOWNLOAD_READ_TIMEOUT = 60
				79
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	80	# Maximum expected delay (in seconds) between successive file fetches
				81	# in run_tha_test. If it takes longer than that, a deadlock might be happening
				82	# and all stack frames for all threads are dumped to log.
				83	DEADLOCK_TIMEOUT = 5 * 60
				84
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	85
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	86	# The delay (in seconds) to wait between logging statements when retrieving
				87	# the required files. This is intended to let the user (or buildbot) know that
				88	# the program is still running.
				89	DELAY_BETWEEN_UPDATES_IN_SECS = 30
				90
				91
maruel@chromium.org	385d73d	2013-09-19 18:33:21 +0000	[diff] [blame]	92	# Sadly, hashlib uses 'sha1' instead of the standard 'sha-1' so explicitly
				93	# specify the names here.
				94	SUPPORTED_ALGOS = {
				95	'md5': hashlib.md5,
				96	'sha-1': hashlib.sha1,
				97	'sha-512': hashlib.sha512,
				98	}
				99
				100
				101	# Used for serialization.
				102	SUPPORTED_ALGOS_REVERSE = dict((v, k) for k, v in SUPPORTED_ALGOS.iteritems())
				103
				104
Marc-Antoine Ruel	ac54cb4	2013-11-18 14:05:35 -0500	[diff] [blame]	105	DEFAULT_BLACKLIST = (
				106	# Temporary vim or python files.
				107	r'^.+\.(?:pyc\|swp)$',
				108	# .git or .svn directory.
				109	r'^(?:.+' + re.escape(os.path.sep) + r'\|)\.(?:git\|svn)$',
				110	)
				111
				112
				113	# Chromium-specific.
				114	DEFAULT_BLACKLIST += (
				115	r'^.+\.(?:run_test_cases)$',
				116	r'^(?:.+' + re.escape(os.path.sep) + r'\|)testserver\.log$',
				117	)
				118
				119
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	120	class Error(Exception):
				121	"""Generic runtime error."""
				122	pass
				123
				124
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	125	class ConfigError(ValueError):
				126	"""Generic failure to load a .isolated file."""
				127	pass
				128
				129
				130	class MappingError(OSError):
				131	"""Failed to recreate the tree."""
				132	pass
				133
				134
maruel@chromium.org	7b844a6	2013-09-17 13:04:59 +0000	[diff] [blame]	135	def is_valid_hash(value, algo):
				136	"""Returns if the value is a valid hash for the corresponding algorithm."""
				137	size = 2 * algo().digest_size
				138	return bool(re.match(r'^[a-fA-F0-9]{%d}$' % size, value))
				139
				140
				141	def hash_file(filepath, algo):
				142	"""Calculates the hash of a file without reading it all in memory at once.
				143
				144	\|algo\| should be one of hashlib hashing algorithm.
				145	"""
				146	digest = algo()
maruel@chromium.org	037758d	2012-12-10 17:59:46 +0000	[diff] [blame]	147	with open(filepath, 'rb') as f:
				148	while True:
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	149	chunk = f.read(DISK_FILE_CHUNK)
maruel@chromium.org	037758d	2012-12-10 17:59:46 +0000	[diff] [blame]	150	if not chunk:
				151	break
				152	digest.update(chunk)
				153	return digest.hexdigest()
				154
				155
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	156	def stream_read(stream, chunk_size):
				157	"""Reads chunks from \|stream\| and yields them."""
				158	while True:
				159	data = stream.read(chunk_size)
				160	if not data:
				161	break
				162	yield data
				163
				164
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	165	def file_read(filepath, chunk_size=DISK_FILE_CHUNK):
				166	"""Yields file content in chunks of given \|chunk_size\|."""
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	167	with open(filepath, 'rb') as f:
				168	while True:
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	169	data = f.read(chunk_size)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	170	if not data:
				171	break
				172	yield data
				173
				174
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	175	def file_write(filepath, content_generator):
				176	"""Writes file content as generated by content_generator.
				177
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	178	Creates the intermediary directory as needed.
				179
				180	Returns the number of bytes written.
				181
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	182	Meant to be mocked out in unit tests.
				183	"""
				184	filedir = os.path.dirname(filepath)
				185	if not os.path.isdir(filedir):
				186	os.makedirs(filedir)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	187	total = 0
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	188	with open(filepath, 'wb') as f:
				189	for d in content_generator:
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	190	total += len(d)
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	191	f.write(d)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	192	return total
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	193
				194
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	195	def zip_compress(content_generator, level=7):
				196	"""Reads chunks from \|content_generator\| and yields zip compressed chunks."""
				197	compressor = zlib.compressobj(level)
				198	for chunk in content_generator:
				199	compressed = compressor.compress(chunk)
				200	if compressed:
				201	yield compressed
				202	tail = compressor.flush(zlib.Z_FINISH)
				203	if tail:
				204	yield tail
				205
				206
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	207	def zip_decompress(content_generator, chunk_size=DISK_FILE_CHUNK):
				208	"""Reads zipped data from \|content_generator\| and yields decompressed data.
				209
				210	Decompresses data in small chunks (no larger than \|chunk_size\|) so that
				211	zip bomb file doesn't cause zlib to preallocate huge amount of memory.
				212
				213	Raises IOError if data is corrupted or incomplete.
				214	"""
				215	decompressor = zlib.decompressobj()
				216	compressed_size = 0
				217	try:
				218	for chunk in content_generator:
				219	compressed_size += len(chunk)
				220	data = decompressor.decompress(chunk, chunk_size)
				221	if data:
				222	yield data
				223	while decompressor.unconsumed_tail:
				224	data = decompressor.decompress(decompressor.unconsumed_tail, chunk_size)
				225	if data:
				226	yield data
				227	tail = decompressor.flush()
				228	if tail:
				229	yield tail
				230	except zlib.error as e:
				231	raise IOError(
				232	'Corrupted zip stream (read %d bytes) - %s' % (compressed_size, e))
				233	# Ensure all data was read and decompressed.
				234	if decompressor.unused_data or decompressor.unconsumed_tail:
				235	raise IOError('Not all data was decompressed')
				236
				237
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	238	def get_zip_compression_level(filename):
				239	"""Given a filename calculates the ideal zip compression level to use."""
				240	file_ext = os.path.splitext(filename)[1].lower()
				241	# TODO(csharp): Profile to find what compression level works best.
				242	return 0 if file_ext in ALREADY_COMPRESSED_TYPES else 7
				243
				244
maruel@chromium.org	af25485	2013-09-17 17:48:14 +0000	[diff] [blame]	245	def create_directories(base_directory, files):
				246	"""Creates the directory structure needed by the given list of files."""
				247	logging.debug('create_directories(%s, %d)', base_directory, len(files))
				248	# Creates the tree of directories to create.
				249	directories = set(os.path.dirname(f) for f in files)
				250	for item in list(directories):
				251	while item:
				252	directories.add(item)
				253	item = os.path.dirname(item)
				254	for d in sorted(directories):
				255	if d:
				256	os.mkdir(os.path.join(base_directory, d))
				257
				258
Marc-Antoine Ruel	ccafe0e	2013-11-08 16:15:36 -0500	[diff] [blame]	259	def create_symlinks(base_directory, files):
				260	"""Creates any symlinks needed by the given set of files."""
maruel@chromium.org	af25485	2013-09-17 17:48:14 +0000	[diff] [blame]	261	for filepath, properties in files:
				262	if 'l' not in properties:
				263	continue
				264	if sys.platform == 'win32':
Marc-Antoine Ruel	ccafe0e	2013-11-08 16:15:36 -0500	[diff] [blame]	265	# TODO(maruel): Create symlink via the win32 api.
maruel@chromium.org	af25485	2013-09-17 17:48:14 +0000	[diff] [blame]	266	logging.warning('Ignoring symlink %s', filepath)
				267	continue
				268	outfile = os.path.join(base_directory, filepath)
Marc-Antoine Ruel	ccafe0e	2013-11-08 16:15:36 -0500	[diff] [blame]	269	# os.symlink() doesn't exist on Windows.
maruel@chromium.org	af25485	2013-09-17 17:48:14 +0000	[diff] [blame]	270	os.symlink(properties['l'], outfile) # pylint: disable=E1101
maruel@chromium.org	af25485	2013-09-17 17:48:14 +0000	[diff] [blame]	271
				272
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	273	def is_valid_file(filepath, size):
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	274	"""Determines if the given files appears valid.
				275
				276	Currently it just checks the file's size.
				277	"""
				278	if size == UNKNOWN_FILE_SIZE:
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	279	return os.path.isfile(filepath)
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	280	actual_size = os.stat(filepath).st_size
				281	if size != actual_size:
				282	logging.warning(
				283	'Found invalid item %s; %d != %d',
				284	os.path.basename(filepath), actual_size, size)
				285	return False
				286	return True
				287
				288
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	289	class WorkerPool(threading_utils.AutoRetryThreadPool):
				290	"""Thread pool that automatically retries on IOError and runs a preconfigured
				291	function.
				292	"""
				293	# Initial and maximum number of worker threads.
				294	INITIAL_WORKERS = 2
				295	MAX_WORKERS = 16
				296	RETRIES = 5
				297
				298	def __init__(self):
				299	super(WorkerPool, self).__init__(
				300	[IOError],
				301	self.RETRIES,
				302	self.INITIAL_WORKERS,
				303	self.MAX_WORKERS,
				304	0,
				305	'remote')
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	306
				307
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	308	class Item(object):
				309	"""An item to push to Storage.
				310
				311	It starts its life in a main thread, travels to 'contains' thread, then to
				312	'push' thread and then finally back to the main thread.
				313
				314	It is never used concurrently from multiple threads.
				315	"""
				316
				317	def __init__(self, digest, size, is_isolated=False):
				318	self.digest = digest
				319	self.size = size
				320	self.is_isolated = is_isolated
				321	self.compression_level = 6
				322	self.push_state = None
				323
				324	def content(self, chunk_size):
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	325	"""Iterable with content of this item in chunks of given size.
				326
				327	Arguments:
				328	chunk_size: preferred size of the chunk to produce, may be ignored.
				329	"""
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	330	raise NotImplementedError()
				331
				332
				333	class FileItem(Item):
				334	"""A file to push to Storage."""
				335
				336	def __init__(self, path, digest, size, is_isolated):
				337	super(FileItem, self).__init__(digest, size, is_isolated)
				338	self.path = path
				339	self.compression_level = get_zip_compression_level(path)
				340
				341	def content(self, chunk_size):
				342	return file_read(self.path, chunk_size)
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	343
				344
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	345	class BufferItem(Item):
				346	"""A byte buffer to push to Storage."""
				347
				348	def __init__(self, buf, algo, is_isolated=False):
				349	super(BufferItem, self).__init__(
				350	algo(buf).hexdigest(), len(buf), is_isolated)
				351	self.buffer = buf
				352
				353	def content(self, _chunk_size):
				354	return [self.buffer]
				355
				356
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	357	class Storage(object):
				358	"""Efficiently downloads or uploads large set of files via StorageApi."""
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	359
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	360	def __init__(self, storage_api, use_zip):
				361	self.use_zip = use_zip
				362	self._storage_api = storage_api
				363	self._cpu_thread_pool = None
				364	self._net_thread_pool = None
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	365
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	366	@property
				367	def cpu_thread_pool(self):
				368	"""ThreadPool for CPU-bound tasks like zipping."""
				369	if self._cpu_thread_pool is None:
				370	self._cpu_thread_pool = threading_utils.ThreadPool(
				371	2, max(threading_utils.num_processors(), 2), 0, 'zip')
				372	return self._cpu_thread_pool
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	373
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	374	@property
				375	def net_thread_pool(self):
				376	"""AutoRetryThreadPool for IO-bound tasks, retries IOError."""
				377	if self._net_thread_pool is None:
				378	self._net_thread_pool = WorkerPool()
				379	return self._net_thread_pool
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	380
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	381	def close(self):
				382	"""Waits for all pending tasks to finish."""
				383	if self._cpu_thread_pool:
				384	self._cpu_thread_pool.join()
				385	self._cpu_thread_pool.close()
				386	self._cpu_thread_pool = None
				387	if self._net_thread_pool:
				388	self._net_thread_pool.join()
				389	self._net_thread_pool.close()
				390	self._net_thread_pool = None
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	391
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	392	def __enter__(self):
				393	"""Context manager interface."""
				394	return self
				395
				396	def __exit__(self, _exc_type, _exc_value, _traceback):
				397	"""Context manager interface."""
				398	self.close()
				399	return False
				400
				401	def upload_tree(self, indir, infiles):
				402	"""Uploads the given tree to the isolate server.
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	403
				404	Arguments:
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	405	indir: root directory the infiles are based in.
				406	infiles: dict of files to upload from \|indir\|.
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	407
				408	Returns:
				409	List of items that were uploaded. All other items are already there.
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	410	"""
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	411	logging.info('upload tree(indir=%s, files=%d)', indir, len(infiles))
				412
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	413	# Convert \|indir\| + \|infiles\| into a list of FileItem objects.
				414	# Filter out symlinks, since they are not represented by items on isolate
				415	# server side.
				416	items = [
				417	FileItem(
				418	path=os.path.join(indir, filepath),
				419	digest=metadata['h'],
				420	size=metadata['s'],
				421	is_isolated=metadata.get('priority') == '0')
				422	for filepath, metadata in infiles.iteritems()
				423	if 'l' not in metadata
				424	]
				425
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	426	return self.upload_items(items)
				427
				428	def upload_items(self, items):
				429	"""Uploads bunch of items to the isolate server.
				430
				431	Will upload only items that are missing.
				432
				433	Arguments:
				434	items: list of Item instances that represents data to upload.
				435
				436	Returns:
				437	List of items that were uploaded. All other items are already there.
				438	"""
				439	# TODO(vadimsh): Optimize special case of len(items) == 1 that is frequently
				440	# used by swarming.py. There's no need to spawn multiple threads and try to
				441	# do stuff in parallel: there's nothing to parallelize. 'contains' check and
				442	# 'push' should be performed sequentially in the context of current thread.
				443
vadimsh@chromium.org	672cd2b	2013-10-08 17:49:33 +0000	[diff] [blame]	444	# For each digest keep only first Item that matches it. All other items
				445	# are just indistinguishable copies from the point of view of isolate
				446	# server (it doesn't care about paths at all, only content and digests).
				447	seen = {}
				448	duplicates = 0
				449	for item in items:
				450	if seen.setdefault(item.digest, item) is not item:
				451	duplicates += 1
				452	items = seen.values()
				453	if duplicates:
				454	logging.info('Skipped %d duplicated files', duplicates)
				455
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	456	# Enqueue all upload tasks.
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	457	missing = set()
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	458	channel = threading_utils.TaskChannel()
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	459	for missing_item in self.get_missing_items(items):
				460	missing.add(missing_item)
				461	self.async_push(
				462	channel,
				463	WorkerPool.HIGH if missing_item.is_isolated else WorkerPool.MED,
				464	missing_item)
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	465
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	466	uploaded = []
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	467	# No need to spawn deadlock detector thread if there's nothing to upload.
				468	if missing:
				469	with threading_utils.DeadlockDetector(DEADLOCK_TIMEOUT) as detector:
				470	# Wait for all started uploads to finish.
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	471	while len(uploaded) != len(missing):
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	472	detector.ping()
				473	item = channel.pull()
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	474	uploaded.append(item)
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	475	logging.debug(
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	476	'Uploaded %d / %d: %s', len(uploaded), len(missing), item.digest)
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	477	logging.info('All files are uploaded')
				478
				479	# Print stats.
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	480	total = len(items)
				481	total_size = sum(f.size for f in items)
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	482	logging.info(
				483	'Total: %6d, %9.1fkb',
				484	total,
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	485	total_size / 1024.)
				486	cache_hit = set(items) - missing
				487	cache_hit_size = sum(f.size for f in cache_hit)
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	488	logging.info(
				489	'cache hit: %6d, %9.1fkb, %6.2f%% files, %6.2f%% size',
				490	len(cache_hit),
				491	cache_hit_size / 1024.,
				492	len(cache_hit) * 100. / total,
				493	cache_hit_size * 100. / total_size if total_size else 0)
				494	cache_miss = missing
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	495	cache_miss_size = sum(f.size for f in cache_miss)
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	496	logging.info(
				497	'cache miss: %6d, %9.1fkb, %6.2f%% files, %6.2f%% size',
				498	len(cache_miss),
				499	cache_miss_size / 1024.,
				500	len(cache_miss) * 100. / total,
				501	cache_miss_size * 100. / total_size if total_size else 0)
				502
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	503	return uploaded
				504
				505	def get_fetch_url(self, digest):
				506	"""Returns an URL that can be used to fetch an item with given digest.
				507
				508	Arguments:
				509	digest: hex digest of item to fetch.
				510
				511	Returns:
				512	An URL or None if underlying protocol doesn't support this.
				513	"""
				514	return self._storage_api.get_fetch_url(digest)
				515
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	516	def async_push(self, channel, priority, item):
				517	"""Starts asynchronous push to the server in a parallel thread.
				518
				519	Arguments:
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	520	channel: TaskChannel that receives back \|item\| when upload ends.
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	521	priority: thread pool task priority for the push.
				522	item: item to upload as instance of Item class.
				523	"""
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	524	def push(content):
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	525	"""Pushes an item and returns its id, to pass as a result to \|channel\|."""
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	526	self._storage_api.push(item, content)
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	527	return item
				528
				529	# If zipping is not required, just start a push task.
				530	if not self.use_zip:
				531	self.net_thread_pool.add_task_with_channel(channel, priority, push,
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	532	item.content(DISK_FILE_CHUNK))
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	533	return
				534
				535	# If zipping is enabled, zip in a separate thread.
				536	def zip_and_push():
				537	# TODO(vadimsh): Implement streaming uploads. Before it's done, assemble
				538	# content right here. It will block until all file is zipped.
				539	try:
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	540	stream = zip_compress(item.content(ZIPPED_FILE_CHUNK),
				541	item.compression_level)
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	542	data = ''.join(stream)
				543	except Exception as exc:
				544	logging.error('Failed to zip \'%s\': %s', item, exc)
Vadim Shtayura	0ffc409	2013-11-20 17:49:52 -0800	[diff] [blame]	545	channel.send_exception()
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	546	return
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	547	self.net_thread_pool.add_task_with_channel(
				548	channel, priority, push, [data])
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	549	self.cpu_thread_pool.add_task(priority, zip_and_push)
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	550
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	551	def async_fetch(self, channel, priority, digest, size, sink):
				552	"""Starts asynchronous fetch from the server in a parallel thread.
				553
				554	Arguments:
				555	channel: TaskChannel that receives back \|digest\| when download ends.
				556	priority: thread pool task priority for the fetch.
				557	digest: hex digest of an item to download.
				558	size: expected size of the item (after decompression).
				559	sink: function that will be called as sink(generator).
				560	"""
				561	def fetch():
				562	try:
				563	# Prepare reading pipeline.
				564	stream = self._storage_api.fetch(digest)
				565	if self.use_zip:
				566	stream = zip_decompress(stream, DISK_FILE_CHUNK)
				567	# Run \|stream\| through verifier that will assert its size.
				568	verifier = FetchStreamVerifier(stream, size)
				569	# Verified stream goes to \|sink\|.
				570	sink(verifier.run())
				571	except Exception as err:
Vadim Shtayura	0ffc409	2013-11-20 17:49:52 -0800	[diff] [blame]	572	logging.error('Failed to fetch %s: %s', digest, err)
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	573	raise
				574	return digest
				575
				576	# Don't bother with zip_thread_pool for decompression. Decompression is
				577	# really fast and most probably IO bound anyway.
				578	self.net_thread_pool.add_task_with_channel(channel, priority, fetch)
				579
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	580	def get_missing_items(self, items):
				581	"""Yields items that are missing from the server.
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	582
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	583	Issues multiple parallel queries via StorageApi's 'contains' method.
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	584
				585	Arguments:
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	586	items: a list of Item objects to check.
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	587
				588	Yields:
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	589	Item objects that are missing from the server.
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	590	"""
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	591	channel = threading_utils.TaskChannel()
				592	pending = 0
				593	# Enqueue all requests.
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	594	for batch in self.batch_items_for_check(items):
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	595	self.net_thread_pool.add_task_with_channel(channel, WorkerPool.HIGH,
				596	self._storage_api.contains, batch)
				597	pending += 1
				598	# Yield results as they come in.
				599	for _ in xrange(pending):
				600	for missing in channel.pull():
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	601	yield missing
				602
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	603	@staticmethod
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	604	def batch_items_for_check(items):
				605	"""Splits list of items to check for existence on the server into batches.
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	606
				607	Each batch corresponds to a single 'exists?' query to the server via a call
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	608	to StorageApi's 'contains' method.
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	609
				610	Arguments:
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	611	items: a list of Item objects.
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	612
				613	Yields:
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	614	Batches of items to query for existence in a single operation,
				615	each batch is a list of Item objects.
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	616	"""
				617	batch_count = 0
				618	batch_size_limit = ITEMS_PER_CONTAINS_QUERIES[0]
				619	next_queries = []
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	620	for item in sorted(items, key=lambda x: x.size, reverse=True):
				621	next_queries.append(item)
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	622	if len(next_queries) == batch_size_limit:
				623	yield next_queries
				624	next_queries = []
				625	batch_count += 1
				626	batch_size_limit = ITEMS_PER_CONTAINS_QUERIES[
				627	min(batch_count, len(ITEMS_PER_CONTAINS_QUERIES) - 1)]
				628	if next_queries:
				629	yield next_queries
				630
				631
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	632	class FetchQueue(object):
				633	"""Fetches items from Storage and places them into LocalCache.
				634
				635	It manages multiple concurrent fetch operations. Acts as a bridge between
				636	Storage and LocalCache so that Storage and LocalCache don't depend on each
				637	other at all.
				638	"""
				639
				640	def __init__(self, storage, cache):
				641	self.storage = storage
				642	self.cache = cache
				643	self._channel = threading_utils.TaskChannel()
				644	self._pending = set()
				645	self._accessed = set()
				646	self._fetched = cache.cached_set()
				647
				648	def add(self, priority, digest, size=UNKNOWN_FILE_SIZE):
				649	"""Starts asynchronous fetch of item \|digest\|."""
				650	# Fetching it now?
				651	if digest in self._pending:
				652	return
				653
				654	# Mark this file as in use, verify_all_cached will later ensure it is still
				655	# in cache.
				656	self._accessed.add(digest)
				657
				658	# Already fetched? Notify cache to update item's LRU position.
				659	if digest in self._fetched:
				660	# 'touch' returns True if item is in cache and not corrupted.
				661	if self.cache.touch(digest, size):
				662	return
				663	# Item is corrupted, remove it from cache and fetch it again.
				664	self._fetched.remove(digest)
				665	self.cache.evict(digest)
				666
				667	# TODO(maruel): It should look at the free disk space, the current cache
				668	# size and the size of the new item on every new item:
				669	# - Trim the cache as more entries are listed when free disk space is low,
				670	# otherwise if the amount of data downloaded during the run > free disk
				671	# space, it'll crash.
				672	# - Make sure there's enough free disk space to fit all dependencies of
				673	# this run! If not, abort early.
				674
				675	# Start fetching.
				676	self._pending.add(digest)
				677	self.storage.async_fetch(
				678	self._channel, priority, digest, size,
				679	functools.partial(self.cache.write, digest))
				680
				681	def wait(self, digests):
				682	"""Starts a loop that waits for at least one of \|digests\| to be retrieved.
				683
				684	Returns the first digest retrieved.
				685	"""
				686	# Flush any already fetched items.
				687	for digest in digests:
				688	if digest in self._fetched:
				689	return digest
				690
				691	# Ensure all requested items are being fetched now.
				692	assert all(digest in self._pending for digest in digests), (
				693	digests, self._pending)
				694
				695	# Wait for some requested item to finish fetching.
				696	while self._pending:
				697	digest = self._channel.pull()
				698	self._pending.remove(digest)
				699	self._fetched.add(digest)
				700	if digest in digests:
				701	return digest
				702
				703	# Should never reach this point due to assert above.
				704	raise RuntimeError('Impossible state')
				705
				706	def inject_local_file(self, path, algo):
				707	"""Adds local file to the cache as if it was fetched from storage."""
				708	with open(path, 'rb') as f:
				709	data = f.read()
				710	digest = algo(data).hexdigest()
				711	self.cache.write(digest, [data])
				712	self._fetched.add(digest)
				713	return digest
				714
				715	@property
				716	def pending_count(self):
				717	"""Returns number of items to be fetched."""
				718	return len(self._pending)
				719
				720	def verify_all_cached(self):
				721	"""True if all accessed items are in cache."""
				722	return self._accessed.issubset(self.cache.cached_set())
				723
				724
				725	class FetchStreamVerifier(object):
				726	"""Verifies that fetched file is valid before passing it to the LocalCache."""
				727
				728	def __init__(self, stream, expected_size):
				729	self.stream = stream
				730	self.expected_size = expected_size
				731	self.current_size = 0
				732
				733	def run(self):
				734	"""Generator that yields same items as \|stream\|.
				735
				736	Verifies \|stream\| is complete before yielding a last chunk to consumer.
				737
				738	Also wraps IOError produced by consumer into MappingError exceptions since
				739	otherwise Storage will retry fetch on unrelated local cache errors.
				740	"""
				741	# Read one chunk ahead, keep it in \|stored\|.
				742	# That way a complete stream can be verified before pushing last chunk
				743	# to consumer.
				744	stored = None
				745	for chunk in self.stream:
				746	assert chunk is not None
				747	if stored is not None:
				748	self._inspect_chunk(stored, is_last=False)
				749	try:
				750	yield stored
				751	except IOError as exc:
				752	raise MappingError('Failed to store an item in cache: %s' % exc)
				753	stored = chunk
				754	if stored is not None:
				755	self._inspect_chunk(stored, is_last=True)
				756	try:
				757	yield stored
				758	except IOError as exc:
				759	raise MappingError('Failed to store an item in cache: %s' % exc)
				760
				761	def _inspect_chunk(self, chunk, is_last):
				762	"""Called for each fetched chunk before passing it to consumer."""
				763	self.current_size += len(chunk)
				764	if (is_last and (self.expected_size != UNKNOWN_FILE_SIZE) and
				765	(self.expected_size != self.current_size)):
				766	raise IOError('Incorrect file size: expected %d, got %d' % (
				767	self.expected_size, self.current_size))
				768
				769
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	770	class StorageApi(object):
				771	"""Interface for classes that implement low-level storage operations."""
				772
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	773	def get_fetch_url(self, digest):
				774	"""Returns an URL that can be used to fetch an item with given digest.
				775
				776	Arguments:
				777	digest: hex digest of item to fetch.
				778
				779	Returns:
				780	An URL or None if the protocol doesn't support this.
				781	"""
				782	raise NotImplementedError()
				783
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	784	def fetch(self, digest):
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	785	"""Fetches an object and yields its content.
				786
				787	Arguments:
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	788	digest: hash digest of item to download.
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	789
				790	Yields:
				791	Chunks of downloaded item (as str objects).
				792	"""
				793	raise NotImplementedError()
				794
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	795	def push(self, item, content):
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	796	"""Uploads an \|item\| with content generated by \|content\| generator.
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	797
				798	Arguments:
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	799	item: Item object that holds information about an item being pushed.
				800	content: a generator that yields chunks to push.
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	801
				802	Returns:
				803	None.
				804	"""
				805	raise NotImplementedError()
				806
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	807	def contains(self, items):
				808	"""Checks for existence of given \|items\| on the server.
				809
				810	Mutates \|items\| by assigning opaque implement specific object to Item's
				811	push_state attribute on missing entries in the datastore.
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	812
				813	Arguments:
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	814	items: list of Item objects.
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	815
				816	Returns:
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	817	A list of items missing on server as a list of Item objects.
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	818	"""
				819	raise NotImplementedError()
				820
				821
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	822	class IsolateServer(StorageApi):
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	823	"""StorageApi implementation that downloads and uploads to Isolate Server.
				824
				825	It uploads and downloads directly from Google Storage whenever appropriate.
				826	"""
				827
				828	class _PushState(object):
				829	"""State needed to call .push(), to be stored in Item.push_state."""
				830	def __init__(self, upload_url, finalize_url):
				831	self.upload_url = upload_url
				832	self.finalize_url = finalize_url
				833	self.uploaded = False
				834	self.finalized = False
				835
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	836	def __init__(self, base_url, namespace):
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	837	super(IsolateServer, self).__init__()
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	838	assert base_url.startswith('http'), base_url
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	839	self.base_url = base_url.rstrip('/')
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	840	self.namespace = namespace
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	841	self._lock = threading.Lock()
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	842	self._server_caps = None
				843
				844	@staticmethod
				845	def _generate_handshake_request():
				846	"""Returns a dict to be sent as handshake request body."""
				847	# TODO(vadimsh): Set 'pusher' and 'fetcher' according to intended usage.
				848	return {
				849	'client_app_version': __version__,
				850	'fetcher': True,
				851	'protocol_version': ISOLATE_PROTOCOL_VERSION,
				852	'pusher': True,
				853	}
				854
				855	@staticmethod
				856	def _validate_handshake_response(caps):
				857	"""Validates and normalizes handshake response."""
				858	logging.info('Protocol version: %s', caps['protocol_version'])
				859	logging.info('Server version: %s', caps['server_app_version'])
				860	if caps.get('error'):
				861	raise MappingError(caps['error'])
				862	if not caps['access_token']:
				863	raise ValueError('access_token is missing')
				864	return caps
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	865
				866	@property
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	867	def _server_capabilities(self):
				868	"""Performs handshake with the server if not yet done.
				869
				870	Returns:
				871	Server capabilities dictionary as returned by /handshake endpoint.
				872
				873	Raises:
				874	MappingError if server rejects the handshake.
				875	"""
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	876	# TODO(maruel): Make this request much earlier asynchronously while the
				877	# files are being enumerated.
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	878	with self._lock:
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	879	if self._server_caps is None:
				880	request_body = json.dumps(
				881	self._generate_handshake_request(), separators=(',', ':'))
				882	response = net.url_read(
				883	url=self.base_url + '/content-gs/handshake',
				884	data=request_body,
				885	content_type='application/json',
				886	method='POST')
				887	if response is None:
				888	raise MappingError('Failed to perform handshake.')
				889	try:
				890	caps = json.loads(response)
				891	if not isinstance(caps, dict):
				892	raise ValueError('Expecting JSON dict')
				893	self._server_caps = self._validate_handshake_response(caps)
				894	except (ValueError, KeyError, TypeError) as exc:
				895	# KeyError exception has very confusing str conversion: it's just a
				896	# missing key value and nothing else. So print exception class name
				897	# as well.
				898	raise MappingError('Invalid handshake response (%s): %s' % (
				899	exc.__class__.__name__, exc))
				900	return self._server_caps
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	901
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	902	def get_fetch_url(self, digest):
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	903	assert isinstance(digest, basestring)
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	904	return '%s/content-gs/retrieve/%s/%s' % (
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	905	self.base_url, self.namespace, digest)
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	906
				907	def fetch(self, digest):
				908	source_url = self.get_fetch_url(digest)
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	909	logging.debug('download_file(%s)', source_url)
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	910
				911	# Because the app engine DB is only eventually consistent, retry 404 errors
				912	# because the file might just not be visible yet (even though it has been
				913	# uploaded).
				914	connection = net.url_open(
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	915	source_url, retry_404=True, read_timeout=DOWNLOAD_READ_TIMEOUT)
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	916	if not connection:
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	917	raise IOError('Unable to open connection to %s' % source_url)
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	918	return stream_read(connection, NET_IO_FILE_CHUNK)
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	919
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	920	def push(self, item, content):
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	921	assert isinstance(item, Item)
				922	assert isinstance(item.push_state, IsolateServer._PushState)
				923	assert not item.push_state.finalized
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	924
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	925	# TODO(vadimsh): Do not read from \|content\| generator when retrying push.
				926	# If \|content\| is indeed a generator, it can not be re-winded back
				927	# to the beginning of the stream. A retry will find it exhausted. A possible
				928	# solution is to wrap \|content\| generator with some sort of caching
				929	# restartable generator. It should be done alongside streaming support
				930	# implementation.
				931
				932	# This push operation may be a retry after failed finalization call below,
				933	# no need to reupload contents in that case.
				934	if not item.push_state.uploaded:
				935	# A cheezy way to avoid memcpy of (possibly huge) file, until streaming
				936	# upload support is implemented.
				937	if isinstance(content, list) and len(content) == 1:
				938	content = content[0]
				939	else:
				940	content = ''.join(content)
				941	# PUT file to \|upload_url\|.
				942	response = net.url_read(
				943	url=item.push_state.upload_url,
				944	data=content,
				945	content_type='application/octet-stream',
				946	method='PUT')
				947	if response is None:
				948	raise IOError('Failed to upload a file %s to %s' % (
				949	item.digest, item.push_state.upload_url))
				950	item.push_state.uploaded = True
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	951	else:
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	952	logging.info(
				953	'A file %s already uploaded, retrying finalization only', item.digest)
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	954
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	955	# Optionally notify the server that it's done.
				956	if item.push_state.finalize_url:
				957	# TODO(vadimsh): Calculate MD5 or CRC32C sum while uploading a file and
				958	# send it to isolated server. That way isolate server can verify that
				959	# the data safely reached Google Storage (GS provides MD5 and CRC32C of
				960	# stored files).
				961	response = net.url_read(
				962	url=item.push_state.finalize_url,
				963	data='',
				964	content_type='application/json',
				965	method='POST')
				966	if response is None:
				967	raise IOError('Failed to finalize an upload of %s' % item.digest)
				968	item.push_state.finalized = True
maruel@chromium.org	d1e20c9	2013-09-17 20:54:26 +0000	[diff] [blame]	969
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	970	def contains(self, items):
				971	logging.info('Checking existence of %d files...', len(items))
maruel@chromium.org	d1e20c9	2013-09-17 20:54:26 +0000	[diff] [blame]	972
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	973	# Request body is a json encoded list of dicts.
				974	body = [
				975	{
				976	'h': item.digest,
				977	's': item.size,
				978	'i': int(item.is_isolated),
				979	} for item in items
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	980	]
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	981
				982	query_url = '%s/content-gs/pre-upload/%s?token=%s' % (
				983	self.base_url,
				984	self.namespace,
				985	urllib.quote(self._server_capabilities['access_token']))
				986	response_body = net.url_read(
				987	url=query_url,
				988	data=json.dumps(body, separators=(',', ':')),
				989	content_type='application/json',
				990	method='POST')
				991	if response_body is None:
				992	raise MappingError('Failed to execute /pre-upload query')
				993
				994	# Response body is a list of push_urls (or null if file is already present).
				995	try:
				996	response = json.loads(response_body)
				997	if not isinstance(response, list):
				998	raise ValueError('Expecting response with json-encoded list')
				999	if len(response) != len(items):
				1000	raise ValueError(
				1001	'Incorrect number of items in the list, expected %d, '
				1002	'but got %d' % (len(items), len(response)))
				1003	except ValueError as err:
				1004	raise MappingError(
				1005	'Invalid response from server: %s, body is %s' % (err, response_body))
				1006
				1007	# Pick Items that are missing, attach _PushState to them.
				1008	missing_items = []
				1009	for i, push_urls in enumerate(response):
				1010	if push_urls:
				1011	assert len(push_urls) == 2, str(push_urls)
				1012	item = items[i]
				1013	assert item.push_state is None
				1014	item.push_state = IsolateServer._PushState(push_urls[0], push_urls[1])
				1015	missing_items.append(item)
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	1016	logging.info('Queried %d files, %d cache hit',
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	1017	len(items), len(items) - len(missing_items))
				1018	return missing_items
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	1019
				1020
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	1021	class FileSystem(StorageApi):
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	1022	"""StorageApi implementation that fetches data from the file system.
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	1023
				1024	The common use case is a NFS/CIFS file server that is mounted locally that is
				1025	used to fetch the file on a local partition.
				1026	"""
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	1027
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	1028	def __init__(self, base_path):
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	1029	super(FileSystem, self).__init__()
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	1030	self.base_path = base_path
				1031
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	1032	def get_fetch_url(self, digest):
				1033	return None
				1034
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1035	def fetch(self, digest):
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	1036	assert isinstance(digest, basestring)
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1037	return file_read(os.path.join(self.base_path, digest))
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	1038
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1039	def push(self, item, content):
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	1040	assert isinstance(item, Item)
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1041	file_write(os.path.join(self.base_path, item.digest), content)
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	1042
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	1043	def contains(self, items):
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	1044	return [
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	1045	item for item in items
				1046	if not os.path.exists(os.path.join(self.base_path, item.digest))
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	1047	]
				1048
				1049
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1050	class LocalCache(object):
				1051	"""Local cache that stores objects fetched via Storage.
				1052
				1053	It can be accessed concurrently from multiple threads, so it should protect
				1054	its internal state with some lock.
				1055	"""
				1056
				1057	def __enter__(self):
				1058	"""Context manager interface."""
				1059	return self
				1060
				1061	def __exit__(self, _exc_type, _exec_value, _traceback):
				1062	"""Context manager interface."""
				1063	return False
				1064
				1065	def cached_set(self):
				1066	"""Returns a set of all cached digests (always a new object)."""
				1067	raise NotImplementedError()
				1068
				1069	def touch(self, digest, size):
				1070	"""Ensures item is not corrupted and updates its LRU position.
				1071
				1072	Arguments:
				1073	digest: hash digest of item to check.
				1074	size: expected size of this item.
				1075
				1076	Returns:
				1077	True if item is in cache and not corrupted.
				1078	"""
				1079	raise NotImplementedError()
				1080
				1081	def evict(self, digest):
				1082	"""Removes item from cache if it's there."""
				1083	raise NotImplementedError()
				1084
				1085	def read(self, digest):
				1086	"""Returns contents of the cached item as a single str."""
				1087	raise NotImplementedError()
				1088
				1089	def write(self, digest, content):
				1090	"""Reads data from \|content\| generator and stores it in cache."""
				1091	raise NotImplementedError()
				1092
Marc-Antoine Ruel	fb199cf	2013-11-12 15:38:12 -0500	[diff] [blame]	1093	def hardlink(self, digest, dest, file_mode):
				1094	"""Ensures file at \|dest\| has same content as cached \|digest\|.
				1095
				1096	If file_mode is provided, it is used to set the executable bit if
				1097	applicable.
				1098	"""
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1099	raise NotImplementedError()
				1100
				1101
				1102	class MemoryCache(LocalCache):
				1103	"""LocalCache implementation that stores everything in memory."""
				1104
				1105	def __init__(self):
				1106	super(MemoryCache, self).__init__()
				1107	# Let's not assume dict is thread safe.
				1108	self._lock = threading.Lock()
				1109	self._contents = {}
				1110
				1111	def cached_set(self):
				1112	with self._lock:
				1113	return set(self._contents)
				1114
				1115	def touch(self, digest, size):
				1116	with self._lock:
				1117	return digest in self._contents
				1118
				1119	def evict(self, digest):
				1120	with self._lock:
				1121	self._contents.pop(digest, None)
				1122
				1123	def read(self, digest):
				1124	with self._lock:
				1125	return self._contents[digest]
				1126
				1127	def write(self, digest, content):
				1128	# Assemble whole stream before taking the lock.
				1129	data = ''.join(content)
				1130	with self._lock:
				1131	self._contents[digest] = data
				1132
Marc-Antoine Ruel	fb199cf	2013-11-12 15:38:12 -0500	[diff] [blame]	1133	def hardlink(self, digest, dest, file_mode):
				1134	"""Since data is kept in memory, there is no filenode to hardlink."""
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1135	file_write(dest, [self.read(digest)])
Marc-Antoine Ruel	fb199cf	2013-11-12 15:38:12 -0500	[diff] [blame]	1136	if file_mode is not None:
				1137	# Ignores all other bits.
				1138	os.chmod(dest, file_mode & 0500)
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1139
				1140
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	1141	def get_hash_algo(_namespace):
				1142	"""Return hash algorithm class to use when uploading to given \|namespace\|."""
				1143	# TODO(vadimsh): Implement this at some point.
				1144	return hashlib.sha1
				1145
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	1146
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	1147	def is_namespace_with_compression(namespace):
				1148	"""Returns True if given \|namespace\| stores compressed objects."""
				1149	return namespace.endswith(('-gzip', '-deflate'))
				1150
				1151
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	1152	def get_storage_api(file_or_url, namespace):
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	1153	"""Returns an object that implements StorageApi interface."""
Marc-Antoine Ruel	3798993	2013-11-19 16:28:08 -0500	[diff] [blame]	1154	if file_path.is_url(file_or_url):
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	1155	return IsolateServer(file_or_url, namespace)
				1156	else:
				1157	return FileSystem(file_or_url)
				1158
				1159
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1160	def get_storage(file_or_url, namespace):
				1161	"""Returns Storage class configured with appropriate StorageApi instance."""
				1162	return Storage(
				1163	get_storage_api(file_or_url, namespace),
				1164	is_namespace_with_compression(namespace))
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	1165
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	1166
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	1167	def expand_symlinks(indir, relfile):
				1168	"""Follows symlinks in \|relfile\|, but treating symlinks that point outside the
				1169	build tree as if they were ordinary directories/files. Returns the final
				1170	symlink-free target and a list of paths to symlinks encountered in the
				1171	process.
				1172
				1173	The rule about symlinks outside the build tree is for the benefit of the
				1174	Chromium OS ebuild, which symlinks the output directory to an unrelated path
				1175	in the chroot.
				1176
				1177	Fails when a directory loop is detected, although in theory we could support
				1178	that case.
				1179	"""
				1180	is_directory = relfile.endswith(os.path.sep)
				1181	done = indir
				1182	todo = relfile.strip(os.path.sep)
				1183	symlinks = []
				1184
				1185	while todo:
				1186	pre_symlink, symlink, post_symlink = file_path.split_at_symlink(
				1187	done, todo)
				1188	if not symlink:
				1189	todo = file_path.fix_native_path_case(done, todo)
				1190	done = os.path.join(done, todo)
				1191	break
				1192	symlink_path = os.path.join(done, pre_symlink, symlink)
				1193	post_symlink = post_symlink.lstrip(os.path.sep)
				1194	# readlink doesn't exist on Windows.
				1195	# pylint: disable=E1101
				1196	target = os.path.normpath(os.path.join(done, pre_symlink))
				1197	symlink_target = os.readlink(symlink_path)
				1198	if os.path.isabs(symlink_target):
				1199	# Absolute path are considered a normal directories. The use case is
				1200	# generally someone who puts the output directory on a separate drive.
				1201	target = symlink_target
				1202	else:
				1203	# The symlink itself could be using the wrong path case.
				1204	target = file_path.fix_native_path_case(target, symlink_target)
				1205
				1206	if not os.path.exists(target):
				1207	raise MappingError(
				1208	'Symlink target doesn\'t exist: %s -> %s' % (symlink_path, target))
				1209	target = file_path.get_native_path_case(target)
				1210	if not file_path.path_starts_with(indir, target):
				1211	done = symlink_path
				1212	todo = post_symlink
				1213	continue
				1214	if file_path.path_starts_with(target, symlink_path):
				1215	raise MappingError(
				1216	'Can\'t map recursive symlink reference %s -> %s' %
				1217	(symlink_path, target))
				1218	logging.info('Found symlink: %s -> %s', symlink_path, target)
				1219	symlinks.append(os.path.relpath(symlink_path, indir))
				1220	# Treat the common prefix of the old and new paths as done, and start
				1221	# scanning again.
				1222	target = target.split(os.path.sep)
				1223	symlink_path = symlink_path.split(os.path.sep)
				1224	prefix_length = 0
				1225	for target_piece, symlink_path_piece in zip(target, symlink_path):
				1226	if target_piece == symlink_path_piece:
				1227	prefix_length += 1
				1228	else:
				1229	break
				1230	done = os.path.sep.join(target[:prefix_length])
				1231	todo = os.path.join(
				1232	os.path.sep.join(target[prefix_length:]), post_symlink)
				1233
				1234	relfile = os.path.relpath(done, indir)
				1235	relfile = relfile.rstrip(os.path.sep) + is_directory * os.path.sep
				1236	return relfile, symlinks
				1237
				1238
				1239	def expand_directory_and_symlink(indir, relfile, blacklist, follow_symlinks):
				1240	"""Expands a single input. It can result in multiple outputs.
				1241
				1242	This function is recursive when relfile is a directory.
				1243
				1244	Note: this code doesn't properly handle recursive symlink like one created
				1245	with:
				1246	ln -s .. foo
				1247	"""
				1248	if os.path.isabs(relfile):
				1249	raise MappingError('Can\'t map absolute path %s' % relfile)
				1250
				1251	infile = file_path.normpath(os.path.join(indir, relfile))
				1252	if not infile.startswith(indir):
				1253	raise MappingError('Can\'t map file %s outside %s' % (infile, indir))
				1254
				1255	filepath = os.path.join(indir, relfile)
				1256	native_filepath = file_path.get_native_path_case(filepath)
				1257	if filepath != native_filepath:
				1258	# Special case './'.
				1259	if filepath != native_filepath + '.' + os.path.sep:
				1260	# Give up enforcing strict path case on OSX. Really, it's that sad. The
				1261	# case where it happens is very specific and hard to reproduce:
				1262	# get_native_path_case(
				1263	# u'Foo.framework/Versions/A/Resources/Something.nib') will return
				1264	# u'Foo.framework/Versions/A/resources/Something.nib', e.g. lowercase 'r'.
				1265	#
				1266	# Note that this is really something deep in OSX because running
				1267	# ls Foo.framework/Versions/A
				1268	# will print out 'Resources', while file_path.get_native_path_case()
				1269	# returns a lower case 'r'.
				1270	#
				1271	# So something is happening under the hood resulting in the command 'ls'
				1272	# and Carbon.File.FSPathMakeRef('path').FSRefMakePath() to disagree. We
				1273	# have no idea why.
				1274	if sys.platform != 'darwin':
				1275	raise MappingError(
				1276	'File path doesn\'t equal native file path\n%s != %s' %
				1277	(filepath, native_filepath))
				1278
				1279	symlinks = []
				1280	if follow_symlinks:
				1281	relfile, symlinks = expand_symlinks(indir, relfile)
				1282
				1283	if relfile.endswith(os.path.sep):
				1284	if not os.path.isdir(infile):
				1285	raise MappingError(
				1286	'%s is not a directory but ends with "%s"' % (infile, os.path.sep))
				1287
				1288	# Special case './'.
				1289	if relfile.startswith('.' + os.path.sep):
				1290	relfile = relfile[2:]
				1291	outfiles = symlinks
				1292	try:
				1293	for filename in os.listdir(infile):
				1294	inner_relfile = os.path.join(relfile, filename)
				1295	if blacklist and blacklist(inner_relfile):
				1296	continue
				1297	if os.path.isdir(os.path.join(indir, inner_relfile)):
				1298	inner_relfile += os.path.sep
				1299	outfiles.extend(
				1300	expand_directory_and_symlink(indir, inner_relfile, blacklist,
				1301	follow_symlinks))
				1302	return outfiles
				1303	except OSError as e:
				1304	raise MappingError(
				1305	'Unable to iterate over directory %s.\n%s' % (infile, e))
				1306	else:
				1307	# Always add individual files even if they were blacklisted.
				1308	if os.path.isdir(infile):
				1309	raise MappingError(
				1310	'Input directory %s must have a trailing slash' % infile)
				1311
				1312	if not os.path.isfile(infile):
				1313	raise MappingError('Input file %s doesn\'t exist' % infile)
				1314
				1315	return symlinks + [relfile]
				1316
				1317
				1318	def process_input(filepath, prevdict, read_only, flavor, algo):
				1319	"""Processes an input file, a dependency, and return meta data about it.
				1320
				1321	Behaviors:
				1322	- Retrieves the file mode, file size, file timestamp, file link
				1323	destination if it is a file link and calcultate the SHA-1 of the file's
				1324	content if the path points to a file and not a symlink.
				1325
				1326	Arguments:
				1327	filepath: File to act on.
				1328	prevdict: the previous dictionary. It is used to retrieve the cached sha-1
				1329	to skip recalculating the hash. Optional.
				1330	read_only: If True, the file mode is manipulated. In practice, only save
				1331	one of 4 modes: 0755 (rwx), 0644 (rw), 0555 (rx), 0444 (r). On
				1332	windows, mode is not set since all files are 'executable' by
				1333	default.
				1334	flavor: One isolated flavor, like 'linux', 'mac' or 'win'.
				1335	algo: Hashing algorithm used.
				1336
				1337	Returns:
				1338	The necessary data to create a entry in the 'files' section of an .isolated
				1339	file.
				1340	"""
				1341	out = {}
				1342	# TODO(csharp): Fix crbug.com/150823 and enable the touched logic again.
				1343	# if prevdict.get('T') == True:
				1344	# # The file's content is ignored. Skip the time and hard code mode.
				1345	# if get_flavor() != 'win':
				1346	# out['m'] = stat.S_IRUSR \| stat.S_IRGRP
				1347	# out['s'] = 0
				1348	# out['h'] = algo().hexdigest()
				1349	# out['T'] = True
				1350	# return out
				1351
				1352	# Always check the file stat and check if it is a link. The timestamp is used
				1353	# to know if the file's content/symlink destination should be looked into.
				1354	# E.g. only reuse from prevdict if the timestamp hasn't changed.
				1355	# There is the risk of the file's timestamp being reset to its last value
				1356	# manually while its content changed. We don't protect against that use case.
				1357	try:
				1358	filestats = os.lstat(filepath)
				1359	except OSError:
				1360	# The file is not present.
				1361	raise MappingError('%s is missing' % filepath)
				1362	is_link = stat.S_ISLNK(filestats.st_mode)
				1363
				1364	if flavor != 'win':
				1365	# Ignore file mode on Windows since it's not really useful there.
				1366	filemode = stat.S_IMODE(filestats.st_mode)
				1367	# Remove write access for group and all access to 'others'.
				1368	filemode &= ~(stat.S_IWGRP \| stat.S_IRWXO)
				1369	if read_only:
				1370	filemode &= ~stat.S_IWUSR
				1371	if filemode & stat.S_IXUSR:
				1372	filemode \|= stat.S_IXGRP
				1373	else:
				1374	filemode &= ~stat.S_IXGRP
				1375	if not is_link:
				1376	out['m'] = filemode
				1377
				1378	# Used to skip recalculating the hash or link destination. Use the most recent
				1379	# update time.
				1380	# TODO(maruel): Save it in the .state file instead of .isolated so the
				1381	# .isolated file is deterministic.
				1382	out['t'] = int(round(filestats.st_mtime))
				1383
				1384	if not is_link:
				1385	out['s'] = filestats.st_size
				1386	# If the timestamp wasn't updated and the file size is still the same, carry
				1387	# on the sha-1.
				1388	if (prevdict.get('t') == out['t'] and
				1389	prevdict.get('s') == out['s']):
				1390	# Reuse the previous hash if available.
				1391	out['h'] = prevdict.get('h')
				1392	if not out.get('h'):
				1393	out['h'] = hash_file(filepath, algo)
				1394	else:
				1395	# If the timestamp wasn't updated, carry on the link destination.
				1396	if prevdict.get('t') == out['t']:
				1397	# Reuse the previous link destination if available.
				1398	out['l'] = prevdict.get('l')
				1399	if out.get('l') is None:
				1400	# The link could be in an incorrect path case. In practice, this only
				1401	# happen on OSX on case insensitive HFS.
				1402	# TODO(maruel): It'd be better if it was only done once, in
				1403	# expand_directory_and_symlink(), so it would not be necessary to do again
				1404	# here.
				1405	symlink_value = os.readlink(filepath) # pylint: disable=E1101
				1406	filedir = file_path.get_native_path_case(os.path.dirname(filepath))
				1407	native_dest = file_path.fix_native_path_case(filedir, symlink_value)
				1408	out['l'] = os.path.relpath(native_dest, filedir)
				1409	return out
				1410
				1411
				1412	def save_isolated(isolated, data):
				1413	"""Writes one or multiple .isolated files.
				1414
				1415	Note: this reference implementation does not create child .isolated file so it
				1416	always returns an empty list.
				1417
				1418	Returns the list of child isolated files that are included by \|isolated\|.
				1419	"""
				1420	# Make sure the data is valid .isolated data by 'reloading' it.
				1421	algo = SUPPORTED_ALGOS[data['algo']]
				1422	load_isolated(json.dumps(data), data.get('flavor'), algo)
				1423	tools.write_json(isolated, data, True)
				1424	return []
				1425
				1426
				1427
maruel@chromium.org	7b844a6	2013-09-17 13:04:59 +0000	[diff] [blame]	1428	def upload_tree(base_url, indir, infiles, namespace):
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	1429	"""Uploads the given tree to the given url.
				1430
				1431	Arguments:
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	1432	base_url: The base url, it is assume that \|base_url\|/has/ can be used to
				1433	query if an element was already uploaded, and \|base_url\|/store/
				1434	can be used to upload a new element.
				1435	indir: Root directory the infiles are based in.
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	1436	infiles: dict of files to upload from \|indir\| to \|base_url\|.
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	1437	namespace: The namespace to use on the server.
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	1438	"""
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1439	with get_storage(base_url, namespace) as storage:
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	1440	storage.upload_tree(indir, infiles)
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	1441	return 0
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	1442
				1443
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1444	def load_isolated(content, os_flavor, algo):
				1445	"""Verifies the .isolated file is valid and loads this object with the json
				1446	data.
maruel@chromium.org	385d73d	2013-09-19 18:33:21 +0000	[diff] [blame]	1447
				1448	Arguments:
				1449	- content: raw serialized content to load.
				1450	- os_flavor: OS to load this file on. Optional.
				1451	- algo: hashlib algorithm class. Used to confirm the algorithm matches the
				1452	algorithm used on the Isolate Server.
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1453	"""
				1454	try:
				1455	data = json.loads(content)
				1456	except ValueError:
				1457	raise ConfigError('Failed to parse: %s...' % content[:100])
				1458
				1459	if not isinstance(data, dict):
				1460	raise ConfigError('Expected dict, got %r' % data)
				1461
maruel@chromium.org	385d73d	2013-09-19 18:33:21 +0000	[diff] [blame]	1462	# Check 'version' first, since it could modify the parsing after.
Marc-Antoine Ruel	ac54cb4	2013-11-18 14:05:35 -0500	[diff] [blame]	1463	# TODO(maruel): Drop support for unversioned .isolated file around Jan 2014.
maruel@chromium.org	385d73d	2013-09-19 18:33:21 +0000	[diff] [blame]	1464	value = data.get('version', '1.0')
				1465	if not isinstance(value, basestring):
				1466	raise ConfigError('Expected string, got %r' % value)
				1467	if not re.match(r'^(\d+)\.(\d+)$', value):
				1468	raise ConfigError('Expected a compatible version, got %r' % value)
				1469	if value.split('.', 1)[0] != '1':
				1470	raise ConfigError('Expected compatible \'1.x\' version, got %r' % value)
				1471
				1472	if algo is None:
Marc-Antoine Ruel	ac54cb4	2013-11-18 14:05:35 -0500	[diff] [blame]	1473	# TODO(maruel): Remove the default around Jan 2014.
maruel@chromium.org	385d73d	2013-09-19 18:33:21 +0000	[diff] [blame]	1474	# Default the algorithm used in the .isolated file itself, falls back to
				1475	# 'sha-1' if unspecified.
				1476	algo = SUPPORTED_ALGOS_REVERSE[data.get('algo', 'sha-1')]
				1477
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1478	for key, value in data.iteritems():
maruel@chromium.org	385d73d	2013-09-19 18:33:21 +0000	[diff] [blame]	1479	if key == 'algo':
				1480	if not isinstance(value, basestring):
				1481	raise ConfigError('Expected string, got %r' % value)
				1482	if value not in SUPPORTED_ALGOS:
				1483	raise ConfigError(
				1484	'Expected one of \'%s\', got %r' %
				1485	(', '.join(sorted(SUPPORTED_ALGOS)), value))
				1486	if value != SUPPORTED_ALGOS_REVERSE[algo]:
				1487	raise ConfigError(
				1488	'Expected \'%s\', got %r' % (SUPPORTED_ALGOS_REVERSE[algo], value))
				1489
				1490	elif key == 'command':
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1491	if not isinstance(value, list):
				1492	raise ConfigError('Expected list, got %r' % value)
				1493	if not value:
				1494	raise ConfigError('Expected non-empty command')
				1495	for subvalue in value:
				1496	if not isinstance(subvalue, basestring):
				1497	raise ConfigError('Expected string, got %r' % subvalue)
				1498
				1499	elif key == 'files':
				1500	if not isinstance(value, dict):
				1501	raise ConfigError('Expected dict, got %r' % value)
				1502	for subkey, subvalue in value.iteritems():
				1503	if not isinstance(subkey, basestring):
				1504	raise ConfigError('Expected string, got %r' % subkey)
				1505	if not isinstance(subvalue, dict):
				1506	raise ConfigError('Expected dict, got %r' % subvalue)
				1507	for subsubkey, subsubvalue in subvalue.iteritems():
				1508	if subsubkey == 'l':
				1509	if not isinstance(subsubvalue, basestring):
				1510	raise ConfigError('Expected string, got %r' % subsubvalue)
				1511	elif subsubkey == 'm':
				1512	if not isinstance(subsubvalue, int):
				1513	raise ConfigError('Expected int, got %r' % subsubvalue)
				1514	elif subsubkey == 'h':
				1515	if not is_valid_hash(subsubvalue, algo):
				1516	raise ConfigError('Expected sha-1, got %r' % subsubvalue)
				1517	elif subsubkey == 's':
Marc-Antoine Ruel	aab3a62	2013-11-28 09:47:05 -0500	[diff] [blame^]	1518	if not isinstance(subsubvalue, (int, long)):
				1519	raise ConfigError('Expected int or long, got %r' % subsubvalue)
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1520	else:
				1521	raise ConfigError('Unknown subsubkey %s' % subsubkey)
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1522	if bool('h' in subvalue) == bool('l' in subvalue):
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1523	raise ConfigError(
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1524	'Need only one of \'h\' (sha-1) or \'l\' (link), got: %r' %
				1525	subvalue)
				1526	if bool('h' in subvalue) != bool('s' in subvalue):
				1527	raise ConfigError(
				1528	'Both \'h\' (sha-1) and \'s\' (size) should be set, got: %r' %
				1529	subvalue)
				1530	if bool('s' in subvalue) == bool('l' in subvalue):
				1531	raise ConfigError(
				1532	'Need only one of \'s\' (size) or \'l\' (link), got: %r' %
				1533	subvalue)
				1534	if bool('l' in subvalue) and bool('m' in subvalue):
				1535	raise ConfigError(
				1536	'Cannot use \'m\' (mode) and \'l\' (link), got: %r' %
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1537	subvalue)
				1538
				1539	elif key == 'includes':
				1540	if not isinstance(value, list):
				1541	raise ConfigError('Expected list, got %r' % value)
				1542	if not value:
				1543	raise ConfigError('Expected non-empty includes list')
				1544	for subvalue in value:
				1545	if not is_valid_hash(subvalue, algo):
				1546	raise ConfigError('Expected sha-1, got %r' % subvalue)
				1547
				1548	elif key == 'read_only':
				1549	if not isinstance(value, bool):
				1550	raise ConfigError('Expected bool, got %r' % value)
				1551
				1552	elif key == 'relative_cwd':
				1553	if not isinstance(value, basestring):
				1554	raise ConfigError('Expected string, got %r' % value)
				1555
				1556	elif key == 'os':
				1557	if os_flavor and value != os_flavor:
				1558	raise ConfigError(
				1559	'Expected \'os\' to be \'%s\' but got \'%s\'' %
				1560	(os_flavor, value))
				1561
maruel@chromium.org	385d73d	2013-09-19 18:33:21 +0000	[diff] [blame]	1562	elif key == 'version':
				1563	# Already checked above.
				1564	pass
				1565
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1566	else:
maruel@chromium.org	385d73d	2013-09-19 18:33:21 +0000	[diff] [blame]	1567	raise ConfigError('Unknown key %r' % key)
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1568
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1569	# Automatically fix os.path.sep if necessary. While .isolated files are always
				1570	# in the the native path format, someone could want to download an .isolated
				1571	# tree from another OS.
				1572	wrong_path_sep = '/' if os.path.sep == '\\' else '\\'
				1573	if 'files' in data:
				1574	data['files'] = dict(
				1575	(k.replace(wrong_path_sep, os.path.sep), v)
				1576	for k, v in data['files'].iteritems())
				1577	for v in data['files'].itervalues():
				1578	if 'l' in v:
				1579	v['l'] = v['l'].replace(wrong_path_sep, os.path.sep)
				1580	if 'relative_cwd' in data:
				1581	data['relative_cwd'] = data['relative_cwd'].replace(
				1582	wrong_path_sep, os.path.sep)
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1583	return data
				1584
				1585
				1586	class IsolatedFile(object):
				1587	"""Represents a single parsed .isolated file."""
				1588	def __init__(self, obj_hash, algo):
				1589	"""\|obj_hash\| is really the sha-1 of the file."""
				1590	logging.debug('IsolatedFile(%s)' % obj_hash)
				1591	self.obj_hash = obj_hash
				1592	self.algo = algo
				1593	# Set once all the left-side of the tree is parsed. 'Tree' here means the
				1594	# .isolate and all the .isolated files recursively included by it with
				1595	# 'includes' key. The order of each sha-1 in 'includes', each representing a
				1596	# .isolated file in the hash table, is important, as the later ones are not
				1597	# processed until the firsts are retrieved and read.
				1598	self.can_fetch = False
				1599
				1600	# Raw data.
				1601	self.data = {}
				1602	# A IsolatedFile instance, one per object in self.includes.
				1603	self.children = []
				1604
				1605	# Set once the .isolated file is loaded.
				1606	self._is_parsed = False
				1607	# Set once the files are fetched.
				1608	self.files_fetched = False
				1609
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1610	def load(self, os_flavor, content):
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1611	"""Verifies the .isolated file is valid and loads this object with the json
				1612	data.
				1613	"""
				1614	logging.debug('IsolatedFile.load(%s)' % self.obj_hash)
				1615	assert not self._is_parsed
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1616	self.data = load_isolated(content, os_flavor, self.algo)
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1617	self.children = [
				1618	IsolatedFile(i, self.algo) for i in self.data.get('includes', [])
				1619	]
				1620	self._is_parsed = True
				1621
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1622	def fetch_files(self, fetch_queue, files):
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1623	"""Adds files in this .isolated file not present in \|files\| dictionary.
				1624
				1625	Preemptively request files.
				1626
				1627	Note that \|files\| is modified by this function.
				1628	"""
				1629	assert self.can_fetch
				1630	if not self._is_parsed or self.files_fetched:
				1631	return
				1632	logging.debug('fetch_files(%s)' % self.obj_hash)
				1633	for filepath, properties in self.data.get('files', {}).iteritems():
				1634	# Root isolated has priority on the files being mapped. In particular,
				1635	# overriden files must not be fetched.
				1636	if filepath not in files:
				1637	files[filepath] = properties
				1638	if 'h' in properties:
				1639	# Preemptively request files.
				1640	logging.debug('fetching %s' % filepath)
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1641	fetch_queue.add(WorkerPool.MED, properties['h'], properties['s'])
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1642	self.files_fetched = True
				1643
				1644
				1645	class Settings(object):
				1646	"""Results of a completely parsed .isolated file."""
				1647	def __init__(self):
				1648	self.command = []
				1649	self.files = {}
				1650	self.read_only = None
				1651	self.relative_cwd = None
				1652	# The main .isolated file, a IsolatedFile instance.
				1653	self.root = None
				1654
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1655	def load(self, fetch_queue, root_isolated_hash, os_flavor, algo):
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1656	"""Loads the .isolated and all the included .isolated asynchronously.
				1657
				1658	It enables support for "included" .isolated files. They are processed in
				1659	strict order but fetched asynchronously from the cache. This is important so
				1660	that a file in an included .isolated file that is overridden by an embedding
				1661	.isolated file is not fetched needlessly. The includes are fetched in one
				1662	pass and the files are fetched as soon as all the ones on the left-side
				1663	of the tree were fetched.
				1664
				1665	The prioritization is very important here for nested .isolated files.
				1666	'includes' have the highest priority and the algorithm is optimized for both
				1667	deep and wide trees. A deep one is a long link of .isolated files referenced
				1668	one at a time by one item in 'includes'. A wide one has a large number of
				1669	'includes' in a single .isolated file. 'left' is defined as an included
				1670	.isolated file earlier in the 'includes' list. So the order of the elements
				1671	in 'includes' is important.
				1672	"""
				1673	self.root = IsolatedFile(root_isolated_hash, algo)
				1674
				1675	# Isolated files being retrieved now: hash -> IsolatedFile instance.
				1676	pending = {}
				1677	# Set of hashes of already retrieved items to refuse recursive includes.
				1678	seen = set()
				1679
				1680	def retrieve(isolated_file):
				1681	h = isolated_file.obj_hash
				1682	if h in seen:
				1683	raise ConfigError('IsolatedFile %s is retrieved recursively' % h)
				1684	assert h not in pending
				1685	seen.add(h)
				1686	pending[h] = isolated_file
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1687	fetch_queue.add(WorkerPool.HIGH, h)
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1688
				1689	retrieve(self.root)
				1690
				1691	while pending:
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1692	item_hash = fetch_queue.wait(pending)
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1693	item = pending.pop(item_hash)
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1694	item.load(os_flavor, fetch_queue.cache.read(item_hash))
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1695	if item_hash == root_isolated_hash:
				1696	# It's the root item.
				1697	item.can_fetch = True
				1698
				1699	for new_child in item.children:
				1700	retrieve(new_child)
				1701
				1702	# Traverse the whole tree to see if files can now be fetched.
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1703	self._traverse_tree(fetch_queue, self.root)
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1704
				1705	def check(n):
				1706	return all(check(x) for x in n.children) and n.files_fetched
				1707	assert check(self.root)
				1708
				1709	self.relative_cwd = self.relative_cwd or ''
				1710	self.read_only = self.read_only or False
				1711
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1712	def _traverse_tree(self, fetch_queue, node):
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1713	if node.can_fetch:
				1714	if not node.files_fetched:
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1715	self._update_self(fetch_queue, node)
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1716	will_break = False
				1717	for i in node.children:
				1718	if not i.can_fetch:
				1719	if will_break:
				1720	break
				1721	# Automatically mark the first one as fetcheable.
				1722	i.can_fetch = True
				1723	will_break = True
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1724	self._traverse_tree(fetch_queue, i)
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1725
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1726	def _update_self(self, fetch_queue, node):
				1727	node.fetch_files(fetch_queue, self.files)
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1728	# Grabs properties.
				1729	if not self.command and node.data.get('command'):
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1730	# Ensure paths are correctly separated on windows.
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1731	self.command = node.data['command']
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1732	if self.command:
				1733	self.command[0] = self.command[0].replace('/', os.path.sep)
				1734	self.command = tools.fix_python_path(self.command)
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1735	if self.read_only is None and node.data.get('read_only') is not None:
				1736	self.read_only = node.data['read_only']
				1737	if (self.relative_cwd is None and
				1738	node.data.get('relative_cwd') is not None):
				1739	self.relative_cwd = node.data['relative_cwd']
				1740
				1741
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1742	def fetch_isolated(
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1743	isolated_hash, storage, cache, algo, outdir, os_flavor, require_command):
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1744	"""Aggressively downloads the .isolated file(s), then download all the files.
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1745
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1746	Arguments:
				1747	isolated_hash: hash of the root *.isolated file.
				1748	storage: Storage class that communicates with isolate storage.
				1749	cache: LocalCache class that knows how to store and map files locally.
				1750	algo: hash algorithm to use.
				1751	outdir: Output directory to map file tree to.
				1752	os_flavor: OS flavor to choose when reading sections of *.isolated file.
				1753	require_command: Ensure *.isolated specifies a command to run.
				1754
				1755	Returns:
				1756	Settings object that holds details about loaded *.isolated file.
				1757	"""
				1758	with cache:
				1759	fetch_queue = FetchQueue(storage, cache)
				1760	settings = Settings()
				1761
				1762	with tools.Profiler('GetIsolateds'):
				1763	# Optionally support local files by manually adding them to cache.
				1764	if not is_valid_hash(isolated_hash, algo):
				1765	isolated_hash = fetch_queue.inject_local_file(isolated_hash, algo)
				1766
				1767	# Load all *.isolated and start loading rest of the files.
				1768	settings.load(fetch_queue, isolated_hash, os_flavor, algo)
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1769	if require_command and not settings.command:
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1770	# TODO(vadimsh): All fetch operations are already enqueue and there's no
				1771	# easy way to cancel them.
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1772	raise ConfigError('No command to run')
				1773
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1774	with tools.Profiler('GetRest'):
				1775	# Create file system hierarchy.
				1776	if not os.path.isdir(outdir):
				1777	os.makedirs(outdir)
				1778	create_directories(outdir, settings.files)
Marc-Antoine Ruel	ccafe0e	2013-11-08 16:15:36 -0500	[diff] [blame]	1779	create_symlinks(outdir, settings.files.iteritems())
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1780
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1781	# Ensure working directory exists.
				1782	cwd = os.path.normpath(os.path.join(outdir, settings.relative_cwd))
				1783	if not os.path.isdir(cwd):
				1784	os.makedirs(cwd)
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1785
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1786	# Multimap: digest -> list of pairs (path, props).
				1787	remaining = {}
				1788	for filepath, props in settings.files.iteritems():
				1789	if 'h' in props:
				1790	remaining.setdefault(props['h'], []).append((filepath, props))
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1791
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1792	# Now block on the remaining files to be downloaded and mapped.
				1793	logging.info('Retrieving remaining files (%d of them)...',
				1794	fetch_queue.pending_count)
				1795	last_update = time.time()
				1796	with threading_utils.DeadlockDetector(DEADLOCK_TIMEOUT) as detector:
				1797	while remaining:
				1798	detector.ping()
				1799
				1800	# Wait for any item to finish fetching to cache.
				1801	digest = fetch_queue.wait(remaining)
				1802
				1803	# Link corresponding files to a fetched item in cache.
				1804	for filepath, props in remaining.pop(digest):
Marc-Antoine Ruel	fb199cf	2013-11-12 15:38:12 -0500	[diff] [blame]	1805	cache.hardlink(
				1806	digest, os.path.join(outdir, filepath), props.get('m'))
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1807
				1808	# Report progress.
				1809	duration = time.time() - last_update
				1810	if duration > DELAY_BETWEEN_UPDATES_IN_SECS:
				1811	msg = '%d files remaining...' % len(remaining)
				1812	print msg
				1813	logging.info(msg)
				1814	last_update = time.time()
				1815
				1816	# Cache could evict some items we just tried to fetch, it's a fatal error.
				1817	if not fetch_queue.verify_all_cached():
				1818	raise MappingError('Cache is too small to hold all requested files')
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1819	return settings
				1820
				1821
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	1822	def directory_to_metadata(root, algo, blacklist):
				1823	"""Returns the FileItem list and .isolated metadata for a directory."""
				1824	root = file_path.get_native_path_case(root)
				1825	metadata = dict(
				1826	(relpath, process_input(
				1827	os.path.join(root, relpath), {}, False, sys.platform, algo))
				1828	for relpath in expand_directory_and_symlink(
				1829	root, './', blacklist, True)
				1830	)
				1831	for v in metadata.itervalues():
				1832	v.pop('t')
				1833	items = [
				1834	FileItem(
				1835	path=os.path.join(root, relpath),
				1836	digest=meta['h'],
				1837	size=meta['s'],
				1838	is_isolated=relpath.endswith('.isolated'))
				1839	for relpath, meta in metadata.iteritems() if 'h' in meta
				1840	]
				1841	return items, metadata
				1842
				1843
				1844	def archive(storage, algo, files, blacklist):
				1845	"""Stores every entries and returns the relevant data."""
				1846	assert all(isinstance(i, unicode) for i in files), files
				1847	if len(files) != len(set(map(os.path.abspath, files))):
				1848	raise Error('Duplicate entries found.')
				1849
				1850	results = []
				1851	# The temporary directory is only created as needed.
				1852	tempdir = None
				1853	try:
				1854	# TODO(maruel): Yield the files to a worker thread.
				1855	items_to_upload = []
				1856	for f in files:
				1857	try:
				1858	filepath = os.path.abspath(f)
				1859	if os.path.isdir(filepath):
				1860	# Uploading a whole directory.
				1861	items, metadata = directory_to_metadata(filepath, algo, blacklist)
				1862
				1863	# Create the .isolated file.
				1864	if not tempdir:
				1865	tempdir = tempfile.mkdtemp(prefix='isolateserver')
				1866	handle, isolated = tempfile.mkstemp(dir=tempdir, suffix='.isolated')
				1867	os.close(handle)
				1868	data = {
				1869	'algo': SUPPORTED_ALGOS_REVERSE[algo],
				1870	'files': metadata,
				1871	'version': '1.0',
				1872	}
				1873	save_isolated(isolated, data)
				1874	h = hash_file(isolated, algo)
				1875	items_to_upload.extend(items)
				1876	items_to_upload.append(
				1877	FileItem(
				1878	path=isolated,
				1879	digest=h,
				1880	size=os.stat(isolated).st_size,
				1881	is_isolated=True))
				1882	results.append((h, f))
				1883
				1884	elif os.path.isfile(filepath):
				1885	h = hash_file(filepath, algo)
				1886	items_to_upload.append(
				1887	FileItem(
				1888	path=filepath,
				1889	digest=h,
				1890	size=os.stat(filepath).st_size,
				1891	is_isolated=f.endswith('.isolated')))
				1892	results.append((h, f))
				1893	else:
				1894	raise Error('%s is neither a file or directory.' % f)
				1895	except OSError:
				1896	raise Error('Failed to process %s.' % f)
				1897	# Technically we would care about the uploaded files but we don't much in
				1898	# practice.
				1899	_uploaded_files = storage.upload_items(items_to_upload)
				1900	return results
				1901	finally:
				1902	if tempdir:
				1903	shutil.rmtree(tempdir)
				1904
				1905
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	1906	@subcommand.usage('<file1..fileN> or - to read from stdin')
				1907	def CMDarchive(parser, args):
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	1908	"""Archives data to the server.
				1909
				1910	If a directory is specified, a .isolated file is created the whole directory
				1911	is uploaded. Then this .isolated file can be included in another one to run
				1912	commands.
				1913
				1914	The commands output each file that was processed with its content hash. For
				1915	directories, the .isolated generated for the directory is listed as the
				1916	directory entry itself.
				1917	"""
				1918	parser.add_option(
				1919	'--blacklist',
				1920	action='append', default=list(DEFAULT_BLACKLIST),
				1921	help='List of regexp to use as blacklist filter when uploading '
				1922	'directories')
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	1923	options, files = parser.parse_args(args)
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	1924
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	1925	if files == ['-']:
				1926	files = sys.stdin.readlines()
				1927
				1928	if not files:
				1929	parser.error('Nothing to upload')
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	1930
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	1931	files = [f.decode('utf-8') for f in files]
				1932	algo = get_hash_algo(options.namespace)
				1933	blacklist = tools.gen_blacklist(options.blacklist)
				1934	try:
				1935	with get_storage(options.isolate_server, options.namespace) as storage:
				1936	results = archive(storage, algo, files, blacklist)
				1937	except Error as e:
				1938	parser.error(e.args[0])
				1939	print('\n'.join('%s %s' % (r[0], r[1]) for r in results))
				1940	return 0
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	1941
				1942
				1943	def CMDdownload(parser, args):
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	1944	"""Download data from the server.
				1945
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1946	It can either download individual files or a complete tree from a .isolated
				1947	file.
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	1948	"""
				1949	parser.add_option(
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1950	'-i', '--isolated', metavar='HASH',
				1951	help='hash of an isolated file, .isolated file content is discarded, use '
				1952	'--file if you need it')
				1953	parser.add_option(
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	1954	'-f', '--file', metavar='HASH DEST', default=[], action='append', nargs=2,
				1955	help='hash and destination of a file, can be used multiple times')
				1956	parser.add_option(
				1957	'-t', '--target', metavar='DIR', default=os.getcwd(),
				1958	help='destination directory')
				1959	options, args = parser.parse_args(args)
				1960	if args:
				1961	parser.error('Unsupported arguments: %s' % args)
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1962	if bool(options.isolated) == bool(options.file):
				1963	parser.error('Use one of --isolated or --file, and only one.')
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	1964
				1965	options.target = os.path.abspath(options.target)
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1966	storage = get_storage(options.isolate_server, options.namespace)
				1967	cache = MemoryCache()
				1968	algo = get_hash_algo(options.namespace)
				1969
				1970	# Fetching individual files.
				1971	if options.file:
				1972	channel = threading_utils.TaskChannel()
				1973	pending = {}
				1974	for digest, dest in options.file:
				1975	pending[digest] = dest
				1976	storage.async_fetch(
				1977	channel,
				1978	WorkerPool.MED,
				1979	digest,
				1980	UNKNOWN_FILE_SIZE,
				1981	functools.partial(file_write, os.path.join(options.target, dest)))
				1982	while pending:
				1983	fetched = channel.pull()
				1984	dest = pending.pop(fetched)
				1985	logging.info('%s: %s', fetched, dest)
				1986
				1987	# Fetching whole isolated tree.
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1988	if options.isolated:
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1989	settings = fetch_isolated(
				1990	isolated_hash=options.isolated,
				1991	storage=storage,
				1992	cache=cache,
				1993	algo=algo,
				1994	outdir=options.target,
				1995	os_flavor=None,
				1996	require_command=False)
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1997	rel = os.path.join(options.target, settings.relative_cwd)
				1998	print('To run this test please run from the directory %s:' %
				1999	os.path.join(options.target, rel))
				2000	print(' ' + ' '.join(settings.command))
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	2001
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	2002	return 0
				2003
				2004
				2005	class OptionParserIsolateServer(tools.OptionParserWithLogging):
				2006	def __init__(self, **kwargs):
Marc-Antoine Ruel	ac54cb4	2013-11-18 14:05:35 -0500	[diff] [blame]	2007	tools.OptionParserWithLogging.__init__(
				2008	self,
				2009	version=__version__,
				2010	prog=os.path.basename(sys.modules[__name__].__file__),
				2011	**kwargs)
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	2012	self.add_option(
				2013	'-I', '--isolate-server',
maruel@chromium.org	e9403ab	2013-09-20 18:03:49 +0000	[diff] [blame]	2014	metavar='URL', default='',
				2015	help='Isolate server to use')
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	2016	self.add_option(
				2017	'--namespace', default='default-gzip',
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	2018	help='The namespace to use on the server, default: %default')
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	2019
				2020	def parse_args(self, args, *kwargs):
				2021	options, args = tools.OptionParserWithLogging.parse_args(
				2022	self, args, *kwargs)
				2023	options.isolate_server = options.isolate_server.rstrip('/')
				2024	if not options.isolate_server:
				2025	self.error('--isolate-server is required.')
				2026	return options, args
				2027
				2028
				2029	def main(args):
				2030	dispatcher = subcommand.CommandDispatcher(__name__)
				2031	try:
Marc-Antoine Ruel	ac54cb4	2013-11-18 14:05:35 -0500	[diff] [blame]	2032	return dispatcher.execute(OptionParserIsolateServer(), args)
vadimsh@chromium.org	d908a54	2013-10-30 01:36:17 +0000	[diff] [blame]	2033	except Exception as e:
				2034	tools.report_error(e)
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	2035	return 1
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	2036
				2037
				2038	if __name__ == '__main__':
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	2039	fix_encoding.fix_encoding()
				2040	tools.disable_buffering()
				2041	colorama.init()
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	2042	sys.exit(main(sys.argv[1:]))