Blame - isolateserver.py - chromium.googlesource.com/infra/luci/client-py

blob: 7946004c36e7031ef597f389260dc9727356a8a3 [file] [log] [blame]

maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	1	#!/usr/bin/env python
Marc-Antoine Ruel	8add124	2013-11-05 17:28:27 -0500	[diff] [blame]	2	# Copyright 2013 The Swarming Authors. All rights reserved.
Marc-Antoine Ruel	e98b112	2013-11-05 20:27:57 -0500	[diff] [blame]	3	# Use of this source code is governed under the Apache License, Version 2.0 that
				4	# can be found in the LICENSE file.
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	5
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	6	"""Archives a set of files or directories to a server."""
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	7
Marc-Antoine Ruel	8806e62	2014-02-12 14:15:53 -0500	[diff] [blame]	8	__version__ = '0.3.2'
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	9
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	10	import functools
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	11	import hashlib
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	12	import json
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	13	import logging
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	14	import os
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	15	import re
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	16	import shutil
				17	import stat
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	18	import sys
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	19	import tempfile
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	20	import threading
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	21	import time
maruel@chromium.org	e82112e	2013-04-24 14:41:55 +0000	[diff] [blame]	22	import urllib
Marc-Antoine Ruel	1687b5e	2014-02-06 17:47:53 -0500	[diff] [blame]	23	import urlparse
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	24	import zlib
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	25
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	26	from third_party import colorama
				27	from third_party.depot_tools import fix_encoding
				28	from third_party.depot_tools import subcommand
				29
Marc-Antoine Ruel	3798993	2013-11-19 16:28:08 -0500	[diff] [blame]	30	from utils import file_path
vadimsh@chromium.org	6b70621	2013-08-28 15:03:46 +0000	[diff] [blame]	31	from utils import net
vadimsh@chromium.org	b074b16	2013-08-22 17:55:46 +0000	[diff] [blame]	32	from utils import threading_utils
vadimsh@chromium.org	a432647	2013-08-24 02:05:41 +0000	[diff] [blame]	33	from utils import tools
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	34
Vadim Shtayura	e34e13a	2014-02-02 11:23:26 -0800	[diff] [blame]	35	import auth
				36
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	37
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	38	# Version of isolate protocol passed to the server in /handshake request.
				39	ISOLATE_PROTOCOL_VERSION = '1.0'
Marc-Antoine Ruel	1c1edd6	2013-12-06 09:13:13 -0500	[diff] [blame]	40	# Version stored and expected in .isolated files.
Marc-Antoine Ruel	0519946	2014-03-13 15:40:48 -0400	[diff] [blame^]	41	ISOLATED_FILE_VERSION = '1.4'
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	42
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	43
				44	# The number of files to check the isolate server per /pre-upload query.
vadimsh@chromium.org	eea5242	2013-08-21 19:35:54 +0000	[diff] [blame]	45	# All files are sorted by likelihood of a change in the file content
				46	# (currently file size is used to estimate this: larger the file -> larger the
				47	# possibility it has changed). Then first ITEMS_PER_CONTAINS_QUERIES[0] files
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	48	# are taken and send to '/pre-upload', then next ITEMS_PER_CONTAINS_QUERIES[1],
vadimsh@chromium.org	eea5242	2013-08-21 19:35:54 +0000	[diff] [blame]	49	# and so on. Numbers here is a trade-off; the more per request, the lower the
				50	# effect of HTTP round trip latency and TCP-level chattiness. On the other hand,
				51	# larger values cause longer lookups, increasing the initial latency to start
				52	# uploading, which is especially an issue for large files. This value is
				53	# optimized for the "few thousands files to look up with minimal number of large
				54	# files missing" case.
				55	ITEMS_PER_CONTAINS_QUERIES = [20, 20, 50, 50, 50, 100]
csharp@chromium.org	07fa759	2013-01-11 18:19:30 +0000	[diff] [blame]	56
maruel@chromium.org	9958e4a	2013-09-17 00:01:48 +0000	[diff] [blame]	57
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	58	# A list of already compressed extension types that should not receive any
				59	# compression before being uploaded.
				60	ALREADY_COMPRESSED_TYPES = [
				61	'7z', 'avi', 'cur', 'gif', 'h264', 'jar', 'jpeg', 'jpg', 'pdf', 'png',
				62	'wav', 'zip'
				63	]
				64
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	65
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	66	# The file size to be used when we don't know the correct file size,
				67	# generally used for .isolated files.
				68	UNKNOWN_FILE_SIZE = None
				69
				70
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	71	# Chunk size to use when doing disk I/O.
				72	DISK_FILE_CHUNK = 1024 * 1024
				73
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	74	# Chunk size to use when reading from network stream.
				75	NET_IO_FILE_CHUNK = 16 * 1024
				76
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	77
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	78	# Read timeout in seconds for downloads from isolate storage. If there's no
				79	# response from the server within this timeout whole download will be aborted.
				80	DOWNLOAD_READ_TIMEOUT = 60
				81
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	82	# Maximum expected delay (in seconds) between successive file fetches
				83	# in run_tha_test. If it takes longer than that, a deadlock might be happening
				84	# and all stack frames for all threads are dumped to log.
				85	DEADLOCK_TIMEOUT = 5 * 60
				86
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	87
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	88	# The delay (in seconds) to wait between logging statements when retrieving
				89	# the required files. This is intended to let the user (or buildbot) know that
				90	# the program is still running.
				91	DELAY_BETWEEN_UPDATES_IN_SECS = 30
				92
				93
maruel@chromium.org	385d73d	2013-09-19 18:33:21 +0000	[diff] [blame]	94	# Sadly, hashlib uses 'sha1' instead of the standard 'sha-1' so explicitly
				95	# specify the names here.
				96	SUPPORTED_ALGOS = {
				97	'md5': hashlib.md5,
				98	'sha-1': hashlib.sha1,
				99	'sha-512': hashlib.sha512,
				100	}
				101
				102
				103	# Used for serialization.
				104	SUPPORTED_ALGOS_REVERSE = dict((v, k) for k, v in SUPPORTED_ALGOS.iteritems())
				105
				106
Marc-Antoine Ruel	ac54cb4	2013-11-18 14:05:35 -0500	[diff] [blame]	107	DEFAULT_BLACKLIST = (
				108	# Temporary vim or python files.
				109	r'^.+\.(?:pyc\|swp)$',
				110	# .git or .svn directory.
				111	r'^(?:.+' + re.escape(os.path.sep) + r'\|)\.(?:git\|svn)$',
				112	)
				113
				114
				115	# Chromium-specific.
				116	DEFAULT_BLACKLIST += (
				117	r'^.+\.(?:run_test_cases)$',
				118	r'^(?:.+' + re.escape(os.path.sep) + r'\|)testserver\.log$',
				119	)
				120
				121
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	122	class Error(Exception):
				123	"""Generic runtime error."""
				124	pass
				125
				126
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	127	class ConfigError(ValueError):
				128	"""Generic failure to load a .isolated file."""
				129	pass
				130
				131
				132	class MappingError(OSError):
				133	"""Failed to recreate the tree."""
				134	pass
				135
				136
maruel@chromium.org	7b844a6	2013-09-17 13:04:59 +0000	[diff] [blame]	137	def is_valid_hash(value, algo):
				138	"""Returns if the value is a valid hash for the corresponding algorithm."""
				139	size = 2 * algo().digest_size
				140	return bool(re.match(r'^[a-fA-F0-9]{%d}$' % size, value))
				141
				142
				143	def hash_file(filepath, algo):
				144	"""Calculates the hash of a file without reading it all in memory at once.
				145
				146	\|algo\| should be one of hashlib hashing algorithm.
				147	"""
				148	digest = algo()
maruel@chromium.org	037758d	2012-12-10 17:59:46 +0000	[diff] [blame]	149	with open(filepath, 'rb') as f:
				150	while True:
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	151	chunk = f.read(DISK_FILE_CHUNK)
maruel@chromium.org	037758d	2012-12-10 17:59:46 +0000	[diff] [blame]	152	if not chunk:
				153	break
				154	digest.update(chunk)
				155	return digest.hexdigest()
				156
				157
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	158	def stream_read(stream, chunk_size):
				159	"""Reads chunks from \|stream\| and yields them."""
				160	while True:
				161	data = stream.read(chunk_size)
				162	if not data:
				163	break
				164	yield data
				165
				166
Vadim Shtayura	f0cb97a	2013-12-05 13:57:49 -0800	[diff] [blame]	167	def file_read(filepath, chunk_size=DISK_FILE_CHUNK, offset=0):
				168	"""Yields file content in chunks of \|chunk_size\| starting from \|offset\|."""
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	169	with open(filepath, 'rb') as f:
Vadim Shtayura	f0cb97a	2013-12-05 13:57:49 -0800	[diff] [blame]	170	if offset:
				171	f.seek(offset)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	172	while True:
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	173	data = f.read(chunk_size)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	174	if not data:
				175	break
				176	yield data
				177
				178
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	179	def file_write(filepath, content_generator):
				180	"""Writes file content as generated by content_generator.
				181
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	182	Creates the intermediary directory as needed.
				183
				184	Returns the number of bytes written.
				185
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	186	Meant to be mocked out in unit tests.
				187	"""
				188	filedir = os.path.dirname(filepath)
				189	if not os.path.isdir(filedir):
				190	os.makedirs(filedir)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	191	total = 0
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	192	with open(filepath, 'wb') as f:
				193	for d in content_generator:
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	194	total += len(d)
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	195	f.write(d)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	196	return total
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	197
				198
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	199	def zip_compress(content_generator, level=7):
				200	"""Reads chunks from \|content_generator\| and yields zip compressed chunks."""
				201	compressor = zlib.compressobj(level)
				202	for chunk in content_generator:
				203	compressed = compressor.compress(chunk)
				204	if compressed:
				205	yield compressed
				206	tail = compressor.flush(zlib.Z_FINISH)
				207	if tail:
				208	yield tail
				209
				210
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	211	def zip_decompress(content_generator, chunk_size=DISK_FILE_CHUNK):
				212	"""Reads zipped data from \|content_generator\| and yields decompressed data.
				213
				214	Decompresses data in small chunks (no larger than \|chunk_size\|) so that
				215	zip bomb file doesn't cause zlib to preallocate huge amount of memory.
				216
				217	Raises IOError if data is corrupted or incomplete.
				218	"""
				219	decompressor = zlib.decompressobj()
				220	compressed_size = 0
				221	try:
				222	for chunk in content_generator:
				223	compressed_size += len(chunk)
				224	data = decompressor.decompress(chunk, chunk_size)
				225	if data:
				226	yield data
				227	while decompressor.unconsumed_tail:
				228	data = decompressor.decompress(decompressor.unconsumed_tail, chunk_size)
				229	if data:
				230	yield data
				231	tail = decompressor.flush()
				232	if tail:
				233	yield tail
				234	except zlib.error as e:
				235	raise IOError(
				236	'Corrupted zip stream (read %d bytes) - %s' % (compressed_size, e))
				237	# Ensure all data was read and decompressed.
				238	if decompressor.unused_data or decompressor.unconsumed_tail:
				239	raise IOError('Not all data was decompressed')
				240
				241
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	242	def get_zip_compression_level(filename):
				243	"""Given a filename calculates the ideal zip compression level to use."""
				244	file_ext = os.path.splitext(filename)[1].lower()
				245	# TODO(csharp): Profile to find what compression level works best.
				246	return 0 if file_ext in ALREADY_COMPRESSED_TYPES else 7
				247
				248
maruel@chromium.org	af25485	2013-09-17 17:48:14 +0000	[diff] [blame]	249	def create_directories(base_directory, files):
				250	"""Creates the directory structure needed by the given list of files."""
				251	logging.debug('create_directories(%s, %d)', base_directory, len(files))
				252	# Creates the tree of directories to create.
				253	directories = set(os.path.dirname(f) for f in files)
				254	for item in list(directories):
				255	while item:
				256	directories.add(item)
				257	item = os.path.dirname(item)
				258	for d in sorted(directories):
				259	if d:
				260	os.mkdir(os.path.join(base_directory, d))
				261
				262
Marc-Antoine Ruel	ccafe0e	2013-11-08 16:15:36 -0500	[diff] [blame]	263	def create_symlinks(base_directory, files):
				264	"""Creates any symlinks needed by the given set of files."""
maruel@chromium.org	af25485	2013-09-17 17:48:14 +0000	[diff] [blame]	265	for filepath, properties in files:
				266	if 'l' not in properties:
				267	continue
				268	if sys.platform == 'win32':
Marc-Antoine Ruel	ccafe0e	2013-11-08 16:15:36 -0500	[diff] [blame]	269	# TODO(maruel): Create symlink via the win32 api.
maruel@chromium.org	af25485	2013-09-17 17:48:14 +0000	[diff] [blame]	270	logging.warning('Ignoring symlink %s', filepath)
				271	continue
				272	outfile = os.path.join(base_directory, filepath)
Marc-Antoine Ruel	ccafe0e	2013-11-08 16:15:36 -0500	[diff] [blame]	273	# os.symlink() doesn't exist on Windows.
maruel@chromium.org	af25485	2013-09-17 17:48:14 +0000	[diff] [blame]	274	os.symlink(properties['l'], outfile) # pylint: disable=E1101
maruel@chromium.org	af25485	2013-09-17 17:48:14 +0000	[diff] [blame]	275
				276
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	277	def is_valid_file(filepath, size):
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	278	"""Determines if the given files appears valid.
				279
				280	Currently it just checks the file's size.
				281	"""
				282	if size == UNKNOWN_FILE_SIZE:
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	283	return os.path.isfile(filepath)
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	284	actual_size = os.stat(filepath).st_size
				285	if size != actual_size:
				286	logging.warning(
				287	'Found invalid item %s; %d != %d',
				288	os.path.basename(filepath), actual_size, size)
				289	return False
				290	return True
				291
				292
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	293	class WorkerPool(threading_utils.AutoRetryThreadPool):
				294	"""Thread pool that automatically retries on IOError and runs a preconfigured
				295	function.
				296	"""
				297	# Initial and maximum number of worker threads.
				298	INITIAL_WORKERS = 2
				299	MAX_WORKERS = 16
				300	RETRIES = 5
				301
				302	def __init__(self):
				303	super(WorkerPool, self).__init__(
				304	[IOError],
				305	self.RETRIES,
				306	self.INITIAL_WORKERS,
				307	self.MAX_WORKERS,
				308	0,
				309	'remote')
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	310
				311
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	312	class Item(object):
				313	"""An item to push to Storage.
				314
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	315	Its digest and size may be provided in advance, if known. Otherwise they will
				316	be derived from content(). If digest is provided, it MUST correspond to
				317	hash algorithm used by Storage.
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	318
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	319	When used with Storage, Item starts its life in a main thread, travels
				320	to 'contains' thread, then to 'push' thread and then finally back to
				321	the main thread. It is never used concurrently from multiple threads.
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	322	"""
				323
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	324	def __init__(self, digest=None, size=None, high_priority=False):
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	325	self.digest = digest
				326	self.size = size
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	327	self.high_priority = high_priority
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	328	self.compression_level = 6
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	329
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	330	def content(self):
				331	"""Iterable with content of this item as byte string (str) chunks."""
				332	raise NotImplementedError()
				333
				334	def prepare(self, hash_algo):
				335	"""Ensures self.digest and self.size are set.
				336
				337	Uses content() as a source of data to calculate them. Does nothing if digest
				338	and size is already known.
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	339
				340	Arguments:
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	341	hash_algo: hash algorithm to use to calculate digest.
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	342	"""
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	343	if self.digest is None or self.size is None:
				344	digest = hash_algo()
				345	total = 0
				346	for chunk in self.content():
				347	digest.update(chunk)
				348	total += len(chunk)
				349	self.digest = digest.hexdigest()
				350	self.size = total
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	351
				352
				353	class FileItem(Item):
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	354	"""A file to push to Storage.
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	355
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	356	Its digest and size may be provided in advance, if known. Otherwise they will
				357	be derived from the file content.
				358	"""
				359
				360	def __init__(self, path, digest=None, size=None, high_priority=False):
				361	super(FileItem, self).__init__(
				362	digest,
				363	size if size is not None else os.stat(path).st_size,
				364	high_priority)
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	365	self.path = path
				366	self.compression_level = get_zip_compression_level(path)
				367
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	368	def content(self):
				369	return file_read(self.path)
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	370
				371
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	372	class BufferItem(Item):
				373	"""A byte buffer to push to Storage."""
				374
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	375	def __init__(self, buf, high_priority=False):
				376	super(BufferItem, self).__init__(None, len(buf), high_priority)
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	377	self.buffer = buf
				378
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	379	def content(self):
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	380	return [self.buffer]
				381
				382
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	383	class Storage(object):
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	384	"""Efficiently downloads or uploads large set of files via StorageApi.
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	385
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	386	Implements compression support, parallel 'contains' checks, parallel uploads
				387	and more.
				388
				389	Works only within single namespace (and thus hashing algorithm and compression
				390	scheme are fixed).
				391
				392	Spawns multiple internal threads. Thread safe, but not fork safe.
				393	"""
				394
				395	def __init__(self, storage_api, use_zip, hash_algo):
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	396	self.use_zip = use_zip
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	397	self.hash_algo = hash_algo
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	398	self._storage_api = storage_api
				399	self._cpu_thread_pool = None
				400	self._net_thread_pool = None
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	401
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	402	@property
				403	def cpu_thread_pool(self):
				404	"""ThreadPool for CPU-bound tasks like zipping."""
				405	if self._cpu_thread_pool is None:
				406	self._cpu_thread_pool = threading_utils.ThreadPool(
				407	2, max(threading_utils.num_processors(), 2), 0, 'zip')
				408	return self._cpu_thread_pool
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	409
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	410	@property
				411	def net_thread_pool(self):
				412	"""AutoRetryThreadPool for IO-bound tasks, retries IOError."""
				413	if self._net_thread_pool is None:
				414	self._net_thread_pool = WorkerPool()
				415	return self._net_thread_pool
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	416
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	417	def close(self):
				418	"""Waits for all pending tasks to finish."""
				419	if self._cpu_thread_pool:
				420	self._cpu_thread_pool.join()
				421	self._cpu_thread_pool.close()
				422	self._cpu_thread_pool = None
				423	if self._net_thread_pool:
				424	self._net_thread_pool.join()
				425	self._net_thread_pool.close()
				426	self._net_thread_pool = None
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	427
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	428	def __enter__(self):
				429	"""Context manager interface."""
				430	return self
				431
				432	def __exit__(self, _exc_type, _exc_value, _traceback):
				433	"""Context manager interface."""
				434	self.close()
				435	return False
				436
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	437	def upload_items(self, items):
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	438	"""Uploads a bunch of items to the isolate server.
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	439
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	440	It figures out what items are missing from the server and uploads only them.
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	441
				442	Arguments:
				443	items: list of Item instances that represents data to upload.
				444
				445	Returns:
				446	List of items that were uploaded. All other items are already there.
				447	"""
				448	# TODO(vadimsh): Optimize special case of len(items) == 1 that is frequently
				449	# used by swarming.py. There's no need to spawn multiple threads and try to
				450	# do stuff in parallel: there's nothing to parallelize. 'contains' check and
				451	# 'push' should be performed sequentially in the context of current thread.
				452
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	453	# Ensure all digests are calculated.
				454	for item in items:
				455	item.prepare(self.hash_algo)
				456
vadimsh@chromium.org	672cd2b	2013-10-08 17:49:33 +0000	[diff] [blame]	457	# For each digest keep only first Item that matches it. All other items
				458	# are just indistinguishable copies from the point of view of isolate
				459	# server (it doesn't care about paths at all, only content and digests).
				460	seen = {}
				461	duplicates = 0
				462	for item in items:
				463	if seen.setdefault(item.digest, item) is not item:
				464	duplicates += 1
				465	items = seen.values()
				466	if duplicates:
				467	logging.info('Skipped %d duplicated files', duplicates)
				468
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	469	# Enqueue all upload tasks.
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	470	missing = set()
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	471	uploaded = []
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	472	channel = threading_utils.TaskChannel()
				473	for missing_item, push_state in self.get_missing_items(items):
				474	missing.add(missing_item)
				475	self.async_push(channel, missing_item, push_state)
				476
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	477	# No need to spawn deadlock detector thread if there's nothing to upload.
				478	if missing:
				479	with threading_utils.DeadlockDetector(DEADLOCK_TIMEOUT) as detector:
				480	# Wait for all started uploads to finish.
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	481	while len(uploaded) != len(missing):
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	482	detector.ping()
				483	item = channel.pull()
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	484	uploaded.append(item)
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	485	logging.debug(
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	486	'Uploaded %d / %d: %s', len(uploaded), len(missing), item.digest)
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	487	logging.info('All files are uploaded')
				488
				489	# Print stats.
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	490	total = len(items)
				491	total_size = sum(f.size for f in items)
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	492	logging.info(
				493	'Total: %6d, %9.1fkb',
				494	total,
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	495	total_size / 1024.)
				496	cache_hit = set(items) - missing
				497	cache_hit_size = sum(f.size for f in cache_hit)
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	498	logging.info(
				499	'cache hit: %6d, %9.1fkb, %6.2f%% files, %6.2f%% size',
				500	len(cache_hit),
				501	cache_hit_size / 1024.,
				502	len(cache_hit) * 100. / total,
				503	cache_hit_size * 100. / total_size if total_size else 0)
				504	cache_miss = missing
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	505	cache_miss_size = sum(f.size for f in cache_miss)
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	506	logging.info(
				507	'cache miss: %6d, %9.1fkb, %6.2f%% files, %6.2f%% size',
				508	len(cache_miss),
				509	cache_miss_size / 1024.,
				510	len(cache_miss) * 100. / total,
				511	cache_miss_size * 100. / total_size if total_size else 0)
				512
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	513	return uploaded
				514
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	515	def get_fetch_url(self, item):
				516	"""Returns an URL that can be used to fetch given item once it's uploaded.
				517
				518	Note that if namespace uses compression, data at given URL is compressed.
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	519
				520	Arguments:
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	521	item: Item to get fetch URL for.
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	522
				523	Returns:
				524	An URL or None if underlying protocol doesn't support this.
				525	"""
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	526	item.prepare(self.hash_algo)
				527	return self._storage_api.get_fetch_url(item.digest)
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	528
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	529	def async_push(self, channel, item, push_state):
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	530	"""Starts asynchronous push to the server in a parallel thread.
				531
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	532	Can be used only after \|item\| was checked for presence on a server with
				533	'get_missing_items' call. 'get_missing_items' returns \|push_state\| object
				534	that contains storage specific information describing how to upload
				535	the item (for example in case of cloud storage, it is signed upload URLs).
				536
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	537	Arguments:
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	538	channel: TaskChannel that receives back \|item\| when upload ends.
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	539	item: item to upload as instance of Item class.
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	540	push_state: push state returned by 'get_missing_items' call for \|item\|.
				541
				542	Returns:
				543	None, but \|channel\| later receives back \|item\| when upload ends.
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	544	"""
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	545	# Thread pool task priority.
				546	priority = WorkerPool.HIGH if item.high_priority else WorkerPool.MED
				547
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	548	def push(content):
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	549	"""Pushes an item and returns it to \|channel\|."""
				550	item.prepare(self.hash_algo)
				551	self._storage_api.push(item, push_state, content)
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	552	return item
				553
				554	# If zipping is not required, just start a push task.
				555	if not self.use_zip:
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	556	self.net_thread_pool.add_task_with_channel(
				557	channel, priority, push, item.content())
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	558	return
				559
				560	# If zipping is enabled, zip in a separate thread.
				561	def zip_and_push():
				562	# TODO(vadimsh): Implement streaming uploads. Before it's done, assemble
				563	# content right here. It will block until all file is zipped.
				564	try:
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	565	stream = zip_compress(item.content(), item.compression_level)
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	566	data = ''.join(stream)
				567	except Exception as exc:
				568	logging.error('Failed to zip \'%s\': %s', item, exc)
Vadim Shtayura	0ffc409	2013-11-20 17:49:52 -0800	[diff] [blame]	569	channel.send_exception()
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	570	return
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	571	self.net_thread_pool.add_task_with_channel(
				572	channel, priority, push, [data])
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	573	self.cpu_thread_pool.add_task(priority, zip_and_push)
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	574
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	575	def push(self, item, push_state):
				576	"""Synchronously pushes a single item to the server.
				577
				578	If you need to push many items at once, consider using 'upload_items' or
				579	'async_push' with instance of TaskChannel.
				580
				581	Arguments:
				582	item: item to upload as instance of Item class.
				583	push_state: push state returned by 'get_missing_items' call for \|item\|.
				584
				585	Returns:
				586	Pushed item (same object as \|item\|).
				587	"""
				588	channel = threading_utils.TaskChannel()
				589	with threading_utils.DeadlockDetector(DEADLOCK_TIMEOUT):
				590	self.async_push(channel, item, push_state)
				591	pushed = channel.pull()
				592	assert pushed is item
				593	return item
				594
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	595	def async_fetch(self, channel, priority, digest, size, sink):
				596	"""Starts asynchronous fetch from the server in a parallel thread.
				597
				598	Arguments:
				599	channel: TaskChannel that receives back \|digest\| when download ends.
				600	priority: thread pool task priority for the fetch.
				601	digest: hex digest of an item to download.
				602	size: expected size of the item (after decompression).
				603	sink: function that will be called as sink(generator).
				604	"""
				605	def fetch():
				606	try:
				607	# Prepare reading pipeline.
				608	stream = self._storage_api.fetch(digest)
				609	if self.use_zip:
				610	stream = zip_decompress(stream, DISK_FILE_CHUNK)
				611	# Run \|stream\| through verifier that will assert its size.
				612	verifier = FetchStreamVerifier(stream, size)
				613	# Verified stream goes to \|sink\|.
				614	sink(verifier.run())
				615	except Exception as err:
Vadim Shtayura	0ffc409	2013-11-20 17:49:52 -0800	[diff] [blame]	616	logging.error('Failed to fetch %s: %s', digest, err)
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	617	raise
				618	return digest
				619
				620	# Don't bother with zip_thread_pool for decompression. Decompression is
				621	# really fast and most probably IO bound anyway.
				622	self.net_thread_pool.add_task_with_channel(channel, priority, fetch)
				623
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	624	def get_missing_items(self, items):
				625	"""Yields items that are missing from the server.
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	626
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	627	Issues multiple parallel queries via StorageApi's 'contains' method.
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	628
				629	Arguments:
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	630	items: a list of Item objects to check.
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	631
				632	Yields:
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	633	For each missing item it yields a pair (item, push_state), where:
				634	* item - Item object that is missing (one of \|items\|).
				635	* push_state - opaque object that contains storage specific information
				636	describing how to upload the item (for example in case of cloud
				637	storage, it is signed upload URLs). It can later be passed to
				638	'async_push'.
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	639	"""
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	640	channel = threading_utils.TaskChannel()
				641	pending = 0
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	642
				643	# Ensure all digests are calculated.
				644	for item in items:
				645	item.prepare(self.hash_algo)
				646
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	647	# Enqueue all requests.
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	648	for batch in batch_items_for_check(items):
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	649	self.net_thread_pool.add_task_with_channel(channel, WorkerPool.HIGH,
				650	self._storage_api.contains, batch)
				651	pending += 1
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	652
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	653	# Yield results as they come in.
				654	for _ in xrange(pending):
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	655	for missing_item, push_state in channel.pull().iteritems():
				656	yield missing_item, push_state
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	657
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	658
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	659	def batch_items_for_check(items):
				660	"""Splits list of items to check for existence on the server into batches.
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	661
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	662	Each batch corresponds to a single 'exists?' query to the server via a call
				663	to StorageApi's 'contains' method.
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	664
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	665	Arguments:
				666	items: a list of Item objects.
				667
				668	Yields:
				669	Batches of items to query for existence in a single operation,
				670	each batch is a list of Item objects.
				671	"""
				672	batch_count = 0
				673	batch_size_limit = ITEMS_PER_CONTAINS_QUERIES[0]
				674	next_queries = []
				675	for item in sorted(items, key=lambda x: x.size, reverse=True):
				676	next_queries.append(item)
				677	if len(next_queries) == batch_size_limit:
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	678	yield next_queries
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	679	next_queries = []
				680	batch_count += 1
				681	batch_size_limit = ITEMS_PER_CONTAINS_QUERIES[
				682	min(batch_count, len(ITEMS_PER_CONTAINS_QUERIES) - 1)]
				683	if next_queries:
				684	yield next_queries
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	685
				686
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	687	class FetchQueue(object):
				688	"""Fetches items from Storage and places them into LocalCache.
				689
				690	It manages multiple concurrent fetch operations. Acts as a bridge between
				691	Storage and LocalCache so that Storage and LocalCache don't depend on each
				692	other at all.
				693	"""
				694
				695	def __init__(self, storage, cache):
				696	self.storage = storage
				697	self.cache = cache
				698	self._channel = threading_utils.TaskChannel()
				699	self._pending = set()
				700	self._accessed = set()
				701	self._fetched = cache.cached_set()
				702
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	703	def add(self, digest, size=UNKNOWN_FILE_SIZE, priority=WorkerPool.MED):
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	704	"""Starts asynchronous fetch of item \|digest\|."""
				705	# Fetching it now?
				706	if digest in self._pending:
				707	return
				708
				709	# Mark this file as in use, verify_all_cached will later ensure it is still
				710	# in cache.
				711	self._accessed.add(digest)
				712
				713	# Already fetched? Notify cache to update item's LRU position.
				714	if digest in self._fetched:
				715	# 'touch' returns True if item is in cache and not corrupted.
				716	if self.cache.touch(digest, size):
				717	return
				718	# Item is corrupted, remove it from cache and fetch it again.
				719	self._fetched.remove(digest)
				720	self.cache.evict(digest)
				721
				722	# TODO(maruel): It should look at the free disk space, the current cache
				723	# size and the size of the new item on every new item:
				724	# - Trim the cache as more entries are listed when free disk space is low,
				725	# otherwise if the amount of data downloaded during the run > free disk
				726	# space, it'll crash.
				727	# - Make sure there's enough free disk space to fit all dependencies of
				728	# this run! If not, abort early.
				729
				730	# Start fetching.
				731	self._pending.add(digest)
				732	self.storage.async_fetch(
				733	self._channel, priority, digest, size,
				734	functools.partial(self.cache.write, digest))
				735
				736	def wait(self, digests):
				737	"""Starts a loop that waits for at least one of \|digests\| to be retrieved.
				738
				739	Returns the first digest retrieved.
				740	"""
				741	# Flush any already fetched items.
				742	for digest in digests:
				743	if digest in self._fetched:
				744	return digest
				745
				746	# Ensure all requested items are being fetched now.
				747	assert all(digest in self._pending for digest in digests), (
				748	digests, self._pending)
				749
				750	# Wait for some requested item to finish fetching.
				751	while self._pending:
				752	digest = self._channel.pull()
				753	self._pending.remove(digest)
				754	self._fetched.add(digest)
				755	if digest in digests:
				756	return digest
				757
				758	# Should never reach this point due to assert above.
				759	raise RuntimeError('Impossible state')
				760
				761	def inject_local_file(self, path, algo):
				762	"""Adds local file to the cache as if it was fetched from storage."""
				763	with open(path, 'rb') as f:
				764	data = f.read()
				765	digest = algo(data).hexdigest()
				766	self.cache.write(digest, [data])
				767	self._fetched.add(digest)
				768	return digest
				769
				770	@property
				771	def pending_count(self):
				772	"""Returns number of items to be fetched."""
				773	return len(self._pending)
				774
				775	def verify_all_cached(self):
				776	"""True if all accessed items are in cache."""
				777	return self._accessed.issubset(self.cache.cached_set())
				778
				779
				780	class FetchStreamVerifier(object):
				781	"""Verifies that fetched file is valid before passing it to the LocalCache."""
				782
				783	def __init__(self, stream, expected_size):
				784	self.stream = stream
				785	self.expected_size = expected_size
				786	self.current_size = 0
				787
				788	def run(self):
				789	"""Generator that yields same items as \|stream\|.
				790
				791	Verifies \|stream\| is complete before yielding a last chunk to consumer.
				792
				793	Also wraps IOError produced by consumer into MappingError exceptions since
				794	otherwise Storage will retry fetch on unrelated local cache errors.
				795	"""
				796	# Read one chunk ahead, keep it in \|stored\|.
				797	# That way a complete stream can be verified before pushing last chunk
				798	# to consumer.
				799	stored = None
				800	for chunk in self.stream:
				801	assert chunk is not None
				802	if stored is not None:
				803	self._inspect_chunk(stored, is_last=False)
				804	try:
				805	yield stored
				806	except IOError as exc:
				807	raise MappingError('Failed to store an item in cache: %s' % exc)
				808	stored = chunk
				809	if stored is not None:
				810	self._inspect_chunk(stored, is_last=True)
				811	try:
				812	yield stored
				813	except IOError as exc:
				814	raise MappingError('Failed to store an item in cache: %s' % exc)
				815
				816	def _inspect_chunk(self, chunk, is_last):
				817	"""Called for each fetched chunk before passing it to consumer."""
				818	self.current_size += len(chunk)
				819	if (is_last and (self.expected_size != UNKNOWN_FILE_SIZE) and
				820	(self.expected_size != self.current_size)):
				821	raise IOError('Incorrect file size: expected %d, got %d' % (
				822	self.expected_size, self.current_size))
				823
				824
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	825	class StorageApi(object):
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	826	"""Interface for classes that implement low-level storage operations.
				827
				828	StorageApi is oblivious of compression and hashing scheme used. This details
				829	are handled in higher level Storage class.
				830
				831	Clients should generally not use StorageApi directly. Storage class is
				832	preferred since it implements compression and upload optimizations.
				833	"""
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	834
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	835	def get_fetch_url(self, digest):
				836	"""Returns an URL that can be used to fetch an item with given digest.
				837
				838	Arguments:
				839	digest: hex digest of item to fetch.
				840
				841	Returns:
				842	An URL or None if the protocol doesn't support this.
				843	"""
				844	raise NotImplementedError()
				845
Vadim Shtayura	f0cb97a	2013-12-05 13:57:49 -0800	[diff] [blame]	846	def fetch(self, digest, offset=0):
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	847	"""Fetches an object and yields its content.
				848
				849	Arguments:
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	850	digest: hash digest of item to download.
Vadim Shtayura	f0cb97a	2013-12-05 13:57:49 -0800	[diff] [blame]	851	offset: offset (in bytes) from the start of the file to resume fetch from.
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	852
				853	Yields:
				854	Chunks of downloaded item (as str objects).
				855	"""
				856	raise NotImplementedError()
				857
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	858	def push(self, item, push_state, content=None):
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	859	"""Uploads an \|item\| with content generated by \|content\| generator.
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	860
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	861	\|item\| MUST go through 'contains' call to get \|push_state\| before it can
				862	be pushed to the storage.
				863
				864	To be clear, here is one possible usage:
				865	all_items = [... all items to push as Item subclasses ...]
				866	for missing_item, push_state in storage_api.contains(all_items).items():
				867	storage_api.push(missing_item, push_state)
				868
				869	When pushing to a namespace with compression, data that should be pushed
				870	and data provided by the item is not the same. In that case \|content\| is
				871	not None and it yields chunks of compressed data (using item.content() as
				872	a source of original uncompressed data). This is implemented by Storage
				873	class.
				874
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	875	Arguments:
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	876	item: Item object that holds information about an item being pushed.
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	877	push_state: push state object as returned by 'contains' call.
				878	content: a generator that yields chunks to push, item.content() if None.
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	879
				880	Returns:
				881	None.
				882	"""
				883	raise NotImplementedError()
				884
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	885	def contains(self, items):
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	886	"""Checks for \|items\| on the server, prepares missing ones for upload.
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	887
				888	Arguments:
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	889	items: list of Item objects to check for presence.
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	890
				891	Returns:
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	892	A dict missing Item -> opaque push state object to be passed to 'push'.
				893	See doc string for 'push'.
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	894	"""
				895	raise NotImplementedError()
				896
				897
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	898	class _IsolateServerPushState(object):
				899	"""Per-item state passed from IsolateServer.contains to IsolateServer.push.
Mike Frysinger	27f03da	2014-02-12 16:47:01 -0500	[diff] [blame]	900
				901	Note this needs to be a global class to support pickling.
				902	"""
				903
				904	def __init__(self, upload_url, finalize_url):
				905	self.upload_url = upload_url
				906	self.finalize_url = finalize_url
				907	self.uploaded = False
				908	self.finalized = False
				909
				910
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	911	class IsolateServer(StorageApi):
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	912	"""StorageApi implementation that downloads and uploads to Isolate Server.
				913
				914	It uploads and downloads directly from Google Storage whenever appropriate.
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	915	Works only within single namespace.
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	916	"""
				917
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	918	def __init__(self, base_url, namespace):
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	919	super(IsolateServer, self).__init__()
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	920	assert base_url.startswith('http'), base_url
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	921	self.base_url = base_url.rstrip('/')
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	922	self.namespace = namespace
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	923	self._lock = threading.Lock()
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	924	self._server_caps = None
				925
				926	@staticmethod
				927	def _generate_handshake_request():
				928	"""Returns a dict to be sent as handshake request body."""
				929	# TODO(vadimsh): Set 'pusher' and 'fetcher' according to intended usage.
				930	return {
				931	'client_app_version': __version__,
				932	'fetcher': True,
				933	'protocol_version': ISOLATE_PROTOCOL_VERSION,
				934	'pusher': True,
				935	}
				936
				937	@staticmethod
				938	def _validate_handshake_response(caps):
				939	"""Validates and normalizes handshake response."""
				940	logging.info('Protocol version: %s', caps['protocol_version'])
				941	logging.info('Server version: %s', caps['server_app_version'])
				942	if caps.get('error'):
				943	raise MappingError(caps['error'])
				944	if not caps['access_token']:
				945	raise ValueError('access_token is missing')
				946	return caps
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	947
				948	@property
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	949	def _server_capabilities(self):
				950	"""Performs handshake with the server if not yet done.
				951
				952	Returns:
				953	Server capabilities dictionary as returned by /handshake endpoint.
				954
				955	Raises:
				956	MappingError if server rejects the handshake.
				957	"""
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	958	# TODO(maruel): Make this request much earlier asynchronously while the
				959	# files are being enumerated.
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	960
				961	# TODO(vadimsh): Put \|namespace\| in the URL so that server can apply
				962	# namespace-level ACLs to this call.
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	963	with self._lock:
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	964	if self._server_caps is None:
				965	request_body = json.dumps(
				966	self._generate_handshake_request(), separators=(',', ':'))
				967	response = net.url_read(
				968	url=self.base_url + '/content-gs/handshake',
				969	data=request_body,
				970	content_type='application/json',
				971	method='POST')
				972	if response is None:
				973	raise MappingError('Failed to perform handshake.')
				974	try:
				975	caps = json.loads(response)
				976	if not isinstance(caps, dict):
				977	raise ValueError('Expecting JSON dict')
				978	self._server_caps = self._validate_handshake_response(caps)
				979	except (ValueError, KeyError, TypeError) as exc:
				980	# KeyError exception has very confusing str conversion: it's just a
				981	# missing key value and nothing else. So print exception class name
				982	# as well.
				983	raise MappingError('Invalid handshake response (%s): %s' % (
				984	exc.__class__.__name__, exc))
				985	return self._server_caps
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	986
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	987	def get_fetch_url(self, digest):
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	988	assert isinstance(digest, basestring)
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	989	return '%s/content-gs/retrieve/%s/%s' % (
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	990	self.base_url, self.namespace, digest)
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	991
Vadim Shtayura	f0cb97a	2013-12-05 13:57:49 -0800	[diff] [blame]	992	def fetch(self, digest, offset=0):
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	993	source_url = self.get_fetch_url(digest)
Vadim Shtayura	f0cb97a	2013-12-05 13:57:49 -0800	[diff] [blame]	994	logging.debug('download_file(%s, %d)', source_url, offset)
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	995
				996	# Because the app engine DB is only eventually consistent, retry 404 errors
				997	# because the file might just not be visible yet (even though it has been
				998	# uploaded).
				999	connection = net.url_open(
Vadim Shtayura	f0cb97a	2013-12-05 13:57:49 -0800	[diff] [blame]	1000	source_url,
				1001	retry_404=True,
				1002	read_timeout=DOWNLOAD_READ_TIMEOUT,
				1003	headers={'Range': 'bytes=%d-' % offset} if offset else None)
				1004
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	1005	if not connection:
Vadim Shtayura	e34e13a	2014-02-02 11:23:26 -0800	[diff] [blame]	1006	raise IOError('Request failed - %s' % source_url)
Vadim Shtayura	f0cb97a	2013-12-05 13:57:49 -0800	[diff] [blame]	1007
				1008	# If \|offset\| is used, verify server respects it by checking Content-Range.
				1009	if offset:
				1010	content_range = connection.get_header('Content-Range')
				1011	if not content_range:
				1012	raise IOError('Missing Content-Range header')
				1013
				1014	# 'Content-Range' format is 'bytes <offset>-<last_byte_index>/<size>'.
				1015	# According to a spec, <size> can be '*' meaning "Total size of the file
				1016	# is not known in advance".
				1017	try:
				1018	match = re.match(r'bytes (\d+)-(\d+)/(\d+\|\*)', content_range)
				1019	if not match:
				1020	raise ValueError()
				1021	content_offset = int(match.group(1))
				1022	last_byte_index = int(match.group(2))
				1023	size = None if match.group(3) == '*' else int(match.group(3))
				1024	except ValueError:
				1025	raise IOError('Invalid Content-Range header: %s' % content_range)
				1026
				1027	# Ensure returned offset equals requested one.
				1028	if offset != content_offset:
				1029	raise IOError('Expecting offset %d, got %d (Content-Range is %s)' % (
				1030	offset, content_offset, content_range))
				1031
				1032	# Ensure entire tail of the file is returned.
				1033	if size is not None and last_byte_index + 1 != size:
				1034	raise IOError('Incomplete response. Content-Range: %s' % content_range)
				1035
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1036	return stream_read(connection, NET_IO_FILE_CHUNK)
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	1037
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	1038	def push(self, item, push_state, content=None):
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	1039	assert isinstance(item, Item)
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	1040	assert item.digest is not None
				1041	assert item.size is not None
				1042	assert isinstance(push_state, _IsolateServerPushState)
				1043	assert not push_state.finalized
				1044
				1045	# Default to item.content().
				1046	content = item.content() if content is None else content
				1047
				1048	# Do not iterate byte by byte over 'str'. Push it all as a single chunk.
				1049	if isinstance(content, basestring):
				1050	assert not isinstance(content, unicode), 'Unicode string is not allowed'
				1051	content = [content]
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	1052
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	1053	# TODO(vadimsh): Do not read from \|content\| generator when retrying push.
				1054	# If \|content\| is indeed a generator, it can not be re-winded back
				1055	# to the beginning of the stream. A retry will find it exhausted. A possible
				1056	# solution is to wrap \|content\| generator with some sort of caching
				1057	# restartable generator. It should be done alongside streaming support
				1058	# implementation.
				1059
				1060	# This push operation may be a retry after failed finalization call below,
				1061	# no need to reupload contents in that case.
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	1062	if not push_state.uploaded:
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	1063	# A cheezy way to avoid memcpy of (possibly huge) file, until streaming
				1064	# upload support is implemented.
				1065	if isinstance(content, list) and len(content) == 1:
				1066	content = content[0]
				1067	else:
				1068	content = ''.join(content)
				1069	# PUT file to \|upload_url\|.
				1070	response = net.url_read(
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	1071	url=push_state.upload_url,
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	1072	data=content,
				1073	content_type='application/octet-stream',
				1074	method='PUT')
				1075	if response is None:
				1076	raise IOError('Failed to upload a file %s to %s' % (
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	1077	item.digest, push_state.upload_url))
				1078	push_state.uploaded = True
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	1079	else:
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	1080	logging.info(
				1081	'A file %s already uploaded, retrying finalization only', item.digest)
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	1082
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	1083	# Optionally notify the server that it's done.
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	1084	if push_state.finalize_url:
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	1085	# TODO(vadimsh): Calculate MD5 or CRC32C sum while uploading a file and
				1086	# send it to isolated server. That way isolate server can verify that
				1087	# the data safely reached Google Storage (GS provides MD5 and CRC32C of
				1088	# stored files).
				1089	response = net.url_read(
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	1090	url=push_state.finalize_url,
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	1091	data='',
				1092	content_type='application/json',
				1093	method='POST')
				1094	if response is None:
				1095	raise IOError('Failed to finalize an upload of %s' % item.digest)
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	1096	push_state.finalized = True
maruel@chromium.org	d1e20c9	2013-09-17 20:54:26 +0000	[diff] [blame]	1097
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	1098	def contains(self, items):
				1099	logging.info('Checking existence of %d files...', len(items))
maruel@chromium.org	d1e20c9	2013-09-17 20:54:26 +0000	[diff] [blame]	1100
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	1101	# Ensure all items were initialized with 'prepare' call. Storage does that.
				1102	assert all(i.digest is not None and i.size is not None for i in items)
				1103
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	1104	# Request body is a json encoded list of dicts.
				1105	body = [
				1106	{
				1107	'h': item.digest,
				1108	's': item.size,
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	1109	'i': int(item.high_priority),
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	1110	} for item in items
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	1111	]
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	1112
				1113	query_url = '%s/content-gs/pre-upload/%s?token=%s' % (
				1114	self.base_url,
				1115	self.namespace,
				1116	urllib.quote(self._server_capabilities['access_token']))
				1117	response_body = net.url_read(
				1118	url=query_url,
				1119	data=json.dumps(body, separators=(',', ':')),
				1120	content_type='application/json',
				1121	method='POST')
				1122	if response_body is None:
				1123	raise MappingError('Failed to execute /pre-upload query')
				1124
				1125	# Response body is a list of push_urls (or null if file is already present).
				1126	try:
				1127	response = json.loads(response_body)
				1128	if not isinstance(response, list):
				1129	raise ValueError('Expecting response with json-encoded list')
				1130	if len(response) != len(items):
				1131	raise ValueError(
				1132	'Incorrect number of items in the list, expected %d, '
				1133	'but got %d' % (len(items), len(response)))
				1134	except ValueError as err:
				1135	raise MappingError(
				1136	'Invalid response from server: %s, body is %s' % (err, response_body))
				1137
				1138	# Pick Items that are missing, attach _PushState to them.
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	1139	missing_items = {}
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	1140	for i, push_urls in enumerate(response):
				1141	if push_urls:
				1142	assert len(push_urls) == 2, str(push_urls)
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	1143	missing_items[items[i]] = _IsolateServerPushState(
				1144	push_urls[0], push_urls[1])
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	1145	logging.info('Queried %d files, %d cache hit',
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	1146	len(items), len(items) - len(missing_items))
				1147	return missing_items
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	1148
				1149
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	1150	class FileSystem(StorageApi):
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	1151	"""StorageApi implementation that fetches data from the file system.
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	1152
				1153	The common use case is a NFS/CIFS file server that is mounted locally that is
				1154	used to fetch the file on a local partition.
				1155	"""
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	1156
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	1157	# Used for push_state instead of None. That way caller is forced to
				1158	# call 'contains' before 'push'. Naively passing None in 'push' will not work.
				1159	_DUMMY_PUSH_STATE = object()
				1160
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	1161	def __init__(self, base_path):
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	1162	super(FileSystem, self).__init__()
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	1163	self.base_path = base_path
				1164
vadimsh@chromium.org	f24e5c3	2013-10-11 21:16:21 +0000	[diff] [blame]	1165	def get_fetch_url(self, digest):
				1166	return None
				1167
Vadim Shtayura	f0cb97a	2013-12-05 13:57:49 -0800	[diff] [blame]	1168	def fetch(self, digest, offset=0):
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	1169	assert isinstance(digest, basestring)
Vadim Shtayura	f0cb97a	2013-12-05 13:57:49 -0800	[diff] [blame]	1170	return file_read(os.path.join(self.base_path, digest), offset=offset)
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	1171
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	1172	def push(self, item, push_state, content=None):
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	1173	assert isinstance(item, Item)
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	1174	assert item.digest is not None
				1175	assert item.size is not None
				1176	assert push_state is self._DUMMY_PUSH_STATE
				1177	content = item.content() if content is None else content
				1178	if isinstance(content, basestring):
				1179	assert not isinstance(content, unicode), 'Unicode string is not allowed'
				1180	content = [content]
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1181	file_write(os.path.join(self.base_path, item.digest), content)
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	1182
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	1183	def contains(self, items):
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	1184	assert all(i.digest is not None and i.size is not None for i in items)
				1185	return dict(
				1186	(item, self._DUMMY_PUSH_STATE) for item in items
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	1187	if not os.path.exists(os.path.join(self.base_path, item.digest))
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	1188	)
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	1189
				1190
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1191	class LocalCache(object):
				1192	"""Local cache that stores objects fetched via Storage.
				1193
				1194	It can be accessed concurrently from multiple threads, so it should protect
				1195	its internal state with some lock.
				1196	"""
Marc-Antoine Ruel	2283ad1	2014-02-09 11:14:57 -0500	[diff] [blame]	1197	cache_dir = None
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1198
				1199	def __enter__(self):
				1200	"""Context manager interface."""
				1201	return self
				1202
				1203	def __exit__(self, _exc_type, _exec_value, _traceback):
				1204	"""Context manager interface."""
				1205	return False
				1206
				1207	def cached_set(self):
				1208	"""Returns a set of all cached digests (always a new object)."""
				1209	raise NotImplementedError()
				1210
				1211	def touch(self, digest, size):
				1212	"""Ensures item is not corrupted and updates its LRU position.
				1213
				1214	Arguments:
				1215	digest: hash digest of item to check.
				1216	size: expected size of this item.
				1217
				1218	Returns:
				1219	True if item is in cache and not corrupted.
				1220	"""
				1221	raise NotImplementedError()
				1222
				1223	def evict(self, digest):
				1224	"""Removes item from cache if it's there."""
				1225	raise NotImplementedError()
				1226
				1227	def read(self, digest):
				1228	"""Returns contents of the cached item as a single str."""
				1229	raise NotImplementedError()
				1230
				1231	def write(self, digest, content):
				1232	"""Reads data from \|content\| generator and stores it in cache."""
				1233	raise NotImplementedError()
				1234
Marc-Antoine Ruel	fb199cf	2013-11-12 15:38:12 -0500	[diff] [blame]	1235	def hardlink(self, digest, dest, file_mode):
				1236	"""Ensures file at \|dest\| has same content as cached \|digest\|.
				1237
				1238	If file_mode is provided, it is used to set the executable bit if
				1239	applicable.
				1240	"""
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1241	raise NotImplementedError()
				1242
				1243
				1244	class MemoryCache(LocalCache):
				1245	"""LocalCache implementation that stores everything in memory."""
				1246
				1247	def __init__(self):
				1248	super(MemoryCache, self).__init__()
				1249	# Let's not assume dict is thread safe.
				1250	self._lock = threading.Lock()
				1251	self._contents = {}
				1252
				1253	def cached_set(self):
				1254	with self._lock:
				1255	return set(self._contents)
				1256
				1257	def touch(self, digest, size):
				1258	with self._lock:
				1259	return digest in self._contents
				1260
				1261	def evict(self, digest):
				1262	with self._lock:
				1263	self._contents.pop(digest, None)
				1264
				1265	def read(self, digest):
				1266	with self._lock:
				1267	return self._contents[digest]
				1268
				1269	def write(self, digest, content):
				1270	# Assemble whole stream before taking the lock.
				1271	data = ''.join(content)
				1272	with self._lock:
				1273	self._contents[digest] = data
				1274
Marc-Antoine Ruel	fb199cf	2013-11-12 15:38:12 -0500	[diff] [blame]	1275	def hardlink(self, digest, dest, file_mode):
				1276	"""Since data is kept in memory, there is no filenode to hardlink."""
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1277	file_write(dest, [self.read(digest)])
Marc-Antoine Ruel	fb199cf	2013-11-12 15:38:12 -0500	[diff] [blame]	1278	if file_mode is not None:
				1279	# Ignores all other bits.
				1280	os.chmod(dest, file_mode & 0500)
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1281
				1282
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	1283	def get_hash_algo(_namespace):
				1284	"""Return hash algorithm class to use when uploading to given \|namespace\|."""
				1285	# TODO(vadimsh): Implement this at some point.
				1286	return hashlib.sha1
				1287
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	1288
vadimsh@chromium.org	7cdf1c0	2013-09-25 00:24:16 +0000	[diff] [blame]	1289	def is_namespace_with_compression(namespace):
				1290	"""Returns True if given \|namespace\| stores compressed objects."""
				1291	return namespace.endswith(('-gzip', '-deflate'))
				1292
				1293
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	1294	def get_storage_api(file_or_url, namespace):
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	1295	"""Returns an object that implements low-level StorageApi interface.
				1296
				1297	It is used by Storage to work with single isolate \|namespace\|. It should
				1298	rarely be used directly by clients, see 'get_storage' for
				1299	a better alternative.
				1300
				1301	Arguments:
				1302	file_or_url: a file path to use file system based storage, or URL of isolate
				1303	service to use shared cloud based storage.
				1304	namespace: isolate namespace to operate in, also defines hashing and
				1305	compression scheme used, i.e. namespace names that end with '-gzip'
				1306	store compressed data.
				1307
				1308	Returns:
				1309	Instance of StorageApi subclass.
				1310	"""
Marc-Antoine Ruel	3798993	2013-11-19 16:28:08 -0500	[diff] [blame]	1311	if file_path.is_url(file_or_url):
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	1312	return IsolateServer(file_or_url, namespace)
				1313	else:
				1314	return FileSystem(file_or_url)
				1315
				1316
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1317	def get_storage(file_or_url, namespace):
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	1318	"""Returns Storage class that can upload and download from \|namespace\|.
				1319
				1320	Arguments:
				1321	file_or_url: a file path to use file system based storage, or URL of isolate
				1322	service to use shared cloud based storage.
				1323	namespace: isolate namespace to operate in, also defines hashing and
				1324	compression scheme used, i.e. namespace names that end with '-gzip'
				1325	store compressed data.
				1326
				1327	Returns:
				1328	Instance of Storage.
				1329	"""
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1330	return Storage(
				1331	get_storage_api(file_or_url, namespace),
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	1332	is_namespace_with_compression(namespace),
				1333	get_hash_algo(namespace))
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	1334
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	1335
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	1336	def expand_symlinks(indir, relfile):
				1337	"""Follows symlinks in \|relfile\|, but treating symlinks that point outside the
				1338	build tree as if they were ordinary directories/files. Returns the final
				1339	symlink-free target and a list of paths to symlinks encountered in the
				1340	process.
				1341
				1342	The rule about symlinks outside the build tree is for the benefit of the
				1343	Chromium OS ebuild, which symlinks the output directory to an unrelated path
				1344	in the chroot.
				1345
				1346	Fails when a directory loop is detected, although in theory we could support
				1347	that case.
				1348	"""
				1349	is_directory = relfile.endswith(os.path.sep)
				1350	done = indir
				1351	todo = relfile.strip(os.path.sep)
				1352	symlinks = []
				1353
				1354	while todo:
				1355	pre_symlink, symlink, post_symlink = file_path.split_at_symlink(
				1356	done, todo)
				1357	if not symlink:
				1358	todo = file_path.fix_native_path_case(done, todo)
				1359	done = os.path.join(done, todo)
				1360	break
				1361	symlink_path = os.path.join(done, pre_symlink, symlink)
				1362	post_symlink = post_symlink.lstrip(os.path.sep)
				1363	# readlink doesn't exist on Windows.
				1364	# pylint: disable=E1101
				1365	target = os.path.normpath(os.path.join(done, pre_symlink))
				1366	symlink_target = os.readlink(symlink_path)
				1367	if os.path.isabs(symlink_target):
				1368	# Absolute path are considered a normal directories. The use case is
				1369	# generally someone who puts the output directory on a separate drive.
				1370	target = symlink_target
				1371	else:
				1372	# The symlink itself could be using the wrong path case.
				1373	target = file_path.fix_native_path_case(target, symlink_target)
				1374
				1375	if not os.path.exists(target):
				1376	raise MappingError(
				1377	'Symlink target doesn\'t exist: %s -> %s' % (symlink_path, target))
				1378	target = file_path.get_native_path_case(target)
				1379	if not file_path.path_starts_with(indir, target):
				1380	done = symlink_path
				1381	todo = post_symlink
				1382	continue
				1383	if file_path.path_starts_with(target, symlink_path):
				1384	raise MappingError(
				1385	'Can\'t map recursive symlink reference %s -> %s' %
				1386	(symlink_path, target))
				1387	logging.info('Found symlink: %s -> %s', symlink_path, target)
				1388	symlinks.append(os.path.relpath(symlink_path, indir))
				1389	# Treat the common prefix of the old and new paths as done, and start
				1390	# scanning again.
				1391	target = target.split(os.path.sep)
				1392	symlink_path = symlink_path.split(os.path.sep)
				1393	prefix_length = 0
				1394	for target_piece, symlink_path_piece in zip(target, symlink_path):
				1395	if target_piece == symlink_path_piece:
				1396	prefix_length += 1
				1397	else:
				1398	break
				1399	done = os.path.sep.join(target[:prefix_length])
				1400	todo = os.path.join(
				1401	os.path.sep.join(target[prefix_length:]), post_symlink)
				1402
				1403	relfile = os.path.relpath(done, indir)
				1404	relfile = relfile.rstrip(os.path.sep) + is_directory * os.path.sep
				1405	return relfile, symlinks
				1406
				1407
				1408	def expand_directory_and_symlink(indir, relfile, blacklist, follow_symlinks):
				1409	"""Expands a single input. It can result in multiple outputs.
				1410
				1411	This function is recursive when relfile is a directory.
				1412
				1413	Note: this code doesn't properly handle recursive symlink like one created
				1414	with:
				1415	ln -s .. foo
				1416	"""
				1417	if os.path.isabs(relfile):
				1418	raise MappingError('Can\'t map absolute path %s' % relfile)
				1419
				1420	infile = file_path.normpath(os.path.join(indir, relfile))
				1421	if not infile.startswith(indir):
				1422	raise MappingError('Can\'t map file %s outside %s' % (infile, indir))
				1423
				1424	filepath = os.path.join(indir, relfile)
				1425	native_filepath = file_path.get_native_path_case(filepath)
				1426	if filepath != native_filepath:
				1427	# Special case './'.
				1428	if filepath != native_filepath + '.' + os.path.sep:
				1429	# Give up enforcing strict path case on OSX. Really, it's that sad. The
				1430	# case where it happens is very specific and hard to reproduce:
				1431	# get_native_path_case(
				1432	# u'Foo.framework/Versions/A/Resources/Something.nib') will return
				1433	# u'Foo.framework/Versions/A/resources/Something.nib', e.g. lowercase 'r'.
				1434	#
				1435	# Note that this is really something deep in OSX because running
				1436	# ls Foo.framework/Versions/A
				1437	# will print out 'Resources', while file_path.get_native_path_case()
				1438	# returns a lower case 'r'.
				1439	#
				1440	# So something is happening under the hood resulting in the command 'ls'
				1441	# and Carbon.File.FSPathMakeRef('path').FSRefMakePath() to disagree. We
				1442	# have no idea why.
				1443	if sys.platform != 'darwin':
				1444	raise MappingError(
				1445	'File path doesn\'t equal native file path\n%s != %s' %
				1446	(filepath, native_filepath))
				1447
				1448	symlinks = []
				1449	if follow_symlinks:
				1450	relfile, symlinks = expand_symlinks(indir, relfile)
				1451
				1452	if relfile.endswith(os.path.sep):
				1453	if not os.path.isdir(infile):
				1454	raise MappingError(
				1455	'%s is not a directory but ends with "%s"' % (infile, os.path.sep))
				1456
				1457	# Special case './'.
				1458	if relfile.startswith('.' + os.path.sep):
				1459	relfile = relfile[2:]
				1460	outfiles = symlinks
				1461	try:
				1462	for filename in os.listdir(infile):
				1463	inner_relfile = os.path.join(relfile, filename)
				1464	if blacklist and blacklist(inner_relfile):
				1465	continue
				1466	if os.path.isdir(os.path.join(indir, inner_relfile)):
				1467	inner_relfile += os.path.sep
				1468	outfiles.extend(
				1469	expand_directory_and_symlink(indir, inner_relfile, blacklist,
				1470	follow_symlinks))
				1471	return outfiles
				1472	except OSError as e:
				1473	raise MappingError(
				1474	'Unable to iterate over directory %s.\n%s' % (infile, e))
				1475	else:
				1476	# Always add individual files even if they were blacklisted.
				1477	if os.path.isdir(infile):
				1478	raise MappingError(
				1479	'Input directory %s must have a trailing slash' % infile)
				1480
				1481	if not os.path.isfile(infile):
				1482	raise MappingError('Input file %s doesn\'t exist' % infile)
				1483
				1484	return symlinks + [relfile]
				1485
				1486
Marc-Antoine Ruel	0519946	2014-03-13 15:40:48 -0400	[diff] [blame^]	1487	def process_input(filepath, prevdict, read_only, algo):
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	1488	"""Processes an input file, a dependency, and return meta data about it.
				1489
				1490	Behaviors:
				1491	- Retrieves the file mode, file size, file timestamp, file link
				1492	destination if it is a file link and calcultate the SHA-1 of the file's
				1493	content if the path points to a file and not a symlink.
				1494
				1495	Arguments:
				1496	filepath: File to act on.
				1497	prevdict: the previous dictionary. It is used to retrieve the cached sha-1
				1498	to skip recalculating the hash. Optional.
Marc-Antoine Ruel	7124e39	2014-01-09 11:49:21 -0500	[diff] [blame]	1499	read_only: If 1 or 2, the file mode is manipulated. In practice, only save
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	1500	one of 4 modes: 0755 (rwx), 0644 (rw), 0555 (rx), 0444 (r). On
				1501	windows, mode is not set since all files are 'executable' by
				1502	default.
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	1503	algo: Hashing algorithm used.
				1504
				1505	Returns:
				1506	The necessary data to create a entry in the 'files' section of an .isolated
				1507	file.
				1508	"""
				1509	out = {}
				1510	# TODO(csharp): Fix crbug.com/150823 and enable the touched logic again.
				1511	# if prevdict.get('T') == True:
				1512	# # The file's content is ignored. Skip the time and hard code mode.
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	1513	# out['s'] = 0
				1514	# out['h'] = algo().hexdigest()
				1515	# out['T'] = True
				1516	# return out
				1517
				1518	# Always check the file stat and check if it is a link. The timestamp is used
				1519	# to know if the file's content/symlink destination should be looked into.
				1520	# E.g. only reuse from prevdict if the timestamp hasn't changed.
				1521	# There is the risk of the file's timestamp being reset to its last value
				1522	# manually while its content changed. We don't protect against that use case.
				1523	try:
				1524	filestats = os.lstat(filepath)
				1525	except OSError:
				1526	# The file is not present.
				1527	raise MappingError('%s is missing' % filepath)
				1528	is_link = stat.S_ISLNK(filestats.st_mode)
				1529
Marc-Antoine Ruel	0519946	2014-03-13 15:40:48 -0400	[diff] [blame^]	1530	if sys.platform != 'win32':
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	1531	# Ignore file mode on Windows since it's not really useful there.
				1532	filemode = stat.S_IMODE(filestats.st_mode)
				1533	# Remove write access for group and all access to 'others'.
				1534	filemode &= ~(stat.S_IWGRP \| stat.S_IRWXO)
				1535	if read_only:
				1536	filemode &= ~stat.S_IWUSR
				1537	if filemode & stat.S_IXUSR:
				1538	filemode \|= stat.S_IXGRP
				1539	else:
				1540	filemode &= ~stat.S_IXGRP
				1541	if not is_link:
				1542	out['m'] = filemode
				1543
				1544	# Used to skip recalculating the hash or link destination. Use the most recent
				1545	# update time.
				1546	# TODO(maruel): Save it in the .state file instead of .isolated so the
				1547	# .isolated file is deterministic.
				1548	out['t'] = int(round(filestats.st_mtime))
				1549
				1550	if not is_link:
				1551	out['s'] = filestats.st_size
				1552	# If the timestamp wasn't updated and the file size is still the same, carry
				1553	# on the sha-1.
				1554	if (prevdict.get('t') == out['t'] and
				1555	prevdict.get('s') == out['s']):
				1556	# Reuse the previous hash if available.
				1557	out['h'] = prevdict.get('h')
				1558	if not out.get('h'):
				1559	out['h'] = hash_file(filepath, algo)
				1560	else:
				1561	# If the timestamp wasn't updated, carry on the link destination.
				1562	if prevdict.get('t') == out['t']:
				1563	# Reuse the previous link destination if available.
				1564	out['l'] = prevdict.get('l')
				1565	if out.get('l') is None:
				1566	# The link could be in an incorrect path case. In practice, this only
				1567	# happen on OSX on case insensitive HFS.
				1568	# TODO(maruel): It'd be better if it was only done once, in
				1569	# expand_directory_and_symlink(), so it would not be necessary to do again
				1570	# here.
				1571	symlink_value = os.readlink(filepath) # pylint: disable=E1101
				1572	filedir = file_path.get_native_path_case(os.path.dirname(filepath))
				1573	native_dest = file_path.fix_native_path_case(filedir, symlink_value)
				1574	out['l'] = os.path.relpath(native_dest, filedir)
				1575	return out
				1576
				1577
				1578	def save_isolated(isolated, data):
				1579	"""Writes one or multiple .isolated files.
				1580
				1581	Note: this reference implementation does not create child .isolated file so it
				1582	always returns an empty list.
				1583
				1584	Returns the list of child isolated files that are included by \|isolated\|.
				1585	"""
				1586	# Make sure the data is valid .isolated data by 'reloading' it.
				1587	algo = SUPPORTED_ALGOS[data['algo']]
Marc-Antoine Ruel	0519946	2014-03-13 15:40:48 -0400	[diff] [blame^]	1588	load_isolated(json.dumps(data), algo)
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	1589	tools.write_json(isolated, data, True)
				1590	return []
				1591
				1592
maruel@chromium.org	7b844a6	2013-09-17 13:04:59 +0000	[diff] [blame]	1593	def upload_tree(base_url, indir, infiles, namespace):
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	1594	"""Uploads the given tree to the given url.
				1595
				1596	Arguments:
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	1597	base_url: The base url, it is assume that \|base_url\|/has/ can be used to
				1598	query if an element was already uploaded, and \|base_url\|/store/
				1599	can be used to upload a new element.
				1600	indir: Root directory the infiles are based in.
vadimsh@chromium.org	bcb966b	2013-10-01 18:14:18 +0000	[diff] [blame]	1601	infiles: dict of files to upload from \|indir\| to \|base_url\|.
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	1602	namespace: The namespace to use on the server.
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	1603	"""
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	1604	logging.info('upload_tree(indir=%s, files=%d)', indir, len(infiles))
				1605
				1606	# Convert \|indir\| + \|infiles\| into a list of FileItem objects.
				1607	# Filter out symlinks, since they are not represented by items on isolate
				1608	# server side.
				1609	items = [
				1610	FileItem(
				1611	path=os.path.join(indir, filepath),
				1612	digest=metadata['h'],
				1613	size=metadata['s'],
				1614	high_priority=metadata.get('priority') == '0')
				1615	for filepath, metadata in infiles.iteritems()
				1616	if 'l' not in metadata
				1617	]
				1618
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1619	with get_storage(base_url, namespace) as storage:
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	1620	storage.upload_items(items)
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	1621	return 0
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	1622
				1623
Marc-Antoine Ruel	0519946	2014-03-13 15:40:48 -0400	[diff] [blame^]	1624	def load_isolated(content, algo):
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1625	"""Verifies the .isolated file is valid and loads this object with the json
				1626	data.
maruel@chromium.org	385d73d	2013-09-19 18:33:21 +0000	[diff] [blame]	1627
				1628	Arguments:
				1629	- content: raw serialized content to load.
maruel@chromium.org	385d73d	2013-09-19 18:33:21 +0000	[diff] [blame]	1630	- algo: hashlib algorithm class. Used to confirm the algorithm matches the
				1631	algorithm used on the Isolate Server.
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1632	"""
				1633	try:
				1634	data = json.loads(content)
				1635	except ValueError:
				1636	raise ConfigError('Failed to parse: %s...' % content[:100])
				1637
				1638	if not isinstance(data, dict):
				1639	raise ConfigError('Expected dict, got %r' % data)
				1640
maruel@chromium.org	385d73d	2013-09-19 18:33:21 +0000	[diff] [blame]	1641	# Check 'version' first, since it could modify the parsing after.
Marc-Antoine Ruel	0519946	2014-03-13 15:40:48 -0400	[diff] [blame^]	1642	value = data.get('version', '1.0')
maruel@chromium.org	385d73d	2013-09-19 18:33:21 +0000	[diff] [blame]	1643	if not isinstance(value, basestring):
				1644	raise ConfigError('Expected string, got %r' % value)
Marc-Antoine Ruel	0519946	2014-03-13 15:40:48 -0400	[diff] [blame^]	1645	try:
				1646	version = tuple(map(int, value.split('.')))
				1647	except ValueError:
				1648	raise ConfigError('Expected valid version, got %r' % value)
				1649
				1650	expected_version = tuple(map(int, ISOLATED_FILE_VERSION.split('.')))
				1651	# Major version must match.
				1652	if version[0] != expected_version[0]:
Marc-Antoine Ruel	1c1edd6	2013-12-06 09:13:13 -0500	[diff] [blame]	1653	raise ConfigError(
				1654	'Expected compatible \'%s\' version, got %r' %
				1655	(ISOLATED_FILE_VERSION, value))
maruel@chromium.org	385d73d	2013-09-19 18:33:21 +0000	[diff] [blame]	1656
				1657	if algo is None:
Marc-Antoine Ruel	ac54cb4	2013-11-18 14:05:35 -0500	[diff] [blame]	1658	# TODO(maruel): Remove the default around Jan 2014.
maruel@chromium.org	385d73d	2013-09-19 18:33:21 +0000	[diff] [blame]	1659	# Default the algorithm used in the .isolated file itself, falls back to
				1660	# 'sha-1' if unspecified.
				1661	algo = SUPPORTED_ALGOS_REVERSE[data.get('algo', 'sha-1')]
				1662
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1663	for key, value in data.iteritems():
maruel@chromium.org	385d73d	2013-09-19 18:33:21 +0000	[diff] [blame]	1664	if key == 'algo':
				1665	if not isinstance(value, basestring):
				1666	raise ConfigError('Expected string, got %r' % value)
				1667	if value not in SUPPORTED_ALGOS:
				1668	raise ConfigError(
				1669	'Expected one of \'%s\', got %r' %
				1670	(', '.join(sorted(SUPPORTED_ALGOS)), value))
				1671	if value != SUPPORTED_ALGOS_REVERSE[algo]:
				1672	raise ConfigError(
				1673	'Expected \'%s\', got %r' % (SUPPORTED_ALGOS_REVERSE[algo], value))
				1674
				1675	elif key == 'command':
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1676	if not isinstance(value, list):
				1677	raise ConfigError('Expected list, got %r' % value)
				1678	if not value:
				1679	raise ConfigError('Expected non-empty command')
				1680	for subvalue in value:
				1681	if not isinstance(subvalue, basestring):
				1682	raise ConfigError('Expected string, got %r' % subvalue)
				1683
				1684	elif key == 'files':
				1685	if not isinstance(value, dict):
				1686	raise ConfigError('Expected dict, got %r' % value)
				1687	for subkey, subvalue in value.iteritems():
				1688	if not isinstance(subkey, basestring):
				1689	raise ConfigError('Expected string, got %r' % subkey)
				1690	if not isinstance(subvalue, dict):
				1691	raise ConfigError('Expected dict, got %r' % subvalue)
				1692	for subsubkey, subsubvalue in subvalue.iteritems():
				1693	if subsubkey == 'l':
				1694	if not isinstance(subsubvalue, basestring):
				1695	raise ConfigError('Expected string, got %r' % subsubvalue)
				1696	elif subsubkey == 'm':
				1697	if not isinstance(subsubvalue, int):
				1698	raise ConfigError('Expected int, got %r' % subsubvalue)
				1699	elif subsubkey == 'h':
				1700	if not is_valid_hash(subsubvalue, algo):
				1701	raise ConfigError('Expected sha-1, got %r' % subsubvalue)
				1702	elif subsubkey == 's':
Marc-Antoine Ruel	aab3a62	2013-11-28 09:47:05 -0500	[diff] [blame]	1703	if not isinstance(subsubvalue, (int, long)):
				1704	raise ConfigError('Expected int or long, got %r' % subsubvalue)
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1705	else:
				1706	raise ConfigError('Unknown subsubkey %s' % subsubkey)
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1707	if bool('h' in subvalue) == bool('l' in subvalue):
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1708	raise ConfigError(
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1709	'Need only one of \'h\' (sha-1) or \'l\' (link), got: %r' %
				1710	subvalue)
				1711	if bool('h' in subvalue) != bool('s' in subvalue):
				1712	raise ConfigError(
				1713	'Both \'h\' (sha-1) and \'s\' (size) should be set, got: %r' %
				1714	subvalue)
				1715	if bool('s' in subvalue) == bool('l' in subvalue):
				1716	raise ConfigError(
				1717	'Need only one of \'s\' (size) or \'l\' (link), got: %r' %
				1718	subvalue)
				1719	if bool('l' in subvalue) and bool('m' in subvalue):
				1720	raise ConfigError(
				1721	'Cannot use \'m\' (mode) and \'l\' (link), got: %r' %
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1722	subvalue)
				1723
				1724	elif key == 'includes':
				1725	if not isinstance(value, list):
				1726	raise ConfigError('Expected list, got %r' % value)
				1727	if not value:
				1728	raise ConfigError('Expected non-empty includes list')
				1729	for subvalue in value:
				1730	if not is_valid_hash(subvalue, algo):
				1731	raise ConfigError('Expected sha-1, got %r' % subvalue)
				1732
Marc-Antoine Ruel	0519946	2014-03-13 15:40:48 -0400	[diff] [blame^]	1733	elif key == 'os':
				1734	if version >= (1, 4):
				1735	raise ConfigError('Key \'os\' is not allowed starting version 1.4')
				1736
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1737	elif key == 'read_only':
Marc-Antoine Ruel	7124e39	2014-01-09 11:49:21 -0500	[diff] [blame]	1738	if not value in (0, 1, 2):
				1739	raise ConfigError('Expected 0, 1 or 2, got %r' % value)
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1740
				1741	elif key == 'relative_cwd':
				1742	if not isinstance(value, basestring):
				1743	raise ConfigError('Expected string, got %r' % value)
				1744
maruel@chromium.org	385d73d	2013-09-19 18:33:21 +0000	[diff] [blame]	1745	elif key == 'version':
				1746	# Already checked above.
				1747	pass
				1748
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1749	else:
maruel@chromium.org	385d73d	2013-09-19 18:33:21 +0000	[diff] [blame]	1750	raise ConfigError('Unknown key %r' % key)
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1751
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1752	# Automatically fix os.path.sep if necessary. While .isolated files are always
				1753	# in the the native path format, someone could want to download an .isolated
				1754	# tree from another OS.
				1755	wrong_path_sep = '/' if os.path.sep == '\\' else '\\'
				1756	if 'files' in data:
				1757	data['files'] = dict(
				1758	(k.replace(wrong_path_sep, os.path.sep), v)
				1759	for k, v in data['files'].iteritems())
				1760	for v in data['files'].itervalues():
				1761	if 'l' in v:
				1762	v['l'] = v['l'].replace(wrong_path_sep, os.path.sep)
				1763	if 'relative_cwd' in data:
				1764	data['relative_cwd'] = data['relative_cwd'].replace(
				1765	wrong_path_sep, os.path.sep)
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1766	return data
				1767
				1768
				1769	class IsolatedFile(object):
				1770	"""Represents a single parsed .isolated file."""
				1771	def __init__(self, obj_hash, algo):
				1772	"""\|obj_hash\| is really the sha-1 of the file."""
				1773	logging.debug('IsolatedFile(%s)' % obj_hash)
				1774	self.obj_hash = obj_hash
				1775	self.algo = algo
				1776	# Set once all the left-side of the tree is parsed. 'Tree' here means the
				1777	# .isolate and all the .isolated files recursively included by it with
				1778	# 'includes' key. The order of each sha-1 in 'includes', each representing a
				1779	# .isolated file in the hash table, is important, as the later ones are not
				1780	# processed until the firsts are retrieved and read.
				1781	self.can_fetch = False
				1782
				1783	# Raw data.
				1784	self.data = {}
				1785	# A IsolatedFile instance, one per object in self.includes.
				1786	self.children = []
				1787
				1788	# Set once the .isolated file is loaded.
				1789	self._is_parsed = False
				1790	# Set once the files are fetched.
				1791	self.files_fetched = False
				1792
Marc-Antoine Ruel	0519946	2014-03-13 15:40:48 -0400	[diff] [blame^]	1793	def load(self, content):
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1794	"""Verifies the .isolated file is valid and loads this object with the json
				1795	data.
				1796	"""
				1797	logging.debug('IsolatedFile.load(%s)' % self.obj_hash)
				1798	assert not self._is_parsed
Marc-Antoine Ruel	0519946	2014-03-13 15:40:48 -0400	[diff] [blame^]	1799	self.data = load_isolated(content, self.algo)
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1800	self.children = [
				1801	IsolatedFile(i, self.algo) for i in self.data.get('includes', [])
				1802	]
				1803	self._is_parsed = True
				1804
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1805	def fetch_files(self, fetch_queue, files):
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1806	"""Adds files in this .isolated file not present in \|files\| dictionary.
				1807
				1808	Preemptively request files.
				1809
				1810	Note that \|files\| is modified by this function.
				1811	"""
				1812	assert self.can_fetch
				1813	if not self._is_parsed or self.files_fetched:
				1814	return
				1815	logging.debug('fetch_files(%s)' % self.obj_hash)
				1816	for filepath, properties in self.data.get('files', {}).iteritems():
				1817	# Root isolated has priority on the files being mapped. In particular,
				1818	# overriden files must not be fetched.
				1819	if filepath not in files:
				1820	files[filepath] = properties
				1821	if 'h' in properties:
				1822	# Preemptively request files.
				1823	logging.debug('fetching %s' % filepath)
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	1824	fetch_queue.add(properties['h'], properties['s'], WorkerPool.MED)
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1825	self.files_fetched = True
				1826
				1827
				1828	class Settings(object):
				1829	"""Results of a completely parsed .isolated file."""
				1830	def __init__(self):
				1831	self.command = []
				1832	self.files = {}
				1833	self.read_only = None
				1834	self.relative_cwd = None
				1835	# The main .isolated file, a IsolatedFile instance.
				1836	self.root = None
				1837
Marc-Antoine Ruel	0519946	2014-03-13 15:40:48 -0400	[diff] [blame^]	1838	def load(self, fetch_queue, root_isolated_hash, algo):
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1839	"""Loads the .isolated and all the included .isolated asynchronously.
				1840
				1841	It enables support for "included" .isolated files. They are processed in
				1842	strict order but fetched asynchronously from the cache. This is important so
				1843	that a file in an included .isolated file that is overridden by an embedding
				1844	.isolated file is not fetched needlessly. The includes are fetched in one
				1845	pass and the files are fetched as soon as all the ones on the left-side
				1846	of the tree were fetched.
				1847
				1848	The prioritization is very important here for nested .isolated files.
				1849	'includes' have the highest priority and the algorithm is optimized for both
				1850	deep and wide trees. A deep one is a long link of .isolated files referenced
				1851	one at a time by one item in 'includes'. A wide one has a large number of
				1852	'includes' in a single .isolated file. 'left' is defined as an included
				1853	.isolated file earlier in the 'includes' list. So the order of the elements
				1854	in 'includes' is important.
				1855	"""
				1856	self.root = IsolatedFile(root_isolated_hash, algo)
				1857
				1858	# Isolated files being retrieved now: hash -> IsolatedFile instance.
				1859	pending = {}
				1860	# Set of hashes of already retrieved items to refuse recursive includes.
				1861	seen = set()
				1862
				1863	def retrieve(isolated_file):
				1864	h = isolated_file.obj_hash
				1865	if h in seen:
				1866	raise ConfigError('IsolatedFile %s is retrieved recursively' % h)
				1867	assert h not in pending
				1868	seen.add(h)
				1869	pending[h] = isolated_file
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	1870	fetch_queue.add(h, priority=WorkerPool.HIGH)
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1871
				1872	retrieve(self.root)
				1873
				1874	while pending:
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1875	item_hash = fetch_queue.wait(pending)
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1876	item = pending.pop(item_hash)
Marc-Antoine Ruel	0519946	2014-03-13 15:40:48 -0400	[diff] [blame^]	1877	item.load(fetch_queue.cache.read(item_hash))
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1878	if item_hash == root_isolated_hash:
				1879	# It's the root item.
				1880	item.can_fetch = True
				1881
				1882	for new_child in item.children:
				1883	retrieve(new_child)
				1884
				1885	# Traverse the whole tree to see if files can now be fetched.
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1886	self._traverse_tree(fetch_queue, self.root)
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1887
				1888	def check(n):
				1889	return all(check(x) for x in n.children) and n.files_fetched
				1890	assert check(self.root)
				1891
				1892	self.relative_cwd = self.relative_cwd or ''
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1893
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1894	def _traverse_tree(self, fetch_queue, node):
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1895	if node.can_fetch:
				1896	if not node.files_fetched:
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1897	self._update_self(fetch_queue, node)
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1898	will_break = False
				1899	for i in node.children:
				1900	if not i.can_fetch:
				1901	if will_break:
				1902	break
				1903	# Automatically mark the first one as fetcheable.
				1904	i.can_fetch = True
				1905	will_break = True
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1906	self._traverse_tree(fetch_queue, i)
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1907
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1908	def _update_self(self, fetch_queue, node):
				1909	node.fetch_files(fetch_queue, self.files)
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1910	# Grabs properties.
				1911	if not self.command and node.data.get('command'):
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1912	# Ensure paths are correctly separated on windows.
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1913	self.command = node.data['command']
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1914	if self.command:
				1915	self.command[0] = self.command[0].replace('/', os.path.sep)
				1916	self.command = tools.fix_python_path(self.command)
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1917	if self.read_only is None and node.data.get('read_only') is not None:
				1918	self.read_only = node.data['read_only']
				1919	if (self.relative_cwd is None and
				1920	node.data.get('relative_cwd') is not None):
				1921	self.relative_cwd = node.data['relative_cwd']
				1922
				1923
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1924	def fetch_isolated(
Marc-Antoine Ruel	0519946	2014-03-13 15:40:48 -0400	[diff] [blame^]	1925	isolated_hash, storage, cache, algo, outdir, require_command):
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1926	"""Aggressively downloads the .isolated file(s), then download all the files.
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1927
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1928	Arguments:
				1929	isolated_hash: hash of the root *.isolated file.
				1930	storage: Storage class that communicates with isolate storage.
				1931	cache: LocalCache class that knows how to store and map files locally.
				1932	algo: hash algorithm to use.
				1933	outdir: Output directory to map file tree to.
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1934	require_command: Ensure *.isolated specifies a command to run.
				1935
				1936	Returns:
				1937	Settings object that holds details about loaded *.isolated file.
				1938	"""
				1939	with cache:
				1940	fetch_queue = FetchQueue(storage, cache)
				1941	settings = Settings()
				1942
				1943	with tools.Profiler('GetIsolateds'):
				1944	# Optionally support local files by manually adding them to cache.
				1945	if not is_valid_hash(isolated_hash, algo):
				1946	isolated_hash = fetch_queue.inject_local_file(isolated_hash, algo)
				1947
				1948	# Load all *.isolated and start loading rest of the files.
Marc-Antoine Ruel	0519946	2014-03-13 15:40:48 -0400	[diff] [blame^]	1949	settings.load(fetch_queue, isolated_hash, algo)
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1950	if require_command and not settings.command:
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1951	# TODO(vadimsh): All fetch operations are already enqueue and there's no
				1952	# easy way to cancel them.
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1953	raise ConfigError('No command to run')
				1954
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1955	with tools.Profiler('GetRest'):
				1956	# Create file system hierarchy.
				1957	if not os.path.isdir(outdir):
				1958	os.makedirs(outdir)
				1959	create_directories(outdir, settings.files)
Marc-Antoine Ruel	ccafe0e	2013-11-08 16:15:36 -0500	[diff] [blame]	1960	create_symlinks(outdir, settings.files.iteritems())
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1961
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1962	# Ensure working directory exists.
				1963	cwd = os.path.normpath(os.path.join(outdir, settings.relative_cwd))
				1964	if not os.path.isdir(cwd):
				1965	os.makedirs(cwd)
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1966
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1967	# Multimap: digest -> list of pairs (path, props).
				1968	remaining = {}
				1969	for filepath, props in settings.files.iteritems():
				1970	if 'h' in props:
				1971	remaining.setdefault(props['h'], []).append((filepath, props))
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1972
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1973	# Now block on the remaining files to be downloaded and mapped.
				1974	logging.info('Retrieving remaining files (%d of them)...',
				1975	fetch_queue.pending_count)
				1976	last_update = time.time()
				1977	with threading_utils.DeadlockDetector(DEADLOCK_TIMEOUT) as detector:
				1978	while remaining:
				1979	detector.ping()
				1980
				1981	# Wait for any item to finish fetching to cache.
				1982	digest = fetch_queue.wait(remaining)
				1983
				1984	# Link corresponding files to a fetched item in cache.
				1985	for filepath, props in remaining.pop(digest):
Marc-Antoine Ruel	fb199cf	2013-11-12 15:38:12 -0500	[diff] [blame]	1986	cache.hardlink(
				1987	digest, os.path.join(outdir, filepath), props.get('m'))
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	1988
				1989	# Report progress.
				1990	duration = time.time() - last_update
				1991	if duration > DELAY_BETWEEN_UPDATES_IN_SECS:
				1992	msg = '%d files remaining...' % len(remaining)
				1993	print msg
				1994	logging.info(msg)
				1995	last_update = time.time()
				1996
				1997	# Cache could evict some items we just tried to fetch, it's a fatal error.
				1998	if not fetch_queue.verify_all_cached():
				1999	raise MappingError('Cache is too small to hold all requested files')
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	2000	return settings
				2001
				2002
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	2003	def directory_to_metadata(root, algo, blacklist):
				2004	"""Returns the FileItem list and .isolated metadata for a directory."""
				2005	root = file_path.get_native_path_case(root)
				2006	metadata = dict(
Marc-Antoine Ruel	0519946	2014-03-13 15:40:48 -0400	[diff] [blame^]	2007	(relpath, process_input(os.path.join(root, relpath), {}, False, algo))
				2008	for relpath in expand_directory_and_symlink(root, './', blacklist, True)
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	2009	)
				2010	for v in metadata.itervalues():
				2011	v.pop('t')
				2012	items = [
				2013	FileItem(
				2014	path=os.path.join(root, relpath),
				2015	digest=meta['h'],
				2016	size=meta['s'],
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	2017	high_priority=relpath.endswith('.isolated'))
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	2018	for relpath, meta in metadata.iteritems() if 'h' in meta
				2019	]
				2020	return items, metadata
				2021
				2022
Marc-Antoine Ruel	488ce8f	2014-02-09 11:25:04 -0500	[diff] [blame]	2023	def archive_files_to_storage(storage, algo, files, blacklist):
Marc-Antoine Ruel	2283ad1	2014-02-09 11:14:57 -0500	[diff] [blame]	2024	"""Stores every entries and returns the relevant data.
				2025
				2026	Arguments:
				2027	storage: a Storage object that communicates with the remote object store.
				2028	algo: an hashlib class to hash content. Usually hashlib.sha1.
				2029	files: list of file paths to upload. If a directory is specified, a
				2030	.isolated file is created and its hash is returned.
				2031	blacklist: function that returns True if a file should be omitted.
				2032	"""
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	2033	assert all(isinstance(i, unicode) for i in files), files
				2034	if len(files) != len(set(map(os.path.abspath, files))):
				2035	raise Error('Duplicate entries found.')
				2036
				2037	results = []
				2038	# The temporary directory is only created as needed.
				2039	tempdir = None
				2040	try:
				2041	# TODO(maruel): Yield the files to a worker thread.
				2042	items_to_upload = []
				2043	for f in files:
				2044	try:
				2045	filepath = os.path.abspath(f)
				2046	if os.path.isdir(filepath):
				2047	# Uploading a whole directory.
				2048	items, metadata = directory_to_metadata(filepath, algo, blacklist)
				2049
				2050	# Create the .isolated file.
				2051	if not tempdir:
				2052	tempdir = tempfile.mkdtemp(prefix='isolateserver')
				2053	handle, isolated = tempfile.mkstemp(dir=tempdir, suffix='.isolated')
				2054	os.close(handle)
				2055	data = {
				2056	'algo': SUPPORTED_ALGOS_REVERSE[algo],
				2057	'files': metadata,
Marc-Antoine Ruel	1c1edd6	2013-12-06 09:13:13 -0500	[diff] [blame]	2058	'version': ISOLATED_FILE_VERSION,
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	2059	}
				2060	save_isolated(isolated, data)
				2061	h = hash_file(isolated, algo)
				2062	items_to_upload.extend(items)
				2063	items_to_upload.append(
				2064	FileItem(
				2065	path=isolated,
				2066	digest=h,
				2067	size=os.stat(isolated).st_size,
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	2068	high_priority=True))
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	2069	results.append((h, f))
				2070
				2071	elif os.path.isfile(filepath):
				2072	h = hash_file(filepath, algo)
				2073	items_to_upload.append(
				2074	FileItem(
				2075	path=filepath,
				2076	digest=h,
				2077	size=os.stat(filepath).st_size,
Vadim Shtayura	bcff74f	2014-02-27 16:19:34 -0800	[diff] [blame]	2078	high_priority=f.endswith('.isolated')))
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	2079	results.append((h, f))
				2080	else:
				2081	raise Error('%s is neither a file or directory.' % f)
				2082	except OSError:
				2083	raise Error('Failed to process %s.' % f)
Marc-Antoine Ruel	2283ad1	2014-02-09 11:14:57 -0500	[diff] [blame]	2084	# Technically we would care about which files were uploaded but we don't
				2085	# much in practice.
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	2086	_uploaded_files = storage.upload_items(items_to_upload)
				2087	return results
				2088	finally:
				2089	if tempdir:
				2090	shutil.rmtree(tempdir)
				2091
				2092
Marc-Antoine Ruel	488ce8f	2014-02-09 11:25:04 -0500	[diff] [blame]	2093	def archive(out, namespace, files, blacklist):
				2094	if files == ['-']:
				2095	files = sys.stdin.readlines()
				2096
				2097	if not files:
				2098	raise Error('Nothing to upload')
				2099
				2100	files = [f.decode('utf-8') for f in files]
				2101	algo = get_hash_algo(namespace)
				2102	blacklist = tools.gen_blacklist(blacklist)
				2103	with get_storage(out, namespace) as storage:
				2104	results = archive_files_to_storage(storage, algo, files, blacklist)
				2105	print('\n'.join('%s %s' % (r[0], r[1]) for r in results))
				2106
				2107
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	2108	@subcommand.usage('<file1..fileN> or - to read from stdin')
				2109	def CMDarchive(parser, args):
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	2110	"""Archives data to the server.
				2111
				2112	If a directory is specified, a .isolated file is created the whole directory
				2113	is uploaded. Then this .isolated file can be included in another one to run
				2114	commands.
				2115
				2116	The commands output each file that was processed with its content hash. For
				2117	directories, the .isolated generated for the directory is listed as the
				2118	directory entry itself.
				2119	"""
Marc-Antoine Ruel	8806e62	2014-02-12 14:15:53 -0500	[diff] [blame]	2120	add_isolate_server_options(parser, False)
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	2121	parser.add_option(
				2122	'--blacklist',
				2123	action='append', default=list(DEFAULT_BLACKLIST),
				2124	help='List of regexp to use as blacklist filter when uploading '
				2125	'directories')
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	2126	options, files = parser.parse_args(args)
Marc-Antoine Ruel	488ce8f	2014-02-09 11:25:04 -0500	[diff] [blame]	2127	process_isolate_server_options(parser, options)
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	2128	try:
Marc-Antoine Ruel	488ce8f	2014-02-09 11:25:04 -0500	[diff] [blame]	2129	archive(options.isolate_server, options.namespace, files, options.blacklist)
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	2130	except Error as e:
				2131	parser.error(e.args[0])
Marc-Antoine Ruel	fcc3cd8	2013-11-19 16:31:38 -0500	[diff] [blame]	2132	return 0
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	2133
				2134
				2135	def CMDdownload(parser, args):
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	2136	"""Download data from the server.
				2137
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	2138	It can either download individual files or a complete tree from a .isolated
				2139	file.
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	2140	"""
Marc-Antoine Ruel	8806e62	2014-02-12 14:15:53 -0500	[diff] [blame]	2141	add_isolate_server_options(parser, True)
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	2142	parser.add_option(
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	2143	'-i', '--isolated', metavar='HASH',
				2144	help='hash of an isolated file, .isolated file content is discarded, use '
				2145	'--file if you need it')
				2146	parser.add_option(
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	2147	'-f', '--file', metavar='HASH DEST', default=[], action='append', nargs=2,
				2148	help='hash and destination of a file, can be used multiple times')
				2149	parser.add_option(
				2150	'-t', '--target', metavar='DIR', default=os.getcwd(),
				2151	help='destination directory')
				2152	options, args = parser.parse_args(args)
Marc-Antoine Ruel	488ce8f	2014-02-09 11:25:04 -0500	[diff] [blame]	2153	process_isolate_server_options(parser, options)
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	2154	if args:
				2155	parser.error('Unsupported arguments: %s' % args)
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	2156	if bool(options.isolated) == bool(options.file):
				2157	parser.error('Use one of --isolated or --file, and only one.')
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	2158
				2159	options.target = os.path.abspath(options.target)
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	2160
Marc-Antoine Ruel	8806e62	2014-02-12 14:15:53 -0500	[diff] [blame]	2161	remote = options.isolate_server or options.indir
				2162	with get_storage(remote, options.namespace) as storage:
Vadim Shtayura	3172be5	2013-12-03 12:49:05 -0800	[diff] [blame]	2163	# Fetching individual files.
				2164	if options.file:
				2165	channel = threading_utils.TaskChannel()
				2166	pending = {}
				2167	for digest, dest in options.file:
				2168	pending[digest] = dest
				2169	storage.async_fetch(
				2170	channel,
				2171	WorkerPool.MED,
				2172	digest,
				2173	UNKNOWN_FILE_SIZE,
				2174	functools.partial(file_write, os.path.join(options.target, dest)))
				2175	while pending:
				2176	fetched = channel.pull()
				2177	dest = pending.pop(fetched)
				2178	logging.info('%s: %s', fetched, dest)
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	2179
Vadim Shtayura	3172be5	2013-12-03 12:49:05 -0800	[diff] [blame]	2180	# Fetching whole isolated tree.
				2181	if options.isolated:
				2182	settings = fetch_isolated(
				2183	isolated_hash=options.isolated,
				2184	storage=storage,
				2185	cache=MemoryCache(),
				2186	algo=get_hash_algo(options.namespace),
				2187	outdir=options.target,
Vadim Shtayura	3172be5	2013-12-03 12:49:05 -0800	[diff] [blame]	2188	require_command=False)
				2189	rel = os.path.join(options.target, settings.relative_cwd)
				2190	print('To run this test please run from the directory %s:' %
				2191	os.path.join(options.target, rel))
				2192	print(' ' + ' '.join(settings.command))
vadimsh@chromium.org	7b5dae3	2013-10-03 16:59:59 +0000	[diff] [blame]	2193
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	2194	return 0
				2195
				2196
Marc-Antoine Ruel	488ce8f	2014-02-09 11:25:04 -0500	[diff] [blame]	2197	@subcommand.usage('<file1..fileN> or - to read from stdin')
				2198	def CMDhashtable(parser, args):
				2199	"""Archives data to a hashtable on the file system.
				2200
				2201	If a directory is specified, a .isolated file is created the whole directory
				2202	is uploaded. Then this .isolated file can be included in another one to run
				2203	commands.
				2204
				2205	The commands output each file that was processed with its content hash. For
				2206	directories, the .isolated generated for the directory is listed as the
				2207	directory entry itself.
				2208	"""
				2209	add_outdir_options(parser)
				2210	parser.add_option(
				2211	'--blacklist',
				2212	action='append', default=list(DEFAULT_BLACKLIST),
				2213	help='List of regexp to use as blacklist filter when uploading '
				2214	'directories')
				2215	options, files = parser.parse_args(args)
Marc-Antoine Ruel	8806e62	2014-02-12 14:15:53 -0500	[diff] [blame]	2216	process_outdir_options(parser, options, os.getcwd())
Marc-Antoine Ruel	488ce8f	2014-02-09 11:25:04 -0500	[diff] [blame]	2217	try:
				2218	# Do not compress files when archiving to the file system.
				2219	archive(options.outdir, 'default', files, options.blacklist)
				2220	except Error as e:
				2221	parser.error(e.args[0])
				2222	return 0
				2223
				2224
Marc-Antoine Ruel	8806e62	2014-02-12 14:15:53 -0500	[diff] [blame]	2225	def add_isolate_server_options(parser, add_indir):
				2226	"""Adds --isolate-server and --namespace options to parser.
				2227
				2228	Includes --indir if desired.
				2229	"""
Marc-Antoine Ruel	1687b5e	2014-02-06 17:47:53 -0500	[diff] [blame]	2230	parser.add_option(
				2231	'-I', '--isolate-server',
				2232	metavar='URL', default=os.environ.get('ISOLATE_SERVER', ''),
Marc-Antoine Ruel	8806e62	2014-02-12 14:15:53 -0500	[diff] [blame]	2233	help='URL of the Isolate Server to use. Defaults to the environment '
				2234	'variable ISOLATE_SERVER if set. No need to specify https://, this '
				2235	'is assumed.')
Marc-Antoine Ruel	1687b5e	2014-02-06 17:47:53 -0500	[diff] [blame]	2236	parser.add_option(
				2237	'--namespace', default='default-gzip',
				2238	help='The namespace to use on the Isolate Server, default: %default')
Marc-Antoine Ruel	8806e62	2014-02-12 14:15:53 -0500	[diff] [blame]	2239	if add_indir:
				2240	parser.add_option(
				2241	'--indir', metavar='DIR',
				2242	help='Directory used to store the hashtable instead of using an '
				2243	'isolate server.')
Marc-Antoine Ruel	1687b5e	2014-02-06 17:47:53 -0500	[diff] [blame]	2244
				2245
				2246	def process_isolate_server_options(parser, options):
Marc-Antoine Ruel	8806e62	2014-02-12 14:15:53 -0500	[diff] [blame]	2247	"""Processes the --isolate-server and --indir options and aborts if neither is
				2248	specified.
Marc-Antoine Ruel	1687b5e	2014-02-06 17:47:53 -0500	[diff] [blame]	2249	"""
Marc-Antoine Ruel	8806e62	2014-02-12 14:15:53 -0500	[diff] [blame]	2250	has_indir = hasattr(options, 'indir')
Marc-Antoine Ruel	1687b5e	2014-02-06 17:47:53 -0500	[diff] [blame]	2251	if not options.isolate_server:
Marc-Antoine Ruel	8806e62	2014-02-12 14:15:53 -0500	[diff] [blame]	2252	if not has_indir:
				2253	parser.error('--isolate-server is required.')
				2254	elif not options.indir:
				2255	parser.error('Use one of --indir or --isolate-server.')
				2256	else:
				2257	if has_indir and options.indir:
				2258	parser.error('Use only one of --indir or --isolate-server.')
				2259
				2260	if options.isolate_server:
				2261	parts = urlparse.urlparse(options.isolate_server, 'https')
Marc-Antoine Ruel	1687b5e	2014-02-06 17:47:53 -0500	[diff] [blame]	2262	if parts.query:
				2263	parser.error('--isolate-server doesn\'t support query parameter.')
				2264	if parts.fragment:
				2265	parser.error('--isolate-server doesn\'t support fragment in the url.')
Marc-Antoine Ruel	8806e62	2014-02-12 14:15:53 -0500	[diff] [blame]	2266	# urlparse('foo.com') will result in netloc='', path='foo.com', which is not
				2267	# what is desired here.
				2268	new = list(parts)
				2269	if not new[1] and new[2]:
				2270	new[1] = new[2].rstrip('/')
				2271	new[2] = ''
				2272	new[2] = new[2].rstrip('/')
				2273	options.isolate_server = urlparse.urlunparse(new)
				2274	return
				2275
				2276	if file_path.is_url(options.indir):
				2277	parser.error('Can\'t use an URL for --indir.')
				2278	options.indir = unicode(options.indir).replace('/', os.path.sep)
				2279	options.indir = os.path.abspath(
				2280	os.path.normpath(os.path.join(os.getcwd(), options.indir)))
				2281	if not os.path.isdir(options.indir):
				2282	parser.error('Path given to --indir must exist.')
				2283
Marc-Antoine Ruel	1687b5e	2014-02-06 17:47:53 -0500	[diff] [blame]	2284
				2285
Marc-Antoine Ruel	488ce8f	2014-02-09 11:25:04 -0500	[diff] [blame]	2286	def add_outdir_options(parser):
Marc-Antoine Ruel	8806e62	2014-02-12 14:15:53 -0500	[diff] [blame]	2287	"""Adds --outdir, which is orthogonal to --isolate-server.
				2288
				2289	Note: On upload, separate commands are used between 'archive' and 'hashtable'.
				2290	On 'download', the same command can download from either an isolate server or
				2291	a file system.
				2292	"""
Marc-Antoine Ruel	488ce8f	2014-02-09 11:25:04 -0500	[diff] [blame]	2293	parser.add_option(
				2294	'-o', '--outdir', metavar='DIR',
				2295	help='Directory used to recreate the tree.')
				2296
				2297
Marc-Antoine Ruel	8806e62	2014-02-12 14:15:53 -0500	[diff] [blame]	2298	def process_outdir_options(parser, options, cwd):
Marc-Antoine Ruel	488ce8f	2014-02-09 11:25:04 -0500	[diff] [blame]	2299	if not options.outdir:
				2300	parser.error('--outdir is required.')
				2301	if file_path.is_url(options.outdir):
Marc-Antoine Ruel	8806e62	2014-02-12 14:15:53 -0500	[diff] [blame]	2302	parser.error('Can\'t use an URL for --outdir.')
Marc-Antoine Ruel	488ce8f	2014-02-09 11:25:04 -0500	[diff] [blame]	2303	options.outdir = unicode(options.outdir).replace('/', os.path.sep)
				2304	# outdir doesn't need native path case since tracing is never done from there.
				2305	options.outdir = os.path.abspath(
				2306	os.path.normpath(os.path.join(cwd, options.outdir)))
				2307	# In theory, we'd create the directory outdir right away. Defer doing it in
				2308	# case there's errors in the command line.
				2309
				2310
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	2311	class OptionParserIsolateServer(tools.OptionParserWithLogging):
				2312	def __init__(self, **kwargs):
Marc-Antoine Ruel	ac54cb4	2013-11-18 14:05:35 -0500	[diff] [blame]	2313	tools.OptionParserWithLogging.__init__(
				2314	self,
				2315	version=__version__,
				2316	prog=os.path.basename(sys.modules[__name__].__file__),
				2317	**kwargs)
Vadim Shtayura	e34e13a	2014-02-02 11:23:26 -0800	[diff] [blame]	2318	auth.add_auth_options(self)
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	2319
				2320	def parse_args(self, args, *kwargs):
				2321	options, args = tools.OptionParserWithLogging.parse_args(
				2322	self, args, *kwargs)
Vadim Shtayura	5d1efce	2014-02-04 10:55:43 -0800	[diff] [blame]	2323	auth.process_auth_options(self, options)
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	2324	return options, args
				2325
				2326
				2327	def main(args):
				2328	dispatcher = subcommand.CommandDispatcher(__name__)
				2329	try:
Marc-Antoine Ruel	ac54cb4	2013-11-18 14:05:35 -0500	[diff] [blame]	2330	return dispatcher.execute(OptionParserIsolateServer(), args)
vadimsh@chromium.org	d908a54	2013-10-30 01:36:17 +0000	[diff] [blame]	2331	except Exception as e:
				2332	tools.report_error(e)
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	2333	return 1
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	2334
				2335
				2336	if __name__ == '__main__':
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	2337	fix_encoding.fix_encoding()
				2338	tools.disable_buffering()
				2339	colorama.init()
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	2340	sys.exit(main(sys.argv[1:]))