Blame - isolateserver.py - chromium.googlesource.com/infra/luci/client-py

blob: 1e8030df609c4d536ca03dd46d7e64db167404a6 [file] [log] [blame]

maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	1	#!/usr/bin/env python
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	2	# Copyright 2013 The Chromium Authors. All rights reserved.
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	3	# Use of this source code is governed by a BSD-style license that can be
				4	# found in the LICENSE file.
				5
				6	"""Archives a set of files to a server."""
				7
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	8	__version__ = '0.2'
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	9
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	10	import binascii
				11	import hashlib
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	12	import itertools
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	13	import json
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	14	import logging
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	15	import os
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	16	import random
				17	import re
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	18	import sys
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	19	import threading
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	20	import time
maruel@chromium.org	e82112e	2013-04-24 14:41:55 +0000	[diff] [blame]	21	import urllib
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	22	import zlib
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	23
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	24	from third_party import colorama
				25	from third_party.depot_tools import fix_encoding
				26	from third_party.depot_tools import subcommand
				27
vadimsh@chromium.org	6b70621	2013-08-28 15:03:46 +0000	[diff] [blame]	28	from utils import net
vadimsh@chromium.org	b074b16	2013-08-22 17:55:46 +0000	[diff] [blame]	29	from utils import threading_utils
vadimsh@chromium.org	a432647	2013-08-24 02:05:41 +0000	[diff] [blame]	30	from utils import tools
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	31
				32
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	33	# Default server.
				34	# TODO(maruel): Chromium-specific.
				35	ISOLATE_SERVER = 'https://isolateserver-dev.appspot.com/'
				36
				37
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	38	# The minimum size of files to upload directly to the blobstore.
maruel@chromium.org	aef29f8	2012-12-12 15:00:42 +0000	[diff] [blame]	39	MIN_SIZE_FOR_DIRECT_BLOBSTORE = 20 * 1024
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	40
vadimsh@chromium.org	eea5242	2013-08-21 19:35:54 +0000	[diff] [blame]	41	# The number of files to check the isolate server per /contains query.
				42	# All files are sorted by likelihood of a change in the file content
				43	# (currently file size is used to estimate this: larger the file -> larger the
				44	# possibility it has changed). Then first ITEMS_PER_CONTAINS_QUERIES[0] files
				45	# are taken and send to '/contains', then next ITEMS_PER_CONTAINS_QUERIES[1],
				46	# and so on. Numbers here is a trade-off; the more per request, the lower the
				47	# effect of HTTP round trip latency and TCP-level chattiness. On the other hand,
				48	# larger values cause longer lookups, increasing the initial latency to start
				49	# uploading, which is especially an issue for large files. This value is
				50	# optimized for the "few thousands files to look up with minimal number of large
				51	# files missing" case.
				52	ITEMS_PER_CONTAINS_QUERIES = [20, 20, 50, 50, 50, 100]
csharp@chromium.org	07fa759	2013-01-11 18:19:30 +0000	[diff] [blame]	53
maruel@chromium.org	9958e4a	2013-09-17 00:01:48 +0000	[diff] [blame]	54
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	55	# A list of already compressed extension types that should not receive any
				56	# compression before being uploaded.
				57	ALREADY_COMPRESSED_TYPES = [
				58	'7z', 'avi', 'cur', 'gif', 'h264', 'jar', 'jpeg', 'jpg', 'pdf', 'png',
				59	'wav', 'zip'
				60	]
				61
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	62
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	63	# The file size to be used when we don't know the correct file size,
				64	# generally used for .isolated files.
				65	UNKNOWN_FILE_SIZE = None
				66
				67
				68	# The size of each chunk to read when downloading and unzipping files.
				69	ZIPPED_FILE_CHUNK = 16 * 1024
				70
				71
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	72	# Chunk size to use when doing disk I/O.
				73	DISK_FILE_CHUNK = 1024 * 1024
				74
				75
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	76	# Read timeout in seconds for downloads from isolate storage. If there's no
				77	# response from the server within this timeout whole download will be aborted.
				78	DOWNLOAD_READ_TIMEOUT = 60
				79
				80
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	81	# The delay (in seconds) to wait between logging statements when retrieving
				82	# the required files. This is intended to let the user (or buildbot) know that
				83	# the program is still running.
				84	DELAY_BETWEEN_UPDATES_IN_SECS = 30
				85
				86
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	87	class ConfigError(ValueError):
				88	"""Generic failure to load a .isolated file."""
				89	pass
				90
				91
				92	class MappingError(OSError):
				93	"""Failed to recreate the tree."""
				94	pass
				95
				96
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	97	def randomness():
				98	"""Generates low-entropy randomness for MIME encoding.
				99
				100	Exists so it can be mocked out in unit tests.
				101	"""
				102	return str(time.time())
				103
				104
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	105	def encode_multipart_formdata(fields, files,
				106	mime_mapper=lambda _: 'application/octet-stream'):
				107	"""Encodes a Multipart form data object.
				108
				109	Args:
				110	fields: a sequence (name, value) elements for
				111	regular form fields.
				112	files: a sequence of (name, filename, value) elements for data to be
				113	uploaded as files.
				114	mime_mapper: function to return the mime type from the filename.
				115	Returns:
				116	content_type: for httplib.HTTP instance
				117	body: for httplib.HTTP instance
				118	"""
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	119	boundary = hashlib.md5(randomness()).hexdigest()
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	120	body_list = []
				121	for (key, value) in fields:
				122	if isinstance(key, unicode):
				123	value = key.encode('utf-8')
				124	if isinstance(value, unicode):
				125	value = value.encode('utf-8')
				126	body_list.append('--' + boundary)
				127	body_list.append('Content-Disposition: form-data; name="%s"' % key)
				128	body_list.append('')
				129	body_list.append(value)
				130	body_list.append('--' + boundary)
				131	body_list.append('')
				132	for (key, filename, value) in files:
				133	if isinstance(key, unicode):
				134	value = key.encode('utf-8')
				135	if isinstance(filename, unicode):
				136	value = filename.encode('utf-8')
				137	if isinstance(value, unicode):
				138	value = value.encode('utf-8')
				139	body_list.append('--' + boundary)
				140	body_list.append('Content-Disposition: form-data; name="%s"; '
				141	'filename="%s"' % (key, filename))
				142	body_list.append('Content-Type: %s' % mime_mapper(filename))
				143	body_list.append('')
				144	body_list.append(value)
				145	body_list.append('--' + boundary)
				146	body_list.append('')
				147	if body_list:
				148	body_list[-2] += '--'
				149	body = '\r\n'.join(body_list)
				150	content_type = 'multipart/form-data; boundary=%s' % boundary
				151	return content_type, body
				152
				153
maruel@chromium.org	7b844a6	2013-09-17 13:04:59 +0000	[diff] [blame]	154	def is_valid_hash(value, algo):
				155	"""Returns if the value is a valid hash for the corresponding algorithm."""
				156	size = 2 * algo().digest_size
				157	return bool(re.match(r'^[a-fA-F0-9]{%d}$' % size, value))
				158
				159
				160	def hash_file(filepath, algo):
				161	"""Calculates the hash of a file without reading it all in memory at once.
				162
				163	\|algo\| should be one of hashlib hashing algorithm.
				164	"""
				165	digest = algo()
maruel@chromium.org	037758d	2012-12-10 17:59:46 +0000	[diff] [blame]	166	with open(filepath, 'rb') as f:
				167	while True:
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	168	chunk = f.read(DISK_FILE_CHUNK)
maruel@chromium.org	037758d	2012-12-10 17:59:46 +0000	[diff] [blame]	169	if not chunk:
				170	break
				171	digest.update(chunk)
				172	return digest.hexdigest()
				173
				174
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame^]	175	def file_read(filepath, chunk_size=DISK_FILE_CHUNK):
				176	"""Yields file content in chunks of given \|chunk_size\|."""
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	177	with open(filepath, 'rb') as f:
				178	while True:
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame^]	179	data = f.read(chunk_size)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	180	if not data:
				181	break
				182	yield data
				183
				184
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	185	def file_write(filepath, content_generator):
				186	"""Writes file content as generated by content_generator.
				187
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	188	Creates the intermediary directory as needed.
				189
				190	Returns the number of bytes written.
				191
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	192	Meant to be mocked out in unit tests.
				193	"""
				194	filedir = os.path.dirname(filepath)
				195	if not os.path.isdir(filedir):
				196	os.makedirs(filedir)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	197	total = 0
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	198	with open(filepath, 'wb') as f:
				199	for d in content_generator:
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	200	total += len(d)
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	201	f.write(d)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	202	return total
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	203
				204
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame^]	205	def zip_compress(content_generator, level=7):
				206	"""Reads chunks from \|content_generator\| and yields zip compressed chunks."""
				207	compressor = zlib.compressobj(level)
				208	for chunk in content_generator:
				209	compressed = compressor.compress(chunk)
				210	if compressed:
				211	yield compressed
				212	tail = compressor.flush(zlib.Z_FINISH)
				213	if tail:
				214	yield tail
				215
				216
				217	def get_zip_compression_level(filename):
				218	"""Given a filename calculates the ideal zip compression level to use."""
				219	file_ext = os.path.splitext(filename)[1].lower()
				220	# TODO(csharp): Profile to find what compression level works best.
				221	return 0 if file_ext in ALREADY_COMPRESSED_TYPES else 7
				222
				223
maruel@chromium.org	af25485	2013-09-17 17:48:14 +0000	[diff] [blame]	224	def create_directories(base_directory, files):
				225	"""Creates the directory structure needed by the given list of files."""
				226	logging.debug('create_directories(%s, %d)', base_directory, len(files))
				227	# Creates the tree of directories to create.
				228	directories = set(os.path.dirname(f) for f in files)
				229	for item in list(directories):
				230	while item:
				231	directories.add(item)
				232	item = os.path.dirname(item)
				233	for d in sorted(directories):
				234	if d:
				235	os.mkdir(os.path.join(base_directory, d))
				236
				237
				238	def create_links(base_directory, files):
				239	"""Creates any links needed by the given set of files."""
				240	for filepath, properties in files:
				241	if 'l' not in properties:
				242	continue
				243	if sys.platform == 'win32':
				244	# TODO(maruel): Create junctions or empty text files similar to what
				245	# cygwin do?
				246	logging.warning('Ignoring symlink %s', filepath)
				247	continue
				248	outfile = os.path.join(base_directory, filepath)
				249	# symlink doesn't exist on Windows. So the 'link' property should
				250	# never be specified for windows .isolated file.
				251	os.symlink(properties['l'], outfile) # pylint: disable=E1101
				252	if 'm' in properties:
				253	lchmod = getattr(os, 'lchmod', None)
				254	if lchmod:
				255	lchmod(outfile, properties['m'])
				256
				257
				258	def setup_commands(base_directory, cwd, cmd):
				259	"""Correctly adjusts and then returns the required working directory
				260	and command needed to run the test.
				261	"""
				262	assert not os.path.isabs(cwd), 'The cwd must be a relative path, got %s' % cwd
				263	cwd = os.path.join(base_directory, cwd)
				264	if not os.path.isdir(cwd):
				265	os.makedirs(cwd)
				266
				267	# Ensure paths are correctly separated on windows.
				268	cmd[0] = cmd[0].replace('/', os.path.sep)
				269	cmd = tools.fix_python_path(cmd)
				270
				271	return cwd, cmd
				272
				273
				274	def generate_remaining_files(files):
				275	"""Generates a dictionary of all the remaining files to be downloaded."""
				276	remaining = {}
				277	for filepath, props in files:
				278	if 'h' in props:
				279	remaining.setdefault(props['h'], []).append((filepath, props))
				280
				281	return remaining
				282
				283
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	284	def is_valid_file(filepath, size):
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	285	"""Determines if the given files appears valid.
				286
				287	Currently it just checks the file's size.
				288	"""
				289	if size == UNKNOWN_FILE_SIZE:
				290	return True
				291	actual_size = os.stat(filepath).st_size
				292	if size != actual_size:
				293	logging.warning(
				294	'Found invalid item %s; %d != %d',
				295	os.path.basename(filepath), actual_size, size)
				296	return False
				297	return True
				298
				299
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	300	def try_remove(filepath):
				301	"""Removes a file without crashing even if it doesn't exist."""
				302	try:
				303	os.remove(filepath)
				304	except OSError:
				305	pass
				306
				307
vadimsh@chromium.org	80f7300	2013-07-12 14:52:44 +0000	[diff] [blame]	308	def url_read(url, **kwargs):
vadimsh@chromium.org	6b70621	2013-08-28 15:03:46 +0000	[diff] [blame]	309	result = net.url_read(url, **kwargs)
vadimsh@chromium.org	80f7300	2013-07-12 14:52:44 +0000	[diff] [blame]	310	if result is None:
maruel@chromium.org	ef33312	2013-03-12 20:36:40 +0000	[diff] [blame]	311	# If we get no response from the server, assume it is down and raise an
				312	# exception.
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	313	raise MappingError('Unable to connect to server %s' % url)
maruel@chromium.org	ef33312	2013-03-12 20:36:40 +0000	[diff] [blame]	314	return result
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	315
				316
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame^]	317	class StorageApi(object):
				318	"""Base class for object that can download and upload files."""
				319
				320	def __init__(self):
				321	self._pool = None
				322
				323	def set_pool(self, pool):
				324	"""Sets WorkerPool that can be used for parallel uploads."""
				325	self._pool = pool
				326
				327	def fetch(self, item, expected_size):
				328	"""Fetches an object and yields its content."""
				329	raise NotImplementedError()
				330
				331	def push(self, item, expected_size, content_generator, push_urls=None):
				332	"""Uploads content generated by \|content_generator\| as \|item\|."""
				333	raise NotImplementedError()
				334
				335	def check_missing_files(self, files):
				336	"""Checks for existence of given \|files\| on the server.
				337
				338	Arguments:
				339	files: list of pairs (file name, metadata dict).
				340
				341	Returns:
				342	A list of files missing on server as a list of triplets
				343	(file name, metadata dict, push_urls object to pass to push).
				344	"""
				345	raise NotImplementedError()
				346
				347	def get_missing_files(self, files):
				348	"""Yields files that are missing from the server.
				349
				350	Issues multiple parallel queries via check_missing_files method calls.
				351
				352	Arguments:
				353	files: a dictionary file name -> metadata dict.
				354
				355	Yields:
				356	Triplets (file name, metadata dict, push_urls object to pass to push).
				357	"""
				358	# TODO(maruel, vadimsh): Reuse self._pool here.
				359	with threading_utils.ThreadPool(1, 16, 0, prefix='get_missing_files') as tp:
				360	for batch in self.batch_files_for_check(files):
				361	tp.add_task(0, self.check_missing_files, batch)
				362	for missing in itertools.chain.from_iterable(tp.iter_results()):
				363	yield missing
				364
				365	def async_push(self, priority, item, expected_size,
				366	content_generator, push_urls=None):
				367	"""Starts asynchronous push to the server in a parallel thread."""
				368	# TODO(vadimsh): Implement streaming uploads. Before it's done, assemble
				369	# content right here. It will block until all file is zipped.
				370	data = ''.join(content_generator)
				371	self._pool.add_task(
				372	priority, self.push, item, expected_size, [data], push_urls)
				373
				374	@staticmethod
				375	def batch_files_for_check(files):
				376	"""Splits list of files to check for existence on the server into batches.
				377
				378	Each batch corresponds to a single 'exists?' query to the server via a call
				379	to check_missing_files method.
				380
				381	Arguments:
				382	files: a dictionary file name -> metadata dict.
				383
				384	Yields:
				385	Batches of files to query for existence in a single operation,
				386	each batch is a list of pairs: (file name, metadata dict).
				387	"""
				388	batch_count = 0
				389	batch_size_limit = ITEMS_PER_CONTAINS_QUERIES[0]
				390	next_queries = []
				391	items = ((k, v) for k, v in files.iteritems() if 's' in v)
				392	for filename, metadata in sorted(items, key=lambda x: -x[1]['s']):
				393	next_queries.append((filename, metadata))
				394	if len(next_queries) == batch_size_limit:
				395	yield next_queries
				396	next_queries = []
				397	batch_count += 1
				398	batch_size_limit = ITEMS_PER_CONTAINS_QUERIES[
				399	min(batch_count, len(ITEMS_PER_CONTAINS_QUERIES) - 1)]
				400	if next_queries:
				401	yield next_queries
				402
				403
				404	class IsolateServer(StorageApi):
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	405	"""Client class to download or upload to Isolate Server."""
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	406	def __init__(self, base_url, namespace):
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame^]	407	super(IsolateServer, self).__init__()
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	408	assert base_url.startswith('http'), base_url
				409	self.content_url = base_url.rstrip('/') + '/content/'
				410	self.namespace = namespace
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame^]	411	self.algo = get_hash_algo(namespace)
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	412	self._token = None
				413	self._lock = threading.Lock()
				414
				415	@property
				416	def token(self):
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	417	# TODO(maruel): Make this request much earlier asynchronously while the
				418	# files are being enumerated.
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	419	with self._lock:
				420	if not self._token:
				421	self._token = urllib.quote(url_read(self.content_url + 'get_token'))
				422	return self._token
				423
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	424	def fetch(self, item, expected_size):
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	425	"""Fetches an object and yields its content."""
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	426	assert isinstance(item, basestring)
				427	assert (
				428	isinstance(expected_size, (int, long)) or
				429	expected_size == UNKNOWN_FILE_SIZE)
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	430	zipped_url = '%sretrieve/%s/%s' % (self.content_url, self.namespace, item)
				431	logging.debug('download_file(%s)', zipped_url)
				432
				433	# Because the app engine DB is only eventually consistent, retry 404 errors
				434	# because the file might just not be visible yet (even though it has been
				435	# uploaded).
				436	connection = net.url_open(
				437	zipped_url, retry_404=True, read_timeout=DOWNLOAD_READ_TIMEOUT)
				438	if not connection:
				439	raise IOError('Unable to open connection to %s' % zipped_url)
				440
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	441	# TODO(maruel): Must only decompress when needed.
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	442	decompressor = zlib.decompressobj()
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	443	try:
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	444	compressed_size = 0
				445	decompressed_size = 0
				446	while True:
				447	chunk = connection.read(ZIPPED_FILE_CHUNK)
				448	if not chunk:
				449	break
				450	compressed_size += len(chunk)
				451	decompressed = decompressor.decompress(chunk)
				452	decompressed_size += len(decompressed)
				453	yield decompressed
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	454
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	455	# Ensure that all the data was properly decompressed.
				456	uncompressed_data = decompressor.flush()
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	457	if uncompressed_data:
				458	raise IOError('Decompression failed')
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	459	if (expected_size != UNKNOWN_FILE_SIZE and
				460	decompressed_size != expected_size):
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	461	raise IOError('File incorrect size after download of %s. Got %s and '
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	462	'expected %s' % (item, decompressed_size, expected_size))
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	463	except zlib.error as e:
				464	msg = 'Corrupted zlib for item %s. Processed %d of %s bytes.\n%s' % (
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	465	item, compressed_size, connection.content_length, e)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	466	logging.warning(msg)
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	467
				468	# Testing seems to show that if a few machines are trying to download
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	469	# the same blob, they can cause each other to fail. So if we hit a zip
				470	# error, this is the most likely cause (it only downloads some of the
				471	# data). Randomly sleep for between 5 and 25 seconds to try and spread
				472	# out the downloads.
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	473	sleep_duration = (random.random() * 20) + 5
				474	time.sleep(sleep_duration)
				475	raise IOError(msg)
maruel@chromium.org	c2bfef4	2013-08-30 21:46:26 +0000	[diff] [blame]	476
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame^]	477	def push(self, item, expected_size, content_generator, push_urls=None):
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	478	"""Uploads content generated by \|content_generator\| as \|item\| to the remote
				479	isolate server.
				480	"""
				481	assert isinstance(item, basestring)
				482	assert isinstance(expected_size, int) or expected_size == UNKNOWN_FILE_SIZE
				483	item = str(item)
				484	# TODO(maruel): Support large files. This would require streaming support.
				485	content = ''.join(content_generator)
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	486	if len(content) > MIN_SIZE_FOR_DIRECT_BLOBSTORE:
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	487	return self._upload_hash_content_to_blobstore(item, content)
maruel@chromium.org	d1e20c9	2013-09-17 20:54:26 +0000	[diff] [blame]	488
				489	url = '%sstore/%s/%s?token=%s' % (
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	490	self.content_url, self.namespace, item, self.token)
maruel@chromium.org	d1e20c9	2013-09-17 20:54:26 +0000	[diff] [blame]	491	return url_read(
				492	url, data=content, content_type='application/octet-stream')
				493
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame^]	494	def check_missing_files(self, files):
				495	"""Checks for existence of given \|files\| on the server."""
				496	logging.info('Checking existence of %d files...', len(files))
				497
				498	body = ''.join(
				499	(binascii.unhexlify(metadata['h']) for (_, metadata) in files))
				500	assert (len(body) % self.algo().digest_size) == 0, repr(body)
				501
				502	query_url = '%scontains/%s?token=%s' % (
				503	self.content_url, self.namespace, self.token)
				504	response = url_read(
				505	query_url, data=body, content_type='application/octet-stream')
				506	if len(files) != len(response):
				507	raise MappingError(
				508	'Got an incorrect number of responses from the server. Expected %d, '
				509	'but got %d' % (len(files), len(response)))
				510
				511	# This implementation of IsolateServer doesn't use push_urls field,
				512	# set it to None.
				513	missing_files = [
				514	files[i] + (None,) for i, flag in enumerate(response) if flag == '\x00'
				515	]
				516	logging.info('Queried %d files, %d cache hit',
				517	len(files), len(files) - len(missing_files))
				518	return missing_files
				519
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	520	def _upload_hash_content_to_blobstore(self, item, content):
maruel@chromium.org	d1e20c9	2013-09-17 20:54:26 +0000	[diff] [blame]	521	"""Uploads the content directly to the blobstore via a generated url."""
				522	# TODO(maruel): Support large files. This would require streaming support.
				523	gen_url = '%sgenerate_blobstore_url/%s/%s' % (
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	524	self.content_url, self.namespace, item)
maruel@chromium.org	d1e20c9	2013-09-17 20:54:26 +0000	[diff] [blame]	525	# Token is guaranteed to be already quoted but it is unnecessary here, and
				526	# only here.
				527	data = [('token', urllib.unquote(self.token))]
				528	content_type, body = encode_multipart_formdata(
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	529	data, [('content', item, content)])
maruel@chromium.org	d1e20c9	2013-09-17 20:54:26 +0000	[diff] [blame]	530	last_url = gen_url
				531	for _ in net.retry_loop(max_attempts=net.URL_OPEN_MAX_ATTEMPTS):
				532	# Retry HTTP 50x here but not 404.
				533	upload_url = net.url_read(gen_url, data=data)
				534	if not upload_url:
				535	raise MappingError('Unable to connect to server %s' % gen_url)
				536	last_url = upload_url
				537
				538	# Do not retry this request on HTTP 50x. Regenerate an upload url each
				539	# time since uploading "consumes" the upload url.
				540	result = net.url_read(
				541	upload_url, data=body, content_type=content_type, retry_50x=False)
				542	if result is not None:
				543	return result
				544	raise MappingError('Unable to connect to server %s' % last_url)
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	545
				546
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame^]	547	class FileSystem(StorageApi):
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	548	"""Fetches data from the file system.
				549
				550	The common use case is a NFS/CIFS file server that is mounted locally that is
				551	used to fetch the file on a local partition.
				552	"""
				553	def __init__(self, base_path):
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame^]	554	super(FileSystem, self).__init__()
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	555	self.base_path = base_path
				556
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	557	def fetch(self, item, expected_size):
				558	assert isinstance(item, basestring)
				559	assert isinstance(expected_size, int) or expected_size == UNKNOWN_FILE_SIZE
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	560	source = os.path.join(self.base_path, item)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	561	if (expected_size != UNKNOWN_FILE_SIZE and
				562	not is_valid_file(source, expected_size)):
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	563	raise IOError('Invalid file %s' % item)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	564	return file_read(source)
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	565
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame^]	566	def push(self, item, expected_size, content_generator, push_urls=None):
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	567	assert isinstance(item, basestring)
				568	assert isinstance(expected_size, int) or expected_size == UNKNOWN_FILE_SIZE
				569	dest = os.path.join(self.base_path, item)
				570	total = file_write(dest, content_generator)
				571	if expected_size != UNKNOWN_FILE_SIZE and total != expected_size:
				572	os.remove(dest)
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	573	raise IOError(
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	574	'Invalid file %s, %d != %d' % (item, total, expected_size))
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	575
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame^]	576	def check_missing_files(self, files):
				577	return [
				578	(filename, metadata, None)
				579	for filename, metadata in files
				580	if not os.path.exists(os.path.join(self.base_path, metadata['h']))
				581	]
				582
				583
				584	def get_hash_algo(_namespace):
				585	"""Return hash algorithm class to use when uploading to given \|namespace\|."""
				586	# TODO(vadimsh): Implement this at some point.
				587	return hashlib.sha1
				588
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	589
				590	def get_storage_api(file_or_url, namespace):
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	591	"""Returns an object that implements .fetch() and .push()."""
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	592	if re.match(r'^https?://.+$', file_or_url):
				593	return IsolateServer(file_or_url, namespace)
				594	else:
				595	return FileSystem(file_or_url)
				596
				597
maruel@chromium.org	781ccf6	2013-09-17 19:39:47 +0000	[diff] [blame]	598	class WorkerPool(threading_utils.AutoRetryThreadPool):
				599	"""Thread pool that automatically retries on IOError and runs a preconfigured
				600	function.
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	601	"""
				602	# Initial and maximum number of worker threads.
				603	INITIAL_WORKERS = 2
				604	MAX_WORKERS = 16
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	605	RETRIES = 5
				606
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	607	def __init__(self):
maruel@chromium.org	781ccf6	2013-09-17 19:39:47 +0000	[diff] [blame]	608	super(WorkerPool, self).__init__(
				609	[IOError],
				610	self.RETRIES,
				611	self.INITIAL_WORKERS,
				612	self.MAX_WORKERS,
				613	0,
				614	'remote')
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	615
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	616
maruel@chromium.org	7b844a6	2013-09-17 13:04:59 +0000	[diff] [blame]	617	def upload_tree(base_url, indir, infiles, namespace):
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	618	"""Uploads the given tree to the given url.
				619
				620	Arguments:
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	621	base_url: The base url, it is assume that \|base_url\|/has/ can be used to
				622	query if an element was already uploaded, and \|base_url\|/store/
				623	can be used to upload a new element.
				624	indir: Root directory the infiles are based in.
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	625	infiles: dict of files to upload files from \|indir\| to \|base_url\|.
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	626	namespace: The namespace to use on the server.
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	627	"""
				628	logging.info('upload tree(base_url=%s, indir=%s, files=%d)' %
				629	(base_url, indir, len(infiles)))
maruel@chromium.org	034e396	2013-03-13 13:34:25 +0000	[diff] [blame]	630
csharp@chromium.org	07fa759	2013-01-11 18:19:30 +0000	[diff] [blame]	631	# Create a pool of workers to zip and upload any files missing from
				632	# the server.
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame^]	633	cpus = max(threading_utils.num_processors(), 2)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	634	uploaded = []
csharp@chromium.org	07fa759	2013-01-11 18:19:30 +0000	[diff] [blame]	635
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame^]	636	with WorkerPool() as upload_pool:
				637	remote = get_storage_api(base_url, namespace)
				638	remote.set_pool(upload_pool)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	639
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame^]	640	def zip_and_trigger_push(filename, metadata, push_urls):
				641	"""Read the file, zips it and trigger push to the storage."""
				642	# TODO(csharp): Fix crbug.com/150823 and enable the touched logic again.
				643	path = os.path.join(indir, filename)
				644	content_generator = zip_compress(file_read(path, ZIPPED_FILE_CHUNK),
				645	get_zip_compression_level(path))
				646	if metadata.get('priority', '1') == '0':
				647	priority = WorkerPool.HIGH
				648	else:
				649	priority = WorkerPool.MED
				650	return remote.async_push(
				651	priority, metadata['h'], UNKNOWN_FILE_SIZE,
				652	content_generator, push_urls)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	653
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame^]	654	with threading_utils.ThreadPool(2, cpus, 0, 'zip') as zip_pool:
				655	for filename, metadata, push_urls in remote.get_missing_files(infiles):
				656	zip_pool.add_task(0, zip_and_trigger_push,
				657	filename, metadata, push_urls)
				658	uploaded.append((filename, metadata))
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	659	logging.info('Waiting for all files to finish zipping')
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame^]	660	zip_pool.join()
				661
maruel@chromium.org	781ccf6	2013-09-17 19:39:47 +0000	[diff] [blame]	662	logging.info('All files zipped.')
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame^]	663	upload_pool.join()
				664
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	665	logging.info('All files are uploaded')
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	666
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	667	total = len(infiles)
maruel@chromium.org	e5c1713	2012-11-21 18:18:46 +0000	[diff] [blame]	668	total_size = sum(metadata.get('s', 0) for metadata in infiles.itervalues())
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	669	logging.info(
				670	'Total: %6d, %9.1fkb',
				671	total,
maruel@chromium.org	e5c1713	2012-11-21 18:18:46 +0000	[diff] [blame]	672	sum(m.get('s', 0) for m in infiles.itervalues()) / 1024.)
csharp@chromium.org	20a888c	2013-01-15 15:06:55 +0000	[diff] [blame]	673	cache_hit = set(infiles.iterkeys()) - set(x[0] for x in uploaded)
maruel@chromium.org	e5c1713	2012-11-21 18:18:46 +0000	[diff] [blame]	674	cache_hit_size = sum(infiles[i].get('s', 0) for i in cache_hit)
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	675	logging.info(
				676	'cache hit: %6d, %9.1fkb, %6.2f%% files, %6.2f%% size',
				677	len(cache_hit),
				678	cache_hit_size / 1024.,
				679	len(cache_hit) * 100. / total,
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	680	cache_hit_size * 100. / total_size if total_size else 0)
csharp@chromium.org	20a888c	2013-01-15 15:06:55 +0000	[diff] [blame]	681	cache_miss = uploaded
maruel@chromium.org	e5c1713	2012-11-21 18:18:46 +0000	[diff] [blame]	682	cache_miss_size = sum(infiles[i[0]].get('s', 0) for i in cache_miss)
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	683	logging.info(
				684	'cache miss: %6d, %9.1fkb, %6.2f%% files, %6.2f%% size',
				685	len(cache_miss),
				686	cache_miss_size / 1024.,
				687	len(cache_miss) * 100. / total,
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	688	cache_miss_size * 100. / total_size if total_size else 0)
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	689	return 0
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	690
				691
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	692	class MemoryCache(object):
				693	"""This class is intended to be usable everywhere the Cache class is.
				694
				695	Instead of downloading to a cache, all files are kept in memory to be stored
				696	in the target directory directly.
				697	"""
				698
				699	def __init__(self, target_directory, pool, remote):
				700	self.target_directory = target_directory
				701	self.pool = pool
				702	self.remote = remote
				703	self._lock = threading.Lock()
				704	self._contents = {}
				705
				706	def retrieve(self, priority, item, size):
				707	"""Gets the requested file."""
				708	self.pool.add_task(priority, self._store, item, size)
				709
				710	def wait_for(self, items):
				711	"""Starts a loop that waits for at least one of \|items\| to be retrieved.
				712
				713	Returns the first item retrieved.
				714	"""
				715	with self._lock:
				716	# Flush items already present.
				717	for item in items:
				718	if item in self._contents:
				719	return item
				720
				721	while True:
				722	downloaded = self.pool.get_one_result()
				723	if downloaded in items:
				724	return downloaded
				725
				726	def path(self, item):
				727	return os.path.join(self.target_directory, item)
				728
				729	def read(self, item):
				730	return self._contents[item]
				731
				732	def _store(self, item, size):
				733	data = ''.join(self.remote.fetch(item, size))
				734	with self._lock:
				735	self._contents[item] = data
				736	return item
				737
				738	def __enter__(self):
				739	return self
				740
				741	def __exit__(self, _exc_type, _exec_value, _traceback):
				742	return False
				743
				744
				745	def load_isolated(content, os_flavor, algo):
				746	"""Verifies the .isolated file is valid and loads this object with the json
				747	data.
				748	"""
				749	try:
				750	data = json.loads(content)
				751	except ValueError:
				752	raise ConfigError('Failed to parse: %s...' % content[:100])
				753
				754	if not isinstance(data, dict):
				755	raise ConfigError('Expected dict, got %r' % data)
				756
				757	for key, value in data.iteritems():
				758	if key == 'command':
				759	if not isinstance(value, list):
				760	raise ConfigError('Expected list, got %r' % value)
				761	if not value:
				762	raise ConfigError('Expected non-empty command')
				763	for subvalue in value:
				764	if not isinstance(subvalue, basestring):
				765	raise ConfigError('Expected string, got %r' % subvalue)
				766
				767	elif key == 'files':
				768	if not isinstance(value, dict):
				769	raise ConfigError('Expected dict, got %r' % value)
				770	for subkey, subvalue in value.iteritems():
				771	if not isinstance(subkey, basestring):
				772	raise ConfigError('Expected string, got %r' % subkey)
				773	if not isinstance(subvalue, dict):
				774	raise ConfigError('Expected dict, got %r' % subvalue)
				775	for subsubkey, subsubvalue in subvalue.iteritems():
				776	if subsubkey == 'l':
				777	if not isinstance(subsubvalue, basestring):
				778	raise ConfigError('Expected string, got %r' % subsubvalue)
				779	elif subsubkey == 'm':
				780	if not isinstance(subsubvalue, int):
				781	raise ConfigError('Expected int, got %r' % subsubvalue)
				782	elif subsubkey == 'h':
				783	if not is_valid_hash(subsubvalue, algo):
				784	raise ConfigError('Expected sha-1, got %r' % subsubvalue)
				785	elif subsubkey == 's':
				786	if not isinstance(subsubvalue, int):
				787	raise ConfigError('Expected int, got %r' % subsubvalue)
				788	else:
				789	raise ConfigError('Unknown subsubkey %s' % subsubkey)
				790	if bool('h' in subvalue) and bool('l' in subvalue):
				791	raise ConfigError(
				792	'Did not expect both \'h\' (sha-1) and \'l\' (link), got: %r' %
				793	subvalue)
				794
				795	elif key == 'includes':
				796	if not isinstance(value, list):
				797	raise ConfigError('Expected list, got %r' % value)
				798	if not value:
				799	raise ConfigError('Expected non-empty includes list')
				800	for subvalue in value:
				801	if not is_valid_hash(subvalue, algo):
				802	raise ConfigError('Expected sha-1, got %r' % subvalue)
				803
				804	elif key == 'read_only':
				805	if not isinstance(value, bool):
				806	raise ConfigError('Expected bool, got %r' % value)
				807
				808	elif key == 'relative_cwd':
				809	if not isinstance(value, basestring):
				810	raise ConfigError('Expected string, got %r' % value)
				811
				812	elif key == 'os':
				813	if os_flavor and value != os_flavor:
				814	raise ConfigError(
				815	'Expected \'os\' to be \'%s\' but got \'%s\'' %
				816	(os_flavor, value))
				817
				818	else:
				819	raise ConfigError('Unknown key %s' % key)
				820
				821	return data
				822
				823
				824	class IsolatedFile(object):
				825	"""Represents a single parsed .isolated file."""
				826	def __init__(self, obj_hash, algo):
				827	"""\|obj_hash\| is really the sha-1 of the file."""
				828	logging.debug('IsolatedFile(%s)' % obj_hash)
				829	self.obj_hash = obj_hash
				830	self.algo = algo
				831	# Set once all the left-side of the tree is parsed. 'Tree' here means the
				832	# .isolate and all the .isolated files recursively included by it with
				833	# 'includes' key. The order of each sha-1 in 'includes', each representing a
				834	# .isolated file in the hash table, is important, as the later ones are not
				835	# processed until the firsts are retrieved and read.
				836	self.can_fetch = False
				837
				838	# Raw data.
				839	self.data = {}
				840	# A IsolatedFile instance, one per object in self.includes.
				841	self.children = []
				842
				843	# Set once the .isolated file is loaded.
				844	self._is_parsed = False
				845	# Set once the files are fetched.
				846	self.files_fetched = False
				847
				848	def load(self, content):
				849	"""Verifies the .isolated file is valid and loads this object with the json
				850	data.
				851	"""
				852	logging.debug('IsolatedFile.load(%s)' % self.obj_hash)
				853	assert not self._is_parsed
				854	self.data = load_isolated(content, None, self.algo)
				855	self.children = [
				856	IsolatedFile(i, self.algo) for i in self.data.get('includes', [])
				857	]
				858	self._is_parsed = True
				859
				860	def fetch_files(self, cache, files):
				861	"""Adds files in this .isolated file not present in \|files\| dictionary.
				862
				863	Preemptively request files.
				864
				865	Note that \|files\| is modified by this function.
				866	"""
				867	assert self.can_fetch
				868	if not self._is_parsed or self.files_fetched:
				869	return
				870	logging.debug('fetch_files(%s)' % self.obj_hash)
				871	for filepath, properties in self.data.get('files', {}).iteritems():
				872	# Root isolated has priority on the files being mapped. In particular,
				873	# overriden files must not be fetched.
				874	if filepath not in files:
				875	files[filepath] = properties
				876	if 'h' in properties:
				877	# Preemptively request files.
				878	logging.debug('fetching %s' % filepath)
				879	cache.retrieve(
				880	WorkerPool.MED,
				881	properties['h'],
				882	properties['s'])
				883	self.files_fetched = True
				884
				885
				886	class Settings(object):
				887	"""Results of a completely parsed .isolated file."""
				888	def __init__(self):
				889	self.command = []
				890	self.files = {}
				891	self.read_only = None
				892	self.relative_cwd = None
				893	# The main .isolated file, a IsolatedFile instance.
				894	self.root = None
				895
				896	def load(self, cache, root_isolated_hash, algo):
				897	"""Loads the .isolated and all the included .isolated asynchronously.
				898
				899	It enables support for "included" .isolated files. They are processed in
				900	strict order but fetched asynchronously from the cache. This is important so
				901	that a file in an included .isolated file that is overridden by an embedding
				902	.isolated file is not fetched needlessly. The includes are fetched in one
				903	pass and the files are fetched as soon as all the ones on the left-side
				904	of the tree were fetched.
				905
				906	The prioritization is very important here for nested .isolated files.
				907	'includes' have the highest priority and the algorithm is optimized for both
				908	deep and wide trees. A deep one is a long link of .isolated files referenced
				909	one at a time by one item in 'includes'. A wide one has a large number of
				910	'includes' in a single .isolated file. 'left' is defined as an included
				911	.isolated file earlier in the 'includes' list. So the order of the elements
				912	in 'includes' is important.
				913	"""
				914	self.root = IsolatedFile(root_isolated_hash, algo)
				915
				916	# Isolated files being retrieved now: hash -> IsolatedFile instance.
				917	pending = {}
				918	# Set of hashes of already retrieved items to refuse recursive includes.
				919	seen = set()
				920
				921	def retrieve(isolated_file):
				922	h = isolated_file.obj_hash
				923	if h in seen:
				924	raise ConfigError('IsolatedFile %s is retrieved recursively' % h)
				925	assert h not in pending
				926	seen.add(h)
				927	pending[h] = isolated_file
				928	cache.retrieve(WorkerPool.HIGH, h, UNKNOWN_FILE_SIZE)
				929
				930	retrieve(self.root)
				931
				932	while pending:
				933	item_hash = cache.wait_for(pending)
				934	item = pending.pop(item_hash)
				935	item.load(cache.read(item_hash))
				936	if item_hash == root_isolated_hash:
				937	# It's the root item.
				938	item.can_fetch = True
				939
				940	for new_child in item.children:
				941	retrieve(new_child)
				942
				943	# Traverse the whole tree to see if files can now be fetched.
				944	self._traverse_tree(cache, self.root)
				945
				946	def check(n):
				947	return all(check(x) for x in n.children) and n.files_fetched
				948	assert check(self.root)
				949
				950	self.relative_cwd = self.relative_cwd or ''
				951	self.read_only = self.read_only or False
				952
				953	def _traverse_tree(self, cache, node):
				954	if node.can_fetch:
				955	if not node.files_fetched:
				956	self._update_self(cache, node)
				957	will_break = False
				958	for i in node.children:
				959	if not i.can_fetch:
				960	if will_break:
				961	break
				962	# Automatically mark the first one as fetcheable.
				963	i.can_fetch = True
				964	will_break = True
				965	self._traverse_tree(cache, i)
				966
				967	def _update_self(self, cache, node):
				968	node.fetch_files(cache, self.files)
				969	# Grabs properties.
				970	if not self.command and node.data.get('command'):
				971	self.command = node.data['command']
				972	if self.read_only is None and node.data.get('read_only') is not None:
				973	self.read_only = node.data['read_only']
				974	if (self.relative_cwd is None and
				975	node.data.get('relative_cwd') is not None):
				976	self.relative_cwd = node.data['relative_cwd']
				977
				978
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	979	@subcommand.usage('<file1..fileN> or - to read from stdin')
				980	def CMDarchive(parser, args):
				981	"""Archives data to the server."""
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	982	options, files = parser.parse_args(args)
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	983
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	984	if files == ['-']:
				985	files = sys.stdin.readlines()
				986
				987	if not files:
				988	parser.error('Nothing to upload')
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	989	if not options.isolate_server:
				990	parser.error('Nowhere to send. Please specify --isolate-server')
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	991
				992	# Load the necessary metadata. This is going to be rewritten eventually to be
				993	# more efficient.
maruel@chromium.org	7b844a6	2013-09-17 13:04:59 +0000	[diff] [blame]	994	algo = hashlib.sha1
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	995	infiles = dict(
				996	(
				997	f,
				998	{
maruel@chromium.org	e5c1713	2012-11-21 18:18:46 +0000	[diff] [blame]	999	's': os.stat(f).st_size,
maruel@chromium.org	7b844a6	2013-09-17 13:04:59 +0000	[diff] [blame]	1000	'h': hash_file(f, algo),
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	1001	}
				1002	)
				1003	for f in files)
				1004
vadimsh@chromium.org	a432647	2013-08-24 02:05:41 +0000	[diff] [blame]	1005	with tools.Profiler('Archive'):
maruel@chromium.org	7b844a6	2013-09-17 13:04:59 +0000	[diff] [blame]	1006	ret = upload_tree(
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	1007	base_url=options.isolate_server,
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	1008	indir=os.getcwd(),
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	1009	infiles=infiles,
				1010	namespace=options.namespace)
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	1011	if not ret:
				1012	print '\n'.join('%s %s' % (infiles[f]['h'], f) for f in sorted(infiles))
				1013	return ret
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	1014
				1015
				1016	def CMDdownload(parser, args):
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	1017	"""Download data from the server.
				1018
				1019	It can download individual files.
				1020	"""
				1021	parser.add_option(
				1022	'-f', '--file', metavar='HASH DEST', default=[], action='append', nargs=2,
				1023	help='hash and destination of a file, can be used multiple times')
				1024	parser.add_option(
				1025	'-t', '--target', metavar='DIR', default=os.getcwd(),
				1026	help='destination directory')
				1027	options, args = parser.parse_args(args)
				1028	if args:
				1029	parser.error('Unsupported arguments: %s' % args)
				1030	if not options.file:
				1031	parser.error('Use one of --file is required.')
				1032
				1033	options.target = os.path.abspath(options.target)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	1034	remote = get_storage_api(options.isolate_server, options.namespace)
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	1035	for h, dest in options.file:
				1036	logging.info('%s: %s', h, dest)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	1037	file_write(
				1038	os.path.join(options.target, dest),
				1039	remote.fetch(h, UNKNOWN_FILE_SIZE))
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	1040	return 0
				1041
				1042
				1043	class OptionParserIsolateServer(tools.OptionParserWithLogging):
				1044	def __init__(self, **kwargs):
				1045	tools.OptionParserWithLogging.__init__(self, **kwargs)
				1046	self.add_option(
				1047	'-I', '--isolate-server',
				1048	default=ISOLATE_SERVER,
				1049	metavar='URL',
				1050	help='Isolate server where data is stored. default: %default')
				1051	self.add_option(
				1052	'--namespace', default='default-gzip',
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	1053	help='The namespace to use on the server, default: %default')
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	1054
				1055	def parse_args(self, args, *kwargs):
				1056	options, args = tools.OptionParserWithLogging.parse_args(
				1057	self, args, *kwargs)
				1058	options.isolate_server = options.isolate_server.rstrip('/')
				1059	if not options.isolate_server:
				1060	self.error('--isolate-server is required.')
				1061	return options, args
				1062
				1063
				1064	def main(args):
				1065	dispatcher = subcommand.CommandDispatcher(__name__)
				1066	try:
				1067	return dispatcher.execute(
				1068	OptionParserIsolateServer(version=__version__), args)
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	1069	except (ConfigError, MappingError) as e:
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	1070	sys.stderr.write('\nError: ')
				1071	sys.stderr.write(str(e))
				1072	sys.stderr.write('\n')
				1073	return 1
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	1074
				1075
				1076	if __name__ == '__main__':
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	1077	fix_encoding.fix_encoding()
				1078	tools.disable_buffering()
				1079	colorama.init()
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	1080	sys.exit(main(sys.argv[1:]))