Blame - isolateserver.py - chromium.googlesource.com/infra/luci/client-py

blob: 0f7c5ff613ec46702a02051f880e284c77452828 [file] [log] [blame]

maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	1	#!/usr/bin/env python
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	2	# Copyright 2013 The Chromium Authors. All rights reserved.
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	3	# Use of this source code is governed by a BSD-style license that can be
				4	# found in the LICENSE file.
				5
				6	"""Archives a set of files to a server."""
				7
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	8	__version__ = '0.2'
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	9
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	10	import binascii
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	11	import cStringIO
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	12	import hashlib
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	13	import itertools
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame^]	14	import json
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	15	import logging
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	16	import os
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	17	import random
				18	import re
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	19	import sys
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	20	import threading
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	21	import time
maruel@chromium.org	e82112e	2013-04-24 14:41:55 +0000	[diff] [blame]	22	import urllib
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	23	import zlib
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	24
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	25	from third_party import colorama
				26	from third_party.depot_tools import fix_encoding
				27	from third_party.depot_tools import subcommand
				28
vadimsh@chromium.org	6b70621	2013-08-28 15:03:46 +0000	[diff] [blame]	29	from utils import net
vadimsh@chromium.org	b074b16	2013-08-22 17:55:46 +0000	[diff] [blame]	30	from utils import threading_utils
vadimsh@chromium.org	a432647	2013-08-24 02:05:41 +0000	[diff] [blame]	31	from utils import tools
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	32
				33
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	34	# Default server.
				35	# TODO(maruel): Chromium-specific.
				36	ISOLATE_SERVER = 'https://isolateserver-dev.appspot.com/'
				37
				38
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	39	# The minimum size of files to upload directly to the blobstore.
maruel@chromium.org	aef29f8	2012-12-12 15:00:42 +0000	[diff] [blame]	40	MIN_SIZE_FOR_DIRECT_BLOBSTORE = 20 * 1024
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	41
vadimsh@chromium.org	eea5242	2013-08-21 19:35:54 +0000	[diff] [blame]	42	# The number of files to check the isolate server per /contains query.
				43	# All files are sorted by likelihood of a change in the file content
				44	# (currently file size is used to estimate this: larger the file -> larger the
				45	# possibility it has changed). Then first ITEMS_PER_CONTAINS_QUERIES[0] files
				46	# are taken and send to '/contains', then next ITEMS_PER_CONTAINS_QUERIES[1],
				47	# and so on. Numbers here is a trade-off; the more per request, the lower the
				48	# effect of HTTP round trip latency and TCP-level chattiness. On the other hand,
				49	# larger values cause longer lookups, increasing the initial latency to start
				50	# uploading, which is especially an issue for large files. This value is
				51	# optimized for the "few thousands files to look up with minimal number of large
				52	# files missing" case.
				53	ITEMS_PER_CONTAINS_QUERIES = [20, 20, 50, 50, 50, 100]
csharp@chromium.org	07fa759	2013-01-11 18:19:30 +0000	[diff] [blame]	54
maruel@chromium.org	9958e4a	2013-09-17 00:01:48 +0000	[diff] [blame]	55
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	56	# A list of already compressed extension types that should not receive any
				57	# compression before being uploaded.
				58	ALREADY_COMPRESSED_TYPES = [
				59	'7z', 'avi', 'cur', 'gif', 'h264', 'jar', 'jpeg', 'jpg', 'pdf', 'png',
				60	'wav', 'zip'
				61	]
				62
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	63
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	64	# The file size to be used when we don't know the correct file size,
				65	# generally used for .isolated files.
				66	UNKNOWN_FILE_SIZE = None
				67
				68
				69	# The size of each chunk to read when downloading and unzipping files.
				70	ZIPPED_FILE_CHUNK = 16 * 1024
				71
				72
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	73	# Chunk size to use when doing disk I/O.
				74	DISK_FILE_CHUNK = 1024 * 1024
				75
				76
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	77	# Read timeout in seconds for downloads from isolate storage. If there's no
				78	# response from the server within this timeout whole download will be aborted.
				79	DOWNLOAD_READ_TIMEOUT = 60
				80
				81
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame^]	82	# The delay (in seconds) to wait between logging statements when retrieving
				83	# the required files. This is intended to let the user (or buildbot) know that
				84	# the program is still running.
				85	DELAY_BETWEEN_UPDATES_IN_SECS = 30
				86
				87
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	88	class ConfigError(ValueError):
				89	"""Generic failure to load a .isolated file."""
				90	pass
				91
				92
				93	class MappingError(OSError):
				94	"""Failed to recreate the tree."""
				95	pass
				96
				97
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	98	def randomness():
				99	"""Generates low-entropy randomness for MIME encoding.
				100
				101	Exists so it can be mocked out in unit tests.
				102	"""
				103	return str(time.time())
				104
				105
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	106	def encode_multipart_formdata(fields, files,
				107	mime_mapper=lambda _: 'application/octet-stream'):
				108	"""Encodes a Multipart form data object.
				109
				110	Args:
				111	fields: a sequence (name, value) elements for
				112	regular form fields.
				113	files: a sequence of (name, filename, value) elements for data to be
				114	uploaded as files.
				115	mime_mapper: function to return the mime type from the filename.
				116	Returns:
				117	content_type: for httplib.HTTP instance
				118	body: for httplib.HTTP instance
				119	"""
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	120	boundary = hashlib.md5(randomness()).hexdigest()
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	121	body_list = []
				122	for (key, value) in fields:
				123	if isinstance(key, unicode):
				124	value = key.encode('utf-8')
				125	if isinstance(value, unicode):
				126	value = value.encode('utf-8')
				127	body_list.append('--' + boundary)
				128	body_list.append('Content-Disposition: form-data; name="%s"' % key)
				129	body_list.append('')
				130	body_list.append(value)
				131	body_list.append('--' + boundary)
				132	body_list.append('')
				133	for (key, filename, value) in files:
				134	if isinstance(key, unicode):
				135	value = key.encode('utf-8')
				136	if isinstance(filename, unicode):
				137	value = filename.encode('utf-8')
				138	if isinstance(value, unicode):
				139	value = value.encode('utf-8')
				140	body_list.append('--' + boundary)
				141	body_list.append('Content-Disposition: form-data; name="%s"; '
				142	'filename="%s"' % (key, filename))
				143	body_list.append('Content-Type: %s' % mime_mapper(filename))
				144	body_list.append('')
				145	body_list.append(value)
				146	body_list.append('--' + boundary)
				147	body_list.append('')
				148	if body_list:
				149	body_list[-2] += '--'
				150	body = '\r\n'.join(body_list)
				151	content_type = 'multipart/form-data; boundary=%s' % boundary
				152	return content_type, body
				153
				154
maruel@chromium.org	7b844a6	2013-09-17 13:04:59 +0000	[diff] [blame]	155	def is_valid_hash(value, algo):
				156	"""Returns if the value is a valid hash for the corresponding algorithm."""
				157	size = 2 * algo().digest_size
				158	return bool(re.match(r'^[a-fA-F0-9]{%d}$' % size, value))
				159
				160
				161	def hash_file(filepath, algo):
				162	"""Calculates the hash of a file without reading it all in memory at once.
				163
				164	\|algo\| should be one of hashlib hashing algorithm.
				165	"""
				166	digest = algo()
maruel@chromium.org	037758d	2012-12-10 17:59:46 +0000	[diff] [blame]	167	with open(filepath, 'rb') as f:
				168	while True:
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	169	chunk = f.read(DISK_FILE_CHUNK)
maruel@chromium.org	037758d	2012-12-10 17:59:46 +0000	[diff] [blame]	170	if not chunk:
				171	break
				172	digest.update(chunk)
				173	return digest.hexdigest()
				174
				175
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	176	def file_read(filepath):
				177	"""Yields file content."""
				178	with open(filepath, 'rb') as f:
				179	while True:
				180	data = f.read(DISK_FILE_CHUNK)
				181	if not data:
				182	break
				183	yield data
				184
				185
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	186	def file_write(filepath, content_generator):
				187	"""Writes file content as generated by content_generator.
				188
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	189	Creates the intermediary directory as needed.
				190
				191	Returns the number of bytes written.
				192
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	193	Meant to be mocked out in unit tests.
				194	"""
				195	filedir = os.path.dirname(filepath)
				196	if not os.path.isdir(filedir):
				197	os.makedirs(filedir)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	198	total = 0
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	199	with open(filepath, 'wb') as f:
				200	for d in content_generator:
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	201	total += len(d)
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	202	f.write(d)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	203	return total
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	204
				205
maruel@chromium.org	af25485	2013-09-17 17:48:14 +0000	[diff] [blame]	206	def create_directories(base_directory, files):
				207	"""Creates the directory structure needed by the given list of files."""
				208	logging.debug('create_directories(%s, %d)', base_directory, len(files))
				209	# Creates the tree of directories to create.
				210	directories = set(os.path.dirname(f) for f in files)
				211	for item in list(directories):
				212	while item:
				213	directories.add(item)
				214	item = os.path.dirname(item)
				215	for d in sorted(directories):
				216	if d:
				217	os.mkdir(os.path.join(base_directory, d))
				218
				219
				220	def create_links(base_directory, files):
				221	"""Creates any links needed by the given set of files."""
				222	for filepath, properties in files:
				223	if 'l' not in properties:
				224	continue
				225	if sys.platform == 'win32':
				226	# TODO(maruel): Create junctions or empty text files similar to what
				227	# cygwin do?
				228	logging.warning('Ignoring symlink %s', filepath)
				229	continue
				230	outfile = os.path.join(base_directory, filepath)
				231	# symlink doesn't exist on Windows. So the 'link' property should
				232	# never be specified for windows .isolated file.
				233	os.symlink(properties['l'], outfile) # pylint: disable=E1101
				234	if 'm' in properties:
				235	lchmod = getattr(os, 'lchmod', None)
				236	if lchmod:
				237	lchmod(outfile, properties['m'])
				238
				239
				240	def setup_commands(base_directory, cwd, cmd):
				241	"""Correctly adjusts and then returns the required working directory
				242	and command needed to run the test.
				243	"""
				244	assert not os.path.isabs(cwd), 'The cwd must be a relative path, got %s' % cwd
				245	cwd = os.path.join(base_directory, cwd)
				246	if not os.path.isdir(cwd):
				247	os.makedirs(cwd)
				248
				249	# Ensure paths are correctly separated on windows.
				250	cmd[0] = cmd[0].replace('/', os.path.sep)
				251	cmd = tools.fix_python_path(cmd)
				252
				253	return cwd, cmd
				254
				255
				256	def generate_remaining_files(files):
				257	"""Generates a dictionary of all the remaining files to be downloaded."""
				258	remaining = {}
				259	for filepath, props in files:
				260	if 'h' in props:
				261	remaining.setdefault(props['h'], []).append((filepath, props))
				262
				263	return remaining
				264
				265
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	266	def is_valid_file(filepath, size):
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	267	"""Determines if the given files appears valid.
				268
				269	Currently it just checks the file's size.
				270	"""
				271	if size == UNKNOWN_FILE_SIZE:
				272	return True
				273	actual_size = os.stat(filepath).st_size
				274	if size != actual_size:
				275	logging.warning(
				276	'Found invalid item %s; %d != %d',
				277	os.path.basename(filepath), actual_size, size)
				278	return False
				279	return True
				280
				281
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	282	def try_remove(filepath):
				283	"""Removes a file without crashing even if it doesn't exist."""
				284	try:
				285	os.remove(filepath)
				286	except OSError:
				287	pass
				288
				289
vadimsh@chromium.org	80f7300	2013-07-12 14:52:44 +0000	[diff] [blame]	290	def url_read(url, **kwargs):
vadimsh@chromium.org	6b70621	2013-08-28 15:03:46 +0000	[diff] [blame]	291	result = net.url_read(url, **kwargs)
vadimsh@chromium.org	80f7300	2013-07-12 14:52:44 +0000	[diff] [blame]	292	if result is None:
maruel@chromium.org	ef33312	2013-03-12 20:36:40 +0000	[diff] [blame]	293	# If we get no response from the server, assume it is down and raise an
				294	# exception.
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	295	raise MappingError('Unable to connect to server %s' % url)
maruel@chromium.org	ef33312	2013-03-12 20:36:40 +0000	[diff] [blame]	296	return result
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	297
				298
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	299	class IsolateServer(object):
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	300	"""Client class to download or upload to Isolate Server."""
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	301	def __init__(self, base_url, namespace):
				302	assert base_url.startswith('http'), base_url
				303	self.content_url = base_url.rstrip('/') + '/content/'
				304	self.namespace = namespace
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	305	self._token = None
				306	self._lock = threading.Lock()
				307
				308	@property
				309	def token(self):
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	310	# TODO(maruel): Make this request much earlier asynchronously while the
				311	# files are being enumerated.
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	312	with self._lock:
				313	if not self._token:
				314	self._token = urllib.quote(url_read(self.content_url + 'get_token'))
				315	return self._token
				316
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	317	def fetch(self, item, expected_size):
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	318	"""Fetches an object and yields its content."""
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	319	assert isinstance(item, basestring)
				320	assert (
				321	isinstance(expected_size, (int, long)) or
				322	expected_size == UNKNOWN_FILE_SIZE)
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	323	zipped_url = '%sretrieve/%s/%s' % (self.content_url, self.namespace, item)
				324	logging.debug('download_file(%s)', zipped_url)
				325
				326	# Because the app engine DB is only eventually consistent, retry 404 errors
				327	# because the file might just not be visible yet (even though it has been
				328	# uploaded).
				329	connection = net.url_open(
				330	zipped_url, retry_404=True, read_timeout=DOWNLOAD_READ_TIMEOUT)
				331	if not connection:
				332	raise IOError('Unable to open connection to %s' % zipped_url)
				333
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	334	# TODO(maruel): Must only decompress when needed.
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	335	decompressor = zlib.decompressobj()
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	336	try:
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	337	compressed_size = 0
				338	decompressed_size = 0
				339	while True:
				340	chunk = connection.read(ZIPPED_FILE_CHUNK)
				341	if not chunk:
				342	break
				343	compressed_size += len(chunk)
				344	decompressed = decompressor.decompress(chunk)
				345	decompressed_size += len(decompressed)
				346	yield decompressed
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	347
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	348	# Ensure that all the data was properly decompressed.
				349	uncompressed_data = decompressor.flush()
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	350	if uncompressed_data:
				351	raise IOError('Decompression failed')
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	352	if (expected_size != UNKNOWN_FILE_SIZE and
				353	decompressed_size != expected_size):
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	354	raise IOError('File incorrect size after download of %s. Got %s and '
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	355	'expected %s' % (item, decompressed_size, expected_size))
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	356	except zlib.error as e:
				357	msg = 'Corrupted zlib for item %s. Processed %d of %s bytes.\n%s' % (
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	358	item, compressed_size, connection.content_length, e)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	359	logging.warning(msg)
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	360
				361	# Testing seems to show that if a few machines are trying to download
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	362	# the same blob, they can cause each other to fail. So if we hit a zip
				363	# error, this is the most likely cause (it only downloads some of the
				364	# data). Randomly sleep for between 5 and 25 seconds to try and spread
				365	# out the downloads.
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	366	sleep_duration = (random.random() * 20) + 5
				367	time.sleep(sleep_duration)
				368	raise IOError(msg)
maruel@chromium.org	c2bfef4	2013-08-30 21:46:26 +0000	[diff] [blame]	369
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	370	def push(self, item, expected_size, content_generator):
				371	"""Uploads content generated by \|content_generator\| as \|item\| to the remote
				372	isolate server.
				373	"""
				374	assert isinstance(item, basestring)
				375	assert isinstance(expected_size, int) or expected_size == UNKNOWN_FILE_SIZE
				376	item = str(item)
				377	# TODO(maruel): Support large files. This would require streaming support.
				378	content = ''.join(content_generator)
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	379	if len(content) > MIN_SIZE_FOR_DIRECT_BLOBSTORE:
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	380	return self._upload_hash_content_to_blobstore(item, content)
maruel@chromium.org	d1e20c9	2013-09-17 20:54:26 +0000	[diff] [blame]	381
				382	url = '%sstore/%s/%s?token=%s' % (
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	383	self.content_url, self.namespace, item, self.token)
maruel@chromium.org	d1e20c9	2013-09-17 20:54:26 +0000	[diff] [blame]	384	return url_read(
				385	url, data=content, content_type='application/octet-stream')
				386
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	387	def _upload_hash_content_to_blobstore(self, item, content):
maruel@chromium.org	d1e20c9	2013-09-17 20:54:26 +0000	[diff] [blame]	388	"""Uploads the content directly to the blobstore via a generated url."""
				389	# TODO(maruel): Support large files. This would require streaming support.
				390	gen_url = '%sgenerate_blobstore_url/%s/%s' % (
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	391	self.content_url, self.namespace, item)
maruel@chromium.org	d1e20c9	2013-09-17 20:54:26 +0000	[diff] [blame]	392	# Token is guaranteed to be already quoted but it is unnecessary here, and
				393	# only here.
				394	data = [('token', urllib.unquote(self.token))]
				395	content_type, body = encode_multipart_formdata(
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	396	data, [('content', item, content)])
maruel@chromium.org	d1e20c9	2013-09-17 20:54:26 +0000	[diff] [blame]	397	last_url = gen_url
				398	for _ in net.retry_loop(max_attempts=net.URL_OPEN_MAX_ATTEMPTS):
				399	# Retry HTTP 50x here but not 404.
				400	upload_url = net.url_read(gen_url, data=data)
				401	if not upload_url:
				402	raise MappingError('Unable to connect to server %s' % gen_url)
				403	last_url = upload_url
				404
				405	# Do not retry this request on HTTP 50x. Regenerate an upload url each
				406	# time since uploading "consumes" the upload url.
				407	result = net.url_read(
				408	upload_url, data=body, content_type=content_type, retry_50x=False)
				409	if result is not None:
				410	return result
				411	raise MappingError('Unable to connect to server %s' % last_url)
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	412
				413
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	414	def check_files_exist_on_server(query_url, queries):
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	415	"""Queries the server to see which files from this batch already exist there.
				416
				417	Arguments:
				418	queries: The hash files to potential upload to the server.
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	419	Returns:
				420	missing_files: list of files that are missing on the server.
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	421	"""
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	422	# TODO(maruel): Move inside IsolateServer.
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	423	logging.info('Checking existence of %d files...', len(queries))
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	424	body = ''.join(
maruel@chromium.org	e5c1713	2012-11-21 18:18:46 +0000	[diff] [blame]	425	(binascii.unhexlify(meta_data['h']) for (_, meta_data) in queries))
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	426	assert (len(body) % 20) == 0, repr(body)
				427
vadimsh@chromium.org	80f7300	2013-07-12 14:52:44 +0000	[diff] [blame]	428	response = url_read(
				429	query_url, data=body, content_type='application/octet-stream')
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	430	if len(queries) != len(response):
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	431	raise MappingError(
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	432	'Got an incorrect number of responses from the server. Expected %d, '
				433	'but got %d' % (len(queries), len(response)))
				434
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	435	missing_files = [
				436	queries[i] for i, flag in enumerate(response) if flag == chr(0)
				437	]
				438	logging.info('Queried %d files, %d cache hit',
				439	len(queries), len(queries) - len(missing_files))
				440	return missing_files
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	441
				442
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	443	class FileSystem(object):
				444	"""Fetches data from the file system.
				445
				446	The common use case is a NFS/CIFS file server that is mounted locally that is
				447	used to fetch the file on a local partition.
				448	"""
				449	def __init__(self, base_path):
				450	self.base_path = base_path
				451
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	452	def fetch(self, item, expected_size):
				453	assert isinstance(item, basestring)
				454	assert isinstance(expected_size, int) or expected_size == UNKNOWN_FILE_SIZE
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	455	source = os.path.join(self.base_path, item)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	456	if (expected_size != UNKNOWN_FILE_SIZE and
				457	not is_valid_file(source, expected_size)):
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	458	raise IOError('Invalid file %s' % item)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	459	return file_read(source)
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	460
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	461	def push(self, item, expected_size, content_generator):
				462	assert isinstance(item, basestring)
				463	assert isinstance(expected_size, int) or expected_size == UNKNOWN_FILE_SIZE
				464	dest = os.path.join(self.base_path, item)
				465	total = file_write(dest, content_generator)
				466	if expected_size != UNKNOWN_FILE_SIZE and total != expected_size:
				467	os.remove(dest)
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	468	raise IOError(
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	469	'Invalid file %s, %d != %d' % (item, total, expected_size))
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	470
				471
				472	def get_storage_api(file_or_url, namespace):
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	473	"""Returns an object that implements .fetch() and .push()."""
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	474	if re.match(r'^https?://.+$', file_or_url):
				475	return IsolateServer(file_or_url, namespace)
				476	else:
				477	return FileSystem(file_or_url)
				478
				479
maruel@chromium.org	781ccf6	2013-09-17 19:39:47 +0000	[diff] [blame]	480	class WorkerPool(threading_utils.AutoRetryThreadPool):
				481	"""Thread pool that automatically retries on IOError and runs a preconfigured
				482	function.
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	483	"""
				484	# Initial and maximum number of worker threads.
				485	INITIAL_WORKERS = 2
				486	MAX_WORKERS = 16
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	487	RETRIES = 5
				488
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	489	def __init__(self):
maruel@chromium.org	781ccf6	2013-09-17 19:39:47 +0000	[diff] [blame]	490	super(WorkerPool, self).__init__(
				491	[IOError],
				492	self.RETRIES,
				493	self.INITIAL_WORKERS,
				494	self.MAX_WORKERS,
				495	0,
				496	'remote')
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	497
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	498
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	499	def compression_level(filename):
				500	"""Given a filename calculates the ideal compression level to use."""
				501	file_ext = os.path.splitext(filename)[1].lower()
				502	# TODO(csharp): Profile to find what compression level works best.
				503	return 0 if file_ext in ALREADY_COMPRESSED_TYPES else 7
				504
				505
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	506	def read_and_compress(filepath, level):
				507	"""Reads a file and returns its content gzip compressed."""
				508	compressor = zlib.compressobj(level)
				509	compressed_data = cStringIO.StringIO()
				510	with open(filepath, 'rb') as f:
				511	while True:
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	512	chunk = f.read(ZIPPED_FILE_CHUNK)
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	513	if not chunk:
				514	break
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	515	compressed_data.write(compressor.compress(chunk))
				516	compressed_data.write(compressor.flush(zlib.Z_FINISH))
				517	value = compressed_data.getvalue()
				518	compressed_data.close()
				519	return value
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	520
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	521
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	522	def zip_and_trigger_upload(infile, metadata, add_item):
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	523	# TODO(csharp): Fix crbug.com/150823 and enable the touched logic again.
				524	# if not metadata['T']:
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	525	# TODO(maruel): Use a generator?
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	526	compressed_data = read_and_compress(infile, compression_level(infile))
				527	priority = (
maruel@chromium.org	781ccf6	2013-09-17 19:39:47 +0000	[diff] [blame]	528	WorkerPool.HIGH if metadata.get('priority', '1') == '0'
				529	else WorkerPool.MED)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	530	return add_item(priority, metadata['h'], [compressed_data])
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	531
				532
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	533	def batch_files_for_check(infiles):
				534	"""Splits list of files to check for existence on the server into batches.
maruel@chromium.org	35fc0c8	2013-01-17 15:14:14 +0000	[diff] [blame]	535
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	536	Each batch corresponds to a single 'exists?' query to the server.
				537
				538	Yields:
				539	batches: list of batches, each batch is a list of files.
maruel@chromium.org	35fc0c8	2013-01-17 15:14:14 +0000	[diff] [blame]	540	"""
vadimsh@chromium.org	eea5242	2013-08-21 19:35:54 +0000	[diff] [blame]	541	batch_count = 0
				542	batch_size_limit = ITEMS_PER_CONTAINS_QUERIES[0]
maruel@chromium.org	35fc0c8	2013-01-17 15:14:14 +0000	[diff] [blame]	543	next_queries = []
csharp@chromium.org	90c4581	2013-01-23 14:27:21 +0000	[diff] [blame]	544	items = ((k, v) for k, v in infiles.iteritems() if 's' in v)
				545	for relfile, metadata in sorted(items, key=lambda x: -x[1]['s']):
maruel@chromium.org	35fc0c8	2013-01-17 15:14:14 +0000	[diff] [blame]	546	next_queries.append((relfile, metadata))
vadimsh@chromium.org	eea5242	2013-08-21 19:35:54 +0000	[diff] [blame]	547	if len(next_queries) == batch_size_limit:
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	548	yield next_queries
maruel@chromium.org	35fc0c8	2013-01-17 15:14:14 +0000	[diff] [blame]	549	next_queries = []
vadimsh@chromium.org	eea5242	2013-08-21 19:35:54 +0000	[diff] [blame]	550	batch_count += 1
				551	batch_size_limit = ITEMS_PER_CONTAINS_QUERIES[
				552	min(batch_count, len(ITEMS_PER_CONTAINS_QUERIES) - 1)]
maruel@chromium.org	35fc0c8	2013-01-17 15:14:14 +0000	[diff] [blame]	553	if next_queries:
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	554	yield next_queries
				555
				556
				557	def get_files_to_upload(contains_hash_url, infiles):
				558	"""Yields files that are missing on the server."""
vadimsh@chromium.org	b074b16	2013-08-22 17:55:46 +0000	[diff] [blame]	559	with threading_utils.ThreadPool(1, 16, 0, prefix='get_files_to_upload') as tp:
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	560	for files in batch_files_for_check(infiles):
vadimsh@chromium.org	b074b16	2013-08-22 17:55:46 +0000	[diff] [blame]	561	tp.add_task(0, check_files_exist_on_server, contains_hash_url, files)
				562	for missing_file in itertools.chain.from_iterable(tp.iter_results()):
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	563	yield missing_file
maruel@chromium.org	35fc0c8	2013-01-17 15:14:14 +0000	[diff] [blame]	564
				565
maruel@chromium.org	7b844a6	2013-09-17 13:04:59 +0000	[diff] [blame]	566	def upload_tree(base_url, indir, infiles, namespace):
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	567	"""Uploads the given tree to the given url.
				568
				569	Arguments:
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	570	base_url: The base url, it is assume that \|base_url\|/has/ can be used to
				571	query if an element was already uploaded, and \|base_url\|/store/
				572	can be used to upload a new element.
				573	indir: Root directory the infiles are based in.
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	574	infiles: dict of files to upload files from \|indir\| to \|base_url\|.
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	575	namespace: The namespace to use on the server.
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	576	"""
				577	logging.info('upload tree(base_url=%s, indir=%s, files=%d)' %
				578	(base_url, indir, len(infiles)))
maruel@chromium.org	034e396	2013-03-13 13:34:25 +0000	[diff] [blame]	579
csharp@chromium.org	07fa759	2013-01-11 18:19:30 +0000	[diff] [blame]	580	# Create a pool of workers to zip and upload any files missing from
				581	# the server.
vadimsh@chromium.org	b074b16	2013-08-22 17:55:46 +0000	[diff] [blame]	582	num_threads = threading_utils.num_processors()
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	583	remote = get_storage_api(base_url, namespace)
				584	# TODO(maruel): There's three separate thread pools here, it is not very
				585	# efficient. remote_uploader and get_files_to_upload() should share the same
				586	# pool and control priorities accordingly.
				587	uploaded = []
				588	with WorkerPool() as remote_uploader:
maruel@chromium.org	781ccf6	2013-09-17 19:39:47 +0000	[diff] [blame]	589	# Starts the zip and upload process for files that are missing
				590	# from the server.
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	591	# TODO(maruel): Move .contains() to the API.
maruel@chromium.org	781ccf6	2013-09-17 19:39:47 +0000	[diff] [blame]	592	contains_hash_url = '%scontains/%s?token=%s' % (
				593	remote.content_url, namespace, remote.token)
csharp@chromium.org	07fa759	2013-01-11 18:19:30 +0000	[diff] [blame]	594
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	595	def add_item(priority, item, content_generator):
				596	remote_uploader.add_task(
				597	priority, remote.push, item, UNKNOWN_FILE_SIZE, content_generator)
				598
				599	with threading_utils.ThreadPool(
				600	min(2, num_threads), num_threads, 0, 'zip') as zipping_pool:
				601	for relfile, metadata in get_files_to_upload(contains_hash_url, infiles):
				602	infile = os.path.join(indir, relfile)
				603	zipping_pool.add_task(
				604	0, zip_and_trigger_upload, infile, metadata, add_item)
				605	uploaded.append((relfile, metadata))
				606
				607	logging.info('Waiting for all files to finish zipping')
				608	zipping_pool.join()
maruel@chromium.org	781ccf6	2013-09-17 19:39:47 +0000	[diff] [blame]	609	logging.info('All files zipped.')
maruel@chromium.org	781ccf6	2013-09-17 19:39:47 +0000	[diff] [blame]	610	remote_uploader.join()
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	611	logging.info('All files are uploaded')
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	612
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	613	total = len(infiles)
maruel@chromium.org	e5c1713	2012-11-21 18:18:46 +0000	[diff] [blame]	614	total_size = sum(metadata.get('s', 0) for metadata in infiles.itervalues())
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	615	logging.info(
				616	'Total: %6d, %9.1fkb',
				617	total,
maruel@chromium.org	e5c1713	2012-11-21 18:18:46 +0000	[diff] [blame]	618	sum(m.get('s', 0) for m in infiles.itervalues()) / 1024.)
csharp@chromium.org	20a888c	2013-01-15 15:06:55 +0000	[diff] [blame]	619	cache_hit = set(infiles.iterkeys()) - set(x[0] for x in uploaded)
maruel@chromium.org	e5c1713	2012-11-21 18:18:46 +0000	[diff] [blame]	620	cache_hit_size = sum(infiles[i].get('s', 0) for i in cache_hit)
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	621	logging.info(
				622	'cache hit: %6d, %9.1fkb, %6.2f%% files, %6.2f%% size',
				623	len(cache_hit),
				624	cache_hit_size / 1024.,
				625	len(cache_hit) * 100. / total,
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	626	cache_hit_size * 100. / total_size if total_size else 0)
csharp@chromium.org	20a888c	2013-01-15 15:06:55 +0000	[diff] [blame]	627	cache_miss = uploaded
maruel@chromium.org	e5c1713	2012-11-21 18:18:46 +0000	[diff] [blame]	628	cache_miss_size = sum(infiles[i[0]].get('s', 0) for i in cache_miss)
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	629	logging.info(
				630	'cache miss: %6d, %9.1fkb, %6.2f%% files, %6.2f%% size',
				631	len(cache_miss),
				632	cache_miss_size / 1024.,
				633	len(cache_miss) * 100. / total,
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	634	cache_miss_size * 100. / total_size if total_size else 0)
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	635	return 0
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	636
				637
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame^]	638	class MemoryCache(object):
				639	"""This class is intended to be usable everywhere the Cache class is.
				640
				641	Instead of downloading to a cache, all files are kept in memory to be stored
				642	in the target directory directly.
				643	"""
				644
				645	def __init__(self, target_directory, pool, remote):
				646	self.target_directory = target_directory
				647	self.pool = pool
				648	self.remote = remote
				649	self._lock = threading.Lock()
				650	self._contents = {}
				651
				652	def retrieve(self, priority, item, size):
				653	"""Gets the requested file."""
				654	self.pool.add_task(priority, self._store, item, size)
				655
				656	def wait_for(self, items):
				657	"""Starts a loop that waits for at least one of \|items\| to be retrieved.
				658
				659	Returns the first item retrieved.
				660	"""
				661	with self._lock:
				662	# Flush items already present.
				663	for item in items:
				664	if item in self._contents:
				665	return item
				666
				667	while True:
				668	downloaded = self.pool.get_one_result()
				669	if downloaded in items:
				670	return downloaded
				671
				672	def path(self, item):
				673	return os.path.join(self.target_directory, item)
				674
				675	def read(self, item):
				676	return self._contents[item]
				677
				678	def _store(self, item, size):
				679	data = ''.join(self.remote.fetch(item, size))
				680	with self._lock:
				681	self._contents[item] = data
				682	return item
				683
				684	def __enter__(self):
				685	return self
				686
				687	def __exit__(self, _exc_type, _exec_value, _traceback):
				688	return False
				689
				690
				691	def load_isolated(content, os_flavor, algo):
				692	"""Verifies the .isolated file is valid and loads this object with the json
				693	data.
				694	"""
				695	try:
				696	data = json.loads(content)
				697	except ValueError:
				698	raise ConfigError('Failed to parse: %s...' % content[:100])
				699
				700	if not isinstance(data, dict):
				701	raise ConfigError('Expected dict, got %r' % data)
				702
				703	for key, value in data.iteritems():
				704	if key == 'command':
				705	if not isinstance(value, list):
				706	raise ConfigError('Expected list, got %r' % value)
				707	if not value:
				708	raise ConfigError('Expected non-empty command')
				709	for subvalue in value:
				710	if not isinstance(subvalue, basestring):
				711	raise ConfigError('Expected string, got %r' % subvalue)
				712
				713	elif key == 'files':
				714	if not isinstance(value, dict):
				715	raise ConfigError('Expected dict, got %r' % value)
				716	for subkey, subvalue in value.iteritems():
				717	if not isinstance(subkey, basestring):
				718	raise ConfigError('Expected string, got %r' % subkey)
				719	if not isinstance(subvalue, dict):
				720	raise ConfigError('Expected dict, got %r' % subvalue)
				721	for subsubkey, subsubvalue in subvalue.iteritems():
				722	if subsubkey == 'l':
				723	if not isinstance(subsubvalue, basestring):
				724	raise ConfigError('Expected string, got %r' % subsubvalue)
				725	elif subsubkey == 'm':
				726	if not isinstance(subsubvalue, int):
				727	raise ConfigError('Expected int, got %r' % subsubvalue)
				728	elif subsubkey == 'h':
				729	if not is_valid_hash(subsubvalue, algo):
				730	raise ConfigError('Expected sha-1, got %r' % subsubvalue)
				731	elif subsubkey == 's':
				732	if not isinstance(subsubvalue, int):
				733	raise ConfigError('Expected int, got %r' % subsubvalue)
				734	else:
				735	raise ConfigError('Unknown subsubkey %s' % subsubkey)
				736	if bool('h' in subvalue) and bool('l' in subvalue):
				737	raise ConfigError(
				738	'Did not expect both \'h\' (sha-1) and \'l\' (link), got: %r' %
				739	subvalue)
				740
				741	elif key == 'includes':
				742	if not isinstance(value, list):
				743	raise ConfigError('Expected list, got %r' % value)
				744	if not value:
				745	raise ConfigError('Expected non-empty includes list')
				746	for subvalue in value:
				747	if not is_valid_hash(subvalue, algo):
				748	raise ConfigError('Expected sha-1, got %r' % subvalue)
				749
				750	elif key == 'read_only':
				751	if not isinstance(value, bool):
				752	raise ConfigError('Expected bool, got %r' % value)
				753
				754	elif key == 'relative_cwd':
				755	if not isinstance(value, basestring):
				756	raise ConfigError('Expected string, got %r' % value)
				757
				758	elif key == 'os':
				759	if os_flavor and value != os_flavor:
				760	raise ConfigError(
				761	'Expected \'os\' to be \'%s\' but got \'%s\'' %
				762	(os_flavor, value))
				763
				764	else:
				765	raise ConfigError('Unknown key %s' % key)
				766
				767	return data
				768
				769
				770	class IsolatedFile(object):
				771	"""Represents a single parsed .isolated file."""
				772	def __init__(self, obj_hash, algo):
				773	"""\|obj_hash\| is really the sha-1 of the file."""
				774	logging.debug('IsolatedFile(%s)' % obj_hash)
				775	self.obj_hash = obj_hash
				776	self.algo = algo
				777	# Set once all the left-side of the tree is parsed. 'Tree' here means the
				778	# .isolate and all the .isolated files recursively included by it with
				779	# 'includes' key. The order of each sha-1 in 'includes', each representing a
				780	# .isolated file in the hash table, is important, as the later ones are not
				781	# processed until the firsts are retrieved and read.
				782	self.can_fetch = False
				783
				784	# Raw data.
				785	self.data = {}
				786	# A IsolatedFile instance, one per object in self.includes.
				787	self.children = []
				788
				789	# Set once the .isolated file is loaded.
				790	self._is_parsed = False
				791	# Set once the files are fetched.
				792	self.files_fetched = False
				793
				794	def load(self, content):
				795	"""Verifies the .isolated file is valid and loads this object with the json
				796	data.
				797	"""
				798	logging.debug('IsolatedFile.load(%s)' % self.obj_hash)
				799	assert not self._is_parsed
				800	self.data = load_isolated(content, None, self.algo)
				801	self.children = [
				802	IsolatedFile(i, self.algo) for i in self.data.get('includes', [])
				803	]
				804	self._is_parsed = True
				805
				806	def fetch_files(self, cache, files):
				807	"""Adds files in this .isolated file not present in \|files\| dictionary.
				808
				809	Preemptively request files.
				810
				811	Note that \|files\| is modified by this function.
				812	"""
				813	assert self.can_fetch
				814	if not self._is_parsed or self.files_fetched:
				815	return
				816	logging.debug('fetch_files(%s)' % self.obj_hash)
				817	for filepath, properties in self.data.get('files', {}).iteritems():
				818	# Root isolated has priority on the files being mapped. In particular,
				819	# overriden files must not be fetched.
				820	if filepath not in files:
				821	files[filepath] = properties
				822	if 'h' in properties:
				823	# Preemptively request files.
				824	logging.debug('fetching %s' % filepath)
				825	cache.retrieve(
				826	WorkerPool.MED,
				827	properties['h'],
				828	properties['s'])
				829	self.files_fetched = True
				830
				831
				832	class Settings(object):
				833	"""Results of a completely parsed .isolated file."""
				834	def __init__(self):
				835	self.command = []
				836	self.files = {}
				837	self.read_only = None
				838	self.relative_cwd = None
				839	# The main .isolated file, a IsolatedFile instance.
				840	self.root = None
				841
				842	def load(self, cache, root_isolated_hash, algo):
				843	"""Loads the .isolated and all the included .isolated asynchronously.
				844
				845	It enables support for "included" .isolated files. They are processed in
				846	strict order but fetched asynchronously from the cache. This is important so
				847	that a file in an included .isolated file that is overridden by an embedding
				848	.isolated file is not fetched needlessly. The includes are fetched in one
				849	pass and the files are fetched as soon as all the ones on the left-side
				850	of the tree were fetched.
				851
				852	The prioritization is very important here for nested .isolated files.
				853	'includes' have the highest priority and the algorithm is optimized for both
				854	deep and wide trees. A deep one is a long link of .isolated files referenced
				855	one at a time by one item in 'includes'. A wide one has a large number of
				856	'includes' in a single .isolated file. 'left' is defined as an included
				857	.isolated file earlier in the 'includes' list. So the order of the elements
				858	in 'includes' is important.
				859	"""
				860	self.root = IsolatedFile(root_isolated_hash, algo)
				861
				862	# Isolated files being retrieved now: hash -> IsolatedFile instance.
				863	pending = {}
				864	# Set of hashes of already retrieved items to refuse recursive includes.
				865	seen = set()
				866
				867	def retrieve(isolated_file):
				868	h = isolated_file.obj_hash
				869	if h in seen:
				870	raise ConfigError('IsolatedFile %s is retrieved recursively' % h)
				871	assert h not in pending
				872	seen.add(h)
				873	pending[h] = isolated_file
				874	cache.retrieve(WorkerPool.HIGH, h, UNKNOWN_FILE_SIZE)
				875
				876	retrieve(self.root)
				877
				878	while pending:
				879	item_hash = cache.wait_for(pending)
				880	item = pending.pop(item_hash)
				881	item.load(cache.read(item_hash))
				882	if item_hash == root_isolated_hash:
				883	# It's the root item.
				884	item.can_fetch = True
				885
				886	for new_child in item.children:
				887	retrieve(new_child)
				888
				889	# Traverse the whole tree to see if files can now be fetched.
				890	self._traverse_tree(cache, self.root)
				891
				892	def check(n):
				893	return all(check(x) for x in n.children) and n.files_fetched
				894	assert check(self.root)
				895
				896	self.relative_cwd = self.relative_cwd or ''
				897	self.read_only = self.read_only or False
				898
				899	def _traverse_tree(self, cache, node):
				900	if node.can_fetch:
				901	if not node.files_fetched:
				902	self._update_self(cache, node)
				903	will_break = False
				904	for i in node.children:
				905	if not i.can_fetch:
				906	if will_break:
				907	break
				908	# Automatically mark the first one as fetcheable.
				909	i.can_fetch = True
				910	will_break = True
				911	self._traverse_tree(cache, i)
				912
				913	def _update_self(self, cache, node):
				914	node.fetch_files(cache, self.files)
				915	# Grabs properties.
				916	if not self.command and node.data.get('command'):
				917	self.command = node.data['command']
				918	if self.read_only is None and node.data.get('read_only') is not None:
				919	self.read_only = node.data['read_only']
				920	if (self.relative_cwd is None and
				921	node.data.get('relative_cwd') is not None):
				922	self.relative_cwd = node.data['relative_cwd']
				923
				924
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	925	@subcommand.usage('<file1..fileN> or - to read from stdin')
				926	def CMDarchive(parser, args):
				927	"""Archives data to the server."""
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	928	options, files = parser.parse_args(args)
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	929
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	930	if files == ['-']:
				931	files = sys.stdin.readlines()
				932
				933	if not files:
				934	parser.error('Nothing to upload')
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	935	if not options.isolate_server:
				936	parser.error('Nowhere to send. Please specify --isolate-server')
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	937
				938	# Load the necessary metadata. This is going to be rewritten eventually to be
				939	# more efficient.
maruel@chromium.org	7b844a6	2013-09-17 13:04:59 +0000	[diff] [blame]	940	algo = hashlib.sha1
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	941	infiles = dict(
				942	(
				943	f,
				944	{
maruel@chromium.org	e5c1713	2012-11-21 18:18:46 +0000	[diff] [blame]	945	's': os.stat(f).st_size,
maruel@chromium.org	7b844a6	2013-09-17 13:04:59 +0000	[diff] [blame]	946	'h': hash_file(f, algo),
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	947	}
				948	)
				949	for f in files)
				950
vadimsh@chromium.org	a432647	2013-08-24 02:05:41 +0000	[diff] [blame]	951	with tools.Profiler('Archive'):
maruel@chromium.org	7b844a6	2013-09-17 13:04:59 +0000	[diff] [blame]	952	ret = upload_tree(
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	953	base_url=options.isolate_server,
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	954	indir=os.getcwd(),
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	955	infiles=infiles,
				956	namespace=options.namespace)
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	957	if not ret:
				958	print '\n'.join('%s %s' % (infiles[f]['h'], f) for f in sorted(infiles))
				959	return ret
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	960
				961
				962	def CMDdownload(parser, args):
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	963	"""Download data from the server.
				964
				965	It can download individual files.
				966	"""
				967	parser.add_option(
				968	'-f', '--file', metavar='HASH DEST', default=[], action='append', nargs=2,
				969	help='hash and destination of a file, can be used multiple times')
				970	parser.add_option(
				971	'-t', '--target', metavar='DIR', default=os.getcwd(),
				972	help='destination directory')
				973	options, args = parser.parse_args(args)
				974	if args:
				975	parser.error('Unsupported arguments: %s' % args)
				976	if not options.file:
				977	parser.error('Use one of --file is required.')
				978
				979	options.target = os.path.abspath(options.target)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	980	remote = get_storage_api(options.isolate_server, options.namespace)
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	981	for h, dest in options.file:
				982	logging.info('%s: %s', h, dest)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	983	file_write(
				984	os.path.join(options.target, dest),
				985	remote.fetch(h, UNKNOWN_FILE_SIZE))
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	986	return 0
				987
				988
				989	class OptionParserIsolateServer(tools.OptionParserWithLogging):
				990	def __init__(self, **kwargs):
				991	tools.OptionParserWithLogging.__init__(self, **kwargs)
				992	self.add_option(
				993	'-I', '--isolate-server',
				994	default=ISOLATE_SERVER,
				995	metavar='URL',
				996	help='Isolate server where data is stored. default: %default')
				997	self.add_option(
				998	'--namespace', default='default-gzip',
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	999	help='The namespace to use on the server, default: %default')
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	1000
				1001	def parse_args(self, args, *kwargs):
				1002	options, args = tools.OptionParserWithLogging.parse_args(
				1003	self, args, *kwargs)
				1004	options.isolate_server = options.isolate_server.rstrip('/')
				1005	if not options.isolate_server:
				1006	self.error('--isolate-server is required.')
				1007	return options, args
				1008
				1009
				1010	def main(args):
				1011	dispatcher = subcommand.CommandDispatcher(__name__)
				1012	try:
				1013	return dispatcher.execute(
				1014	OptionParserIsolateServer(version=__version__), args)
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	1015	except (ConfigError, MappingError) as e:
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	1016	sys.stderr.write('\nError: ')
				1017	sys.stderr.write(str(e))
				1018	sys.stderr.write('\n')
				1019	return 1
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	1020
				1021
				1022	if __name__ == '__main__':
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	1023	fix_encoding.fix_encoding()
				1024	tools.disable_buffering()
				1025	colorama.init()
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	1026	sys.exit(main(sys.argv[1:]))