Blame - isolateserver.py - chromium.googlesource.com/infra/luci/client-py

blob: 3667bc9b53f18178655cfd3ef2af5bd84af3ea60 [file] [log] [blame]

maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	1	#!/usr/bin/env python
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	2	# Copyright 2013 The Chromium Authors. All rights reserved.
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	3	# Use of this source code is governed by a BSD-style license that can be
				4	# found in the LICENSE file.
				5
				6	"""Archives a set of files to a server."""
				7
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	8	__version__ = '0.2'
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	9
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	10	import binascii
				11	import hashlib
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	12	import itertools
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	13	import json
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	14	import logging
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	15	import os
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	16	import random
				17	import re
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	18	import sys
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	19	import threading
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	20	import time
maruel@chromium.org	e82112e	2013-04-24 14:41:55 +0000	[diff] [blame]	21	import urllib
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	22	import zlib
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	23
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	24	from third_party import colorama
				25	from third_party.depot_tools import fix_encoding
				26	from third_party.depot_tools import subcommand
				27
vadimsh@chromium.org	6b70621	2013-08-28 15:03:46 +0000	[diff] [blame]	28	from utils import net
vadimsh@chromium.org	b074b16	2013-08-22 17:55:46 +0000	[diff] [blame]	29	from utils import threading_utils
vadimsh@chromium.org	a432647	2013-08-24 02:05:41 +0000	[diff] [blame]	30	from utils import tools
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	31
				32
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	33	# The minimum size of files to upload directly to the blobstore.
maruel@chromium.org	aef29f8	2012-12-12 15:00:42 +0000	[diff] [blame]	34	MIN_SIZE_FOR_DIRECT_BLOBSTORE = 20 * 1024
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	35
vadimsh@chromium.org	eea5242	2013-08-21 19:35:54 +0000	[diff] [blame]	36	# The number of files to check the isolate server per /contains query.
				37	# All files are sorted by likelihood of a change in the file content
				38	# (currently file size is used to estimate this: larger the file -> larger the
				39	# possibility it has changed). Then first ITEMS_PER_CONTAINS_QUERIES[0] files
				40	# are taken and send to '/contains', then next ITEMS_PER_CONTAINS_QUERIES[1],
				41	# and so on. Numbers here is a trade-off; the more per request, the lower the
				42	# effect of HTTP round trip latency and TCP-level chattiness. On the other hand,
				43	# larger values cause longer lookups, increasing the initial latency to start
				44	# uploading, which is especially an issue for large files. This value is
				45	# optimized for the "few thousands files to look up with minimal number of large
				46	# files missing" case.
				47	ITEMS_PER_CONTAINS_QUERIES = [20, 20, 50, 50, 50, 100]
csharp@chromium.org	07fa759	2013-01-11 18:19:30 +0000	[diff] [blame]	48
maruel@chromium.org	9958e4a	2013-09-17 00:01:48 +0000	[diff] [blame]	49
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	50	# A list of already compressed extension types that should not receive any
				51	# compression before being uploaded.
				52	ALREADY_COMPRESSED_TYPES = [
				53	'7z', 'avi', 'cur', 'gif', 'h264', 'jar', 'jpeg', 'jpg', 'pdf', 'png',
				54	'wav', 'zip'
				55	]
				56
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	57
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	58	# The file size to be used when we don't know the correct file size,
				59	# generally used for .isolated files.
				60	UNKNOWN_FILE_SIZE = None
				61
				62
				63	# The size of each chunk to read when downloading and unzipping files.
				64	ZIPPED_FILE_CHUNK = 16 * 1024
				65
				66
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	67	# Chunk size to use when doing disk I/O.
				68	DISK_FILE_CHUNK = 1024 * 1024
				69
				70
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	71	# Read timeout in seconds for downloads from isolate storage. If there's no
				72	# response from the server within this timeout whole download will be aborted.
				73	DOWNLOAD_READ_TIMEOUT = 60
				74
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	75	# Maximum expected delay (in seconds) between successive file fetches
				76	# in run_tha_test. If it takes longer than that, a deadlock might be happening
				77	# and all stack frames for all threads are dumped to log.
				78	DEADLOCK_TIMEOUT = 5 * 60
				79
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	80
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	81	# The delay (in seconds) to wait between logging statements when retrieving
				82	# the required files. This is intended to let the user (or buildbot) know that
				83	# the program is still running.
				84	DELAY_BETWEEN_UPDATES_IN_SECS = 30
				85
				86
maruel@chromium.org	385d73d	2013-09-19 18:33:21 +0000	[diff] [blame]	87	# Sadly, hashlib uses 'sha1' instead of the standard 'sha-1' so explicitly
				88	# specify the names here.
				89	SUPPORTED_ALGOS = {
				90	'md5': hashlib.md5,
				91	'sha-1': hashlib.sha1,
				92	'sha-512': hashlib.sha512,
				93	}
				94
				95
				96	# Used for serialization.
				97	SUPPORTED_ALGOS_REVERSE = dict((v, k) for k, v in SUPPORTED_ALGOS.iteritems())
				98
				99
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	100	class ConfigError(ValueError):
				101	"""Generic failure to load a .isolated file."""
				102	pass
				103
				104
				105	class MappingError(OSError):
				106	"""Failed to recreate the tree."""
				107	pass
				108
				109
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	110	def randomness():
				111	"""Generates low-entropy randomness for MIME encoding.
				112
				113	Exists so it can be mocked out in unit tests.
				114	"""
				115	return str(time.time())
				116
				117
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	118	def encode_multipart_formdata(fields, files,
				119	mime_mapper=lambda _: 'application/octet-stream'):
				120	"""Encodes a Multipart form data object.
				121
				122	Args:
				123	fields: a sequence (name, value) elements for
				124	regular form fields.
				125	files: a sequence of (name, filename, value) elements for data to be
				126	uploaded as files.
				127	mime_mapper: function to return the mime type from the filename.
				128	Returns:
				129	content_type: for httplib.HTTP instance
				130	body: for httplib.HTTP instance
				131	"""
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	132	boundary = hashlib.md5(randomness()).hexdigest()
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	133	body_list = []
				134	for (key, value) in fields:
				135	if isinstance(key, unicode):
				136	value = key.encode('utf-8')
				137	if isinstance(value, unicode):
				138	value = value.encode('utf-8')
				139	body_list.append('--' + boundary)
				140	body_list.append('Content-Disposition: form-data; name="%s"' % key)
				141	body_list.append('')
				142	body_list.append(value)
				143	body_list.append('--' + boundary)
				144	body_list.append('')
				145	for (key, filename, value) in files:
				146	if isinstance(key, unicode):
				147	value = key.encode('utf-8')
				148	if isinstance(filename, unicode):
				149	value = filename.encode('utf-8')
				150	if isinstance(value, unicode):
				151	value = value.encode('utf-8')
				152	body_list.append('--' + boundary)
				153	body_list.append('Content-Disposition: form-data; name="%s"; '
				154	'filename="%s"' % (key, filename))
				155	body_list.append('Content-Type: %s' % mime_mapper(filename))
				156	body_list.append('')
				157	body_list.append(value)
				158	body_list.append('--' + boundary)
				159	body_list.append('')
				160	if body_list:
				161	body_list[-2] += '--'
				162	body = '\r\n'.join(body_list)
				163	content_type = 'multipart/form-data; boundary=%s' % boundary
				164	return content_type, body
				165
				166
maruel@chromium.org	7b844a6	2013-09-17 13:04:59 +0000	[diff] [blame]	167	def is_valid_hash(value, algo):
				168	"""Returns if the value is a valid hash for the corresponding algorithm."""
				169	size = 2 * algo().digest_size
				170	return bool(re.match(r'^[a-fA-F0-9]{%d}$' % size, value))
				171
				172
				173	def hash_file(filepath, algo):
				174	"""Calculates the hash of a file without reading it all in memory at once.
				175
				176	\|algo\| should be one of hashlib hashing algorithm.
				177	"""
				178	digest = algo()
maruel@chromium.org	037758d	2012-12-10 17:59:46 +0000	[diff] [blame]	179	with open(filepath, 'rb') as f:
				180	while True:
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	181	chunk = f.read(DISK_FILE_CHUNK)
maruel@chromium.org	037758d	2012-12-10 17:59:46 +0000	[diff] [blame]	182	if not chunk:
				183	break
				184	digest.update(chunk)
				185	return digest.hexdigest()
				186
				187
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	188	def file_read(filepath, chunk_size=DISK_FILE_CHUNK):
				189	"""Yields file content in chunks of given \|chunk_size\|."""
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	190	with open(filepath, 'rb') as f:
				191	while True:
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	192	data = f.read(chunk_size)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	193	if not data:
				194	break
				195	yield data
				196
				197
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	198	def file_write(filepath, content_generator):
				199	"""Writes file content as generated by content_generator.
				200
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	201	Creates the intermediary directory as needed.
				202
				203	Returns the number of bytes written.
				204
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	205	Meant to be mocked out in unit tests.
				206	"""
				207	filedir = os.path.dirname(filepath)
				208	if not os.path.isdir(filedir):
				209	os.makedirs(filedir)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	210	total = 0
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	211	with open(filepath, 'wb') as f:
				212	for d in content_generator:
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	213	total += len(d)
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	214	f.write(d)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	215	return total
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	216
				217
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	218	def zip_compress(content_generator, level=7):
				219	"""Reads chunks from \|content_generator\| and yields zip compressed chunks."""
				220	compressor = zlib.compressobj(level)
				221	for chunk in content_generator:
				222	compressed = compressor.compress(chunk)
				223	if compressed:
				224	yield compressed
				225	tail = compressor.flush(zlib.Z_FINISH)
				226	if tail:
				227	yield tail
				228
				229
				230	def get_zip_compression_level(filename):
				231	"""Given a filename calculates the ideal zip compression level to use."""
				232	file_ext = os.path.splitext(filename)[1].lower()
				233	# TODO(csharp): Profile to find what compression level works best.
				234	return 0 if file_ext in ALREADY_COMPRESSED_TYPES else 7
				235
				236
maruel@chromium.org	af25485	2013-09-17 17:48:14 +0000	[diff] [blame]	237	def create_directories(base_directory, files):
				238	"""Creates the directory structure needed by the given list of files."""
				239	logging.debug('create_directories(%s, %d)', base_directory, len(files))
				240	# Creates the tree of directories to create.
				241	directories = set(os.path.dirname(f) for f in files)
				242	for item in list(directories):
				243	while item:
				244	directories.add(item)
				245	item = os.path.dirname(item)
				246	for d in sorted(directories):
				247	if d:
				248	os.mkdir(os.path.join(base_directory, d))
				249
				250
				251	def create_links(base_directory, files):
				252	"""Creates any links needed by the given set of files."""
				253	for filepath, properties in files:
				254	if 'l' not in properties:
				255	continue
				256	if sys.platform == 'win32':
				257	# TODO(maruel): Create junctions or empty text files similar to what
				258	# cygwin do?
				259	logging.warning('Ignoring symlink %s', filepath)
				260	continue
				261	outfile = os.path.join(base_directory, filepath)
				262	# symlink doesn't exist on Windows. So the 'link' property should
				263	# never be specified for windows .isolated file.
				264	os.symlink(properties['l'], outfile) # pylint: disable=E1101
				265	if 'm' in properties:
				266	lchmod = getattr(os, 'lchmod', None)
				267	if lchmod:
				268	lchmod(outfile, properties['m'])
				269
				270
maruel@chromium.org	af25485	2013-09-17 17:48:14 +0000	[diff] [blame]	271	def generate_remaining_files(files):
				272	"""Generates a dictionary of all the remaining files to be downloaded."""
				273	remaining = {}
				274	for filepath, props in files:
				275	if 'h' in props:
				276	remaining.setdefault(props['h'], []).append((filepath, props))
				277
				278	return remaining
				279
				280
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	281	def is_valid_file(filepath, size):
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	282	"""Determines if the given files appears valid.
				283
				284	Currently it just checks the file's size.
				285	"""
				286	if size == UNKNOWN_FILE_SIZE:
				287	return True
				288	actual_size = os.stat(filepath).st_size
				289	if size != actual_size:
				290	logging.warning(
				291	'Found invalid item %s; %d != %d',
				292	os.path.basename(filepath), actual_size, size)
				293	return False
				294	return True
				295
				296
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	297	def try_remove(filepath):
				298	"""Removes a file without crashing even if it doesn't exist."""
				299	try:
				300	os.remove(filepath)
				301	except OSError:
				302	pass
				303
				304
vadimsh@chromium.org	80f7300	2013-07-12 14:52:44 +0000	[diff] [blame]	305	def url_read(url, **kwargs):
vadimsh@chromium.org	6b70621	2013-08-28 15:03:46 +0000	[diff] [blame]	306	result = net.url_read(url, **kwargs)
vadimsh@chromium.org	80f7300	2013-07-12 14:52:44 +0000	[diff] [blame]	307	if result is None:
maruel@chromium.org	ef33312	2013-03-12 20:36:40 +0000	[diff] [blame]	308	# If we get no response from the server, assume it is down and raise an
				309	# exception.
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	310	raise MappingError('Unable to connect to server %s' % url)
maruel@chromium.org	ef33312	2013-03-12 20:36:40 +0000	[diff] [blame]	311	return result
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	312
				313
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	314	class StorageApi(object):
				315	"""Base class for object that can download and upload files."""
				316
				317	def __init__(self):
				318	self._pool = None
				319
				320	def set_pool(self, pool):
				321	"""Sets WorkerPool that can be used for parallel uploads."""
				322	self._pool = pool
				323
				324	def fetch(self, item, expected_size):
				325	"""Fetches an object and yields its content."""
				326	raise NotImplementedError()
				327
				328	def push(self, item, expected_size, content_generator, push_urls=None):
				329	"""Uploads content generated by \|content_generator\| as \|item\|."""
				330	raise NotImplementedError()
				331
				332	def check_missing_files(self, files):
				333	"""Checks for existence of given \|files\| on the server.
				334
				335	Arguments:
				336	files: list of pairs (file name, metadata dict).
				337
				338	Returns:
				339	A list of files missing on server as a list of triplets
				340	(file name, metadata dict, push_urls object to pass to push).
				341	"""
				342	raise NotImplementedError()
				343
				344	def get_missing_files(self, files):
				345	"""Yields files that are missing from the server.
				346
				347	Issues multiple parallel queries via check_missing_files method calls.
				348
				349	Arguments:
				350	files: a dictionary file name -> metadata dict.
				351
				352	Yields:
				353	Triplets (file name, metadata dict, push_urls object to pass to push).
				354	"""
				355	# TODO(maruel, vadimsh): Reuse self._pool here.
				356	with threading_utils.ThreadPool(1, 16, 0, prefix='get_missing_files') as tp:
				357	for batch in self.batch_files_for_check(files):
				358	tp.add_task(0, self.check_missing_files, batch)
				359	for missing in itertools.chain.from_iterable(tp.iter_results()):
				360	yield missing
				361
				362	def async_push(self, priority, item, expected_size,
				363	content_generator, push_urls=None):
				364	"""Starts asynchronous push to the server in a parallel thread."""
				365	# TODO(vadimsh): Implement streaming uploads. Before it's done, assemble
				366	# content right here. It will block until all file is zipped.
				367	data = ''.join(content_generator)
				368	self._pool.add_task(
				369	priority, self.push, item, expected_size, [data], push_urls)
				370
				371	@staticmethod
				372	def batch_files_for_check(files):
				373	"""Splits list of files to check for existence on the server into batches.
				374
				375	Each batch corresponds to a single 'exists?' query to the server via a call
				376	to check_missing_files method.
				377
				378	Arguments:
				379	files: a dictionary file name -> metadata dict.
				380
				381	Yields:
				382	Batches of files to query for existence in a single operation,
				383	each batch is a list of pairs: (file name, metadata dict).
				384	"""
				385	batch_count = 0
				386	batch_size_limit = ITEMS_PER_CONTAINS_QUERIES[0]
				387	next_queries = []
				388	items = ((k, v) for k, v in files.iteritems() if 's' in v)
				389	for filename, metadata in sorted(items, key=lambda x: -x[1]['s']):
				390	next_queries.append((filename, metadata))
				391	if len(next_queries) == batch_size_limit:
				392	yield next_queries
				393	next_queries = []
				394	batch_count += 1
				395	batch_size_limit = ITEMS_PER_CONTAINS_QUERIES[
				396	min(batch_count, len(ITEMS_PER_CONTAINS_QUERIES) - 1)]
				397	if next_queries:
				398	yield next_queries
				399
				400
				401	class IsolateServer(StorageApi):
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	402	"""Client class to download or upload to Isolate Server."""
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	403	def __init__(self, base_url, namespace):
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	404	super(IsolateServer, self).__init__()
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	405	assert base_url.startswith('http'), base_url
				406	self.content_url = base_url.rstrip('/') + '/content/'
				407	self.namespace = namespace
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	408	self.algo = get_hash_algo(namespace)
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	409	self._token = None
				410	self._lock = threading.Lock()
				411
				412	@property
				413	def token(self):
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	414	# TODO(maruel): Make this request much earlier asynchronously while the
				415	# files are being enumerated.
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	416	with self._lock:
				417	if not self._token:
				418	self._token = urllib.quote(url_read(self.content_url + 'get_token'))
				419	return self._token
				420
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	421	def fetch(self, item, expected_size):
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	422	"""Fetches an object and yields its content."""
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	423	assert isinstance(item, basestring)
				424	assert (
				425	isinstance(expected_size, (int, long)) or
				426	expected_size == UNKNOWN_FILE_SIZE)
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	427	zipped_url = '%sretrieve/%s/%s' % (self.content_url, self.namespace, item)
				428	logging.debug('download_file(%s)', zipped_url)
				429
				430	# Because the app engine DB is only eventually consistent, retry 404 errors
				431	# because the file might just not be visible yet (even though it has been
				432	# uploaded).
				433	connection = net.url_open(
				434	zipped_url, retry_404=True, read_timeout=DOWNLOAD_READ_TIMEOUT)
				435	if not connection:
				436	raise IOError('Unable to open connection to %s' % zipped_url)
				437
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	438	# TODO(maruel): Must only decompress when needed.
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	439	decompressor = zlib.decompressobj()
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	440	try:
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	441	compressed_size = 0
				442	decompressed_size = 0
				443	while True:
				444	chunk = connection.read(ZIPPED_FILE_CHUNK)
				445	if not chunk:
				446	break
				447	compressed_size += len(chunk)
				448	decompressed = decompressor.decompress(chunk)
				449	decompressed_size += len(decompressed)
				450	yield decompressed
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	451
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	452	# Ensure that all the data was properly decompressed.
				453	uncompressed_data = decompressor.flush()
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	454	if uncompressed_data:
				455	raise IOError('Decompression failed')
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	456	if (expected_size != UNKNOWN_FILE_SIZE and
				457	decompressed_size != expected_size):
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	458	raise IOError('File incorrect size after download of %s. Got %s and '
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	459	'expected %s' % (item, decompressed_size, expected_size))
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	460	except zlib.error as e:
				461	msg = 'Corrupted zlib for item %s. Processed %d of %s bytes.\n%s' % (
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	462	item, compressed_size, connection.content_length, e)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	463	logging.warning(msg)
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	464
				465	# Testing seems to show that if a few machines are trying to download
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	466	# the same blob, they can cause each other to fail. So if we hit a zip
				467	# error, this is the most likely cause (it only downloads some of the
				468	# data). Randomly sleep for between 5 and 25 seconds to try and spread
				469	# out the downloads.
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	470	sleep_duration = (random.random() * 20) + 5
				471	time.sleep(sleep_duration)
				472	raise IOError(msg)
maruel@chromium.org	c2bfef4	2013-08-30 21:46:26 +0000	[diff] [blame]	473
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	474	def push(self, item, expected_size, content_generator, push_urls=None):
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	475	"""Uploads content generated by \|content_generator\| as \|item\| to the remote
				476	isolate server.
				477	"""
				478	assert isinstance(item, basestring)
				479	assert isinstance(expected_size, int) or expected_size == UNKNOWN_FILE_SIZE
				480	item = str(item)
				481	# TODO(maruel): Support large files. This would require streaming support.
				482	content = ''.join(content_generator)
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	483	if len(content) > MIN_SIZE_FOR_DIRECT_BLOBSTORE:
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	484	return self._upload_hash_content_to_blobstore(item, content)
maruel@chromium.org	d1e20c9	2013-09-17 20:54:26 +0000	[diff] [blame]	485
				486	url = '%sstore/%s/%s?token=%s' % (
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	487	self.content_url, self.namespace, item, self.token)
maruel@chromium.org	d1e20c9	2013-09-17 20:54:26 +0000	[diff] [blame]	488	return url_read(
				489	url, data=content, content_type='application/octet-stream')
				490
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	491	def check_missing_files(self, files):
				492	"""Checks for existence of given \|files\| on the server."""
				493	logging.info('Checking existence of %d files...', len(files))
				494
				495	body = ''.join(
				496	(binascii.unhexlify(metadata['h']) for (_, metadata) in files))
				497	assert (len(body) % self.algo().digest_size) == 0, repr(body)
				498
				499	query_url = '%scontains/%s?token=%s' % (
				500	self.content_url, self.namespace, self.token)
				501	response = url_read(
				502	query_url, data=body, content_type='application/octet-stream')
				503	if len(files) != len(response):
				504	raise MappingError(
				505	'Got an incorrect number of responses from the server. Expected %d, '
				506	'but got %d' % (len(files), len(response)))
				507
				508	# This implementation of IsolateServer doesn't use push_urls field,
				509	# set it to None.
				510	missing_files = [
				511	files[i] + (None,) for i, flag in enumerate(response) if flag == '\x00'
				512	]
				513	logging.info('Queried %d files, %d cache hit',
				514	len(files), len(files) - len(missing_files))
				515	return missing_files
				516
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	517	def _upload_hash_content_to_blobstore(self, item, content):
maruel@chromium.org	d1e20c9	2013-09-17 20:54:26 +0000	[diff] [blame]	518	"""Uploads the content directly to the blobstore via a generated url."""
				519	# TODO(maruel): Support large files. This would require streaming support.
				520	gen_url = '%sgenerate_blobstore_url/%s/%s' % (
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	521	self.content_url, self.namespace, item)
maruel@chromium.org	d1e20c9	2013-09-17 20:54:26 +0000	[diff] [blame]	522	# Token is guaranteed to be already quoted but it is unnecessary here, and
				523	# only here.
				524	data = [('token', urllib.unquote(self.token))]
				525	content_type, body = encode_multipart_formdata(
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	526	data, [('content', item, content)])
maruel@chromium.org	d1e20c9	2013-09-17 20:54:26 +0000	[diff] [blame]	527	last_url = gen_url
				528	for _ in net.retry_loop(max_attempts=net.URL_OPEN_MAX_ATTEMPTS):
				529	# Retry HTTP 50x here but not 404.
				530	upload_url = net.url_read(gen_url, data=data)
				531	if not upload_url:
				532	raise MappingError('Unable to connect to server %s' % gen_url)
				533	last_url = upload_url
				534
				535	# Do not retry this request on HTTP 50x. Regenerate an upload url each
				536	# time since uploading "consumes" the upload url.
				537	result = net.url_read(
				538	upload_url, data=body, content_type=content_type, retry_50x=False)
				539	if result is not None:
				540	return result
				541	raise MappingError('Unable to connect to server %s' % last_url)
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	542
				543
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	544	class FileSystem(StorageApi):
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	545	"""Fetches data from the file system.
				546
				547	The common use case is a NFS/CIFS file server that is mounted locally that is
				548	used to fetch the file on a local partition.
				549	"""
				550	def __init__(self, base_path):
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	551	super(FileSystem, self).__init__()
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	552	self.base_path = base_path
				553
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	554	def fetch(self, item, expected_size):
				555	assert isinstance(item, basestring)
				556	assert isinstance(expected_size, int) or expected_size == UNKNOWN_FILE_SIZE
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	557	source = os.path.join(self.base_path, item)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	558	if (expected_size != UNKNOWN_FILE_SIZE and
				559	not is_valid_file(source, expected_size)):
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	560	raise IOError('Invalid file %s' % item)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	561	return file_read(source)
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	562
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	563	def push(self, item, expected_size, content_generator, push_urls=None):
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	564	assert isinstance(item, basestring)
				565	assert isinstance(expected_size, int) or expected_size == UNKNOWN_FILE_SIZE
				566	dest = os.path.join(self.base_path, item)
				567	total = file_write(dest, content_generator)
				568	if expected_size != UNKNOWN_FILE_SIZE and total != expected_size:
				569	os.remove(dest)
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame]	570	raise IOError(
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	571	'Invalid file %s, %d != %d' % (item, total, expected_size))
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	572
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	573	def check_missing_files(self, files):
				574	return [
				575	(filename, metadata, None)
				576	for filename, metadata in files
				577	if not os.path.exists(os.path.join(self.base_path, metadata['h']))
				578	]
				579
				580
				581	def get_hash_algo(_namespace):
				582	"""Return hash algorithm class to use when uploading to given \|namespace\|."""
				583	# TODO(vadimsh): Implement this at some point.
				584	return hashlib.sha1
				585
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	586
				587	def get_storage_api(file_or_url, namespace):
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	588	"""Returns an object that implements .fetch() and .push()."""
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	589	if re.match(r'^https?://.+$', file_or_url):
				590	return IsolateServer(file_or_url, namespace)
				591	else:
				592	return FileSystem(file_or_url)
				593
				594
maruel@chromium.org	781ccf6	2013-09-17 19:39:47 +0000	[diff] [blame]	595	class WorkerPool(threading_utils.AutoRetryThreadPool):
				596	"""Thread pool that automatically retries on IOError and runs a preconfigured
				597	function.
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	598	"""
				599	# Initial and maximum number of worker threads.
				600	INITIAL_WORKERS = 2
				601	MAX_WORKERS = 16
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	602	RETRIES = 5
				603
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	604	def __init__(self):
maruel@chromium.org	781ccf6	2013-09-17 19:39:47 +0000	[diff] [blame]	605	super(WorkerPool, self).__init__(
				606	[IOError],
				607	self.RETRIES,
				608	self.INITIAL_WORKERS,
				609	self.MAX_WORKERS,
				610	0,
				611	'remote')
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	612
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	613
maruel@chromium.org	7b844a6	2013-09-17 13:04:59 +0000	[diff] [blame]	614	def upload_tree(base_url, indir, infiles, namespace):
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	615	"""Uploads the given tree to the given url.
				616
				617	Arguments:
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	618	base_url: The base url, it is assume that \|base_url\|/has/ can be used to
				619	query if an element was already uploaded, and \|base_url\|/store/
				620	can be used to upload a new element.
				621	indir: Root directory the infiles are based in.
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	622	infiles: dict of files to upload files from \|indir\| to \|base_url\|.
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	623	namespace: The namespace to use on the server.
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	624	"""
				625	logging.info('upload tree(base_url=%s, indir=%s, files=%d)' %
				626	(base_url, indir, len(infiles)))
maruel@chromium.org	034e396	2013-03-13 13:34:25 +0000	[diff] [blame]	627
csharp@chromium.org	07fa759	2013-01-11 18:19:30 +0000	[diff] [blame]	628	# Create a pool of workers to zip and upload any files missing from
				629	# the server.
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	630	cpus = max(threading_utils.num_processors(), 2)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	631	uploaded = []
csharp@chromium.org	07fa759	2013-01-11 18:19:30 +0000	[diff] [blame]	632
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	633	with WorkerPool() as upload_pool:
				634	remote = get_storage_api(base_url, namespace)
				635	remote.set_pool(upload_pool)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	636
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	637	def zip_and_trigger_push(filename, metadata, push_urls):
				638	"""Read the file, zips it and trigger push to the storage."""
				639	# TODO(csharp): Fix crbug.com/150823 and enable the touched logic again.
				640	path = os.path.join(indir, filename)
				641	content_generator = zip_compress(file_read(path, ZIPPED_FILE_CHUNK),
				642	get_zip_compression_level(path))
				643	if metadata.get('priority', '1') == '0':
				644	priority = WorkerPool.HIGH
				645	else:
				646	priority = WorkerPool.MED
				647	return remote.async_push(
				648	priority, metadata['h'], UNKNOWN_FILE_SIZE,
				649	content_generator, push_urls)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	650
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	651	with threading_utils.ThreadPool(2, cpus, 0, 'zip') as zip_pool:
				652	for filename, metadata, push_urls in remote.get_missing_files(infiles):
				653	zip_pool.add_task(0, zip_and_trigger_push,
				654	filename, metadata, push_urls)
				655	uploaded.append((filename, metadata))
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	656	logging.info('Waiting for all files to finish zipping')
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	657	zip_pool.join()
				658
maruel@chromium.org	781ccf6	2013-09-17 19:39:47 +0000	[diff] [blame]	659	logging.info('All files zipped.')
vadimsh@chromium.org	35122be	2013-09-19 02:48:00 +0000	[diff] [blame]	660	upload_pool.join()
				661
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	662	logging.info('All files are uploaded')
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	663
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	664	total = len(infiles)
maruel@chromium.org	e5c1713	2012-11-21 18:18:46 +0000	[diff] [blame]	665	total_size = sum(metadata.get('s', 0) for metadata in infiles.itervalues())
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	666	logging.info(
				667	'Total: %6d, %9.1fkb',
				668	total,
maruel@chromium.org	e5c1713	2012-11-21 18:18:46 +0000	[diff] [blame]	669	sum(m.get('s', 0) for m in infiles.itervalues()) / 1024.)
csharp@chromium.org	20a888c	2013-01-15 15:06:55 +0000	[diff] [blame]	670	cache_hit = set(infiles.iterkeys()) - set(x[0] for x in uploaded)
maruel@chromium.org	e5c1713	2012-11-21 18:18:46 +0000	[diff] [blame]	671	cache_hit_size = sum(infiles[i].get('s', 0) for i in cache_hit)
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	672	logging.info(
				673	'cache hit: %6d, %9.1fkb, %6.2f%% files, %6.2f%% size',
				674	len(cache_hit),
				675	cache_hit_size / 1024.,
				676	len(cache_hit) * 100. / total,
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	677	cache_hit_size * 100. / total_size if total_size else 0)
csharp@chromium.org	20a888c	2013-01-15 15:06:55 +0000	[diff] [blame]	678	cache_miss = uploaded
maruel@chromium.org	e5c1713	2012-11-21 18:18:46 +0000	[diff] [blame]	679	cache_miss_size = sum(infiles[i[0]].get('s', 0) for i in cache_miss)
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	680	logging.info(
				681	'cache miss: %6d, %9.1fkb, %6.2f%% files, %6.2f%% size',
				682	len(cache_miss),
				683	cache_miss_size / 1024.,
				684	len(cache_miss) * 100. / total,
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	685	cache_miss_size * 100. / total_size if total_size else 0)
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	686	return 0
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	687
				688
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	689	class MemoryCache(object):
				690	"""This class is intended to be usable everywhere the Cache class is.
				691
				692	Instead of downloading to a cache, all files are kept in memory to be stored
				693	in the target directory directly.
				694	"""
				695
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	696	def __init__(self, remote):
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	697	self.remote = remote
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	698	self._pool = None
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	699	self._lock = threading.Lock()
				700	self._contents = {}
				701
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	702	def set_pool(self, pool):
				703	self._pool = pool
				704
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	705	def retrieve(self, priority, item, size):
				706	"""Gets the requested file."""
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	707	self._pool.add_task(priority, self._on_content, item, size)
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	708
				709	def wait_for(self, items):
				710	"""Starts a loop that waits for at least one of \|items\| to be retrieved.
				711
				712	Returns the first item retrieved.
				713	"""
				714	with self._lock:
				715	# Flush items already present.
				716	for item in items:
				717	if item in self._contents:
				718	return item
				719
				720	while True:
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	721	downloaded = self._pool.get_one_result()
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	722	if downloaded in items:
				723	return downloaded
				724
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	725	def add(self, filepath, item):
				726	with self._lock:
				727	with open(filepath, 'rb') as f:
				728	self._contents[item] = f.read()
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	729
				730	def read(self, item):
				731	return self._contents[item]
				732
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	733	def store_to(self, item, dest):
				734	file_write(dest, [self._contents[item]])
				735
				736	def _on_content(self, item, size):
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	737	data = ''.join(self.remote.fetch(item, size))
				738	with self._lock:
				739	self._contents[item] = data
				740	return item
				741
				742	def __enter__(self):
				743	return self
				744
				745	def __exit__(self, _exc_type, _exec_value, _traceback):
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	746	with self._lock:
				747	self._contents = {}
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	748	return False
				749
				750
				751	def load_isolated(content, os_flavor, algo):
				752	"""Verifies the .isolated file is valid and loads this object with the json
				753	data.
maruel@chromium.org	385d73d	2013-09-19 18:33:21 +0000	[diff] [blame]	754
				755	Arguments:
				756	- content: raw serialized content to load.
				757	- os_flavor: OS to load this file on. Optional.
				758	- algo: hashlib algorithm class. Used to confirm the algorithm matches the
				759	algorithm used on the Isolate Server.
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	760	"""
				761	try:
				762	data = json.loads(content)
				763	except ValueError:
				764	raise ConfigError('Failed to parse: %s...' % content[:100])
				765
				766	if not isinstance(data, dict):
				767	raise ConfigError('Expected dict, got %r' % data)
				768
maruel@chromium.org	385d73d	2013-09-19 18:33:21 +0000	[diff] [blame]	769	# Check 'version' first, since it could modify the parsing after.
				770	value = data.get('version', '1.0')
				771	if not isinstance(value, basestring):
				772	raise ConfigError('Expected string, got %r' % value)
				773	if not re.match(r'^(\d+)\.(\d+)$', value):
				774	raise ConfigError('Expected a compatible version, got %r' % value)
				775	if value.split('.', 1)[0] != '1':
				776	raise ConfigError('Expected compatible \'1.x\' version, got %r' % value)
				777
				778	if algo is None:
				779	# Default the algorithm used in the .isolated file itself, falls back to
				780	# 'sha-1' if unspecified.
				781	algo = SUPPORTED_ALGOS_REVERSE[data.get('algo', 'sha-1')]
				782
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	783	for key, value in data.iteritems():
maruel@chromium.org	385d73d	2013-09-19 18:33:21 +0000	[diff] [blame]	784	if key == 'algo':
				785	if not isinstance(value, basestring):
				786	raise ConfigError('Expected string, got %r' % value)
				787	if value not in SUPPORTED_ALGOS:
				788	raise ConfigError(
				789	'Expected one of \'%s\', got %r' %
				790	(', '.join(sorted(SUPPORTED_ALGOS)), value))
				791	if value != SUPPORTED_ALGOS_REVERSE[algo]:
				792	raise ConfigError(
				793	'Expected \'%s\', got %r' % (SUPPORTED_ALGOS_REVERSE[algo], value))
				794
				795	elif key == 'command':
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	796	if not isinstance(value, list):
				797	raise ConfigError('Expected list, got %r' % value)
				798	if not value:
				799	raise ConfigError('Expected non-empty command')
				800	for subvalue in value:
				801	if not isinstance(subvalue, basestring):
				802	raise ConfigError('Expected string, got %r' % subvalue)
				803
				804	elif key == 'files':
				805	if not isinstance(value, dict):
				806	raise ConfigError('Expected dict, got %r' % value)
				807	for subkey, subvalue in value.iteritems():
				808	if not isinstance(subkey, basestring):
				809	raise ConfigError('Expected string, got %r' % subkey)
				810	if not isinstance(subvalue, dict):
				811	raise ConfigError('Expected dict, got %r' % subvalue)
				812	for subsubkey, subsubvalue in subvalue.iteritems():
				813	if subsubkey == 'l':
				814	if not isinstance(subsubvalue, basestring):
				815	raise ConfigError('Expected string, got %r' % subsubvalue)
				816	elif subsubkey == 'm':
				817	if not isinstance(subsubvalue, int):
				818	raise ConfigError('Expected int, got %r' % subsubvalue)
				819	elif subsubkey == 'h':
				820	if not is_valid_hash(subsubvalue, algo):
				821	raise ConfigError('Expected sha-1, got %r' % subsubvalue)
				822	elif subsubkey == 's':
				823	if not isinstance(subsubvalue, int):
				824	raise ConfigError('Expected int, got %r' % subsubvalue)
				825	else:
				826	raise ConfigError('Unknown subsubkey %s' % subsubkey)
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	827	if bool('h' in subvalue) == bool('l' in subvalue):
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	828	raise ConfigError(
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	829	'Need only one of \'h\' (sha-1) or \'l\' (link), got: %r' %
				830	subvalue)
				831	if bool('h' in subvalue) != bool('s' in subvalue):
				832	raise ConfigError(
				833	'Both \'h\' (sha-1) and \'s\' (size) should be set, got: %r' %
				834	subvalue)
				835	if bool('s' in subvalue) == bool('l' in subvalue):
				836	raise ConfigError(
				837	'Need only one of \'s\' (size) or \'l\' (link), got: %r' %
				838	subvalue)
				839	if bool('l' in subvalue) and bool('m' in subvalue):
				840	raise ConfigError(
				841	'Cannot use \'m\' (mode) and \'l\' (link), got: %r' %
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	842	subvalue)
				843
				844	elif key == 'includes':
				845	if not isinstance(value, list):
				846	raise ConfigError('Expected list, got %r' % value)
				847	if not value:
				848	raise ConfigError('Expected non-empty includes list')
				849	for subvalue in value:
				850	if not is_valid_hash(subvalue, algo):
				851	raise ConfigError('Expected sha-1, got %r' % subvalue)
				852
				853	elif key == 'read_only':
				854	if not isinstance(value, bool):
				855	raise ConfigError('Expected bool, got %r' % value)
				856
				857	elif key == 'relative_cwd':
				858	if not isinstance(value, basestring):
				859	raise ConfigError('Expected string, got %r' % value)
				860
				861	elif key == 'os':
				862	if os_flavor and value != os_flavor:
				863	raise ConfigError(
				864	'Expected \'os\' to be \'%s\' but got \'%s\'' %
				865	(os_flavor, value))
				866
maruel@chromium.org	385d73d	2013-09-19 18:33:21 +0000	[diff] [blame]	867	elif key == 'version':
				868	# Already checked above.
				869	pass
				870
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	871	else:
maruel@chromium.org	385d73d	2013-09-19 18:33:21 +0000	[diff] [blame]	872	raise ConfigError('Unknown key %r' % key)
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	873
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	874	# Automatically fix os.path.sep if necessary. While .isolated files are always
				875	# in the the native path format, someone could want to download an .isolated
				876	# tree from another OS.
				877	wrong_path_sep = '/' if os.path.sep == '\\' else '\\'
				878	if 'files' in data:
				879	data['files'] = dict(
				880	(k.replace(wrong_path_sep, os.path.sep), v)
				881	for k, v in data['files'].iteritems())
				882	for v in data['files'].itervalues():
				883	if 'l' in v:
				884	v['l'] = v['l'].replace(wrong_path_sep, os.path.sep)
				885	if 'relative_cwd' in data:
				886	data['relative_cwd'] = data['relative_cwd'].replace(
				887	wrong_path_sep, os.path.sep)
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	888	return data
				889
				890
				891	class IsolatedFile(object):
				892	"""Represents a single parsed .isolated file."""
				893	def __init__(self, obj_hash, algo):
				894	"""\|obj_hash\| is really the sha-1 of the file."""
				895	logging.debug('IsolatedFile(%s)' % obj_hash)
				896	self.obj_hash = obj_hash
				897	self.algo = algo
				898	# Set once all the left-side of the tree is parsed. 'Tree' here means the
				899	# .isolate and all the .isolated files recursively included by it with
				900	# 'includes' key. The order of each sha-1 in 'includes', each representing a
				901	# .isolated file in the hash table, is important, as the later ones are not
				902	# processed until the firsts are retrieved and read.
				903	self.can_fetch = False
				904
				905	# Raw data.
				906	self.data = {}
				907	# A IsolatedFile instance, one per object in self.includes.
				908	self.children = []
				909
				910	# Set once the .isolated file is loaded.
				911	self._is_parsed = False
				912	# Set once the files are fetched.
				913	self.files_fetched = False
				914
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	915	def load(self, os_flavor, content):
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	916	"""Verifies the .isolated file is valid and loads this object with the json
				917	data.
				918	"""
				919	logging.debug('IsolatedFile.load(%s)' % self.obj_hash)
				920	assert not self._is_parsed
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	921	self.data = load_isolated(content, os_flavor, self.algo)
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	922	self.children = [
				923	IsolatedFile(i, self.algo) for i in self.data.get('includes', [])
				924	]
				925	self._is_parsed = True
				926
				927	def fetch_files(self, cache, files):
				928	"""Adds files in this .isolated file not present in \|files\| dictionary.
				929
				930	Preemptively request files.
				931
				932	Note that \|files\| is modified by this function.
				933	"""
				934	assert self.can_fetch
				935	if not self._is_parsed or self.files_fetched:
				936	return
				937	logging.debug('fetch_files(%s)' % self.obj_hash)
				938	for filepath, properties in self.data.get('files', {}).iteritems():
				939	# Root isolated has priority on the files being mapped. In particular,
				940	# overriden files must not be fetched.
				941	if filepath not in files:
				942	files[filepath] = properties
				943	if 'h' in properties:
				944	# Preemptively request files.
				945	logging.debug('fetching %s' % filepath)
				946	cache.retrieve(
				947	WorkerPool.MED,
				948	properties['h'],
				949	properties['s'])
				950	self.files_fetched = True
				951
				952
				953	class Settings(object):
				954	"""Results of a completely parsed .isolated file."""
				955	def __init__(self):
				956	self.command = []
				957	self.files = {}
				958	self.read_only = None
				959	self.relative_cwd = None
				960	# The main .isolated file, a IsolatedFile instance.
				961	self.root = None
				962
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	963	def load(self, cache, root_isolated_hash, os_flavor, algo):
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	964	"""Loads the .isolated and all the included .isolated asynchronously.
				965
				966	It enables support for "included" .isolated files. They are processed in
				967	strict order but fetched asynchronously from the cache. This is important so
				968	that a file in an included .isolated file that is overridden by an embedding
				969	.isolated file is not fetched needlessly. The includes are fetched in one
				970	pass and the files are fetched as soon as all the ones on the left-side
				971	of the tree were fetched.
				972
				973	The prioritization is very important here for nested .isolated files.
				974	'includes' have the highest priority and the algorithm is optimized for both
				975	deep and wide trees. A deep one is a long link of .isolated files referenced
				976	one at a time by one item in 'includes'. A wide one has a large number of
				977	'includes' in a single .isolated file. 'left' is defined as an included
				978	.isolated file earlier in the 'includes' list. So the order of the elements
				979	in 'includes' is important.
				980	"""
				981	self.root = IsolatedFile(root_isolated_hash, algo)
				982
				983	# Isolated files being retrieved now: hash -> IsolatedFile instance.
				984	pending = {}
				985	# Set of hashes of already retrieved items to refuse recursive includes.
				986	seen = set()
				987
				988	def retrieve(isolated_file):
				989	h = isolated_file.obj_hash
				990	if h in seen:
				991	raise ConfigError('IsolatedFile %s is retrieved recursively' % h)
				992	assert h not in pending
				993	seen.add(h)
				994	pending[h] = isolated_file
				995	cache.retrieve(WorkerPool.HIGH, h, UNKNOWN_FILE_SIZE)
				996
				997	retrieve(self.root)
				998
				999	while pending:
				1000	item_hash = cache.wait_for(pending)
				1001	item = pending.pop(item_hash)
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1002	item.load(os_flavor, cache.read(item_hash))
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1003	if item_hash == root_isolated_hash:
				1004	# It's the root item.
				1005	item.can_fetch = True
				1006
				1007	for new_child in item.children:
				1008	retrieve(new_child)
				1009
				1010	# Traverse the whole tree to see if files can now be fetched.
				1011	self._traverse_tree(cache, self.root)
				1012
				1013	def check(n):
				1014	return all(check(x) for x in n.children) and n.files_fetched
				1015	assert check(self.root)
				1016
				1017	self.relative_cwd = self.relative_cwd or ''
				1018	self.read_only = self.read_only or False
				1019
				1020	def _traverse_tree(self, cache, node):
				1021	if node.can_fetch:
				1022	if not node.files_fetched:
				1023	self._update_self(cache, node)
				1024	will_break = False
				1025	for i in node.children:
				1026	if not i.can_fetch:
				1027	if will_break:
				1028	break
				1029	# Automatically mark the first one as fetcheable.
				1030	i.can_fetch = True
				1031	will_break = True
				1032	self._traverse_tree(cache, i)
				1033
				1034	def _update_self(self, cache, node):
				1035	node.fetch_files(cache, self.files)
				1036	# Grabs properties.
				1037	if not self.command and node.data.get('command'):
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1038	# Ensure paths are correctly separated on windows.
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1039	self.command = node.data['command']
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1040	if self.command:
				1041	self.command[0] = self.command[0].replace('/', os.path.sep)
				1042	self.command = tools.fix_python_path(self.command)
maruel@chromium.org	4160164	2013-09-18 19:40:46 +0000	[diff] [blame]	1043	if self.read_only is None and node.data.get('read_only') is not None:
				1044	self.read_only = node.data['read_only']
				1045	if (self.relative_cwd is None and
				1046	node.data.get('relative_cwd') is not None):
				1047	self.relative_cwd = node.data['relative_cwd']
				1048
				1049
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1050	def fetch_isolated(
				1051	isolated_hash, cache, outdir, os_flavor, algo, require_command):
				1052	"""Aggressively downloads the .isolated file(s), then download all the files.
				1053	"""
				1054	settings = Settings()
				1055	with WorkerPool() as pool:
				1056	with cache:
				1057	cache.set_pool(pool)
				1058	with tools.Profiler('GetIsolateds'):
				1059	# Optionally support local files.
				1060	if not is_valid_hash(isolated_hash, algo):
				1061	# Adds it in the cache. While not strictly necessary, this
				1062	# simplifies the rest.
				1063	h = hash_file(isolated_hash, algo)
				1064	cache.add(isolated_hash, h)
				1065	isolated_hash = h
				1066	settings.load(cache, isolated_hash, os_flavor, algo)
				1067
				1068	if require_command and not settings.command:
				1069	raise ConfigError('No command to run')
				1070
				1071	with tools.Profiler('GetRest'):
				1072	create_directories(outdir, settings.files)
				1073	create_links(outdir, settings.files.iteritems())
				1074	remaining = generate_remaining_files(settings.files.iteritems())
				1075
				1076	cwd = os.path.normpath(os.path.join(outdir, settings.relative_cwd))
				1077	if not os.path.isdir(cwd):
				1078	os.makedirs(cwd)
				1079
				1080	# Now block on the remaining files to be downloaded and mapped.
				1081	logging.info('Retrieving remaining files')
				1082	last_update = time.time()
				1083	with threading_utils.DeadlockDetector(DEADLOCK_TIMEOUT) as detector:
				1084	while remaining:
				1085	detector.ping()
				1086	obj = cache.wait_for(remaining)
				1087	for filepath, properties in remaining.pop(obj):
				1088	outfile = os.path.join(outdir, filepath)
				1089	cache.store_to(obj, outfile)
				1090	if 'm' in properties:
				1091	# It's not set on Windows.
				1092	os.chmod(outfile, properties['m'])
				1093
				1094	duration = time.time() - last_update
				1095	if duration > DELAY_BETWEEN_UPDATES_IN_SECS:
				1096	msg = '%d files remaining...' % len(remaining)
				1097	print msg
				1098	logging.info(msg)
				1099	last_update = time.time()
				1100	return settings
				1101
				1102
				1103	def download_isolated_tree(isolated_hash, target_directory, remote):
				1104	"""Downloads the dependencies to the given directory."""
				1105	if not os.path.exists(target_directory):
				1106	os.makedirs(target_directory)
				1107
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1108	cache = MemoryCache(remote)
				1109	return fetch_isolated(
maruel@chromium.org	385d73d	2013-09-19 18:33:21 +0000	[diff] [blame]	1110	isolated_hash, cache, target_directory, None, remote.algo, False)
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1111
				1112
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	1113	@subcommand.usage('<file1..fileN> or - to read from stdin')
				1114	def CMDarchive(parser, args):
				1115	"""Archives data to the server."""
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	1116	options, files = parser.parse_args(args)
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	1117
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	1118	if files == ['-']:
				1119	files = sys.stdin.readlines()
				1120
				1121	if not files:
				1122	parser.error('Nothing to upload')
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	1123
maruel@chromium.org	385d73d	2013-09-19 18:33:21 +0000	[diff] [blame]	1124	# Load the necessary metadata.
				1125	# TODO(maruel): Use a worker pool to upload as the hashing is being done.
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	1126	infiles = dict(
				1127	(
				1128	f,
				1129	{
maruel@chromium.org	e5c1713	2012-11-21 18:18:46 +0000	[diff] [blame]	1130	's': os.stat(f).st_size,
maruel@chromium.org	385d73d	2013-09-19 18:33:21 +0000	[diff] [blame]	1131	'h': hash_file(f, get_hash_algo(options.namespace)),
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	1132	}
				1133	)
				1134	for f in files)
				1135
vadimsh@chromium.org	a432647	2013-08-24 02:05:41 +0000	[diff] [blame]	1136	with tools.Profiler('Archive'):
maruel@chromium.org	7b844a6	2013-09-17 13:04:59 +0000	[diff] [blame]	1137	ret = upload_tree(
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	1138	base_url=options.isolate_server,
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	1139	indir=os.getcwd(),
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	1140	infiles=infiles,
				1141	namespace=options.namespace)
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	1142	if not ret:
				1143	print '\n'.join('%s %s' % (infiles[f]['h'], f) for f in sorted(infiles))
				1144	return ret
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	1145
				1146
				1147	def CMDdownload(parser, args):
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	1148	"""Download data from the server.
				1149
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1150	It can either download individual files or a complete tree from a .isolated
				1151	file.
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	1152	"""
				1153	parser.add_option(
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1154	'-i', '--isolated', metavar='HASH',
				1155	help='hash of an isolated file, .isolated file content is discarded, use '
				1156	'--file if you need it')
				1157	parser.add_option(
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	1158	'-f', '--file', metavar='HASH DEST', default=[], action='append', nargs=2,
				1159	help='hash and destination of a file, can be used multiple times')
				1160	parser.add_option(
				1161	'-t', '--target', metavar='DIR', default=os.getcwd(),
				1162	help='destination directory')
				1163	options, args = parser.parse_args(args)
				1164	if args:
				1165	parser.error('Unsupported arguments: %s' % args)
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1166	if bool(options.isolated) == bool(options.file):
				1167	parser.error('Use one of --isolated or --file, and only one.')
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	1168
				1169	options.target = os.path.abspath(options.target)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	1170	remote = get_storage_api(options.isolate_server, options.namespace)
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	1171	for h, dest in options.file:
				1172	logging.info('%s: %s', h, dest)
maruel@chromium.org	8750e4b	2013-09-18 02:37:57 +0000	[diff] [blame]	1173	file_write(
				1174	os.path.join(options.target, dest),
				1175	remote.fetch(h, UNKNOWN_FILE_SIZE))
maruel@chromium.org	4f2ebe4	2013-09-19 13:09:08 +0000	[diff] [blame]	1176	if options.isolated:
				1177	settings = download_isolated_tree(options.isolated, options.target, remote)
				1178	rel = os.path.join(options.target, settings.relative_cwd)
				1179	print('To run this test please run from the directory %s:' %
				1180	os.path.join(options.target, rel))
				1181	print(' ' + ' '.join(settings.command))
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	1182	return 0
				1183
				1184
				1185	class OptionParserIsolateServer(tools.OptionParserWithLogging):
				1186	def __init__(self, **kwargs):
				1187	tools.OptionParserWithLogging.__init__(self, **kwargs)
				1188	self.add_option(
				1189	'-I', '--isolate-server',
maruel@chromium.org	e9403ab	2013-09-20 18:03:49 +0000	[diff] [blame^]	1190	metavar='URL', default='',
				1191	help='Isolate server to use')
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	1192	self.add_option(
				1193	'--namespace', default='default-gzip',
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	1194	help='The namespace to use on the server, default: %default')
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	1195
				1196	def parse_args(self, args, *kwargs):
				1197	options, args = tools.OptionParserWithLogging.parse_args(
				1198	self, args, *kwargs)
				1199	options.isolate_server = options.isolate_server.rstrip('/')
				1200	if not options.isolate_server:
				1201	self.error('--isolate-server is required.')
				1202	return options, args
				1203
				1204
				1205	def main(args):
				1206	dispatcher = subcommand.CommandDispatcher(__name__)
				1207	try:
				1208	return dispatcher.execute(
				1209	OptionParserIsolateServer(version=__version__), args)
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	1210	except (ConfigError, MappingError) as e:
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	1211	sys.stderr.write('\nError: ')
				1212	sys.stderr.write(str(e))
				1213	sys.stderr.write('\n')
				1214	return 1
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	1215
				1216
				1217	if __name__ == '__main__':
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	1218	fix_encoding.fix_encoding()
				1219	tools.disable_buffering()
				1220	colorama.init()
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	1221	sys.exit(main(sys.argv[1:]))