Blame - isolateserver.py - chromium.googlesource.com/infra/luci/client-py

blob: de9339bda387b348b7c1fbbd07fe6649d9062880 [file] [log] [blame]

maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	1	#!/usr/bin/env python
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	2	# Copyright 2013 The Chromium Authors. All rights reserved.
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	3	# Use of this source code is governed by a BSD-style license that can be
				4	# found in the LICENSE file.
				5
				6	"""Archives a set of files to a server."""
				7
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame^]	8	__version__ = '0.2'
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	9
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	10	import binascii
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	11	import cStringIO
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	12	import hashlib
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	13	import itertools
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	14	import logging
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	15	import os
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	16	import Queue
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame^]	17	import random
				18	import re
				19	import shutil
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	20	import sys
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame^]	21	import threading
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	22	import time
maruel@chromium.org	e82112e	2013-04-24 14:41:55 +0000	[diff] [blame]	23	import urllib
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	24	import zlib
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	25
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	26	from third_party import colorama
				27	from third_party.depot_tools import fix_encoding
				28	from third_party.depot_tools import subcommand
				29
vadimsh@chromium.org	6b70621	2013-08-28 15:03:46 +0000	[diff] [blame]	30	from utils import net
vadimsh@chromium.org	b074b16	2013-08-22 17:55:46 +0000	[diff] [blame]	31	from utils import threading_utils
vadimsh@chromium.org	a432647	2013-08-24 02:05:41 +0000	[diff] [blame]	32	from utils import tools
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	33
				34
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	35	# Default server.
				36	# TODO(maruel): Chromium-specific.
				37	ISOLATE_SERVER = 'https://isolateserver-dev.appspot.com/'
				38
				39
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	40	# The minimum size of files to upload directly to the blobstore.
maruel@chromium.org	aef29f8	2012-12-12 15:00:42 +0000	[diff] [blame]	41	MIN_SIZE_FOR_DIRECT_BLOBSTORE = 20 * 1024
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	42
vadimsh@chromium.org	eea5242	2013-08-21 19:35:54 +0000	[diff] [blame]	43	# The number of files to check the isolate server per /contains query.
				44	# All files are sorted by likelihood of a change in the file content
				45	# (currently file size is used to estimate this: larger the file -> larger the
				46	# possibility it has changed). Then first ITEMS_PER_CONTAINS_QUERIES[0] files
				47	# are taken and send to '/contains', then next ITEMS_PER_CONTAINS_QUERIES[1],
				48	# and so on. Numbers here is a trade-off; the more per request, the lower the
				49	# effect of HTTP round trip latency and TCP-level chattiness. On the other hand,
				50	# larger values cause longer lookups, increasing the initial latency to start
				51	# uploading, which is especially an issue for large files. This value is
				52	# optimized for the "few thousands files to look up with minimal number of large
				53	# files missing" case.
				54	ITEMS_PER_CONTAINS_QUERIES = [20, 20, 50, 50, 50, 100]
csharp@chromium.org	07fa759	2013-01-11 18:19:30 +0000	[diff] [blame]	55
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	56	# A list of already compressed extension types that should not receive any
				57	# compression before being uploaded.
				58	ALREADY_COMPRESSED_TYPES = [
				59	'7z', 'avi', 'cur', 'gif', 'h264', 'jar', 'jpeg', 'jpg', 'pdf', 'png',
				60	'wav', 'zip'
				61	]
				62
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	63
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	64	# The file size to be used when we don't know the correct file size,
				65	# generally used for .isolated files.
				66	UNKNOWN_FILE_SIZE = None
				67
				68
				69	# The size of each chunk to read when downloading and unzipping files.
				70	ZIPPED_FILE_CHUNK = 16 * 1024
				71
				72
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame^]	73	# Read timeout in seconds for downloads from isolate storage. If there's no
				74	# response from the server within this timeout whole download will be aborted.
				75	DOWNLOAD_READ_TIMEOUT = 60
				76
				77
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	78	class ConfigError(ValueError):
				79	"""Generic failure to load a .isolated file."""
				80	pass
				81
				82
				83	class MappingError(OSError):
				84	"""Failed to recreate the tree."""
				85	pass
				86
				87
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	88	def randomness():
				89	"""Generates low-entropy randomness for MIME encoding.
				90
				91	Exists so it can be mocked out in unit tests.
				92	"""
				93	return str(time.time())
				94
				95
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	96	def encode_multipart_formdata(fields, files,
				97	mime_mapper=lambda _: 'application/octet-stream'):
				98	"""Encodes a Multipart form data object.
				99
				100	Args:
				101	fields: a sequence (name, value) elements for
				102	regular form fields.
				103	files: a sequence of (name, filename, value) elements for data to be
				104	uploaded as files.
				105	mime_mapper: function to return the mime type from the filename.
				106	Returns:
				107	content_type: for httplib.HTTP instance
				108	body: for httplib.HTTP instance
				109	"""
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	110	boundary = hashlib.md5(randomness()).hexdigest()
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	111	body_list = []
				112	for (key, value) in fields:
				113	if isinstance(key, unicode):
				114	value = key.encode('utf-8')
				115	if isinstance(value, unicode):
				116	value = value.encode('utf-8')
				117	body_list.append('--' + boundary)
				118	body_list.append('Content-Disposition: form-data; name="%s"' % key)
				119	body_list.append('')
				120	body_list.append(value)
				121	body_list.append('--' + boundary)
				122	body_list.append('')
				123	for (key, filename, value) in files:
				124	if isinstance(key, unicode):
				125	value = key.encode('utf-8')
				126	if isinstance(filename, unicode):
				127	value = filename.encode('utf-8')
				128	if isinstance(value, unicode):
				129	value = value.encode('utf-8')
				130	body_list.append('--' + boundary)
				131	body_list.append('Content-Disposition: form-data; name="%s"; '
				132	'filename="%s"' % (key, filename))
				133	body_list.append('Content-Type: %s' % mime_mapper(filename))
				134	body_list.append('')
				135	body_list.append(value)
				136	body_list.append('--' + boundary)
				137	body_list.append('')
				138	if body_list:
				139	body_list[-2] += '--'
				140	body = '\r\n'.join(body_list)
				141	content_type = 'multipart/form-data; boundary=%s' % boundary
				142	return content_type, body
				143
				144
maruel@chromium.org	037758d	2012-12-10 17:59:46 +0000	[diff] [blame]	145	def sha1_file(filepath):
				146	"""Calculates the SHA-1 of a file without reading it all in memory at once."""
				147	digest = hashlib.sha1()
				148	with open(filepath, 'rb') as f:
				149	while True:
				150	# Read in 1mb chunks.
				151	chunk = f.read(1024*1024)
				152	if not chunk:
				153	break
				154	digest.update(chunk)
				155	return digest.hexdigest()
				156
				157
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame^]	158	def file_write(filepath, content_generator):
				159	"""Writes file content as generated by content_generator.
				160
				161	Meant to be mocked out in unit tests.
				162	"""
				163	filedir = os.path.dirname(filepath)
				164	if not os.path.isdir(filedir):
				165	os.makedirs(filedir)
				166	with open(filepath, 'wb') as f:
				167	for d in content_generator:
				168	f.write(d)
				169
				170
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	171	def valid_file(filepath, size):
				172	"""Determines if the given files appears valid.
				173
				174	Currently it just checks the file's size.
				175	"""
				176	if size == UNKNOWN_FILE_SIZE:
				177	return True
				178	actual_size = os.stat(filepath).st_size
				179	if size != actual_size:
				180	logging.warning(
				181	'Found invalid item %s; %d != %d',
				182	os.path.basename(filepath), actual_size, size)
				183	return False
				184	return True
				185
				186
vadimsh@chromium.org	80f7300	2013-07-12 14:52:44 +0000	[diff] [blame]	187	def url_read(url, **kwargs):
vadimsh@chromium.org	6b70621	2013-08-28 15:03:46 +0000	[diff] [blame]	188	result = net.url_read(url, **kwargs)
vadimsh@chromium.org	80f7300	2013-07-12 14:52:44 +0000	[diff] [blame]	189	if result is None:
maruel@chromium.org	ef33312	2013-03-12 20:36:40 +0000	[diff] [blame]	190	# If we get no response from the server, assume it is down and raise an
				191	# exception.
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	192	raise MappingError('Unable to connect to server %s' % url)
maruel@chromium.org	ef33312	2013-03-12 20:36:40 +0000	[diff] [blame]	193	return result
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	194
				195
maruel@chromium.org	dc359e6	2013-03-14 13:08:55 +0000	[diff] [blame]	196	def upload_hash_content_to_blobstore(
				197	generate_upload_url, data, hash_key, content):
vadimsh@chromium.org	80f7300	2013-07-12 14:52:44 +0000	[diff] [blame]	198	"""Uploads the given hash contents directly to the blobstore via a generated
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	199	url.
				200
				201	Arguments:
				202	generate_upload_url: The url to get the new upload url from.
maruel@chromium.org	dc359e6	2013-03-14 13:08:55 +0000	[diff] [blame]	203	data: extra POST data.
				204	hash_key: sha1 of the uncompressed version of content.
				205	content: The contents to upload. Must fit in memory for now.
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	206	"""
				207	logging.debug('Generating url to directly upload file to blobstore')
maruel@chromium.org	92a3d2e	2012-12-20 16:22:29 +0000	[diff] [blame]	208	assert isinstance(hash_key, str), hash_key
				209	assert isinstance(content, str), (hash_key, content)
maruel@chromium.org	d58bf5b	2013-04-26 17:57:42 +0000	[diff] [blame]	210	# TODO(maruel): Support large files. This would require streaming support.
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	211	content_type, body = encode_multipart_formdata(
maruel@chromium.org	d58bf5b	2013-04-26 17:57:42 +0000	[diff] [blame]	212	data, [('content', hash_key, content)])
vadimsh@chromium.org	043b76d	2013-09-12 16:15:13 +0000	[diff] [blame]	213	for _ in net.retry_loop(max_attempts=net.URL_OPEN_MAX_ATTEMPTS):
maruel@chromium.org	d58bf5b	2013-04-26 17:57:42 +0000	[diff] [blame]	214	# Retry HTTP 50x here.
vadimsh@chromium.org	6b70621	2013-08-28 15:03:46 +0000	[diff] [blame]	215	upload_url = net.url_read(generate_upload_url, data=data)
vadimsh@chromium.org	80f7300	2013-07-12 14:52:44 +0000	[diff] [blame]	216	if not upload_url:
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	217	raise MappingError(
maruel@chromium.org	d58bf5b	2013-04-26 17:57:42 +0000	[diff] [blame]	218	'Unable to connect to server %s' % generate_upload_url)
maruel@chromium.org	d58bf5b	2013-04-26 17:57:42 +0000	[diff] [blame]	219
				220	# Do not retry this request on HTTP 50x. Regenerate an upload url each time
				221	# since uploading "consumes" the upload url.
vadimsh@chromium.org	6b70621	2013-08-28 15:03:46 +0000	[diff] [blame]	222	result = net.url_read(
maruel@chromium.org	d58bf5b	2013-04-26 17:57:42 +0000	[diff] [blame]	223	upload_url, data=body, content_type=content_type, retry_50x=False)
vadimsh@chromium.org	80f7300	2013-07-12 14:52:44 +0000	[diff] [blame]	224	if result is not None:
				225	return result
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	226	raise MappingError(
maruel@chromium.org	d58bf5b	2013-04-26 17:57:42 +0000	[diff] [blame]	227	'Unable to connect to server %s' % generate_upload_url)
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	228
				229
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	230	class IsolateServer(object):
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame^]	231	"""Client class to download or upload to Isolate Server."""
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	232	def __init__(self, base_url, namespace):
				233	assert base_url.startswith('http'), base_url
				234	self.content_url = base_url.rstrip('/') + '/content/'
				235	self.namespace = namespace
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame^]	236	self._token = None
				237	self._lock = threading.Lock()
				238
				239	@property
				240	def token(self):
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	241	# TODO(maruel): Make this request much earlier asynchronously while the
				242	# files are being enumerated.
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame^]	243	with self._lock:
				244	if not self._token:
				245	self._token = urllib.quote(url_read(self.content_url + 'get_token'))
				246	return self._token
				247
				248	def retrieve(self, item, dest):
				249	size = [0]
				250	try:
				251	zipped_url = '%sretrieve/%s/%s' % (self.content_url, self.namespace, item)
				252	logging.debug('download_file(%s)', zipped_url)
				253
				254	# Because the app engine DB is only eventually consistent, retry
				255	# 404 errors because the file might just not be visible yet (even
				256	# though it has been uploaded).
				257	connection = net.url_open(
				258	zipped_url, retry_404=True, read_timeout=DOWNLOAD_READ_TIMEOUT)
				259	if not connection:
				260	raise IOError('Unable to open connection to %s' % zipped_url)
				261
				262	content_length = connection.content_length
				263	decompressor = zlib.decompressobj()
				264	def generator():
				265	while True:
				266	chunk = connection.read(ZIPPED_FILE_CHUNK)
				267	if not chunk:
				268	break
				269	size[0] += len(chunk)
				270	yield decompressor.decompress(chunk)
				271	file_write(dest, generator())
				272	# Ensure that all the data was properly decompressed.
				273	uncompressed_data = decompressor.flush()
				274	assert not uncompressed_data
				275	except IOError as e:
				276	logging.error('Failed to download %s at %s.\n%s', item, dest, e)
				277	raise
				278	except zlib.error as e:
				279	msg = 'Corrupted zlib for item %s. Processed %d of %s bytes.\n%s' % (
				280	item, size[0], content_length, e)
				281	logging.error(msg)
				282
				283	# Testing seems to show that if a few machines are trying to download
				284	# the same blob, they can cause each other to fail. So if we hit a
				285	# zip error, this is the most likely cause (it only downloads some of
				286	# the data). Randomly sleep for between 5 and 25 seconds to try and
				287	# spread out the downloads.
				288	# TODO(csharp): Switch from blobstorage to cloud storage and see if
				289	# that solves the issue.
				290	sleep_duration = (random.random() * 20) + 5
				291	time.sleep(sleep_duration)
				292	raise IOError(msg)
maruel@chromium.org	c2bfef4	2013-08-30 21:46:26 +0000	[diff] [blame]	293
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	294	def store(self, content, hash_key):
				295	# TODO(maruel): Detect failures.
				296	hash_key = str(hash_key)
				297	if len(content) > MIN_SIZE_FOR_DIRECT_BLOBSTORE:
				298	url = '%sgenerate_blobstore_url/%s/%s' % (
				299	self.content_url, self.namespace, hash_key)
				300	# token is guaranteed to be already quoted but it is unnecessary here, and
				301	# only here.
				302	data = [('token', urllib.unquote(self.token))]
				303	return upload_hash_content_to_blobstore(url, data, hash_key, content)
				304	else:
				305	url = '%sstore/%s/%s?token=%s' % (
				306	self.content_url, self.namespace, hash_key, self.token)
				307	return url_read(
				308	url, data=content, content_type='application/octet-stream')
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	309
				310
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	311	def check_files_exist_on_server(query_url, queries):
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	312	"""Queries the server to see which files from this batch already exist there.
				313
				314	Arguments:
				315	queries: The hash files to potential upload to the server.
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	316	Returns:
				317	missing_files: list of files that are missing on the server.
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	318	"""
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	319	# TODO(maruel): Move inside IsolateServer.
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	320	logging.info('Checking existence of %d files...', len(queries))
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	321	body = ''.join(
maruel@chromium.org	e5c1713	2012-11-21 18:18:46 +0000	[diff] [blame]	322	(binascii.unhexlify(meta_data['h']) for (_, meta_data) in queries))
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	323	assert (len(body) % 20) == 0, repr(body)
				324
vadimsh@chromium.org	80f7300	2013-07-12 14:52:44 +0000	[diff] [blame]	325	response = url_read(
				326	query_url, data=body, content_type='application/octet-stream')
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	327	if len(queries) != len(response):
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	328	raise MappingError(
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	329	'Got an incorrect number of responses from the server. Expected %d, '
				330	'but got %d' % (len(queries), len(response)))
				331
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	332	missing_files = [
				333	queries[i] for i, flag in enumerate(response) if flag == chr(0)
				334	]
				335	logging.info('Queried %d files, %d cache hit',
				336	len(queries), len(queries) - len(missing_files))
				337	return missing_files
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	338
				339
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame^]	340	class FileSystem(object):
				341	"""Fetches data from the file system.
				342
				343	The common use case is a NFS/CIFS file server that is mounted locally that is
				344	used to fetch the file on a local partition.
				345	"""
				346	def __init__(self, base_path):
				347	self.base_path = base_path
				348
				349	def retrieve(self, item, dest):
				350	source = os.path.join(self.base_path, item)
				351	if source == dest:
				352	logging.info('Source and destination are the same, no action required')
				353	return
				354	logging.debug('copy_file(%s, %s)', source, dest)
				355	shutil.copy(source, dest)
				356
				357	def store(self, content, hash_key):
				358	raise NotImplementedError()
				359
				360
				361	def get_storage_api(file_or_url, namespace):
				362	"""Returns an object that implements .retrieve()."""
				363	if re.match(r'^https?://.+$', file_or_url):
				364	return IsolateServer(file_or_url, namespace)
				365	else:
				366	return FileSystem(file_or_url)
				367
				368
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	369	class RemoteOperation(object):
				370	"""Priority based worker queue to operate on action items.
				371
				372	It execute a function with the given task items. It is specialized to download
				373	files.
				374
				375	When the priority of items is equals, works in strict FIFO mode.
				376	"""
				377	# Initial and maximum number of worker threads.
				378	INITIAL_WORKERS = 2
				379	MAX_WORKERS = 16
				380	# Priorities.
				381	LOW, MED, HIGH = (1<<8, 2<<8, 3<<8)
				382	INTERNAL_PRIORITY_BITS = (1<<8) - 1
				383	RETRIES = 5
				384
				385	def __init__(self, do_item):
				386	# Function to fetch a remote object or upload to a remote location.
				387	self._do_item = do_item
				388	# Contains tuple(priority, obj).
				389	self._done = Queue.PriorityQueue()
				390	self._pool = threading_utils.ThreadPool(
				391	self.INITIAL_WORKERS, self.MAX_WORKERS, 0, 'remote')
				392
				393	def join(self):
				394	"""Blocks until the queue is empty."""
				395	return self._pool.join()
				396
				397	def close(self):
				398	"""Terminates all worker threads."""
				399	self._pool.close()
				400
				401	def add_item(self, priority, obj, dest, size):
				402	"""Retrieves an object from the remote data store.
				403
				404	The smaller \|priority\| gets fetched first.
				405
				406	Thread-safe.
				407	"""
				408	assert (priority & self.INTERNAL_PRIORITY_BITS) == 0
				409	return self._add_item(priority, obj, dest, size)
				410
				411	def _add_item(self, priority, obj, dest, size):
				412	assert isinstance(obj, basestring), obj
				413	assert isinstance(dest, basestring), dest
				414	assert size is None or isinstance(size, int), size
				415	return self._pool.add_task(
				416	priority, self._task_executer, priority, obj, dest, size)
				417
				418	def get_one_result(self):
				419	return self._pool.get_one_result()
				420
				421	def _task_executer(self, priority, obj, dest, size):
				422	"""Wraps self._do_item to trap and retry on IOError exceptions."""
				423	try:
				424	self._do_item(obj, dest)
				425	if size and not valid_file(dest, size):
				426	download_size = os.stat(dest).st_size
				427	os.remove(dest)
				428	raise IOError('File incorrect size after download of %s. Got %s and '
				429	'expected %s' % (obj, download_size, size))
				430	# TODO(maruel): Technically, we'd want to have an output queue to be a
				431	# PriorityQueue.
				432	return obj
				433	except IOError as e:
				434	logging.debug('Caught IOError: %s', e)
				435	# Remove unfinished download.
				436	if os.path.exists(dest):
				437	os.remove(dest)
				438	# Retry a few times, lowering the priority.
				439	if (priority & self.INTERNAL_PRIORITY_BITS) < self.RETRIES:
				440	self._add_item(priority + 1, obj, dest, size)
				441	return
				442	raise
				443
				444
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	445	def compression_level(filename):
				446	"""Given a filename calculates the ideal compression level to use."""
				447	file_ext = os.path.splitext(filename)[1].lower()
				448	# TODO(csharp): Profile to find what compression level works best.
				449	return 0 if file_ext in ALREADY_COMPRESSED_TYPES else 7
				450
				451
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	452	def read_and_compress(filepath, level):
				453	"""Reads a file and returns its content gzip compressed."""
				454	compressor = zlib.compressobj(level)
				455	compressed_data = cStringIO.StringIO()
				456	with open(filepath, 'rb') as f:
				457	while True:
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	458	chunk = f.read(ZIPPED_FILE_CHUNK)
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	459	if not chunk:
				460	break
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	461	compressed_data.write(compressor.compress(chunk))
				462	compressed_data.write(compressor.flush(zlib.Z_FINISH))
				463	value = compressed_data.getvalue()
				464	compressed_data.close()
				465	return value
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	466
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	467
				468	def zip_and_trigger_upload(infile, metadata, upload_function):
				469	# TODO(csharp): Fix crbug.com/150823 and enable the touched logic again.
				470	# if not metadata['T']:
				471	compressed_data = read_and_compress(infile, compression_level(infile))
				472	priority = (
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	473	RemoteOperation.HIGH if metadata.get('priority', '1') == '0'
				474	else RemoteOperation.MED)
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	475	return upload_function(priority, compressed_data, metadata['h'], None)
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	476
				477
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	478	def batch_files_for_check(infiles):
				479	"""Splits list of files to check for existence on the server into batches.
maruel@chromium.org	35fc0c8	2013-01-17 15:14:14 +0000	[diff] [blame]	480
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	481	Each batch corresponds to a single 'exists?' query to the server.
				482
				483	Yields:
				484	batches: list of batches, each batch is a list of files.
maruel@chromium.org	35fc0c8	2013-01-17 15:14:14 +0000	[diff] [blame]	485	"""
vadimsh@chromium.org	eea5242	2013-08-21 19:35:54 +0000	[diff] [blame]	486	batch_count = 0
				487	batch_size_limit = ITEMS_PER_CONTAINS_QUERIES[0]
maruel@chromium.org	35fc0c8	2013-01-17 15:14:14 +0000	[diff] [blame]	488	next_queries = []
csharp@chromium.org	90c4581	2013-01-23 14:27:21 +0000	[diff] [blame]	489	items = ((k, v) for k, v in infiles.iteritems() if 's' in v)
				490	for relfile, metadata in sorted(items, key=lambda x: -x[1]['s']):
maruel@chromium.org	35fc0c8	2013-01-17 15:14:14 +0000	[diff] [blame]	491	next_queries.append((relfile, metadata))
vadimsh@chromium.org	eea5242	2013-08-21 19:35:54 +0000	[diff] [blame]	492	if len(next_queries) == batch_size_limit:
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	493	yield next_queries
maruel@chromium.org	35fc0c8	2013-01-17 15:14:14 +0000	[diff] [blame]	494	next_queries = []
vadimsh@chromium.org	eea5242	2013-08-21 19:35:54 +0000	[diff] [blame]	495	batch_count += 1
				496	batch_size_limit = ITEMS_PER_CONTAINS_QUERIES[
				497	min(batch_count, len(ITEMS_PER_CONTAINS_QUERIES) - 1)]
maruel@chromium.org	35fc0c8	2013-01-17 15:14:14 +0000	[diff] [blame]	498	if next_queries:
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	499	yield next_queries
				500
				501
				502	def get_files_to_upload(contains_hash_url, infiles):
				503	"""Yields files that are missing on the server."""
vadimsh@chromium.org	b074b16	2013-08-22 17:55:46 +0000	[diff] [blame]	504	with threading_utils.ThreadPool(1, 16, 0, prefix='get_files_to_upload') as tp:
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	505	for files in batch_files_for_check(infiles):
vadimsh@chromium.org	b074b16	2013-08-22 17:55:46 +0000	[diff] [blame]	506	tp.add_task(0, check_files_exist_on_server, contains_hash_url, files)
				507	for missing_file in itertools.chain.from_iterable(tp.iter_results()):
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	508	yield missing_file
maruel@chromium.org	35fc0c8	2013-01-17 15:14:14 +0000	[diff] [blame]	509
				510
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	511	def upload_sha1_tree(base_url, indir, infiles, namespace):
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	512	"""Uploads the given tree to the given url.
				513
				514	Arguments:
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	515	base_url: The base url, it is assume that \|base_url\|/has/ can be used to
				516	query if an element was already uploaded, and \|base_url\|/store/
				517	can be used to upload a new element.
				518	indir: Root directory the infiles are based in.
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	519	infiles: dict of files to upload files from \|indir\| to \|base_url\|.
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	520	namespace: The namespace to use on the server.
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	521	"""
				522	logging.info('upload tree(base_url=%s, indir=%s, files=%d)' %
				523	(base_url, indir, len(infiles)))
maruel@chromium.org	034e396	2013-03-13 13:34:25 +0000	[diff] [blame]	524
csharp@chromium.org	07fa759	2013-01-11 18:19:30 +0000	[diff] [blame]	525	# Create a pool of workers to zip and upload any files missing from
				526	# the server.
vadimsh@chromium.org	b074b16	2013-08-22 17:55:46 +0000	[diff] [blame]	527	num_threads = threading_utils.num_processors()
				528	zipping_pool = threading_utils.ThreadPool(min(2, num_threads),
				529	num_threads, 0, 'zip')
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	530	remote = IsolateServer(base_url, namespace)
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	531	remote_uploader = RemoteOperation(remote.store)
csharp@chromium.org	07fa759	2013-01-11 18:19:30 +0000	[diff] [blame]	532
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	533	# Starts the zip and upload process for files that are missing
				534	# from the server.
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	535	contains_hash_url = '%scontains/%s?token=%s' % (
				536	remote.content_url, namespace, remote.token)
csharp@chromium.org	20a888c	2013-01-15 15:06:55 +0000	[diff] [blame]	537	uploaded = []
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	538	for relfile, metadata in get_files_to_upload(contains_hash_url, infiles):
csharp@chromium.org	07fa759	2013-01-11 18:19:30 +0000	[diff] [blame]	539	infile = os.path.join(indir, relfile)
maruel@chromium.org	831958f	2013-01-22 15:01:46 +0000	[diff] [blame]	540	zipping_pool.add_task(0, zip_and_trigger_upload, infile, metadata,
csharp@chromium.org	07fa759	2013-01-11 18:19:30 +0000	[diff] [blame]	541	remote_uploader.add_item)
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	542	uploaded.append((relfile, metadata))
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	543
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	544	logging.info('Waiting for all files to finish zipping')
				545	zipping_pool.join()
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	546	zipping_pool.close()
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	547	logging.info('All files zipped.')
				548
				549	logging.info('Waiting for all files to finish uploading')
maruel@chromium.org	13eca0b	2013-01-22 16:42:21 +0000	[diff] [blame]	550	# Will raise if any exception occurred.
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	551	remote_uploader.join()
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	552	remote_uploader.close()
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	553	logging.info('All files are uploaded')
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	554
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	555	total = len(infiles)
maruel@chromium.org	e5c1713	2012-11-21 18:18:46 +0000	[diff] [blame]	556	total_size = sum(metadata.get('s', 0) for metadata in infiles.itervalues())
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	557	logging.info(
				558	'Total: %6d, %9.1fkb',
				559	total,
maruel@chromium.org	e5c1713	2012-11-21 18:18:46 +0000	[diff] [blame]	560	sum(m.get('s', 0) for m in infiles.itervalues()) / 1024.)
csharp@chromium.org	20a888c	2013-01-15 15:06:55 +0000	[diff] [blame]	561	cache_hit = set(infiles.iterkeys()) - set(x[0] for x in uploaded)
maruel@chromium.org	e5c1713	2012-11-21 18:18:46 +0000	[diff] [blame]	562	cache_hit_size = sum(infiles[i].get('s', 0) for i in cache_hit)
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	563	logging.info(
				564	'cache hit: %6d, %9.1fkb, %6.2f%% files, %6.2f%% size',
				565	len(cache_hit),
				566	cache_hit_size / 1024.,
				567	len(cache_hit) * 100. / total,
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	568	cache_hit_size * 100. / total_size if total_size else 0)
csharp@chromium.org	20a888c	2013-01-15 15:06:55 +0000	[diff] [blame]	569	cache_miss = uploaded
maruel@chromium.org	e5c1713	2012-11-21 18:18:46 +0000	[diff] [blame]	570	cache_miss_size = sum(infiles[i[0]].get('s', 0) for i in cache_miss)
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	571	logging.info(
				572	'cache miss: %6d, %9.1fkb, %6.2f%% files, %6.2f%% size',
				573	len(cache_miss),
				574	cache_miss_size / 1024.,
				575	len(cache_miss) * 100. / total,
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	576	cache_miss_size * 100. / total_size if total_size else 0)
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	577	return 0
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	578
				579
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	580	@subcommand.usage('<file1..fileN> or - to read from stdin')
				581	def CMDarchive(parser, args):
				582	"""Archives data to the server."""
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	583	options, files = parser.parse_args(args)
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	584
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	585	if files == ['-']:
				586	files = sys.stdin.readlines()
				587
				588	if not files:
				589	parser.error('Nothing to upload')
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	590	if not options.isolate_server:
				591	parser.error('Nowhere to send. Please specify --isolate-server')
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	592
				593	# Load the necessary metadata. This is going to be rewritten eventually to be
				594	# more efficient.
				595	infiles = dict(
				596	(
				597	f,
				598	{
maruel@chromium.org	e5c1713	2012-11-21 18:18:46 +0000	[diff] [blame]	599	's': os.stat(f).st_size,
maruel@chromium.org	037758d	2012-12-10 17:59:46 +0000	[diff] [blame]	600	'h': sha1_file(f),
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	601	}
				602	)
				603	for f in files)
				604
vadimsh@chromium.org	a432647	2013-08-24 02:05:41 +0000	[diff] [blame]	605	with tools.Profiler('Archive'):
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame^]	606	ret = upload_sha1_tree(
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	607	base_url=options.isolate_server,
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	608	indir=os.getcwd(),
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	609	infiles=infiles,
				610	namespace=options.namespace)
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame^]	611	if not ret:
				612	print '\n'.join('%s %s' % (infiles[f]['h'], f) for f in sorted(infiles))
				613	return ret
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	614
				615
				616	def CMDdownload(parser, args):
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame^]	617	"""Download data from the server.
				618
				619	It can download individual files.
				620	"""
				621	parser.add_option(
				622	'-f', '--file', metavar='HASH DEST', default=[], action='append', nargs=2,
				623	help='hash and destination of a file, can be used multiple times')
				624	parser.add_option(
				625	'-t', '--target', metavar='DIR', default=os.getcwd(),
				626	help='destination directory')
				627	options, args = parser.parse_args(args)
				628	if args:
				629	parser.error('Unsupported arguments: %s' % args)
				630	if not options.file:
				631	parser.error('Use one of --file is required.')
				632
				633	options.target = os.path.abspath(options.target)
				634	remote = IsolateServer(options.isolate_server, options.namespace)
				635	for h, dest in options.file:
				636	logging.info('%s: %s', h, dest)
				637	remote.retrieve(h, os.path.join(options.target, dest))
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	638	return 0
				639
				640
				641	class OptionParserIsolateServer(tools.OptionParserWithLogging):
				642	def __init__(self, **kwargs):
				643	tools.OptionParserWithLogging.__init__(self, **kwargs)
				644	self.add_option(
				645	'-I', '--isolate-server',
				646	default=ISOLATE_SERVER,
				647	metavar='URL',
				648	help='Isolate server where data is stored. default: %default')
				649	self.add_option(
				650	'--namespace', default='default-gzip',
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame^]	651	help='The namespace to use on the server, default: %default')
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	652
				653	def parse_args(self, args, *kwargs):
				654	options, args = tools.OptionParserWithLogging.parse_args(
				655	self, args, *kwargs)
				656	options.isolate_server = options.isolate_server.rstrip('/')
				657	if not options.isolate_server:
				658	self.error('--isolate-server is required.')
				659	return options, args
				660
				661
				662	def main(args):
				663	dispatcher = subcommand.CommandDispatcher(__name__)
				664	try:
				665	return dispatcher.execute(
				666	OptionParserIsolateServer(version=__version__), args)
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	667	except (ConfigError, MappingError) as e:
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	668	sys.stderr.write('\nError: ')
				669	sys.stderr.write(str(e))
				670	sys.stderr.write('\n')
				671	return 1
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	672
				673
				674	if __name__ == '__main__':
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	675	fix_encoding.fix_encoding()
				676	tools.disable_buffering()
				677	colorama.init()
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	678	sys.exit(main(sys.argv[1:]))