Blame - isolateserver.py - chromium.googlesource.com/infra/luci/client-py

blob: bb54574736be2ddc61c66e0c57cff3f3312786f4 [file] [log] [blame]

maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	1	#!/usr/bin/env python
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	2	# Copyright 2013 The Chromium Authors. All rights reserved.
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	3	# Use of this source code is governed by a BSD-style license that can be
				4	# found in the LICENSE file.
				5
				6	"""Archives a set of files to a server."""
				7
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	8	__version__ = '0.2'
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	9
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	10	import binascii
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	11	import cStringIO
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	12	import hashlib
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	13	import itertools
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	14	import logging
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	15	import os
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	16	import random
				17	import re
				18	import shutil
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	19	import sys
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	20	import threading
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	21	import time
maruel@chromium.org	e82112e	2013-04-24 14:41:55 +0000	[diff] [blame]	22	import urllib
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	23	import zlib
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	24
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	25	from third_party import colorama
				26	from third_party.depot_tools import fix_encoding
				27	from third_party.depot_tools import subcommand
				28
vadimsh@chromium.org	6b70621	2013-08-28 15:03:46 +0000	[diff] [blame]	29	from utils import net
vadimsh@chromium.org	b074b16	2013-08-22 17:55:46 +0000	[diff] [blame]	30	from utils import threading_utils
vadimsh@chromium.org	a432647	2013-08-24 02:05:41 +0000	[diff] [blame]	31	from utils import tools
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	32
				33
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	34	# Default server.
				35	# TODO(maruel): Chromium-specific.
				36	ISOLATE_SERVER = 'https://isolateserver-dev.appspot.com/'
				37
				38
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	39	# The minimum size of files to upload directly to the blobstore.
maruel@chromium.org	aef29f8	2012-12-12 15:00:42 +0000	[diff] [blame]	40	MIN_SIZE_FOR_DIRECT_BLOBSTORE = 20 * 1024
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	41
vadimsh@chromium.org	eea5242	2013-08-21 19:35:54 +0000	[diff] [blame]	42	# The number of files to check the isolate server per /contains query.
				43	# All files are sorted by likelihood of a change in the file content
				44	# (currently file size is used to estimate this: larger the file -> larger the
				45	# possibility it has changed). Then first ITEMS_PER_CONTAINS_QUERIES[0] files
				46	# are taken and send to '/contains', then next ITEMS_PER_CONTAINS_QUERIES[1],
				47	# and so on. Numbers here is a trade-off; the more per request, the lower the
				48	# effect of HTTP round trip latency and TCP-level chattiness. On the other hand,
				49	# larger values cause longer lookups, increasing the initial latency to start
				50	# uploading, which is especially an issue for large files. This value is
				51	# optimized for the "few thousands files to look up with minimal number of large
				52	# files missing" case.
				53	ITEMS_PER_CONTAINS_QUERIES = [20, 20, 50, 50, 50, 100]
csharp@chromium.org	07fa759	2013-01-11 18:19:30 +0000	[diff] [blame]	54
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	55	# A list of already compressed extension types that should not receive any
				56	# compression before being uploaded.
				57	ALREADY_COMPRESSED_TYPES = [
				58	'7z', 'avi', 'cur', 'gif', 'h264', 'jar', 'jpeg', 'jpg', 'pdf', 'png',
				59	'wav', 'zip'
				60	]
				61
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	62
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	63	# The file size to be used when we don't know the correct file size,
				64	# generally used for .isolated files.
				65	UNKNOWN_FILE_SIZE = None
				66
				67
				68	# The size of each chunk to read when downloading and unzipping files.
				69	ZIPPED_FILE_CHUNK = 16 * 1024
				70
				71
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	72	# Read timeout in seconds for downloads from isolate storage. If there's no
				73	# response from the server within this timeout whole download will be aborted.
				74	DOWNLOAD_READ_TIMEOUT = 60
				75
				76
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	77	class ConfigError(ValueError):
				78	"""Generic failure to load a .isolated file."""
				79	pass
				80
				81
				82	class MappingError(OSError):
				83	"""Failed to recreate the tree."""
				84	pass
				85
				86
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	87	def randomness():
				88	"""Generates low-entropy randomness for MIME encoding.
				89
				90	Exists so it can be mocked out in unit tests.
				91	"""
				92	return str(time.time())
				93
				94
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	95	def encode_multipart_formdata(fields, files,
				96	mime_mapper=lambda _: 'application/octet-stream'):
				97	"""Encodes a Multipart form data object.
				98
				99	Args:
				100	fields: a sequence (name, value) elements for
				101	regular form fields.
				102	files: a sequence of (name, filename, value) elements for data to be
				103	uploaded as files.
				104	mime_mapper: function to return the mime type from the filename.
				105	Returns:
				106	content_type: for httplib.HTTP instance
				107	body: for httplib.HTTP instance
				108	"""
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	109	boundary = hashlib.md5(randomness()).hexdigest()
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	110	body_list = []
				111	for (key, value) in fields:
				112	if isinstance(key, unicode):
				113	value = key.encode('utf-8')
				114	if isinstance(value, unicode):
				115	value = value.encode('utf-8')
				116	body_list.append('--' + boundary)
				117	body_list.append('Content-Disposition: form-data; name="%s"' % key)
				118	body_list.append('')
				119	body_list.append(value)
				120	body_list.append('--' + boundary)
				121	body_list.append('')
				122	for (key, filename, value) in files:
				123	if isinstance(key, unicode):
				124	value = key.encode('utf-8')
				125	if isinstance(filename, unicode):
				126	value = filename.encode('utf-8')
				127	if isinstance(value, unicode):
				128	value = value.encode('utf-8')
				129	body_list.append('--' + boundary)
				130	body_list.append('Content-Disposition: form-data; name="%s"; '
				131	'filename="%s"' % (key, filename))
				132	body_list.append('Content-Type: %s' % mime_mapper(filename))
				133	body_list.append('')
				134	body_list.append(value)
				135	body_list.append('--' + boundary)
				136	body_list.append('')
				137	if body_list:
				138	body_list[-2] += '--'
				139	body = '\r\n'.join(body_list)
				140	content_type = 'multipart/form-data; boundary=%s' % boundary
				141	return content_type, body
				142
				143
maruel@chromium.org	037758d	2012-12-10 17:59:46 +0000	[diff] [blame]	144	def sha1_file(filepath):
				145	"""Calculates the SHA-1 of a file without reading it all in memory at once."""
				146	digest = hashlib.sha1()
				147	with open(filepath, 'rb') as f:
				148	while True:
				149	# Read in 1mb chunks.
				150	chunk = f.read(1024*1024)
				151	if not chunk:
				152	break
				153	digest.update(chunk)
				154	return digest.hexdigest()
				155
				156
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	157	def file_write(filepath, content_generator):
				158	"""Writes file content as generated by content_generator.
				159
				160	Meant to be mocked out in unit tests.
				161	"""
				162	filedir = os.path.dirname(filepath)
				163	if not os.path.isdir(filedir):
				164	os.makedirs(filedir)
				165	with open(filepath, 'wb') as f:
				166	for d in content_generator:
				167	f.write(d)
				168
				169
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame^]	170	def is_valid_file(filepath, size):
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	171	"""Determines if the given files appears valid.
				172
				173	Currently it just checks the file's size.
				174	"""
				175	if size == UNKNOWN_FILE_SIZE:
				176	return True
				177	actual_size = os.stat(filepath).st_size
				178	if size != actual_size:
				179	logging.warning(
				180	'Found invalid item %s; %d != %d',
				181	os.path.basename(filepath), actual_size, size)
				182	return False
				183	return True
				184
				185
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame^]	186	def try_remove(filepath):
				187	"""Removes a file without crashing even if it doesn't exist."""
				188	try:
				189	os.remove(filepath)
				190	except OSError:
				191	pass
				192
				193
vadimsh@chromium.org	80f7300	2013-07-12 14:52:44 +0000	[diff] [blame]	194	def url_read(url, **kwargs):
vadimsh@chromium.org	6b70621	2013-08-28 15:03:46 +0000	[diff] [blame]	195	result = net.url_read(url, **kwargs)
vadimsh@chromium.org	80f7300	2013-07-12 14:52:44 +0000	[diff] [blame]	196	if result is None:
maruel@chromium.org	ef33312	2013-03-12 20:36:40 +0000	[diff] [blame]	197	# If we get no response from the server, assume it is down and raise an
				198	# exception.
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	199	raise MappingError('Unable to connect to server %s' % url)
maruel@chromium.org	ef33312	2013-03-12 20:36:40 +0000	[diff] [blame]	200	return result
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	201
				202
maruel@chromium.org	dc359e6	2013-03-14 13:08:55 +0000	[diff] [blame]	203	def upload_hash_content_to_blobstore(
				204	generate_upload_url, data, hash_key, content):
vadimsh@chromium.org	80f7300	2013-07-12 14:52:44 +0000	[diff] [blame]	205	"""Uploads the given hash contents directly to the blobstore via a generated
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	206	url.
				207
				208	Arguments:
				209	generate_upload_url: The url to get the new upload url from.
maruel@chromium.org	dc359e6	2013-03-14 13:08:55 +0000	[diff] [blame]	210	data: extra POST data.
				211	hash_key: sha1 of the uncompressed version of content.
				212	content: The contents to upload. Must fit in memory for now.
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	213	"""
				214	logging.debug('Generating url to directly upload file to blobstore')
maruel@chromium.org	92a3d2e	2012-12-20 16:22:29 +0000	[diff] [blame]	215	assert isinstance(hash_key, str), hash_key
				216	assert isinstance(content, str), (hash_key, content)
maruel@chromium.org	d58bf5b	2013-04-26 17:57:42 +0000	[diff] [blame]	217	# TODO(maruel): Support large files. This would require streaming support.
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	218	content_type, body = encode_multipart_formdata(
maruel@chromium.org	d58bf5b	2013-04-26 17:57:42 +0000	[diff] [blame]	219	data, [('content', hash_key, content)])
vadimsh@chromium.org	043b76d	2013-09-12 16:15:13 +0000	[diff] [blame]	220	for _ in net.retry_loop(max_attempts=net.URL_OPEN_MAX_ATTEMPTS):
maruel@chromium.org	d58bf5b	2013-04-26 17:57:42 +0000	[diff] [blame]	221	# Retry HTTP 50x here.
vadimsh@chromium.org	6b70621	2013-08-28 15:03:46 +0000	[diff] [blame]	222	upload_url = net.url_read(generate_upload_url, data=data)
vadimsh@chromium.org	80f7300	2013-07-12 14:52:44 +0000	[diff] [blame]	223	if not upload_url:
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	224	raise MappingError(
maruel@chromium.org	d58bf5b	2013-04-26 17:57:42 +0000	[diff] [blame]	225	'Unable to connect to server %s' % generate_upload_url)
maruel@chromium.org	d58bf5b	2013-04-26 17:57:42 +0000	[diff] [blame]	226
				227	# Do not retry this request on HTTP 50x. Regenerate an upload url each time
				228	# since uploading "consumes" the upload url.
vadimsh@chromium.org	6b70621	2013-08-28 15:03:46 +0000	[diff] [blame]	229	result = net.url_read(
maruel@chromium.org	d58bf5b	2013-04-26 17:57:42 +0000	[diff] [blame]	230	upload_url, data=body, content_type=content_type, retry_50x=False)
vadimsh@chromium.org	80f7300	2013-07-12 14:52:44 +0000	[diff] [blame]	231	if result is not None:
				232	return result
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	233	raise MappingError(
maruel@chromium.org	d58bf5b	2013-04-26 17:57:42 +0000	[diff] [blame]	234	'Unable to connect to server %s' % generate_upload_url)
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	235
				236
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	237	class IsolateServer(object):
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	238	"""Client class to download or upload to Isolate Server."""
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	239	def __init__(self, base_url, namespace):
				240	assert base_url.startswith('http'), base_url
				241	self.content_url = base_url.rstrip('/') + '/content/'
				242	self.namespace = namespace
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	243	self._token = None
				244	self._lock = threading.Lock()
				245
				246	@property
				247	def token(self):
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	248	# TODO(maruel): Make this request much earlier asynchronously while the
				249	# files are being enumerated.
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	250	with self._lock:
				251	if not self._token:
				252	self._token = urllib.quote(url_read(self.content_url + 'get_token'))
				253	return self._token
				254
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame^]	255	def fetch(self, item, size):
				256	"""Fetches an object and yields its content."""
				257	zipped_url = '%sretrieve/%s/%s' % (self.content_url, self.namespace, item)
				258	logging.debug('download_file(%s)', zipped_url)
				259
				260	# Because the app engine DB is only eventually consistent, retry 404 errors
				261	# because the file might just not be visible yet (even though it has been
				262	# uploaded).
				263	connection = net.url_open(
				264	zipped_url, retry_404=True, read_timeout=DOWNLOAD_READ_TIMEOUT)
				265	if not connection:
				266	raise IOError('Unable to open connection to %s' % zipped_url)
				267
				268	decompressor = zlib.decompressobj()
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	269	try:
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame^]	270	compressed_size = 0
				271	decompressed_size = 0
				272	while True:
				273	chunk = connection.read(ZIPPED_FILE_CHUNK)
				274	if not chunk:
				275	break
				276	compressed_size += len(chunk)
				277	decompressed = decompressor.decompress(chunk)
				278	decompressed_size += len(decompressed)
				279	yield decompressed
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	280
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	281	# Ensure that all the data was properly decompressed.
				282	uncompressed_data = decompressor.flush()
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame^]	283	if uncompressed_data:
				284	raise IOError('Decompression failed')
				285	if size != UNKNOWN_FILE_SIZE and decompressed_size != size:
				286	raise IOError('File incorrect size after download of %s. Got %s and '
				287	'expected %s' % (item, decompressed_size, size))
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	288	except zlib.error as e:
				289	msg = 'Corrupted zlib for item %s. Processed %d of %s bytes.\n%s' % (
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame^]	290	item, compressed_size, connection.content_length, e)
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	291	logging.error(msg)
				292
				293	# Testing seems to show that if a few machines are trying to download
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame^]	294	# the same blob, they can cause each other to fail. So if we hit a zip
				295	# error, this is the most likely cause (it only downloads some of the
				296	# data). Randomly sleep for between 5 and 25 seconds to try and spread
				297	# out the downloads.
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	298	sleep_duration = (random.random() * 20) + 5
				299	time.sleep(sleep_duration)
				300	raise IOError(msg)
maruel@chromium.org	c2bfef4	2013-08-30 21:46:26 +0000	[diff] [blame]	301
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame^]	302	def retrieve(self, item, dest, size):
				303	"""Fetches an object and save its content to \|dest\|."""
				304	try:
				305	file_write(dest, self.fetch(item, size))
				306	except IOError as e:
				307	# Remove unfinished download.
				308	try_remove(dest)
				309	logging.error('Failed to download %s at %s.\n%s', item, dest, e)
				310	raise
				311
				312	def store(self, content, hash_key, _size):
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	313	# TODO(maruel): Detect failures.
				314	hash_key = str(hash_key)
				315	if len(content) > MIN_SIZE_FOR_DIRECT_BLOBSTORE:
				316	url = '%sgenerate_blobstore_url/%s/%s' % (
				317	self.content_url, self.namespace, hash_key)
				318	# token is guaranteed to be already quoted but it is unnecessary here, and
				319	# only here.
				320	data = [('token', urllib.unquote(self.token))]
				321	return upload_hash_content_to_blobstore(url, data, hash_key, content)
				322	else:
				323	url = '%sstore/%s/%s?token=%s' % (
				324	self.content_url, self.namespace, hash_key, self.token)
				325	return url_read(
				326	url, data=content, content_type='application/octet-stream')
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	327
				328
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	329	def check_files_exist_on_server(query_url, queries):
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	330	"""Queries the server to see which files from this batch already exist there.
				331
				332	Arguments:
				333	queries: The hash files to potential upload to the server.
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	334	Returns:
				335	missing_files: list of files that are missing on the server.
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	336	"""
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	337	# TODO(maruel): Move inside IsolateServer.
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	338	logging.info('Checking existence of %d files...', len(queries))
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	339	body = ''.join(
maruel@chromium.org	e5c1713	2012-11-21 18:18:46 +0000	[diff] [blame]	340	(binascii.unhexlify(meta_data['h']) for (_, meta_data) in queries))
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	341	assert (len(body) % 20) == 0, repr(body)
				342
vadimsh@chromium.org	80f7300	2013-07-12 14:52:44 +0000	[diff] [blame]	343	response = url_read(
				344	query_url, data=body, content_type='application/octet-stream')
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	345	if len(queries) != len(response):
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	346	raise MappingError(
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	347	'Got an incorrect number of responses from the server. Expected %d, '
				348	'but got %d' % (len(queries), len(response)))
				349
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	350	missing_files = [
				351	queries[i] for i, flag in enumerate(response) if flag == chr(0)
				352	]
				353	logging.info('Queried %d files, %d cache hit',
				354	len(queries), len(queries) - len(missing_files))
				355	return missing_files
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	356
				357
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	358	class FileSystem(object):
				359	"""Fetches data from the file system.
				360
				361	The common use case is a NFS/CIFS file server that is mounted locally that is
				362	used to fetch the file on a local partition.
				363	"""
				364	def __init__(self, base_path):
				365	self.base_path = base_path
				366
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame^]	367	def fetch(self, item, size):
				368	source = os.path.join(self.base_path, item)
				369	if size != UNKNOWN_FILE_SIZE and not is_valid_file(source, size):
				370	raise IOError('Invalid file %s' % item)
				371	with open(source, 'rb') as f:
				372	return [f.read()]
				373
				374	def retrieve(self, item, dest, size):
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	375	source = os.path.join(self.base_path, item)
				376	if source == dest:
				377	logging.info('Source and destination are the same, no action required')
				378	return
				379	logging.debug('copy_file(%s, %s)', source, dest)
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame^]	380	if size != UNKNOWN_FILE_SIZE and not is_valid_file(source, size):
				381	raise IOError(
				382	'Invalid file %s, %d != %d' % (item, os.stat(source).st_size, size))
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	383	shutil.copy(source, dest)
				384
				385	def store(self, content, hash_key):
				386	raise NotImplementedError()
				387
				388
				389	def get_storage_api(file_or_url, namespace):
				390	"""Returns an object that implements .retrieve()."""
				391	if re.match(r'^https?://.+$', file_or_url):
				392	return IsolateServer(file_or_url, namespace)
				393	else:
				394	return FileSystem(file_or_url)
				395
				396
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	397	class RemoteOperation(object):
				398	"""Priority based worker queue to operate on action items.
				399
				400	It execute a function with the given task items. It is specialized to download
				401	files.
				402
				403	When the priority of items is equals, works in strict FIFO mode.
				404	"""
				405	# Initial and maximum number of worker threads.
				406	INITIAL_WORKERS = 2
				407	MAX_WORKERS = 16
				408	# Priorities.
				409	LOW, MED, HIGH = (1<<8, 2<<8, 3<<8)
				410	INTERNAL_PRIORITY_BITS = (1<<8) - 1
				411	RETRIES = 5
				412
				413	def __init__(self, do_item):
				414	# Function to fetch a remote object or upload to a remote location.
				415	self._do_item = do_item
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	416	self._pool = threading_utils.ThreadPool(
				417	self.INITIAL_WORKERS, self.MAX_WORKERS, 0, 'remote')
				418
				419	def join(self):
				420	"""Blocks until the queue is empty."""
				421	return self._pool.join()
				422
				423	def close(self):
				424	"""Terminates all worker threads."""
				425	self._pool.close()
				426
				427	def add_item(self, priority, obj, dest, size):
				428	"""Retrieves an object from the remote data store.
				429
				430	The smaller \|priority\| gets fetched first.
				431
				432	Thread-safe.
				433	"""
				434	assert (priority & self.INTERNAL_PRIORITY_BITS) == 0
				435	return self._add_item(priority, obj, dest, size)
				436
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame^]	437	def get_one_result(self):
				438	return self._pool.get_one_result()
				439
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	440	def _add_item(self, priority, obj, dest, size):
				441	assert isinstance(obj, basestring), obj
				442	assert isinstance(dest, basestring), dest
				443	assert size is None or isinstance(size, int), size
				444	return self._pool.add_task(
				445	priority, self._task_executer, priority, obj, dest, size)
				446
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	447	def _task_executer(self, priority, obj, dest, size):
				448	"""Wraps self._do_item to trap and retry on IOError exceptions."""
				449	try:
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame^]	450	self._do_item(obj, dest, size)
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	451	# TODO(maruel): Technically, we'd want to have an output queue to be a
				452	# PriorityQueue.
				453	return obj
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame^]	454	except IOError:
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	455	# Retry a few times, lowering the priority.
				456	if (priority & self.INTERNAL_PRIORITY_BITS) < self.RETRIES:
				457	self._add_item(priority + 1, obj, dest, size)
				458	return
				459	raise
				460
				461
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	462	def compression_level(filename):
				463	"""Given a filename calculates the ideal compression level to use."""
				464	file_ext = os.path.splitext(filename)[1].lower()
				465	# TODO(csharp): Profile to find what compression level works best.
				466	return 0 if file_ext in ALREADY_COMPRESSED_TYPES else 7
				467
				468
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	469	def read_and_compress(filepath, level):
				470	"""Reads a file and returns its content gzip compressed."""
				471	compressor = zlib.compressobj(level)
				472	compressed_data = cStringIO.StringIO()
				473	with open(filepath, 'rb') as f:
				474	while True:
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	475	chunk = f.read(ZIPPED_FILE_CHUNK)
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	476	if not chunk:
				477	break
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	478	compressed_data.write(compressor.compress(chunk))
				479	compressed_data.write(compressor.flush(zlib.Z_FINISH))
				480	value = compressed_data.getvalue()
				481	compressed_data.close()
				482	return value
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	483
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	484
				485	def zip_and_trigger_upload(infile, metadata, upload_function):
				486	# TODO(csharp): Fix crbug.com/150823 and enable the touched logic again.
				487	# if not metadata['T']:
				488	compressed_data = read_and_compress(infile, compression_level(infile))
				489	priority = (
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	490	RemoteOperation.HIGH if metadata.get('priority', '1') == '0'
				491	else RemoteOperation.MED)
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame^]	492	return upload_function(
				493	priority, compressed_data, metadata['h'], UNKNOWN_FILE_SIZE)
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	494
				495
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	496	def batch_files_for_check(infiles):
				497	"""Splits list of files to check for existence on the server into batches.
maruel@chromium.org	35fc0c8	2013-01-17 15:14:14 +0000	[diff] [blame]	498
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	499	Each batch corresponds to a single 'exists?' query to the server.
				500
				501	Yields:
				502	batches: list of batches, each batch is a list of files.
maruel@chromium.org	35fc0c8	2013-01-17 15:14:14 +0000	[diff] [blame]	503	"""
vadimsh@chromium.org	eea5242	2013-08-21 19:35:54 +0000	[diff] [blame]	504	batch_count = 0
				505	batch_size_limit = ITEMS_PER_CONTAINS_QUERIES[0]
maruel@chromium.org	35fc0c8	2013-01-17 15:14:14 +0000	[diff] [blame]	506	next_queries = []
csharp@chromium.org	90c4581	2013-01-23 14:27:21 +0000	[diff] [blame]	507	items = ((k, v) for k, v in infiles.iteritems() if 's' in v)
				508	for relfile, metadata in sorted(items, key=lambda x: -x[1]['s']):
maruel@chromium.org	35fc0c8	2013-01-17 15:14:14 +0000	[diff] [blame]	509	next_queries.append((relfile, metadata))
vadimsh@chromium.org	eea5242	2013-08-21 19:35:54 +0000	[diff] [blame]	510	if len(next_queries) == batch_size_limit:
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	511	yield next_queries
maruel@chromium.org	35fc0c8	2013-01-17 15:14:14 +0000	[diff] [blame]	512	next_queries = []
vadimsh@chromium.org	eea5242	2013-08-21 19:35:54 +0000	[diff] [blame]	513	batch_count += 1
				514	batch_size_limit = ITEMS_PER_CONTAINS_QUERIES[
				515	min(batch_count, len(ITEMS_PER_CONTAINS_QUERIES) - 1)]
maruel@chromium.org	35fc0c8	2013-01-17 15:14:14 +0000	[diff] [blame]	516	if next_queries:
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	517	yield next_queries
				518
				519
				520	def get_files_to_upload(contains_hash_url, infiles):
				521	"""Yields files that are missing on the server."""
vadimsh@chromium.org	b074b16	2013-08-22 17:55:46 +0000	[diff] [blame]	522	with threading_utils.ThreadPool(1, 16, 0, prefix='get_files_to_upload') as tp:
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	523	for files in batch_files_for_check(infiles):
vadimsh@chromium.org	b074b16	2013-08-22 17:55:46 +0000	[diff] [blame]	524	tp.add_task(0, check_files_exist_on_server, contains_hash_url, files)
				525	for missing_file in itertools.chain.from_iterable(tp.iter_results()):
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	526	yield missing_file
maruel@chromium.org	35fc0c8	2013-01-17 15:14:14 +0000	[diff] [blame]	527
				528
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	529	def upload_sha1_tree(base_url, indir, infiles, namespace):
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	530	"""Uploads the given tree to the given url.
				531
				532	Arguments:
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	533	base_url: The base url, it is assume that \|base_url\|/has/ can be used to
				534	query if an element was already uploaded, and \|base_url\|/store/
				535	can be used to upload a new element.
				536	indir: Root directory the infiles are based in.
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	537	infiles: dict of files to upload files from \|indir\| to \|base_url\|.
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	538	namespace: The namespace to use on the server.
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	539	"""
				540	logging.info('upload tree(base_url=%s, indir=%s, files=%d)' %
				541	(base_url, indir, len(infiles)))
maruel@chromium.org	034e396	2013-03-13 13:34:25 +0000	[diff] [blame]	542
csharp@chromium.org	07fa759	2013-01-11 18:19:30 +0000	[diff] [blame]	543	# Create a pool of workers to zip and upload any files missing from
				544	# the server.
vadimsh@chromium.org	b074b16	2013-08-22 17:55:46 +0000	[diff] [blame]	545	num_threads = threading_utils.num_processors()
				546	zipping_pool = threading_utils.ThreadPool(min(2, num_threads),
				547	num_threads, 0, 'zip')
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	548	remote = IsolateServer(base_url, namespace)
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	549	remote_uploader = RemoteOperation(remote.store)
csharp@chromium.org	07fa759	2013-01-11 18:19:30 +0000	[diff] [blame]	550
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	551	# Starts the zip and upload process for files that are missing
				552	# from the server.
maruel@chromium.org	3e42ce8	2013-09-12 18:36:59 +0000	[diff] [blame]	553	contains_hash_url = '%scontains/%s?token=%s' % (
				554	remote.content_url, namespace, remote.token)
csharp@chromium.org	20a888c	2013-01-15 15:06:55 +0000	[diff] [blame]	555	uploaded = []
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	556	for relfile, metadata in get_files_to_upload(contains_hash_url, infiles):
csharp@chromium.org	07fa759	2013-01-11 18:19:30 +0000	[diff] [blame]	557	infile = os.path.join(indir, relfile)
maruel@chromium.org	831958f	2013-01-22 15:01:46 +0000	[diff] [blame]	558	zipping_pool.add_task(0, zip_and_trigger_upload, infile, metadata,
csharp@chromium.org	07fa759	2013-01-11 18:19:30 +0000	[diff] [blame]	559	remote_uploader.add_item)
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	560	uploaded.append((relfile, metadata))
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	561
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	562	logging.info('Waiting for all files to finish zipping')
				563	zipping_pool.join()
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	564	zipping_pool.close()
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	565	logging.info('All files zipped.')
				566
				567	logging.info('Waiting for all files to finish uploading')
maruel@chromium.org	13eca0b	2013-01-22 16:42:21 +0000	[diff] [blame]	568	# Will raise if any exception occurred.
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	569	remote_uploader.join()
vadimsh@chromium.org	53f8d5a	2013-06-19 13:03:55 +0000	[diff] [blame]	570	remote_uploader.close()
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	571	logging.info('All files are uploaded')
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	572
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	573	total = len(infiles)
maruel@chromium.org	e5c1713	2012-11-21 18:18:46 +0000	[diff] [blame]	574	total_size = sum(metadata.get('s', 0) for metadata in infiles.itervalues())
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	575	logging.info(
				576	'Total: %6d, %9.1fkb',
				577	total,
maruel@chromium.org	e5c1713	2012-11-21 18:18:46 +0000	[diff] [blame]	578	sum(m.get('s', 0) for m in infiles.itervalues()) / 1024.)
csharp@chromium.org	20a888c	2013-01-15 15:06:55 +0000	[diff] [blame]	579	cache_hit = set(infiles.iterkeys()) - set(x[0] for x in uploaded)
maruel@chromium.org	e5c1713	2012-11-21 18:18:46 +0000	[diff] [blame]	580	cache_hit_size = sum(infiles[i].get('s', 0) for i in cache_hit)
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	581	logging.info(
				582	'cache hit: %6d, %9.1fkb, %6.2f%% files, %6.2f%% size',
				583	len(cache_hit),
				584	cache_hit_size / 1024.,
				585	len(cache_hit) * 100. / total,
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	586	cache_hit_size * 100. / total_size if total_size else 0)
csharp@chromium.org	20a888c	2013-01-15 15:06:55 +0000	[diff] [blame]	587	cache_miss = uploaded
maruel@chromium.org	e5c1713	2012-11-21 18:18:46 +0000	[diff] [blame]	588	cache_miss_size = sum(infiles[i[0]].get('s', 0) for i in cache_miss)
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	589	logging.info(
				590	'cache miss: %6d, %9.1fkb, %6.2f%% files, %6.2f%% size',
				591	len(cache_miss),
				592	cache_miss_size / 1024.,
				593	len(cache_miss) * 100. / total,
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	594	cache_miss_size * 100. / total_size if total_size else 0)
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	595	return 0
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	596
				597
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	598	@subcommand.usage('<file1..fileN> or - to read from stdin')
				599	def CMDarchive(parser, args):
				600	"""Archives data to the server."""
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	601	options, files = parser.parse_args(args)
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	602
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	603	if files == ['-']:
				604	files = sys.stdin.readlines()
				605
				606	if not files:
				607	parser.error('Nothing to upload')
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	608	if not options.isolate_server:
				609	parser.error('Nowhere to send. Please specify --isolate-server')
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	610
				611	# Load the necessary metadata. This is going to be rewritten eventually to be
				612	# more efficient.
				613	infiles = dict(
				614	(
				615	f,
				616	{
maruel@chromium.org	e5c1713	2012-11-21 18:18:46 +0000	[diff] [blame]	617	's': os.stat(f).st_size,
maruel@chromium.org	037758d	2012-12-10 17:59:46 +0000	[diff] [blame]	618	'h': sha1_file(f),
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	619	}
				620	)
				621	for f in files)
				622
vadimsh@chromium.org	a432647	2013-08-24 02:05:41 +0000	[diff] [blame]	623	with tools.Profiler('Archive'):
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	624	ret = upload_sha1_tree(
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	625	base_url=options.isolate_server,
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	626	indir=os.getcwd(),
csharp@chromium.org	59c7bcf	2012-11-21 21:13:18 +0000	[diff] [blame]	627	infiles=infiles,
				628	namespace=options.namespace)
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	629	if not ret:
				630	print '\n'.join('%s %s' % (infiles[f]['h'], f) for f in sorted(infiles))
				631	return ret
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	632
				633
				634	def CMDdownload(parser, args):
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	635	"""Download data from the server.
				636
				637	It can download individual files.
				638	"""
				639	parser.add_option(
				640	'-f', '--file', metavar='HASH DEST', default=[], action='append', nargs=2,
				641	help='hash and destination of a file, can be used multiple times')
				642	parser.add_option(
				643	'-t', '--target', metavar='DIR', default=os.getcwd(),
				644	help='destination directory')
				645	options, args = parser.parse_args(args)
				646	if args:
				647	parser.error('Unsupported arguments: %s' % args)
				648	if not options.file:
				649	parser.error('Use one of --file is required.')
				650
				651	options.target = os.path.abspath(options.target)
				652	remote = IsolateServer(options.isolate_server, options.namespace)
				653	for h, dest in options.file:
				654	logging.info('%s: %s', h, dest)
maruel@chromium.org	e45728d	2013-09-16 23:23:22 +0000	[diff] [blame^]	655	remote.retrieve(h, os.path.join(options.target, dest), UNKNOWN_FILE_SIZE)
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	656	return 0
				657
				658
				659	class OptionParserIsolateServer(tools.OptionParserWithLogging):
				660	def __init__(self, **kwargs):
				661	tools.OptionParserWithLogging.__init__(self, **kwargs)
				662	self.add_option(
				663	'-I', '--isolate-server',
				664	default=ISOLATE_SERVER,
				665	metavar='URL',
				666	help='Isolate server where data is stored. default: %default')
				667	self.add_option(
				668	'--namespace', default='default-gzip',
maruel@chromium.org	b7e79a2	2013-09-13 01:24:56 +0000	[diff] [blame]	669	help='The namespace to use on the server, default: %default')
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	670
				671	def parse_args(self, args, *kwargs):
				672	options, args = tools.OptionParserWithLogging.parse_args(
				673	self, args, *kwargs)
				674	options.isolate_server = options.isolate_server.rstrip('/')
				675	if not options.isolate_server:
				676	self.error('--isolate-server is required.')
				677	return options, args
				678
				679
				680	def main(args):
				681	dispatcher = subcommand.CommandDispatcher(__name__)
				682	try:
				683	return dispatcher.execute(
				684	OptionParserIsolateServer(version=__version__), args)
maruel@chromium.org	dedbf49	2013-09-12 20:42:11 +0000	[diff] [blame]	685	except (ConfigError, MappingError) as e:
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	686	sys.stderr.write('\nError: ')
				687	sys.stderr.write(str(e))
				688	sys.stderr.write('\n')
				689	return 1
maruel@chromium.org	c6f9006	2012-11-07 18:32:22 +0000	[diff] [blame]	690
				691
				692	if __name__ == '__main__':
maruel@chromium.org	fb78d43	2013-08-28 21:22:40 +0000	[diff] [blame]	693	fix_encoding.fix_encoding()
				694	tools.disable_buffering()
				695	colorama.init()
maruel@chromium.org	cb3c3d5	2013-03-14 18:55:30 +0000	[diff] [blame]	696	sys.exit(main(sys.argv[1:]))