blob: bb54574736be2ddc61c66e0c57cff3f3312786f4 [file] [log] [blame]
maruel@chromium.orgc6f90062012-11-07 18:32:22 +00001#!/usr/bin/env python
maruel@chromium.orgfb78d432013-08-28 21:22:40 +00002# Copyright 2013 The Chromium Authors. All rights reserved.
maruel@chromium.orgc6f90062012-11-07 18:32:22 +00003# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Archives a set of files to a server."""
7
maruel@chromium.orgb7e79a22013-09-13 01:24:56 +00008__version__ = '0.2'
maruel@chromium.orgfb78d432013-08-28 21:22:40 +00009
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000010import binascii
vadimsh@chromium.org53f8d5a2013-06-19 13:03:55 +000011import cStringIO
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000012import hashlib
vadimsh@chromium.org53f8d5a2013-06-19 13:03:55 +000013import itertools
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000014import logging
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000015import os
maruel@chromium.orgb7e79a22013-09-13 01:24:56 +000016import random
17import re
18import shutil
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000019import sys
maruel@chromium.orgb7e79a22013-09-13 01:24:56 +000020import threading
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000021import time
maruel@chromium.orge82112e2013-04-24 14:41:55 +000022import urllib
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +000023import zlib
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000024
maruel@chromium.orgfb78d432013-08-28 21:22:40 +000025from third_party import colorama
26from third_party.depot_tools import fix_encoding
27from third_party.depot_tools import subcommand
28
vadimsh@chromium.org6b706212013-08-28 15:03:46 +000029from utils import net
vadimsh@chromium.orgb074b162013-08-22 17:55:46 +000030from utils import threading_utils
vadimsh@chromium.orga4326472013-08-24 02:05:41 +000031from utils import tools
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000032
33
maruel@chromium.orgfb78d432013-08-28 21:22:40 +000034# Default server.
35# TODO(maruel): Chromium-specific.
36ISOLATE_SERVER = 'https://isolateserver-dev.appspot.com/'
37
38
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000039# The minimum size of files to upload directly to the blobstore.
maruel@chromium.orgaef29f82012-12-12 15:00:42 +000040MIN_SIZE_FOR_DIRECT_BLOBSTORE = 20 * 1024
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000041
vadimsh@chromium.orgeea52422013-08-21 19:35:54 +000042# The number of files to check the isolate server per /contains query.
43# All files are sorted by likelihood of a change in the file content
44# (currently file size is used to estimate this: larger the file -> larger the
45# possibility it has changed). Then first ITEMS_PER_CONTAINS_QUERIES[0] files
46# are taken and send to '/contains', then next ITEMS_PER_CONTAINS_QUERIES[1],
47# and so on. Numbers here is a trade-off; the more per request, the lower the
48# effect of HTTP round trip latency and TCP-level chattiness. On the other hand,
49# larger values cause longer lookups, increasing the initial latency to start
50# uploading, which is especially an issue for large files. This value is
51# optimized for the "few thousands files to look up with minimal number of large
52# files missing" case.
53ITEMS_PER_CONTAINS_QUERIES = [20, 20, 50, 50, 50, 100]
csharp@chromium.org07fa7592013-01-11 18:19:30 +000054
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +000055# A list of already compressed extension types that should not receive any
56# compression before being uploaded.
57ALREADY_COMPRESSED_TYPES = [
58 '7z', 'avi', 'cur', 'gif', 'h264', 'jar', 'jpeg', 'jpg', 'pdf', 'png',
59 'wav', 'zip'
60]
61
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000062
maruel@chromium.orgdedbf492013-09-12 20:42:11 +000063# The file size to be used when we don't know the correct file size,
64# generally used for .isolated files.
65UNKNOWN_FILE_SIZE = None
66
67
68# The size of each chunk to read when downloading and unzipping files.
69ZIPPED_FILE_CHUNK = 16 * 1024
70
71
maruel@chromium.orgb7e79a22013-09-13 01:24:56 +000072# Read timeout in seconds for downloads from isolate storage. If there's no
73# response from the server within this timeout whole download will be aborted.
74DOWNLOAD_READ_TIMEOUT = 60
75
76
maruel@chromium.orgdedbf492013-09-12 20:42:11 +000077class ConfigError(ValueError):
78 """Generic failure to load a .isolated file."""
79 pass
80
81
82class MappingError(OSError):
83 """Failed to recreate the tree."""
84 pass
85
86
maruel@chromium.orgcb3c3d52013-03-14 18:55:30 +000087def randomness():
88 """Generates low-entropy randomness for MIME encoding.
89
90 Exists so it can be mocked out in unit tests.
91 """
92 return str(time.time())
93
94
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000095def encode_multipart_formdata(fields, files,
96 mime_mapper=lambda _: 'application/octet-stream'):
97 """Encodes a Multipart form data object.
98
99 Args:
100 fields: a sequence (name, value) elements for
101 regular form fields.
102 files: a sequence of (name, filename, value) elements for data to be
103 uploaded as files.
104 mime_mapper: function to return the mime type from the filename.
105 Returns:
106 content_type: for httplib.HTTP instance
107 body: for httplib.HTTP instance
108 """
maruel@chromium.orgcb3c3d52013-03-14 18:55:30 +0000109 boundary = hashlib.md5(randomness()).hexdigest()
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000110 body_list = []
111 for (key, value) in fields:
112 if isinstance(key, unicode):
113 value = key.encode('utf-8')
114 if isinstance(value, unicode):
115 value = value.encode('utf-8')
116 body_list.append('--' + boundary)
117 body_list.append('Content-Disposition: form-data; name="%s"' % key)
118 body_list.append('')
119 body_list.append(value)
120 body_list.append('--' + boundary)
121 body_list.append('')
122 for (key, filename, value) in files:
123 if isinstance(key, unicode):
124 value = key.encode('utf-8')
125 if isinstance(filename, unicode):
126 value = filename.encode('utf-8')
127 if isinstance(value, unicode):
128 value = value.encode('utf-8')
129 body_list.append('--' + boundary)
130 body_list.append('Content-Disposition: form-data; name="%s"; '
131 'filename="%s"' % (key, filename))
132 body_list.append('Content-Type: %s' % mime_mapper(filename))
133 body_list.append('')
134 body_list.append(value)
135 body_list.append('--' + boundary)
136 body_list.append('')
137 if body_list:
138 body_list[-2] += '--'
139 body = '\r\n'.join(body_list)
140 content_type = 'multipart/form-data; boundary=%s' % boundary
141 return content_type, body
142
143
maruel@chromium.org037758d2012-12-10 17:59:46 +0000144def sha1_file(filepath):
145 """Calculates the SHA-1 of a file without reading it all in memory at once."""
146 digest = hashlib.sha1()
147 with open(filepath, 'rb') as f:
148 while True:
149 # Read in 1mb chunks.
150 chunk = f.read(1024*1024)
151 if not chunk:
152 break
153 digest.update(chunk)
154 return digest.hexdigest()
155
156
maruel@chromium.orgb7e79a22013-09-13 01:24:56 +0000157def file_write(filepath, content_generator):
158 """Writes file content as generated by content_generator.
159
160 Meant to be mocked out in unit tests.
161 """
162 filedir = os.path.dirname(filepath)
163 if not os.path.isdir(filedir):
164 os.makedirs(filedir)
165 with open(filepath, 'wb') as f:
166 for d in content_generator:
167 f.write(d)
168
169
maruel@chromium.orge45728d2013-09-16 23:23:22 +0000170def is_valid_file(filepath, size):
maruel@chromium.orgdedbf492013-09-12 20:42:11 +0000171 """Determines if the given files appears valid.
172
173 Currently it just checks the file's size.
174 """
175 if size == UNKNOWN_FILE_SIZE:
176 return True
177 actual_size = os.stat(filepath).st_size
178 if size != actual_size:
179 logging.warning(
180 'Found invalid item %s; %d != %d',
181 os.path.basename(filepath), actual_size, size)
182 return False
183 return True
184
185
maruel@chromium.orge45728d2013-09-16 23:23:22 +0000186def try_remove(filepath):
187 """Removes a file without crashing even if it doesn't exist."""
188 try:
189 os.remove(filepath)
190 except OSError:
191 pass
192
193
vadimsh@chromium.org80f73002013-07-12 14:52:44 +0000194def url_read(url, **kwargs):
vadimsh@chromium.org6b706212013-08-28 15:03:46 +0000195 result = net.url_read(url, **kwargs)
vadimsh@chromium.org80f73002013-07-12 14:52:44 +0000196 if result is None:
maruel@chromium.orgef333122013-03-12 20:36:40 +0000197 # If we get no response from the server, assume it is down and raise an
198 # exception.
maruel@chromium.orgdedbf492013-09-12 20:42:11 +0000199 raise MappingError('Unable to connect to server %s' % url)
maruel@chromium.orgef333122013-03-12 20:36:40 +0000200 return result
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000201
202
maruel@chromium.orgdc359e62013-03-14 13:08:55 +0000203def upload_hash_content_to_blobstore(
204 generate_upload_url, data, hash_key, content):
vadimsh@chromium.org80f73002013-07-12 14:52:44 +0000205 """Uploads the given hash contents directly to the blobstore via a generated
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000206 url.
207
208 Arguments:
209 generate_upload_url: The url to get the new upload url from.
maruel@chromium.orgdc359e62013-03-14 13:08:55 +0000210 data: extra POST data.
211 hash_key: sha1 of the uncompressed version of content.
212 content: The contents to upload. Must fit in memory for now.
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000213 """
214 logging.debug('Generating url to directly upload file to blobstore')
maruel@chromium.org92a3d2e2012-12-20 16:22:29 +0000215 assert isinstance(hash_key, str), hash_key
216 assert isinstance(content, str), (hash_key, content)
maruel@chromium.orgd58bf5b2013-04-26 17:57:42 +0000217 # TODO(maruel): Support large files. This would require streaming support.
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000218 content_type, body = encode_multipart_formdata(
maruel@chromium.orgd58bf5b2013-04-26 17:57:42 +0000219 data, [('content', hash_key, content)])
vadimsh@chromium.org043b76d2013-09-12 16:15:13 +0000220 for _ in net.retry_loop(max_attempts=net.URL_OPEN_MAX_ATTEMPTS):
maruel@chromium.orgd58bf5b2013-04-26 17:57:42 +0000221 # Retry HTTP 50x here.
vadimsh@chromium.org6b706212013-08-28 15:03:46 +0000222 upload_url = net.url_read(generate_upload_url, data=data)
vadimsh@chromium.org80f73002013-07-12 14:52:44 +0000223 if not upload_url:
maruel@chromium.orgdedbf492013-09-12 20:42:11 +0000224 raise MappingError(
maruel@chromium.orgd58bf5b2013-04-26 17:57:42 +0000225 'Unable to connect to server %s' % generate_upload_url)
maruel@chromium.orgd58bf5b2013-04-26 17:57:42 +0000226
227 # Do not retry this request on HTTP 50x. Regenerate an upload url each time
228 # since uploading "consumes" the upload url.
vadimsh@chromium.org6b706212013-08-28 15:03:46 +0000229 result = net.url_read(
maruel@chromium.orgd58bf5b2013-04-26 17:57:42 +0000230 upload_url, data=body, content_type=content_type, retry_50x=False)
vadimsh@chromium.org80f73002013-07-12 14:52:44 +0000231 if result is not None:
232 return result
maruel@chromium.orgdedbf492013-09-12 20:42:11 +0000233 raise MappingError(
maruel@chromium.orgd58bf5b2013-04-26 17:57:42 +0000234 'Unable to connect to server %s' % generate_upload_url)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000235
236
maruel@chromium.org3e42ce82013-09-12 18:36:59 +0000237class IsolateServer(object):
maruel@chromium.orgb7e79a22013-09-13 01:24:56 +0000238 """Client class to download or upload to Isolate Server."""
maruel@chromium.org3e42ce82013-09-12 18:36:59 +0000239 def __init__(self, base_url, namespace):
240 assert base_url.startswith('http'), base_url
241 self.content_url = base_url.rstrip('/') + '/content/'
242 self.namespace = namespace
maruel@chromium.orgb7e79a22013-09-13 01:24:56 +0000243 self._token = None
244 self._lock = threading.Lock()
245
246 @property
247 def token(self):
maruel@chromium.org3e42ce82013-09-12 18:36:59 +0000248 # TODO(maruel): Make this request much earlier asynchronously while the
249 # files are being enumerated.
maruel@chromium.orgb7e79a22013-09-13 01:24:56 +0000250 with self._lock:
251 if not self._token:
252 self._token = urllib.quote(url_read(self.content_url + 'get_token'))
253 return self._token
254
maruel@chromium.orge45728d2013-09-16 23:23:22 +0000255 def fetch(self, item, size):
256 """Fetches an object and yields its content."""
257 zipped_url = '%sretrieve/%s/%s' % (self.content_url, self.namespace, item)
258 logging.debug('download_file(%s)', zipped_url)
259
260 # Because the app engine DB is only eventually consistent, retry 404 errors
261 # because the file might just not be visible yet (even though it has been
262 # uploaded).
263 connection = net.url_open(
264 zipped_url, retry_404=True, read_timeout=DOWNLOAD_READ_TIMEOUT)
265 if not connection:
266 raise IOError('Unable to open connection to %s' % zipped_url)
267
268 decompressor = zlib.decompressobj()
maruel@chromium.orgb7e79a22013-09-13 01:24:56 +0000269 try:
maruel@chromium.orge45728d2013-09-16 23:23:22 +0000270 compressed_size = 0
271 decompressed_size = 0
272 while True:
273 chunk = connection.read(ZIPPED_FILE_CHUNK)
274 if not chunk:
275 break
276 compressed_size += len(chunk)
277 decompressed = decompressor.decompress(chunk)
278 decompressed_size += len(decompressed)
279 yield decompressed
maruel@chromium.orgb7e79a22013-09-13 01:24:56 +0000280
maruel@chromium.orgb7e79a22013-09-13 01:24:56 +0000281 # Ensure that all the data was properly decompressed.
282 uncompressed_data = decompressor.flush()
maruel@chromium.orge45728d2013-09-16 23:23:22 +0000283 if uncompressed_data:
284 raise IOError('Decompression failed')
285 if size != UNKNOWN_FILE_SIZE and decompressed_size != size:
286 raise IOError('File incorrect size after download of %s. Got %s and '
287 'expected %s' % (item, decompressed_size, size))
maruel@chromium.orgb7e79a22013-09-13 01:24:56 +0000288 except zlib.error as e:
289 msg = 'Corrupted zlib for item %s. Processed %d of %s bytes.\n%s' % (
maruel@chromium.orge45728d2013-09-16 23:23:22 +0000290 item, compressed_size, connection.content_length, e)
maruel@chromium.orgb7e79a22013-09-13 01:24:56 +0000291 logging.error(msg)
292
293 # Testing seems to show that if a few machines are trying to download
maruel@chromium.orge45728d2013-09-16 23:23:22 +0000294 # the same blob, they can cause each other to fail. So if we hit a zip
295 # error, this is the most likely cause (it only downloads some of the
296 # data). Randomly sleep for between 5 and 25 seconds to try and spread
297 # out the downloads.
maruel@chromium.orgb7e79a22013-09-13 01:24:56 +0000298 sleep_duration = (random.random() * 20) + 5
299 time.sleep(sleep_duration)
300 raise IOError(msg)
maruel@chromium.orgc2bfef42013-08-30 21:46:26 +0000301
maruel@chromium.orge45728d2013-09-16 23:23:22 +0000302 def retrieve(self, item, dest, size):
303 """Fetches an object and save its content to |dest|."""
304 try:
305 file_write(dest, self.fetch(item, size))
306 except IOError as e:
307 # Remove unfinished download.
308 try_remove(dest)
309 logging.error('Failed to download %s at %s.\n%s', item, dest, e)
310 raise
311
312 def store(self, content, hash_key, _size):
maruel@chromium.org3e42ce82013-09-12 18:36:59 +0000313 # TODO(maruel): Detect failures.
314 hash_key = str(hash_key)
315 if len(content) > MIN_SIZE_FOR_DIRECT_BLOBSTORE:
316 url = '%sgenerate_blobstore_url/%s/%s' % (
317 self.content_url, self.namespace, hash_key)
318 # token is guaranteed to be already quoted but it is unnecessary here, and
319 # only here.
320 data = [('token', urllib.unquote(self.token))]
321 return upload_hash_content_to_blobstore(url, data, hash_key, content)
322 else:
323 url = '%sstore/%s/%s?token=%s' % (
324 self.content_url, self.namespace, hash_key, self.token)
325 return url_read(
326 url, data=content, content_type='application/octet-stream')
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000327
328
vadimsh@chromium.org53f8d5a2013-06-19 13:03:55 +0000329def check_files_exist_on_server(query_url, queries):
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000330 """Queries the server to see which files from this batch already exist there.
331
332 Arguments:
333 queries: The hash files to potential upload to the server.
vadimsh@chromium.org53f8d5a2013-06-19 13:03:55 +0000334 Returns:
335 missing_files: list of files that are missing on the server.
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000336 """
maruel@chromium.org3e42ce82013-09-12 18:36:59 +0000337 # TODO(maruel): Move inside IsolateServer.
vadimsh@chromium.org53f8d5a2013-06-19 13:03:55 +0000338 logging.info('Checking existence of %d files...', len(queries))
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000339 body = ''.join(
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000340 (binascii.unhexlify(meta_data['h']) for (_, meta_data) in queries))
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000341 assert (len(body) % 20) == 0, repr(body)
342
vadimsh@chromium.org80f73002013-07-12 14:52:44 +0000343 response = url_read(
344 query_url, data=body, content_type='application/octet-stream')
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000345 if len(queries) != len(response):
maruel@chromium.orgdedbf492013-09-12 20:42:11 +0000346 raise MappingError(
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000347 'Got an incorrect number of responses from the server. Expected %d, '
348 'but got %d' % (len(queries), len(response)))
349
vadimsh@chromium.org53f8d5a2013-06-19 13:03:55 +0000350 missing_files = [
351 queries[i] for i, flag in enumerate(response) if flag == chr(0)
352 ]
353 logging.info('Queried %d files, %d cache hit',
354 len(queries), len(queries) - len(missing_files))
355 return missing_files
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000356
357
maruel@chromium.orgb7e79a22013-09-13 01:24:56 +0000358class FileSystem(object):
359 """Fetches data from the file system.
360
361 The common use case is a NFS/CIFS file server that is mounted locally that is
362 used to fetch the file on a local partition.
363 """
364 def __init__(self, base_path):
365 self.base_path = base_path
366
maruel@chromium.orge45728d2013-09-16 23:23:22 +0000367 def fetch(self, item, size):
368 source = os.path.join(self.base_path, item)
369 if size != UNKNOWN_FILE_SIZE and not is_valid_file(source, size):
370 raise IOError('Invalid file %s' % item)
371 with open(source, 'rb') as f:
372 return [f.read()]
373
374 def retrieve(self, item, dest, size):
maruel@chromium.orgb7e79a22013-09-13 01:24:56 +0000375 source = os.path.join(self.base_path, item)
376 if source == dest:
377 logging.info('Source and destination are the same, no action required')
378 return
379 logging.debug('copy_file(%s, %s)', source, dest)
maruel@chromium.orge45728d2013-09-16 23:23:22 +0000380 if size != UNKNOWN_FILE_SIZE and not is_valid_file(source, size):
381 raise IOError(
382 'Invalid file %s, %d != %d' % (item, os.stat(source).st_size, size))
maruel@chromium.orgb7e79a22013-09-13 01:24:56 +0000383 shutil.copy(source, dest)
384
385 def store(self, content, hash_key):
386 raise NotImplementedError()
387
388
389def get_storage_api(file_or_url, namespace):
390 """Returns an object that implements .retrieve()."""
391 if re.match(r'^https?://.+$', file_or_url):
392 return IsolateServer(file_or_url, namespace)
393 else:
394 return FileSystem(file_or_url)
395
396
maruel@chromium.orgdedbf492013-09-12 20:42:11 +0000397class RemoteOperation(object):
398 """Priority based worker queue to operate on action items.
399
400 It execute a function with the given task items. It is specialized to download
401 files.
402
403 When the priority of items is equals, works in strict FIFO mode.
404 """
405 # Initial and maximum number of worker threads.
406 INITIAL_WORKERS = 2
407 MAX_WORKERS = 16
408 # Priorities.
409 LOW, MED, HIGH = (1<<8, 2<<8, 3<<8)
410 INTERNAL_PRIORITY_BITS = (1<<8) - 1
411 RETRIES = 5
412
413 def __init__(self, do_item):
414 # Function to fetch a remote object or upload to a remote location.
415 self._do_item = do_item
maruel@chromium.orgdedbf492013-09-12 20:42:11 +0000416 self._pool = threading_utils.ThreadPool(
417 self.INITIAL_WORKERS, self.MAX_WORKERS, 0, 'remote')
418
419 def join(self):
420 """Blocks until the queue is empty."""
421 return self._pool.join()
422
423 def close(self):
424 """Terminates all worker threads."""
425 self._pool.close()
426
427 def add_item(self, priority, obj, dest, size):
428 """Retrieves an object from the remote data store.
429
430 The smaller |priority| gets fetched first.
431
432 Thread-safe.
433 """
434 assert (priority & self.INTERNAL_PRIORITY_BITS) == 0
435 return self._add_item(priority, obj, dest, size)
436
maruel@chromium.orge45728d2013-09-16 23:23:22 +0000437 def get_one_result(self):
438 return self._pool.get_one_result()
439
maruel@chromium.orgdedbf492013-09-12 20:42:11 +0000440 def _add_item(self, priority, obj, dest, size):
441 assert isinstance(obj, basestring), obj
442 assert isinstance(dest, basestring), dest
443 assert size is None or isinstance(size, int), size
444 return self._pool.add_task(
445 priority, self._task_executer, priority, obj, dest, size)
446
maruel@chromium.orgdedbf492013-09-12 20:42:11 +0000447 def _task_executer(self, priority, obj, dest, size):
448 """Wraps self._do_item to trap and retry on IOError exceptions."""
449 try:
maruel@chromium.orge45728d2013-09-16 23:23:22 +0000450 self._do_item(obj, dest, size)
maruel@chromium.orgdedbf492013-09-12 20:42:11 +0000451 # TODO(maruel): Technically, we'd want to have an output queue to be a
452 # PriorityQueue.
453 return obj
maruel@chromium.orge45728d2013-09-16 23:23:22 +0000454 except IOError:
maruel@chromium.orgdedbf492013-09-12 20:42:11 +0000455 # Retry a few times, lowering the priority.
456 if (priority & self.INTERNAL_PRIORITY_BITS) < self.RETRIES:
457 self._add_item(priority + 1, obj, dest, size)
458 return
459 raise
460
461
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000462def compression_level(filename):
463 """Given a filename calculates the ideal compression level to use."""
464 file_ext = os.path.splitext(filename)[1].lower()
465 # TODO(csharp): Profile to find what compression level works best.
466 return 0 if file_ext in ALREADY_COMPRESSED_TYPES else 7
467
468
maruel@chromium.orgcb3c3d52013-03-14 18:55:30 +0000469def read_and_compress(filepath, level):
470 """Reads a file and returns its content gzip compressed."""
471 compressor = zlib.compressobj(level)
472 compressed_data = cStringIO.StringIO()
473 with open(filepath, 'rb') as f:
474 while True:
maruel@chromium.orgdedbf492013-09-12 20:42:11 +0000475 chunk = f.read(ZIPPED_FILE_CHUNK)
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000476 if not chunk:
477 break
maruel@chromium.orgcb3c3d52013-03-14 18:55:30 +0000478 compressed_data.write(compressor.compress(chunk))
479 compressed_data.write(compressor.flush(zlib.Z_FINISH))
480 value = compressed_data.getvalue()
481 compressed_data.close()
482 return value
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000483
maruel@chromium.orgcb3c3d52013-03-14 18:55:30 +0000484
485def zip_and_trigger_upload(infile, metadata, upload_function):
486 # TODO(csharp): Fix crbug.com/150823 and enable the touched logic again.
487 # if not metadata['T']:
488 compressed_data = read_and_compress(infile, compression_level(infile))
489 priority = (
maruel@chromium.orgdedbf492013-09-12 20:42:11 +0000490 RemoteOperation.HIGH if metadata.get('priority', '1') == '0'
491 else RemoteOperation.MED)
maruel@chromium.orge45728d2013-09-16 23:23:22 +0000492 return upload_function(
493 priority, compressed_data, metadata['h'], UNKNOWN_FILE_SIZE)
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000494
495
vadimsh@chromium.org53f8d5a2013-06-19 13:03:55 +0000496def batch_files_for_check(infiles):
497 """Splits list of files to check for existence on the server into batches.
maruel@chromium.org35fc0c82013-01-17 15:14:14 +0000498
vadimsh@chromium.org53f8d5a2013-06-19 13:03:55 +0000499 Each batch corresponds to a single 'exists?' query to the server.
500
501 Yields:
502 batches: list of batches, each batch is a list of files.
maruel@chromium.org35fc0c82013-01-17 15:14:14 +0000503 """
vadimsh@chromium.orgeea52422013-08-21 19:35:54 +0000504 batch_count = 0
505 batch_size_limit = ITEMS_PER_CONTAINS_QUERIES[0]
maruel@chromium.org35fc0c82013-01-17 15:14:14 +0000506 next_queries = []
csharp@chromium.org90c45812013-01-23 14:27:21 +0000507 items = ((k, v) for k, v in infiles.iteritems() if 's' in v)
508 for relfile, metadata in sorted(items, key=lambda x: -x[1]['s']):
maruel@chromium.org35fc0c82013-01-17 15:14:14 +0000509 next_queries.append((relfile, metadata))
vadimsh@chromium.orgeea52422013-08-21 19:35:54 +0000510 if len(next_queries) == batch_size_limit:
vadimsh@chromium.org53f8d5a2013-06-19 13:03:55 +0000511 yield next_queries
maruel@chromium.org35fc0c82013-01-17 15:14:14 +0000512 next_queries = []
vadimsh@chromium.orgeea52422013-08-21 19:35:54 +0000513 batch_count += 1
514 batch_size_limit = ITEMS_PER_CONTAINS_QUERIES[
515 min(batch_count, len(ITEMS_PER_CONTAINS_QUERIES) - 1)]
maruel@chromium.org35fc0c82013-01-17 15:14:14 +0000516 if next_queries:
vadimsh@chromium.org53f8d5a2013-06-19 13:03:55 +0000517 yield next_queries
518
519
520def get_files_to_upload(contains_hash_url, infiles):
521 """Yields files that are missing on the server."""
vadimsh@chromium.orgb074b162013-08-22 17:55:46 +0000522 with threading_utils.ThreadPool(1, 16, 0, prefix='get_files_to_upload') as tp:
vadimsh@chromium.org53f8d5a2013-06-19 13:03:55 +0000523 for files in batch_files_for_check(infiles):
vadimsh@chromium.orgb074b162013-08-22 17:55:46 +0000524 tp.add_task(0, check_files_exist_on_server, contains_hash_url, files)
525 for missing_file in itertools.chain.from_iterable(tp.iter_results()):
vadimsh@chromium.org53f8d5a2013-06-19 13:03:55 +0000526 yield missing_file
maruel@chromium.org35fc0c82013-01-17 15:14:14 +0000527
528
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000529def upload_sha1_tree(base_url, indir, infiles, namespace):
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000530 """Uploads the given tree to the given url.
531
532 Arguments:
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000533 base_url: The base url, it is assume that |base_url|/has/ can be used to
534 query if an element was already uploaded, and |base_url|/store/
535 can be used to upload a new element.
536 indir: Root directory the infiles are based in.
maruel@chromium.orgcb3c3d52013-03-14 18:55:30 +0000537 infiles: dict of files to upload files from |indir| to |base_url|.
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000538 namespace: The namespace to use on the server.
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000539 """
540 logging.info('upload tree(base_url=%s, indir=%s, files=%d)' %
541 (base_url, indir, len(infiles)))
maruel@chromium.org034e3962013-03-13 13:34:25 +0000542
csharp@chromium.org07fa7592013-01-11 18:19:30 +0000543 # Create a pool of workers to zip and upload any files missing from
544 # the server.
vadimsh@chromium.orgb074b162013-08-22 17:55:46 +0000545 num_threads = threading_utils.num_processors()
546 zipping_pool = threading_utils.ThreadPool(min(2, num_threads),
547 num_threads, 0, 'zip')
maruel@chromium.org3e42ce82013-09-12 18:36:59 +0000548 remote = IsolateServer(base_url, namespace)
maruel@chromium.orgdedbf492013-09-12 20:42:11 +0000549 remote_uploader = RemoteOperation(remote.store)
csharp@chromium.org07fa7592013-01-11 18:19:30 +0000550
vadimsh@chromium.org53f8d5a2013-06-19 13:03:55 +0000551 # Starts the zip and upload process for files that are missing
552 # from the server.
maruel@chromium.org3e42ce82013-09-12 18:36:59 +0000553 contains_hash_url = '%scontains/%s?token=%s' % (
554 remote.content_url, namespace, remote.token)
csharp@chromium.org20a888c2013-01-15 15:06:55 +0000555 uploaded = []
vadimsh@chromium.org53f8d5a2013-06-19 13:03:55 +0000556 for relfile, metadata in get_files_to_upload(contains_hash_url, infiles):
csharp@chromium.org07fa7592013-01-11 18:19:30 +0000557 infile = os.path.join(indir, relfile)
maruel@chromium.org831958f2013-01-22 15:01:46 +0000558 zipping_pool.add_task(0, zip_and_trigger_upload, infile, metadata,
csharp@chromium.org07fa7592013-01-11 18:19:30 +0000559 remote_uploader.add_item)
vadimsh@chromium.org53f8d5a2013-06-19 13:03:55 +0000560 uploaded.append((relfile, metadata))
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000561
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000562 logging.info('Waiting for all files to finish zipping')
563 zipping_pool.join()
vadimsh@chromium.org53f8d5a2013-06-19 13:03:55 +0000564 zipping_pool.close()
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000565 logging.info('All files zipped.')
566
567 logging.info('Waiting for all files to finish uploading')
maruel@chromium.org13eca0b2013-01-22 16:42:21 +0000568 # Will raise if any exception occurred.
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000569 remote_uploader.join()
vadimsh@chromium.org53f8d5a2013-06-19 13:03:55 +0000570 remote_uploader.close()
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000571 logging.info('All files are uploaded')
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000572
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000573 total = len(infiles)
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000574 total_size = sum(metadata.get('s', 0) for metadata in infiles.itervalues())
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000575 logging.info(
576 'Total: %6d, %9.1fkb',
577 total,
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000578 sum(m.get('s', 0) for m in infiles.itervalues()) / 1024.)
csharp@chromium.org20a888c2013-01-15 15:06:55 +0000579 cache_hit = set(infiles.iterkeys()) - set(x[0] for x in uploaded)
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000580 cache_hit_size = sum(infiles[i].get('s', 0) for i in cache_hit)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000581 logging.info(
582 'cache hit: %6d, %9.1fkb, %6.2f%% files, %6.2f%% size',
583 len(cache_hit),
584 cache_hit_size / 1024.,
585 len(cache_hit) * 100. / total,
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000586 cache_hit_size * 100. / total_size if total_size else 0)
csharp@chromium.org20a888c2013-01-15 15:06:55 +0000587 cache_miss = uploaded
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000588 cache_miss_size = sum(infiles[i[0]].get('s', 0) for i in cache_miss)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000589 logging.info(
590 'cache miss: %6d, %9.1fkb, %6.2f%% files, %6.2f%% size',
591 len(cache_miss),
592 cache_miss_size / 1024.,
593 len(cache_miss) * 100. / total,
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000594 cache_miss_size * 100. / total_size if total_size else 0)
maruel@chromium.orgcb3c3d52013-03-14 18:55:30 +0000595 return 0
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000596
597
maruel@chromium.orgfb78d432013-08-28 21:22:40 +0000598@subcommand.usage('<file1..fileN> or - to read from stdin')
599def CMDarchive(parser, args):
600 """Archives data to the server."""
maruel@chromium.orgcb3c3d52013-03-14 18:55:30 +0000601 options, files = parser.parse_args(args)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000602
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000603 if files == ['-']:
604 files = sys.stdin.readlines()
605
606 if not files:
607 parser.error('Nothing to upload')
maruel@chromium.orgfb78d432013-08-28 21:22:40 +0000608 if not options.isolate_server:
609 parser.error('Nowhere to send. Please specify --isolate-server')
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000610
611 # Load the necessary metadata. This is going to be rewritten eventually to be
612 # more efficient.
613 infiles = dict(
614 (
615 f,
616 {
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000617 's': os.stat(f).st_size,
maruel@chromium.org037758d2012-12-10 17:59:46 +0000618 'h': sha1_file(f),
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000619 }
620 )
621 for f in files)
622
vadimsh@chromium.orga4326472013-08-24 02:05:41 +0000623 with tools.Profiler('Archive'):
maruel@chromium.orgb7e79a22013-09-13 01:24:56 +0000624 ret = upload_sha1_tree(
maruel@chromium.orgfb78d432013-08-28 21:22:40 +0000625 base_url=options.isolate_server,
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000626 indir=os.getcwd(),
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000627 infiles=infiles,
628 namespace=options.namespace)
maruel@chromium.orgb7e79a22013-09-13 01:24:56 +0000629 if not ret:
630 print '\n'.join('%s %s' % (infiles[f]['h'], f) for f in sorted(infiles))
631 return ret
maruel@chromium.orgfb78d432013-08-28 21:22:40 +0000632
633
634def CMDdownload(parser, args):
maruel@chromium.orgb7e79a22013-09-13 01:24:56 +0000635 """Download data from the server.
636
637 It can download individual files.
638 """
639 parser.add_option(
640 '-f', '--file', metavar='HASH DEST', default=[], action='append', nargs=2,
641 help='hash and destination of a file, can be used multiple times')
642 parser.add_option(
643 '-t', '--target', metavar='DIR', default=os.getcwd(),
644 help='destination directory')
645 options, args = parser.parse_args(args)
646 if args:
647 parser.error('Unsupported arguments: %s' % args)
648 if not options.file:
649 parser.error('Use one of --file is required.')
650
651 options.target = os.path.abspath(options.target)
652 remote = IsolateServer(options.isolate_server, options.namespace)
653 for h, dest in options.file:
654 logging.info('%s: %s', h, dest)
maruel@chromium.orge45728d2013-09-16 23:23:22 +0000655 remote.retrieve(h, os.path.join(options.target, dest), UNKNOWN_FILE_SIZE)
maruel@chromium.orgfb78d432013-08-28 21:22:40 +0000656 return 0
657
658
659class OptionParserIsolateServer(tools.OptionParserWithLogging):
660 def __init__(self, **kwargs):
661 tools.OptionParserWithLogging.__init__(self, **kwargs)
662 self.add_option(
663 '-I', '--isolate-server',
664 default=ISOLATE_SERVER,
665 metavar='URL',
666 help='Isolate server where data is stored. default: %default')
667 self.add_option(
668 '--namespace', default='default-gzip',
maruel@chromium.orgb7e79a22013-09-13 01:24:56 +0000669 help='The namespace to use on the server, default: %default')
maruel@chromium.orgfb78d432013-08-28 21:22:40 +0000670
671 def parse_args(self, *args, **kwargs):
672 options, args = tools.OptionParserWithLogging.parse_args(
673 self, *args, **kwargs)
674 options.isolate_server = options.isolate_server.rstrip('/')
675 if not options.isolate_server:
676 self.error('--isolate-server is required.')
677 return options, args
678
679
680def main(args):
681 dispatcher = subcommand.CommandDispatcher(__name__)
682 try:
683 return dispatcher.execute(
684 OptionParserIsolateServer(version=__version__), args)
maruel@chromium.orgdedbf492013-09-12 20:42:11 +0000685 except (ConfigError, MappingError) as e:
maruel@chromium.orgfb78d432013-08-28 21:22:40 +0000686 sys.stderr.write('\nError: ')
687 sys.stderr.write(str(e))
688 sys.stderr.write('\n')
689 return 1
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000690
691
692if __name__ == '__main__':
maruel@chromium.orgfb78d432013-08-28 21:22:40 +0000693 fix_encoding.fix_encoding()
694 tools.disable_buffering()
695 colorama.init()
maruel@chromium.orgcb3c3d52013-03-14 18:55:30 +0000696 sys.exit(main(sys.argv[1:]))