blob: 655f9d5b3450a37aa274d73f2ebfbcd6e1e8f96a [file] [log] [blame]
maruel@chromium.orgc6f90062012-11-07 18:32:22 +00001#!/usr/bin/env python
2# Copyright (c) 2012 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Archives a set of files to a server."""
7
8import binascii
9import hashlib
10import logging
11import optparse
12import os
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +000013import cStringIO
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000014import sys
15import time
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +000016import zlib
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000017
18import run_isolated
csharp@chromium.org07fa7592013-01-11 18:19:30 +000019import run_test_cases
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000020
21
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000022# The minimum size of files to upload directly to the blobstore.
maruel@chromium.orgaef29f82012-12-12 15:00:42 +000023MIN_SIZE_FOR_DIRECT_BLOBSTORE = 20 * 1024
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000024
csharp@chromium.org07fa7592013-01-11 18:19:30 +000025# The number of files to check the isolate server for each query.
csharp@chromium.org20a888c2013-01-15 15:06:55 +000026ITEMS_PER_CONTAINS_QUERY = 500
csharp@chromium.org07fa7592013-01-11 18:19:30 +000027
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +000028# A list of already compressed extension types that should not receive any
29# compression before being uploaded.
30ALREADY_COMPRESSED_TYPES = [
31 '7z', 'avi', 'cur', 'gif', 'h264', 'jar', 'jpeg', 'jpg', 'pdf', 'png',
32 'wav', 'zip'
33]
34
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000035
maruel@chromium.orgcb3c3d52013-03-14 18:55:30 +000036def randomness():
37 """Generates low-entropy randomness for MIME encoding.
38
39 Exists so it can be mocked out in unit tests.
40 """
41 return str(time.time())
42
43
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000044def encode_multipart_formdata(fields, files,
45 mime_mapper=lambda _: 'application/octet-stream'):
46 """Encodes a Multipart form data object.
47
48 Args:
49 fields: a sequence (name, value) elements for
50 regular form fields.
51 files: a sequence of (name, filename, value) elements for data to be
52 uploaded as files.
53 mime_mapper: function to return the mime type from the filename.
54 Returns:
55 content_type: for httplib.HTTP instance
56 body: for httplib.HTTP instance
57 """
maruel@chromium.orgcb3c3d52013-03-14 18:55:30 +000058 boundary = hashlib.md5(randomness()).hexdigest()
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000059 body_list = []
60 for (key, value) in fields:
61 if isinstance(key, unicode):
62 value = key.encode('utf-8')
63 if isinstance(value, unicode):
64 value = value.encode('utf-8')
65 body_list.append('--' + boundary)
66 body_list.append('Content-Disposition: form-data; name="%s"' % key)
67 body_list.append('')
68 body_list.append(value)
69 body_list.append('--' + boundary)
70 body_list.append('')
71 for (key, filename, value) in files:
72 if isinstance(key, unicode):
73 value = key.encode('utf-8')
74 if isinstance(filename, unicode):
75 value = filename.encode('utf-8')
76 if isinstance(value, unicode):
77 value = value.encode('utf-8')
78 body_list.append('--' + boundary)
79 body_list.append('Content-Disposition: form-data; name="%s"; '
80 'filename="%s"' % (key, filename))
81 body_list.append('Content-Type: %s' % mime_mapper(filename))
82 body_list.append('')
83 body_list.append(value)
84 body_list.append('--' + boundary)
85 body_list.append('')
86 if body_list:
87 body_list[-2] += '--'
88 body = '\r\n'.join(body_list)
89 content_type = 'multipart/form-data; boundary=%s' % boundary
90 return content_type, body
91
92
maruel@chromium.org037758d2012-12-10 17:59:46 +000093def sha1_file(filepath):
94 """Calculates the SHA-1 of a file without reading it all in memory at once."""
95 digest = hashlib.sha1()
96 with open(filepath, 'rb') as f:
97 while True:
98 # Read in 1mb chunks.
99 chunk = f.read(1024*1024)
100 if not chunk:
101 break
102 digest.update(chunk)
103 return digest.hexdigest()
104
105
maruel@chromium.orgef333122013-03-12 20:36:40 +0000106def url_open(url, *args, **kwargs):
107 result = run_isolated.url_open(url, *args, **kwargs)
108 if not result:
109 # If we get no response from the server, assume it is down and raise an
110 # exception.
111 raise run_isolated.MappingError('Unable to connect to server %s' % url)
112 return result
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000113
114
maruel@chromium.orgdc359e62013-03-14 13:08:55 +0000115def upload_hash_content_to_blobstore(
116 generate_upload_url, data, hash_key, content):
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000117 """Uploads the given hash contents directly to the blobsotre via a generated
118 url.
119
120 Arguments:
121 generate_upload_url: The url to get the new upload url from.
maruel@chromium.orgdc359e62013-03-14 13:08:55 +0000122 data: extra POST data.
123 hash_key: sha1 of the uncompressed version of content.
124 content: The contents to upload. Must fit in memory for now.
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000125 """
126 logging.debug('Generating url to directly upload file to blobstore')
maruel@chromium.org92a3d2e2012-12-20 16:22:29 +0000127 assert isinstance(hash_key, str), hash_key
128 assert isinstance(content, str), (hash_key, content)
maruel@chromium.orgdc359e62013-03-14 13:08:55 +0000129 upload_url = url_open(generate_upload_url, data).read()
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000130
131 if not upload_url:
132 logging.error('Unable to generate upload url')
133 return
134
maruel@chromium.orgdc359e62013-03-14 13:08:55 +0000135 # TODO(maruel): Support large files.
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000136 content_type, body = encode_multipart_formdata(
maruel@chromium.orgcb3c3d52013-03-14 18:55:30 +0000137 data, [('hash_contents', hash_key, content)])
maruel@chromium.orgef333122013-03-12 20:36:40 +0000138 return url_open(upload_url, body, content_type=content_type)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000139
140
141class UploadRemote(run_isolated.Remote):
maruel@chromium.org034e3962013-03-13 13:34:25 +0000142 def __init__(self, namespace, base_url, token):
maruel@chromium.org21243ce2012-12-20 17:43:00 +0000143 self.namespace = str(namespace)
maruel@chromium.org034e3962013-03-13 13:34:25 +0000144 self._token = token
145 super(UploadRemote, self).__init__(base_url)
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000146
147 def get_file_handler(self, base_url):
maruel@chromium.org21243ce2012-12-20 17:43:00 +0000148 base_url = str(base_url)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000149 def upload_file(content, hash_key):
maruel@chromium.org034e3962013-03-13 13:34:25 +0000150 # TODO(maruel): Detect failures.
maruel@chromium.org21243ce2012-12-20 17:43:00 +0000151 hash_key = str(hash_key)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000152 content_url = base_url.rstrip('/') + '/content/'
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000153 if len(content) > MIN_SIZE_FOR_DIRECT_BLOBSTORE:
maruel@chromium.orgdc359e62013-03-14 13:08:55 +0000154 url = '%sgenerate_blobstore_url/%s/%s' % (
155 content_url, self.namespace, hash_key)
156 data = [('token', self._token)]
157 upload_hash_content_to_blobstore(url, data, hash_key, content)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000158 else:
maruel@chromium.org034e3962013-03-13 13:34:25 +0000159 url = '%sstore/%s/%s?token=%s' % (
160 content_url, self.namespace, hash_key, self._token)
161 url_open(url, content, content_type='application/octet-stream')
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000162 return upload_file
163
164
csharp@chromium.org07fa7592013-01-11 18:19:30 +0000165def update_files_to_upload(query_url, queries, upload):
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000166 """Queries the server to see which files from this batch already exist there.
167
168 Arguments:
169 queries: The hash files to potential upload to the server.
csharp@chromium.org07fa7592013-01-11 18:19:30 +0000170 upload: Any new files that need to be upload are sent to this function.
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000171 """
172 body = ''.join(
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000173 (binascii.unhexlify(meta_data['h']) for (_, meta_data) in queries))
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000174 assert (len(body) % 20) == 0, repr(body)
175
maruel@chromium.orgef333122013-03-12 20:36:40 +0000176 response = url_open(
177 query_url, body, content_type='application/octet-stream').read()
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000178 if len(queries) != len(response):
179 raise run_isolated.MappingError(
180 'Got an incorrect number of responses from the server. Expected %d, '
181 'but got %d' % (len(queries), len(response)))
182
183 hit = 0
184 for i in range(len(response)):
185 if response[i] == chr(0):
csharp@chromium.org07fa7592013-01-11 18:19:30 +0000186 upload(queries[i])
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000187 else:
188 hit += 1
189 logging.info('Queried %d files, %d cache hit', len(queries), hit)
190
191
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000192def compression_level(filename):
193 """Given a filename calculates the ideal compression level to use."""
194 file_ext = os.path.splitext(filename)[1].lower()
195 # TODO(csharp): Profile to find what compression level works best.
196 return 0 if file_ext in ALREADY_COMPRESSED_TYPES else 7
197
198
maruel@chromium.orgcb3c3d52013-03-14 18:55:30 +0000199def read_and_compress(filepath, level):
200 """Reads a file and returns its content gzip compressed."""
201 compressor = zlib.compressobj(level)
202 compressed_data = cStringIO.StringIO()
203 with open(filepath, 'rb') as f:
204 while True:
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000205 chunk = f.read(run_isolated.ZIPPED_FILE_CHUNK)
206 if not chunk:
207 break
maruel@chromium.orgcb3c3d52013-03-14 18:55:30 +0000208 compressed_data.write(compressor.compress(chunk))
209 compressed_data.write(compressor.flush(zlib.Z_FINISH))
210 value = compressed_data.getvalue()
211 compressed_data.close()
212 return value
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000213
maruel@chromium.orgcb3c3d52013-03-14 18:55:30 +0000214
215def zip_and_trigger_upload(infile, metadata, upload_function):
216 # TODO(csharp): Fix crbug.com/150823 and enable the touched logic again.
217 # if not metadata['T']:
218 compressed_data = read_and_compress(infile, compression_level(infile))
219 priority = (
220 run_isolated.Remote.HIGH if metadata.get('priority', '1') == '0'
221 else run_isolated.Remote.MED)
222 return upload_function(priority, compressed_data, metadata['h'], None)
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000223
224
maruel@chromium.org35fc0c82013-01-17 15:14:14 +0000225def process_items(contains_hash_url, infiles, zip_and_upload):
226 """Generates the list of files that need to be uploaded and send them to
227 zip_and_upload.
228
229 Some may already be on the server.
230 """
231 next_queries = []
csharp@chromium.org90c45812013-01-23 14:27:21 +0000232 items = ((k, v) for k, v in infiles.iteritems() if 's' in v)
233 for relfile, metadata in sorted(items, key=lambda x: -x[1]['s']):
maruel@chromium.org35fc0c82013-01-17 15:14:14 +0000234 next_queries.append((relfile, metadata))
235 if len(next_queries) == ITEMS_PER_CONTAINS_QUERY:
236 update_files_to_upload(contains_hash_url, next_queries, zip_and_upload)
237 next_queries = []
238 if next_queries:
239 update_files_to_upload(contains_hash_url, next_queries, zip_and_upload)
240
241
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000242def upload_sha1_tree(base_url, indir, infiles, namespace):
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000243 """Uploads the given tree to the given url.
244
245 Arguments:
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000246 base_url: The base url, it is assume that |base_url|/has/ can be used to
247 query if an element was already uploaded, and |base_url|/store/
248 can be used to upload a new element.
249 indir: Root directory the infiles are based in.
maruel@chromium.orgcb3c3d52013-03-14 18:55:30 +0000250 infiles: dict of files to upload files from |indir| to |base_url|.
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000251 namespace: The namespace to use on the server.
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000252 """
253 logging.info('upload tree(base_url=%s, indir=%s, files=%d)' %
254 (base_url, indir, len(infiles)))
maruel@chromium.orgcb3c3d52013-03-14 18:55:30 +0000255 assert base_url.startswith('http'), base_url
256 base_url = base_url.rstrip('/')
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000257
maruel@chromium.org034e3962013-03-13 13:34:25 +0000258 # TODO(maruel): Make this request much earlier asynchronously while the files
259 # are being enumerated.
260 token = url_open(base_url + '/content/get_token').read()
261
csharp@chromium.org07fa7592013-01-11 18:19:30 +0000262 # Create a pool of workers to zip and upload any files missing from
263 # the server.
maruel@chromium.org6b0c9ec2013-01-18 00:34:31 +0000264 num_threads = run_test_cases.num_processors()
265 zipping_pool = run_isolated.ThreadPool(num_threads, num_threads, 0)
maruel@chromium.org034e3962013-03-13 13:34:25 +0000266 remote_uploader = UploadRemote(namespace, base_url, token)
csharp@chromium.org07fa7592013-01-11 18:19:30 +0000267
268 # Starts the zip and upload process for a given query. The query is assumed
269 # to be in the format (relfile, metadata).
csharp@chromium.org20a888c2013-01-15 15:06:55 +0000270 uploaded = []
csharp@chromium.org07fa7592013-01-11 18:19:30 +0000271 def zip_and_upload(query):
272 relfile, metadata = query
273 infile = os.path.join(indir, relfile)
maruel@chromium.org831958f2013-01-22 15:01:46 +0000274 zipping_pool.add_task(0, zip_and_trigger_upload, infile, metadata,
csharp@chromium.org07fa7592013-01-11 18:19:30 +0000275 remote_uploader.add_item)
csharp@chromium.org20a888c2013-01-15 15:06:55 +0000276 uploaded.append(query)
csharp@chromium.org07fa7592013-01-11 18:19:30 +0000277
maruel@chromium.org034e3962013-03-13 13:34:25 +0000278 contains_hash_url = '%s/content/contains/%s?token=%s' % (
maruel@chromium.orgcb3c3d52013-03-14 18:55:30 +0000279 base_url, namespace, token)
maruel@chromium.org35fc0c82013-01-17 15:14:14 +0000280 process_items(contains_hash_url, infiles, zip_and_upload)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000281
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000282 logging.info('Waiting for all files to finish zipping')
283 zipping_pool.join()
284 logging.info('All files zipped.')
285
286 logging.info('Waiting for all files to finish uploading')
maruel@chromium.org13eca0b2013-01-22 16:42:21 +0000287 # Will raise if any exception occurred.
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000288 remote_uploader.join()
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000289 logging.info('All files are uploaded')
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000290
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000291 total = len(infiles)
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000292 total_size = sum(metadata.get('s', 0) for metadata in infiles.itervalues())
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000293 logging.info(
294 'Total: %6d, %9.1fkb',
295 total,
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000296 sum(m.get('s', 0) for m in infiles.itervalues()) / 1024.)
csharp@chromium.org20a888c2013-01-15 15:06:55 +0000297 cache_hit = set(infiles.iterkeys()) - set(x[0] for x in uploaded)
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000298 cache_hit_size = sum(infiles[i].get('s', 0) for i in cache_hit)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000299 logging.info(
300 'cache hit: %6d, %9.1fkb, %6.2f%% files, %6.2f%% size',
301 len(cache_hit),
302 cache_hit_size / 1024.,
303 len(cache_hit) * 100. / total,
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000304 cache_hit_size * 100. / total_size if total_size else 0)
csharp@chromium.org20a888c2013-01-15 15:06:55 +0000305 cache_miss = uploaded
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000306 cache_miss_size = sum(infiles[i[0]].get('s', 0) for i in cache_miss)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000307 logging.info(
308 'cache miss: %6d, %9.1fkb, %6.2f%% files, %6.2f%% size',
309 len(cache_miss),
310 cache_miss_size / 1024.,
311 len(cache_miss) * 100. / total,
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000312 cache_miss_size * 100. / total_size if total_size else 0)
maruel@chromium.orgcb3c3d52013-03-14 18:55:30 +0000313 return 0
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000314
315
maruel@chromium.orgcb3c3d52013-03-14 18:55:30 +0000316def main(args):
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000317 parser = optparse.OptionParser(
318 usage='%prog [options] <file1..fileN> or - to read from stdin',
319 description=sys.modules[__name__].__doc__)
maruel@chromium.orgcb3c3d52013-03-14 18:55:30 +0000320 parser.add_option('-r', '--remote', help='Remote server to archive to')
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000321 parser.add_option(
322 '-v', '--verbose',
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000323 action='count', default=0,
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000324 help='Use multiple times to increase verbosity')
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000325 parser.add_option('--namespace', default='default-gzip',
326 help='The namespace to use on the server.')
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000327
maruel@chromium.orgcb3c3d52013-03-14 18:55:30 +0000328 options, files = parser.parse_args(args)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000329
330 levels = [logging.ERROR, logging.INFO, logging.DEBUG]
331 logging.basicConfig(
332 level=levels[min(len(levels)-1, options.verbose)],
333 format='%(levelname)5s %(module)15s(%(lineno)3d): %(message)s')
334 if files == ['-']:
335 files = sys.stdin.readlines()
336
337 if not files:
338 parser.error('Nothing to upload')
maruel@chromium.orgcb3c3d52013-03-14 18:55:30 +0000339 if not options.remote:
340 parser.error('Nowhere to send. Please specify --remote')
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000341
342 # Load the necessary metadata. This is going to be rewritten eventually to be
343 # more efficient.
344 infiles = dict(
345 (
346 f,
347 {
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000348 's': os.stat(f).st_size,
maruel@chromium.org037758d2012-12-10 17:59:46 +0000349 'h': sha1_file(f),
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000350 }
351 )
352 for f in files)
353
354 with run_isolated.Profiler('Archive'):
maruel@chromium.orgcb3c3d52013-03-14 18:55:30 +0000355 return upload_sha1_tree(
356 base_url=options.remote,
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000357 indir=os.getcwd(),
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000358 infiles=infiles,
359 namespace=options.namespace)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000360
361
362if __name__ == '__main__':
maruel@chromium.orgcb3c3d52013-03-14 18:55:30 +0000363 sys.exit(main(sys.argv[1:]))