blob: 3b6e22796be675755c16b9e10dd78e2f5c8bd304 [file] [log] [blame]
maruel@chromium.orgc6f90062012-11-07 18:32:22 +00001#!/usr/bin/env python
2# Copyright (c) 2012 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Archives a set of files to a server."""
7
8import binascii
9import hashlib
10import logging
11import optparse
12import os
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +000013import cStringIO
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000014import sys
15import time
16import urllib2
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +000017import zlib
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000018
19import run_isolated
csharp@chromium.org07fa7592013-01-11 18:19:30 +000020import run_test_cases
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000021
22
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000023# The minimum size of files to upload directly to the blobstore.
maruel@chromium.orgaef29f82012-12-12 15:00:42 +000024MIN_SIZE_FOR_DIRECT_BLOBSTORE = 20 * 1024
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000025
csharp@chromium.org07fa7592013-01-11 18:19:30 +000026# The number of files to check the isolate server for each query.
csharp@chromium.org20a888c2013-01-15 15:06:55 +000027ITEMS_PER_CONTAINS_QUERY = 500
csharp@chromium.org07fa7592013-01-11 18:19:30 +000028
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +000029# A list of already compressed extension types that should not receive any
30# compression before being uploaded.
31ALREADY_COMPRESSED_TYPES = [
32 '7z', 'avi', 'cur', 'gif', 'h264', 'jar', 'jpeg', 'jpg', 'pdf', 'png',
33 'wav', 'zip'
34]
35
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000036
37def encode_multipart_formdata(fields, files,
38 mime_mapper=lambda _: 'application/octet-stream'):
39 """Encodes a Multipart form data object.
40
41 Args:
42 fields: a sequence (name, value) elements for
43 regular form fields.
44 files: a sequence of (name, filename, value) elements for data to be
45 uploaded as files.
46 mime_mapper: function to return the mime type from the filename.
47 Returns:
48 content_type: for httplib.HTTP instance
49 body: for httplib.HTTP instance
50 """
51 boundary = hashlib.md5(str(time.time())).hexdigest()
52 body_list = []
53 for (key, value) in fields:
54 if isinstance(key, unicode):
55 value = key.encode('utf-8')
56 if isinstance(value, unicode):
57 value = value.encode('utf-8')
58 body_list.append('--' + boundary)
59 body_list.append('Content-Disposition: form-data; name="%s"' % key)
60 body_list.append('')
61 body_list.append(value)
62 body_list.append('--' + boundary)
63 body_list.append('')
64 for (key, filename, value) in files:
65 if isinstance(key, unicode):
66 value = key.encode('utf-8')
67 if isinstance(filename, unicode):
68 value = filename.encode('utf-8')
69 if isinstance(value, unicode):
70 value = value.encode('utf-8')
71 body_list.append('--' + boundary)
72 body_list.append('Content-Disposition: form-data; name="%s"; '
73 'filename="%s"' % (key, filename))
74 body_list.append('Content-Type: %s' % mime_mapper(filename))
75 body_list.append('')
76 body_list.append(value)
77 body_list.append('--' + boundary)
78 body_list.append('')
79 if body_list:
80 body_list[-2] += '--'
81 body = '\r\n'.join(body_list)
82 content_type = 'multipart/form-data; boundary=%s' % boundary
83 return content_type, body
84
85
86def gen_url_request(url, payload, content_type='application/octet-stream'):
87 """Returns a POST request."""
88 request = urllib2.Request(url, data=payload)
89 if payload is not None:
90 request.add_header('Content-Type', content_type)
91 request.add_header('Content-Length', len(payload))
92 return request
93
94
maruel@chromium.org037758d2012-12-10 17:59:46 +000095def sha1_file(filepath):
96 """Calculates the SHA-1 of a file without reading it all in memory at once."""
97 digest = hashlib.sha1()
98 with open(filepath, 'rb') as f:
99 while True:
100 # Read in 1mb chunks.
101 chunk = f.read(1024*1024)
102 if not chunk:
103 break
104 digest.update(chunk)
105 return digest.hexdigest()
106
107
maruel@chromium.orgef333122013-03-12 20:36:40 +0000108def url_open(url, *args, **kwargs):
109 result = run_isolated.url_open(url, *args, **kwargs)
110 if not result:
111 # If we get no response from the server, assume it is down and raise an
112 # exception.
113 raise run_isolated.MappingError('Unable to connect to server %s' % url)
114 return result
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000115
116
maruel@chromium.org00a7d6c2012-11-22 14:11:01 +0000117def upload_hash_content_to_blobstore(generate_upload_url, hash_key, content):
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000118 """Uploads the given hash contents directly to the blobsotre via a generated
119 url.
120
121 Arguments:
122 generate_upload_url: The url to get the new upload url from.
123 hash_contents: The contents to upload.
124 """
125 logging.debug('Generating url to directly upload file to blobstore')
maruel@chromium.org92a3d2e2012-12-20 16:22:29 +0000126 assert isinstance(hash_key, str), hash_key
127 assert isinstance(content, str), (hash_key, content)
maruel@chromium.orgef333122013-03-12 20:36:40 +0000128 upload_url = url_open(generate_upload_url).read()
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000129
130 if not upload_url:
131 logging.error('Unable to generate upload url')
132 return
133
134 content_type, body = encode_multipart_formdata(
maruel@chromium.org00a7d6c2012-11-22 14:11:01 +0000135 [], [('hash_contents', hash_key, content)])
maruel@chromium.orgef333122013-03-12 20:36:40 +0000136 return url_open(upload_url, body, content_type=content_type)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000137
138
139class UploadRemote(run_isolated.Remote):
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000140 def __init__(self, namespace, *args, **kwargs):
141 super(UploadRemote, self).__init__(*args, **kwargs)
maruel@chromium.org21243ce2012-12-20 17:43:00 +0000142 self.namespace = str(namespace)
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000143
144 def get_file_handler(self, base_url):
maruel@chromium.org21243ce2012-12-20 17:43:00 +0000145 base_url = str(base_url)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000146 def upload_file(content, hash_key):
maruel@chromium.org21243ce2012-12-20 17:43:00 +0000147 hash_key = str(hash_key)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000148 content_url = base_url.rstrip('/') + '/content/'
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000149 if len(content) > MIN_SIZE_FOR_DIRECT_BLOBSTORE:
150 upload_hash_content_to_blobstore(
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000151 content_url + 'generate_blobstore_url/' + self.namespace + '/' +
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000152 hash_key,
maruel@chromium.org00a7d6c2012-11-22 14:11:01 +0000153 hash_key,
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000154 content)
155 else:
maruel@chromium.orgef333122013-03-12 20:36:40 +0000156 url_open(
157 content_url + 'store/' + self.namespace + '/' + hash_key,
158 content,
159 content_type='application/octet-stream')
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000160 return upload_file
161
162
csharp@chromium.org07fa7592013-01-11 18:19:30 +0000163def update_files_to_upload(query_url, queries, upload):
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000164 """Queries the server to see which files from this batch already exist there.
165
166 Arguments:
167 queries: The hash files to potential upload to the server.
csharp@chromium.org07fa7592013-01-11 18:19:30 +0000168 upload: Any new files that need to be upload are sent to this function.
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000169 """
170 body = ''.join(
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000171 (binascii.unhexlify(meta_data['h']) for (_, meta_data) in queries))
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000172 assert (len(body) % 20) == 0, repr(body)
173
maruel@chromium.orgef333122013-03-12 20:36:40 +0000174 response = url_open(
175 query_url, body, content_type='application/octet-stream').read()
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000176 if len(queries) != len(response):
177 raise run_isolated.MappingError(
178 'Got an incorrect number of responses from the server. Expected %d, '
179 'but got %d' % (len(queries), len(response)))
180
181 hit = 0
182 for i in range(len(response)):
183 if response[i] == chr(0):
csharp@chromium.org07fa7592013-01-11 18:19:30 +0000184 upload(queries[i])
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000185 else:
186 hit += 1
187 logging.info('Queried %d files, %d cache hit', len(queries), hit)
188
189
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000190def compression_level(filename):
191 """Given a filename calculates the ideal compression level to use."""
192 file_ext = os.path.splitext(filename)[1].lower()
193 # TODO(csharp): Profile to find what compression level works best.
194 return 0 if file_ext in ALREADY_COMPRESSED_TYPES else 7
195
196
197def zip_and_trigger_upload(infile, metadata, upload_function):
198 compressor = zlib.compressobj(compression_level(infile))
199 hash_data = cStringIO.StringIO()
200 with open(infile, 'rb') as f:
201 # TODO(csharp): Fix crbug.com/150823 and enable the touched logic again.
202 while True: # and not metadata['T']:
203 chunk = f.read(run_isolated.ZIPPED_FILE_CHUNK)
204 if not chunk:
205 break
206 hash_data.write(compressor.compress(chunk))
207
208 hash_data.write(compressor.flush(zlib.Z_FINISH))
209 priority = (
210 run_isolated.Remote.HIGH if metadata.get('priority', '1') == '0'
211 else run_isolated.Remote.MED)
212 upload_function(priority, hash_data.getvalue(), metadata['h'],
213 None)
214 hash_data.close()
215
216
maruel@chromium.org35fc0c82013-01-17 15:14:14 +0000217def process_items(contains_hash_url, infiles, zip_and_upload):
218 """Generates the list of files that need to be uploaded and send them to
219 zip_and_upload.
220
221 Some may already be on the server.
222 """
223 next_queries = []
csharp@chromium.org90c45812013-01-23 14:27:21 +0000224 items = ((k, v) for k, v in infiles.iteritems() if 's' in v)
225 for relfile, metadata in sorted(items, key=lambda x: -x[1]['s']):
maruel@chromium.org35fc0c82013-01-17 15:14:14 +0000226 next_queries.append((relfile, metadata))
227 if len(next_queries) == ITEMS_PER_CONTAINS_QUERY:
228 update_files_to_upload(contains_hash_url, next_queries, zip_and_upload)
229 next_queries = []
230 if next_queries:
231 update_files_to_upload(contains_hash_url, next_queries, zip_and_upload)
232
233
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000234def upload_sha1_tree(base_url, indir, infiles, namespace):
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000235 """Uploads the given tree to the given url.
236
237 Arguments:
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000238 base_url: The base url, it is assume that |base_url|/has/ can be used to
239 query if an element was already uploaded, and |base_url|/store/
240 can be used to upload a new element.
241 indir: Root directory the infiles are based in.
242 infiles: dict of files to map from |indir| to |outdir|.
243 namespace: The namespace to use on the server.
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000244 """
245 logging.info('upload tree(base_url=%s, indir=%s, files=%d)' %
246 (base_url, indir, len(infiles)))
247
csharp@chromium.org07fa7592013-01-11 18:19:30 +0000248 # Create a pool of workers to zip and upload any files missing from
249 # the server.
maruel@chromium.org6b0c9ec2013-01-18 00:34:31 +0000250 num_threads = run_test_cases.num_processors()
251 zipping_pool = run_isolated.ThreadPool(num_threads, num_threads, 0)
csharp@chromium.org07fa7592013-01-11 18:19:30 +0000252 remote_uploader = UploadRemote(namespace, base_url)
253
254 # Starts the zip and upload process for a given query. The query is assumed
255 # to be in the format (relfile, metadata).
csharp@chromium.org20a888c2013-01-15 15:06:55 +0000256 uploaded = []
csharp@chromium.org07fa7592013-01-11 18:19:30 +0000257 def zip_and_upload(query):
258 relfile, metadata = query
259 infile = os.path.join(indir, relfile)
maruel@chromium.org831958f2013-01-22 15:01:46 +0000260 zipping_pool.add_task(0, zip_and_trigger_upload, infile, metadata,
csharp@chromium.org07fa7592013-01-11 18:19:30 +0000261 remote_uploader.add_item)
csharp@chromium.org20a888c2013-01-15 15:06:55 +0000262 uploaded.append(query)
csharp@chromium.org07fa7592013-01-11 18:19:30 +0000263
maruel@chromium.org35fc0c82013-01-17 15:14:14 +0000264 contains_hash_url = '%s/content/contains/%s' % (
265 base_url.rstrip('/'), namespace)
266 process_items(contains_hash_url, infiles, zip_and_upload)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000267
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000268 logging.info('Waiting for all files to finish zipping')
269 zipping_pool.join()
270 logging.info('All files zipped.')
271
272 logging.info('Waiting for all files to finish uploading')
maruel@chromium.org13eca0b2013-01-22 16:42:21 +0000273 # Will raise if any exception occurred.
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000274 remote_uploader.join()
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000275 logging.info('All files are uploaded')
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000276
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000277 total = len(infiles)
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000278 total_size = sum(metadata.get('s', 0) for metadata in infiles.itervalues())
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000279 logging.info(
280 'Total: %6d, %9.1fkb',
281 total,
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000282 sum(m.get('s', 0) for m in infiles.itervalues()) / 1024.)
csharp@chromium.org20a888c2013-01-15 15:06:55 +0000283 cache_hit = set(infiles.iterkeys()) - set(x[0] for x in uploaded)
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000284 cache_hit_size = sum(infiles[i].get('s', 0) for i in cache_hit)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000285 logging.info(
286 'cache hit: %6d, %9.1fkb, %6.2f%% files, %6.2f%% size',
287 len(cache_hit),
288 cache_hit_size / 1024.,
289 len(cache_hit) * 100. / total,
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000290 cache_hit_size * 100. / total_size if total_size else 0)
csharp@chromium.org20a888c2013-01-15 15:06:55 +0000291 cache_miss = uploaded
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000292 cache_miss_size = sum(infiles[i[0]].get('s', 0) for i in cache_miss)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000293 logging.info(
294 'cache miss: %6d, %9.1fkb, %6.2f%% files, %6.2f%% size',
295 len(cache_miss),
296 cache_miss_size / 1024.,
297 len(cache_miss) * 100. / total,
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000298 cache_miss_size * 100. / total_size if total_size else 0)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000299
300
301def main():
302 parser = optparse.OptionParser(
303 usage='%prog [options] <file1..fileN> or - to read from stdin',
304 description=sys.modules[__name__].__doc__)
305 # TODO(maruel): Support both NFS and isolateserver.
306 parser.add_option('-o', '--outdir', help='Remote server to archive to')
307 parser.add_option(
308 '-v', '--verbose',
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000309 action='count', default=0,
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000310 help='Use multiple times to increase verbosity')
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000311 parser.add_option('--namespace', default='default-gzip',
312 help='The namespace to use on the server.')
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000313
314 options, files = parser.parse_args()
315
316 levels = [logging.ERROR, logging.INFO, logging.DEBUG]
317 logging.basicConfig(
318 level=levels[min(len(levels)-1, options.verbose)],
319 format='%(levelname)5s %(module)15s(%(lineno)3d): %(message)s')
320 if files == ['-']:
321 files = sys.stdin.readlines()
322
323 if not files:
324 parser.error('Nothing to upload')
325 if not options.outdir:
326 parser.error('Nowhere to send. Please specify --outdir')
327
328 # Load the necessary metadata. This is going to be rewritten eventually to be
329 # more efficient.
330 infiles = dict(
331 (
332 f,
333 {
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000334 's': os.stat(f).st_size,
maruel@chromium.org037758d2012-12-10 17:59:46 +0000335 'h': sha1_file(f),
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000336 }
337 )
338 for f in files)
339
340 with run_isolated.Profiler('Archive'):
341 upload_sha1_tree(
342 base_url=options.outdir,
343 indir=os.getcwd(),
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000344 infiles=infiles,
345 namespace=options.namespace)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000346 return 0
347
348
349if __name__ == '__main__':
350 sys.exit(main())