blob: b1fcc849581fb4bdbad5b37d6c8d02f78fefbe3c [file] [log] [blame]
maruel@chromium.orgc6f90062012-11-07 18:32:22 +00001#!/usr/bin/env python
2# Copyright (c) 2012 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Archives a set of files to a server."""
7
8import binascii
9import hashlib
10import logging
11import optparse
12import os
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +000013import cStringIO
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000014import sys
15import time
16import urllib2
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +000017import zlib
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000018
19import run_isolated
csharp@chromium.org07fa7592013-01-11 18:19:30 +000020import run_test_cases
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000021
22
23# The maximum number of upload attempts to try when uploading a single file.
24MAX_UPLOAD_ATTEMPTS = 5
25
26# The minimum size of files to upload directly to the blobstore.
maruel@chromium.orgaef29f82012-12-12 15:00:42 +000027MIN_SIZE_FOR_DIRECT_BLOBSTORE = 20 * 1024
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000028
csharp@chromium.org07fa7592013-01-11 18:19:30 +000029# The number of files to check the isolate server for each query.
csharp@chromium.org20a888c2013-01-15 15:06:55 +000030ITEMS_PER_CONTAINS_QUERY = 500
csharp@chromium.org07fa7592013-01-11 18:19:30 +000031
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +000032# A list of already compressed extension types that should not receive any
33# compression before being uploaded.
34ALREADY_COMPRESSED_TYPES = [
35 '7z', 'avi', 'cur', 'gif', 'h264', 'jar', 'jpeg', 'jpg', 'pdf', 'png',
36 'wav', 'zip'
37]
38
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000039
40def encode_multipart_formdata(fields, files,
41 mime_mapper=lambda _: 'application/octet-stream'):
42 """Encodes a Multipart form data object.
43
44 Args:
45 fields: a sequence (name, value) elements for
46 regular form fields.
47 files: a sequence of (name, filename, value) elements for data to be
48 uploaded as files.
49 mime_mapper: function to return the mime type from the filename.
50 Returns:
51 content_type: for httplib.HTTP instance
52 body: for httplib.HTTP instance
53 """
54 boundary = hashlib.md5(str(time.time())).hexdigest()
55 body_list = []
56 for (key, value) in fields:
57 if isinstance(key, unicode):
58 value = key.encode('utf-8')
59 if isinstance(value, unicode):
60 value = value.encode('utf-8')
61 body_list.append('--' + boundary)
62 body_list.append('Content-Disposition: form-data; name="%s"' % key)
63 body_list.append('')
64 body_list.append(value)
65 body_list.append('--' + boundary)
66 body_list.append('')
67 for (key, filename, value) in files:
68 if isinstance(key, unicode):
69 value = key.encode('utf-8')
70 if isinstance(filename, unicode):
71 value = filename.encode('utf-8')
72 if isinstance(value, unicode):
73 value = value.encode('utf-8')
74 body_list.append('--' + boundary)
75 body_list.append('Content-Disposition: form-data; name="%s"; '
76 'filename="%s"' % (key, filename))
77 body_list.append('Content-Type: %s' % mime_mapper(filename))
78 body_list.append('')
79 body_list.append(value)
80 body_list.append('--' + boundary)
81 body_list.append('')
82 if body_list:
83 body_list[-2] += '--'
84 body = '\r\n'.join(body_list)
85 content_type = 'multipart/form-data; boundary=%s' % boundary
86 return content_type, body
87
88
89def gen_url_request(url, payload, content_type='application/octet-stream'):
90 """Returns a POST request."""
91 request = urllib2.Request(url, data=payload)
92 if payload is not None:
93 request.add_header('Content-Type', content_type)
94 request.add_header('Content-Length', len(payload))
95 return request
96
97
maruel@chromium.org037758d2012-12-10 17:59:46 +000098def sha1_file(filepath):
99 """Calculates the SHA-1 of a file without reading it all in memory at once."""
100 digest = hashlib.sha1()
101 with open(filepath, 'rb') as f:
102 while True:
103 # Read in 1mb chunks.
104 chunk = f.read(1024*1024)
105 if not chunk:
106 break
107 digest.update(chunk)
108 return digest.hexdigest()
109
110
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000111def url_open(url, data, content_type='application/octet-stream'):
112 """Opens the given url with the given data, repeating up to
113 MAX_UPLOAD_ATTEMPTS times if it encounters an error.
114
115 Arguments:
116 url: The url to open.
117 data: The data to send to the url.
118
119 Returns:
120 The response from the url, or it raises an exception it it failed to get
121 a response.
122 """
123 request = gen_url_request(url, data, content_type)
maruel@chromium.org3dc6abd2012-11-15 17:01:53 +0000124 last_error = None
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000125 for i in range(MAX_UPLOAD_ATTEMPTS):
126 try:
127 return urllib2.urlopen(request)
128 except urllib2.URLError as e:
maruel@chromium.org3dc6abd2012-11-15 17:01:53 +0000129 last_error = e
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000130 logging.warning('Unable to connect to %s, error msg: %s', url, e)
131 time.sleep(0.5 + i)
132
133 # If we get no response from the server after max_retries, assume it
134 # is down and raise an exception
135 raise run_isolated.MappingError(
maruel@chromium.org3dc6abd2012-11-15 17:01:53 +0000136 'Unable to connect to server, %s, to see which files are presents: %s' %
137 (url, last_error))
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000138
139
maruel@chromium.org00a7d6c2012-11-22 14:11:01 +0000140def upload_hash_content_to_blobstore(generate_upload_url, hash_key, content):
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000141 """Uploads the given hash contents directly to the blobsotre via a generated
142 url.
143
144 Arguments:
145 generate_upload_url: The url to get the new upload url from.
146 hash_contents: The contents to upload.
147 """
148 logging.debug('Generating url to directly upload file to blobstore')
maruel@chromium.org92a3d2e2012-12-20 16:22:29 +0000149 assert isinstance(hash_key, str), hash_key
150 assert isinstance(content, str), (hash_key, content)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000151 upload_url = url_open(generate_upload_url, None).read()
152
153 if not upload_url:
154 logging.error('Unable to generate upload url')
155 return
156
157 content_type, body = encode_multipart_formdata(
maruel@chromium.org00a7d6c2012-11-22 14:11:01 +0000158 [], [('hash_contents', hash_key, content)])
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000159 url_open(upload_url, body, content_type)
160
161
162class UploadRemote(run_isolated.Remote):
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000163 def __init__(self, namespace, *args, **kwargs):
164 super(UploadRemote, self).__init__(*args, **kwargs)
maruel@chromium.org21243ce2012-12-20 17:43:00 +0000165 self.namespace = str(namespace)
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000166
167 def get_file_handler(self, base_url):
maruel@chromium.org21243ce2012-12-20 17:43:00 +0000168 base_url = str(base_url)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000169 def upload_file(content, hash_key):
maruel@chromium.org21243ce2012-12-20 17:43:00 +0000170 hash_key = str(hash_key)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000171 content_url = base_url.rstrip('/') + '/content/'
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000172 if len(content) > MIN_SIZE_FOR_DIRECT_BLOBSTORE:
173 upload_hash_content_to_blobstore(
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000174 content_url + 'generate_blobstore_url/' + self.namespace + '/' +
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000175 hash_key,
maruel@chromium.org00a7d6c2012-11-22 14:11:01 +0000176 hash_key,
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000177 content)
178 else:
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000179 url_open(content_url + 'store/' + self.namespace + '/' + hash_key,
180 content)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000181 return upload_file
182
183
csharp@chromium.org07fa7592013-01-11 18:19:30 +0000184def update_files_to_upload(query_url, queries, upload):
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000185 """Queries the server to see which files from this batch already exist there.
186
187 Arguments:
188 queries: The hash files to potential upload to the server.
csharp@chromium.org07fa7592013-01-11 18:19:30 +0000189 upload: Any new files that need to be upload are sent to this function.
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000190 """
191 body = ''.join(
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000192 (binascii.unhexlify(meta_data['h']) for (_, meta_data) in queries))
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000193 assert (len(body) % 20) == 0, repr(body)
194
195 response = url_open(query_url, body).read()
196 if len(queries) != len(response):
197 raise run_isolated.MappingError(
198 'Got an incorrect number of responses from the server. Expected %d, '
199 'but got %d' % (len(queries), len(response)))
200
201 hit = 0
202 for i in range(len(response)):
203 if response[i] == chr(0):
csharp@chromium.org07fa7592013-01-11 18:19:30 +0000204 upload(queries[i])
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000205 else:
206 hit += 1
207 logging.info('Queried %d files, %d cache hit', len(queries), hit)
208
209
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000210def compression_level(filename):
211 """Given a filename calculates the ideal compression level to use."""
212 file_ext = os.path.splitext(filename)[1].lower()
213 # TODO(csharp): Profile to find what compression level works best.
214 return 0 if file_ext in ALREADY_COMPRESSED_TYPES else 7
215
216
217def zip_and_trigger_upload(infile, metadata, upload_function):
218 compressor = zlib.compressobj(compression_level(infile))
219 hash_data = cStringIO.StringIO()
220 with open(infile, 'rb') as f:
221 # TODO(csharp): Fix crbug.com/150823 and enable the touched logic again.
222 while True: # and not metadata['T']:
223 chunk = f.read(run_isolated.ZIPPED_FILE_CHUNK)
224 if not chunk:
225 break
226 hash_data.write(compressor.compress(chunk))
227
228 hash_data.write(compressor.flush(zlib.Z_FINISH))
229 priority = (
230 run_isolated.Remote.HIGH if metadata.get('priority', '1') == '0'
231 else run_isolated.Remote.MED)
232 upload_function(priority, hash_data.getvalue(), metadata['h'],
233 None)
234 hash_data.close()
235
236
237def upload_sha1_tree(base_url, indir, infiles, namespace):
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000238 """Uploads the given tree to the given url.
239
240 Arguments:
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000241 base_url: The base url, it is assume that |base_url|/has/ can be used to
242 query if an element was already uploaded, and |base_url|/store/
243 can be used to upload a new element.
244 indir: Root directory the infiles are based in.
245 infiles: dict of files to map from |indir| to |outdir|.
246 namespace: The namespace to use on the server.
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000247 """
248 logging.info('upload tree(base_url=%s, indir=%s, files=%d)' %
249 (base_url, indir, len(infiles)))
250
csharp@chromium.org07fa7592013-01-11 18:19:30 +0000251 # Create a pool of workers to zip and upload any files missing from
252 # the server.
253 zipping_pool = run_isolated.ThreadPool(
254 num_threads=run_test_cases.num_processors())
255 remote_uploader = UploadRemote(namespace, base_url)
256
257 # Starts the zip and upload process for a given query. The query is assumed
258 # to be in the format (relfile, metadata).
csharp@chromium.org20a888c2013-01-15 15:06:55 +0000259 uploaded = []
csharp@chromium.org07fa7592013-01-11 18:19:30 +0000260 def zip_and_upload(query):
261 relfile, metadata = query
262 infile = os.path.join(indir, relfile)
263 zipping_pool.add_task(zip_and_trigger_upload, infile, metadata,
264 remote_uploader.add_item)
csharp@chromium.org20a888c2013-01-15 15:06:55 +0000265 uploaded.append(query)
csharp@chromium.org07fa7592013-01-11 18:19:30 +0000266
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000267 # Generate the list of files that need to be uploaded (since some may already
csharp@chromium.org07fa7592013-01-11 18:19:30 +0000268 # be on the server).
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000269 base_url = base_url.rstrip('/')
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000270 contains_hash_url = base_url + '/content/contains/' + namespace
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000271 next_queries = []
272 for relfile, metadata in infiles.iteritems():
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000273 if 'l' in metadata:
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000274 # Skip links when uploading.
275 continue
276
277 next_queries.append((relfile, metadata))
csharp@chromium.org07fa7592013-01-11 18:19:30 +0000278 if len(next_queries) == ITEMS_PER_CONTAINS_QUERY:
279 update_files_to_upload(contains_hash_url, next_queries, zip_and_upload)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000280 next_queries = []
281
282 if next_queries:
csharp@chromium.org07fa7592013-01-11 18:19:30 +0000283 update_files_to_upload(contains_hash_url, next_queries, zip_and_upload)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000284
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000285 logging.info('Waiting for all files to finish zipping')
286 zipping_pool.join()
287 logging.info('All files zipped.')
288
289 logging.info('Waiting for all files to finish uploading')
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000290 remote_uploader.join()
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000291 logging.info('All files are uploaded')
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000292
293 exception = remote_uploader.next_exception()
294 if exception:
295 raise exception[0], exception[1], exception[2]
296 total = len(infiles)
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000297 total_size = sum(metadata.get('s', 0) for metadata in infiles.itervalues())
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000298 logging.info(
299 'Total: %6d, %9.1fkb',
300 total,
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000301 sum(m.get('s', 0) for m in infiles.itervalues()) / 1024.)
csharp@chromium.org20a888c2013-01-15 15:06:55 +0000302 cache_hit = set(infiles.iterkeys()) - set(x[0] for x in uploaded)
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000303 cache_hit_size = sum(infiles[i].get('s', 0) for i in cache_hit)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000304 logging.info(
305 'cache hit: %6d, %9.1fkb, %6.2f%% files, %6.2f%% size',
306 len(cache_hit),
307 cache_hit_size / 1024.,
308 len(cache_hit) * 100. / total,
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000309 cache_hit_size * 100. / total_size if total_size else 0)
csharp@chromium.org20a888c2013-01-15 15:06:55 +0000310 cache_miss = uploaded
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000311 cache_miss_size = sum(infiles[i[0]].get('s', 0) for i in cache_miss)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000312 logging.info(
313 'cache miss: %6d, %9.1fkb, %6.2f%% files, %6.2f%% size',
314 len(cache_miss),
315 cache_miss_size / 1024.,
316 len(cache_miss) * 100. / total,
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000317 cache_miss_size * 100. / total_size if total_size else 0)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000318
319
320def main():
321 parser = optparse.OptionParser(
322 usage='%prog [options] <file1..fileN> or - to read from stdin',
323 description=sys.modules[__name__].__doc__)
324 # TODO(maruel): Support both NFS and isolateserver.
325 parser.add_option('-o', '--outdir', help='Remote server to archive to')
326 parser.add_option(
327 '-v', '--verbose',
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000328 action='count', default=0,
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000329 help='Use multiple times to increase verbosity')
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000330 parser.add_option('--namespace', default='default-gzip',
331 help='The namespace to use on the server.')
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000332
333 options, files = parser.parse_args()
334
335 levels = [logging.ERROR, logging.INFO, logging.DEBUG]
336 logging.basicConfig(
337 level=levels[min(len(levels)-1, options.verbose)],
338 format='%(levelname)5s %(module)15s(%(lineno)3d): %(message)s')
339 if files == ['-']:
340 files = sys.stdin.readlines()
341
342 if not files:
343 parser.error('Nothing to upload')
344 if not options.outdir:
345 parser.error('Nowhere to send. Please specify --outdir')
346
347 # Load the necessary metadata. This is going to be rewritten eventually to be
348 # more efficient.
349 infiles = dict(
350 (
351 f,
352 {
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000353 's': os.stat(f).st_size,
maruel@chromium.org037758d2012-12-10 17:59:46 +0000354 'h': sha1_file(f),
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000355 }
356 )
357 for f in files)
358
359 with run_isolated.Profiler('Archive'):
360 upload_sha1_tree(
361 base_url=options.outdir,
362 indir=os.getcwd(),
csharp@chromium.org59c7bcf2012-11-21 21:13:18 +0000363 infiles=infiles,
364 namespace=options.namespace)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000365 return 0
366
367
368if __name__ == '__main__':
369 sys.exit(main())