blob: 144945c7e07716b458687095a57f888cd907908d [file] [log] [blame]
maruel@chromium.orgc6f90062012-11-07 18:32:22 +00001#!/usr/bin/env python
2# Copyright (c) 2012 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Archives a set of files to a server."""
7
8import binascii
9import hashlib
10import logging
11import optparse
12import os
13import sys
14import time
15import urllib2
16
17import run_isolated
18
19
20# The maximum number of upload attempts to try when uploading a single file.
21MAX_UPLOAD_ATTEMPTS = 5
22
23# The minimum size of files to upload directly to the blobstore.
24MIN_SIZE_FOR_DIRECT_BLOBSTORE = 20 * 8
25
26
27def encode_multipart_formdata(fields, files,
28 mime_mapper=lambda _: 'application/octet-stream'):
29 """Encodes a Multipart form data object.
30
31 Args:
32 fields: a sequence (name, value) elements for
33 regular form fields.
34 files: a sequence of (name, filename, value) elements for data to be
35 uploaded as files.
36 mime_mapper: function to return the mime type from the filename.
37 Returns:
38 content_type: for httplib.HTTP instance
39 body: for httplib.HTTP instance
40 """
41 boundary = hashlib.md5(str(time.time())).hexdigest()
42 body_list = []
43 for (key, value) in fields:
44 if isinstance(key, unicode):
45 value = key.encode('utf-8')
46 if isinstance(value, unicode):
47 value = value.encode('utf-8')
48 body_list.append('--' + boundary)
49 body_list.append('Content-Disposition: form-data; name="%s"' % key)
50 body_list.append('')
51 body_list.append(value)
52 body_list.append('--' + boundary)
53 body_list.append('')
54 for (key, filename, value) in files:
55 if isinstance(key, unicode):
56 value = key.encode('utf-8')
57 if isinstance(filename, unicode):
58 value = filename.encode('utf-8')
59 if isinstance(value, unicode):
60 value = value.encode('utf-8')
61 body_list.append('--' + boundary)
62 body_list.append('Content-Disposition: form-data; name="%s"; '
63 'filename="%s"' % (key, filename))
64 body_list.append('Content-Type: %s' % mime_mapper(filename))
65 body_list.append('')
66 body_list.append(value)
67 body_list.append('--' + boundary)
68 body_list.append('')
69 if body_list:
70 body_list[-2] += '--'
71 body = '\r\n'.join(body_list)
72 content_type = 'multipart/form-data; boundary=%s' % boundary
73 return content_type, body
74
75
76def gen_url_request(url, payload, content_type='application/octet-stream'):
77 """Returns a POST request."""
78 request = urllib2.Request(url, data=payload)
79 if payload is not None:
80 request.add_header('Content-Type', content_type)
81 request.add_header('Content-Length', len(payload))
82 return request
83
84
85def url_open(url, data, content_type='application/octet-stream'):
86 """Opens the given url with the given data, repeating up to
87 MAX_UPLOAD_ATTEMPTS times if it encounters an error.
88
89 Arguments:
90 url: The url to open.
91 data: The data to send to the url.
92
93 Returns:
94 The response from the url, or it raises an exception it it failed to get
95 a response.
96 """
97 request = gen_url_request(url, data, content_type)
maruel@chromium.org3dc6abd2012-11-15 17:01:53 +000098 last_error = None
maruel@chromium.orgc6f90062012-11-07 18:32:22 +000099 for i in range(MAX_UPLOAD_ATTEMPTS):
100 try:
101 return urllib2.urlopen(request)
102 except urllib2.URLError as e:
maruel@chromium.org3dc6abd2012-11-15 17:01:53 +0000103 last_error = e
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000104 logging.warning('Unable to connect to %s, error msg: %s', url, e)
105 time.sleep(0.5 + i)
106
107 # If we get no response from the server after max_retries, assume it
108 # is down and raise an exception
109 raise run_isolated.MappingError(
maruel@chromium.org3dc6abd2012-11-15 17:01:53 +0000110 'Unable to connect to server, %s, to see which files are presents: %s' %
111 (url, last_error))
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000112
113
114def upload_hash_content_to_blobstore(generate_upload_url, content):
115 """Uploads the given hash contents directly to the blobsotre via a generated
116 url.
117
118 Arguments:
119 generate_upload_url: The url to get the new upload url from.
120 hash_contents: The contents to upload.
121 """
122 logging.debug('Generating url to directly upload file to blobstore')
123 upload_url = url_open(generate_upload_url, None).read()
124
125 if not upload_url:
126 logging.error('Unable to generate upload url')
127 return
128
129 content_type, body = encode_multipart_formdata(
130 [], [('hash_contents', 'hash_content', content)])
131 url_open(upload_url, body, content_type)
132
133
134class UploadRemote(run_isolated.Remote):
135 @staticmethod
136 def get_file_handler(base_url):
137 def upload_file(content, hash_key):
138 content_url = base_url.rstrip('/') + '/content/'
139 namespace = 'default'
140 if len(content) > MIN_SIZE_FOR_DIRECT_BLOBSTORE:
141 upload_hash_content_to_blobstore(
142 content_url + 'generate_blobstore_url/' + namespace + '/' +
143 hash_key,
144 content)
145 else:
csharp@chromium.org345a90e2012-11-13 21:31:49 +0000146 url_open(content_url + 'store/' + namespace + '/' + hash_key, content)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000147 return upload_file
148
149
150def update_files_to_upload(query_url, queries, files_to_upload):
151 """Queries the server to see which files from this batch already exist there.
152
153 Arguments:
154 queries: The hash files to potential upload to the server.
155 files_to_upload: Any new files that need to be upload are added to
156 this list.
157 """
158 body = ''.join(
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000159 (binascii.unhexlify(meta_data['h']) for (_, meta_data) in queries))
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000160 assert (len(body) % 20) == 0, repr(body)
161
162 response = url_open(query_url, body).read()
163 if len(queries) != len(response):
164 raise run_isolated.MappingError(
165 'Got an incorrect number of responses from the server. Expected %d, '
166 'but got %d' % (len(queries), len(response)))
167
168 hit = 0
169 for i in range(len(response)):
170 if response[i] == chr(0):
171 files_to_upload.append(queries[i])
172 else:
173 hit += 1
174 logging.info('Queried %d files, %d cache hit', len(queries), hit)
175
176
177def upload_sha1_tree(base_url, indir, infiles):
178 """Uploads the given tree to the given url.
179
180 Arguments:
181 base_url: The base url, it is assume that |base_url|/has/ can be used to
182 query if an element was already uploaded, and |base_url|/store/
183 can be used to upload a new element.
184 indir: Root directory the infiles are based in.
185 infiles: dict of files to map from |indir| to |outdir|.
186 """
187 logging.info('upload tree(base_url=%s, indir=%s, files=%d)' %
188 (base_url, indir, len(infiles)))
189
190 # Generate the list of files that need to be uploaded (since some may already
191 # be on the server.
192 base_url = base_url.rstrip('/')
193 contains_hash_url = base_url + '/content/contains/default'
194 to_upload = []
195 next_queries = []
196 for relfile, metadata in infiles.iteritems():
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000197 if 'l' in metadata:
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000198 # Skip links when uploading.
199 continue
200
201 next_queries.append((relfile, metadata))
202 if len(next_queries) == 1000:
203 update_files_to_upload(contains_hash_url, next_queries, to_upload)
204 next_queries = []
205
206 if next_queries:
207 update_files_to_upload(contains_hash_url, next_queries, to_upload)
208
209
210 # Upload the required files.
211 remote_uploader = UploadRemote(base_url)
212 for relfile, metadata in to_upload:
213 # TODO(csharp): Fix crbug.com/150823 and enable the touched logic again.
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000214 # if metadata.get('T') == True:
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000215 # hash_data = ''
216 infile = os.path.join(indir, relfile)
217 with open(infile, 'rb') as f:
218 hash_data = f.read()
219 priority = (run_isolated.Remote.HIGH if metadata.get('priority', '1') == '0'
220 else run_isolated.Remote.MED)
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000221 remote_uploader.add_item(priority, hash_data, metadata['h'], None)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000222 remote_uploader.join()
223
224 exception = remote_uploader.next_exception()
225 if exception:
226 raise exception[0], exception[1], exception[2]
227 total = len(infiles)
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000228 total_size = sum(metadata.get('s', 0) for metadata in infiles.itervalues())
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000229 logging.info(
230 'Total: %6d, %9.1fkb',
231 total,
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000232 sum(m.get('s', 0) for m in infiles.itervalues()) / 1024.)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000233 cache_hit = set(infiles.iterkeys()) - set(x[0] for x in to_upload)
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000234 cache_hit_size = sum(infiles[i].get('s', 0) for i in cache_hit)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000235 logging.info(
236 'cache hit: %6d, %9.1fkb, %6.2f%% files, %6.2f%% size',
237 len(cache_hit),
238 cache_hit_size / 1024.,
239 len(cache_hit) * 100. / total,
240 cache_hit_size * 100. / total_size)
241 cache_miss = to_upload
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000242 cache_miss_size = sum(infiles[i[0]].get('s', 0) for i in cache_miss)
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000243 logging.info(
244 'cache miss: %6d, %9.1fkb, %6.2f%% files, %6.2f%% size',
245 len(cache_miss),
246 cache_miss_size / 1024.,
247 len(cache_miss) * 100. / total,
248 cache_miss_size * 100. / total_size)
249
250
251def main():
252 parser = optparse.OptionParser(
253 usage='%prog [options] <file1..fileN> or - to read from stdin',
254 description=sys.modules[__name__].__doc__)
255 # TODO(maruel): Support both NFS and isolateserver.
256 parser.add_option('-o', '--outdir', help='Remote server to archive to')
257 parser.add_option(
258 '-v', '--verbose',
259 action='count',
260 help='Use multiple times to increase verbosity')
261
262 options, files = parser.parse_args()
263
264 levels = [logging.ERROR, logging.INFO, logging.DEBUG]
265 logging.basicConfig(
266 level=levels[min(len(levels)-1, options.verbose)],
267 format='%(levelname)5s %(module)15s(%(lineno)3d): %(message)s')
268 if files == ['-']:
269 files = sys.stdin.readlines()
270
271 if not files:
272 parser.error('Nothing to upload')
273 if not options.outdir:
274 parser.error('Nowhere to send. Please specify --outdir')
275
276 # Load the necessary metadata. This is going to be rewritten eventually to be
277 # more efficient.
278 infiles = dict(
279 (
280 f,
281 {
maruel@chromium.orge5c17132012-11-21 18:18:46 +0000282 's': os.stat(f).st_size,
283 'h': hashlib.sha1(open(f, 'r').read()).hexdigest(),
maruel@chromium.orgc6f90062012-11-07 18:32:22 +0000284 }
285 )
286 for f in files)
287
288 with run_isolated.Profiler('Archive'):
289 upload_sha1_tree(
290 base_url=options.outdir,
291 indir=os.getcwd(),
292 infiles=infiles)
293 return 0
294
295
296if __name__ == '__main__':
297 sys.exit(main())