blob: c5047b0f94f0d7d4d6c8bbd71bd524ed39f01c35 [file] [log] [blame]
Marc-Antoine Ruel8bee66d2014-08-28 19:02:07 -04001# Copyright 2014 The Swarming Authors. All rights reserved.
2# Use of this source code is governed under the Apache License, Version 2.0 that
3# can be found in the LICENSE file.
4
5"""Understands .isolated files and can do local operations on them."""
6
7import hashlib
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -04008import json
Marc-Antoine Ruel92257792014-08-28 20:51:08 -04009import logging
10import os
Marc-Antoine Ruel8bee66d2014-08-28 19:02:07 -040011import re
Marc-Antoine Ruel92257792014-08-28 20:51:08 -040012import stat
13import sys
14
15from utils import file_path
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -040016from utils import tools
Marc-Antoine Ruel8bee66d2014-08-28 19:02:07 -040017
18
19# Version stored and expected in .isolated files.
20ISOLATED_FILE_VERSION = '1.4'
21
22
23# Chunk size to use when doing disk I/O.
24DISK_FILE_CHUNK = 1024 * 1024
25
26
27# Sadly, hashlib uses 'sha1' instead of the standard 'sha-1' so explicitly
28# specify the names here.
29SUPPORTED_ALGOS = {
30 'md5': hashlib.md5,
31 'sha-1': hashlib.sha1,
32 'sha-512': hashlib.sha512,
33}
34
35
36# Used for serialization.
37SUPPORTED_ALGOS_REVERSE = dict((v, k) for k, v in SUPPORTED_ALGOS.iteritems())
38
39
Marc-Antoine Ruel1e7658c2014-08-28 19:46:39 -040040class IsolatedError(ValueError):
41 """Generic failure to load a .isolated file."""
42 pass
43
44
45class MappingError(OSError):
46 """Failed to recreate the tree."""
47 pass
48
49
Marc-Antoine Ruel8bee66d2014-08-28 19:02:07 -040050def is_valid_hash(value, algo):
51 """Returns if the value is a valid hash for the corresponding algorithm."""
52 size = 2 * algo().digest_size
53 return bool(re.match(r'^[a-fA-F0-9]{%d}$' % size, value))
54
55
56def get_hash_algo(_namespace):
57 """Return hash algorithm class to use when uploading to given |namespace|."""
58 # TODO(vadimsh): Implement this at some point.
59 return hashlib.sha1
60
61
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -040062def is_namespace_with_compression(namespace):
63 """Returns True if given |namespace| stores compressed objects."""
64 return namespace.endswith(('-gzip', '-deflate'))
65
66
Marc-Antoine Ruel8bee66d2014-08-28 19:02:07 -040067def hash_file(filepath, algo):
68 """Calculates the hash of a file without reading it all in memory at once.
69
70 |algo| should be one of hashlib hashing algorithm.
71 """
72 digest = algo()
73 with open(filepath, 'rb') as f:
74 while True:
75 chunk = f.read(DISK_FILE_CHUNK)
76 if not chunk:
77 break
78 digest.update(chunk)
79 return digest.hexdigest()
Marc-Antoine Ruel92257792014-08-28 20:51:08 -040080
81
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -040082class IsolatedFile(object):
83 """Represents a single parsed .isolated file."""
Vadim Shtayura7f7459c2014-09-04 13:25:10 -070084
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -040085 def __init__(self, obj_hash, algo):
86 """|obj_hash| is really the sha-1 of the file."""
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -040087 self.obj_hash = obj_hash
88 self.algo = algo
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -040089
90 # Raw data.
91 self.data = {}
92 # A IsolatedFile instance, one per object in self.includes.
93 self.children = []
94
95 # Set once the .isolated file is loaded.
Vadim Shtayura7f7459c2014-09-04 13:25:10 -070096 self._is_loaded = False
97
98 def __repr__(self):
99 return 'IsolatedFile(%s, loaded: %s)' % (self.obj_hash, self._is_loaded)
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400100
101 def load(self, content):
102 """Verifies the .isolated file is valid and loads this object with the json
103 data.
104 """
105 logging.debug('IsolatedFile.load(%s)' % self.obj_hash)
Vadim Shtayura7f7459c2014-09-04 13:25:10 -0700106 assert not self._is_loaded
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400107 self.data = load_isolated(content, self.algo)
108 self.children = [
109 IsolatedFile(i, self.algo) for i in self.data.get('includes', [])
110 ]
Vadim Shtayura7f7459c2014-09-04 13:25:10 -0700111 self._is_loaded = True
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400112
Vadim Shtayura7f7459c2014-09-04 13:25:10 -0700113 @property
114 def is_loaded(self):
115 """Returns True if 'load' was already called."""
116 return self._is_loaded
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400117
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400118
Vadim Shtayura7f7459c2014-09-04 13:25:10 -0700119def walk_includes(isolated):
120 """Walks IsolatedFile include graph and yields IsolatedFile objects.
121
122 Visits root node first, then recursively all children, left to right.
123 Not yet loaded nodes are considered childless.
124 """
125 yield isolated
126 for child in isolated.children:
127 for x in walk_includes(child):
128 yield x
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400129
130
Marc-Antoine Ruel92257792014-08-28 20:51:08 -0400131def expand_symlinks(indir, relfile):
132 """Follows symlinks in |relfile|, but treating symlinks that point outside the
133 build tree as if they were ordinary directories/files. Returns the final
134 symlink-free target and a list of paths to symlinks encountered in the
135 process.
136
137 The rule about symlinks outside the build tree is for the benefit of the
138 Chromium OS ebuild, which symlinks the output directory to an unrelated path
139 in the chroot.
140
141 Fails when a directory loop is detected, although in theory we could support
142 that case.
143 """
144 is_directory = relfile.endswith(os.path.sep)
145 done = indir
146 todo = relfile.strip(os.path.sep)
147 symlinks = []
148
149 while todo:
150 pre_symlink, symlink, post_symlink = file_path.split_at_symlink(
151 done, todo)
152 if not symlink:
153 todo = file_path.fix_native_path_case(done, todo)
154 done = os.path.join(done, todo)
155 break
156 symlink_path = os.path.join(done, pre_symlink, symlink)
157 post_symlink = post_symlink.lstrip(os.path.sep)
158 # readlink doesn't exist on Windows.
159 # pylint: disable=E1101
160 target = os.path.normpath(os.path.join(done, pre_symlink))
161 symlink_target = os.readlink(symlink_path)
162 if os.path.isabs(symlink_target):
163 # Absolute path are considered a normal directories. The use case is
164 # generally someone who puts the output directory on a separate drive.
165 target = symlink_target
166 else:
167 # The symlink itself could be using the wrong path case.
168 target = file_path.fix_native_path_case(target, symlink_target)
169
170 if not os.path.exists(target):
171 raise MappingError(
172 'Symlink target doesn\'t exist: %s -> %s' % (symlink_path, target))
173 target = file_path.get_native_path_case(target)
174 if not file_path.path_starts_with(indir, target):
175 done = symlink_path
176 todo = post_symlink
177 continue
178 if file_path.path_starts_with(target, symlink_path):
179 raise MappingError(
180 'Can\'t map recursive symlink reference %s -> %s' %
181 (symlink_path, target))
182 logging.info('Found symlink: %s -> %s', symlink_path, target)
183 symlinks.append(os.path.relpath(symlink_path, indir))
184 # Treat the common prefix of the old and new paths as done, and start
185 # scanning again.
186 target = target.split(os.path.sep)
187 symlink_path = symlink_path.split(os.path.sep)
188 prefix_length = 0
189 for target_piece, symlink_path_piece in zip(target, symlink_path):
190 if target_piece == symlink_path_piece:
191 prefix_length += 1
192 else:
193 break
194 done = os.path.sep.join(target[:prefix_length])
195 todo = os.path.join(
196 os.path.sep.join(target[prefix_length:]), post_symlink)
197
198 relfile = os.path.relpath(done, indir)
199 relfile = relfile.rstrip(os.path.sep) + is_directory * os.path.sep
200 return relfile, symlinks
201
202
203def expand_directory_and_symlink(indir, relfile, blacklist, follow_symlinks):
204 """Expands a single input. It can result in multiple outputs.
205
206 This function is recursive when relfile is a directory.
207
208 Note: this code doesn't properly handle recursive symlink like one created
209 with:
210 ln -s .. foo
211 """
212 if os.path.isabs(relfile):
213 raise MappingError('Can\'t map absolute path %s' % relfile)
214
215 infile = file_path.normpath(os.path.join(indir, relfile))
216 if not infile.startswith(indir):
217 raise MappingError('Can\'t map file %s outside %s' % (infile, indir))
218
219 filepath = os.path.join(indir, relfile)
220 native_filepath = file_path.get_native_path_case(filepath)
221 if filepath != native_filepath:
222 # Special case './'.
223 if filepath != native_filepath + '.' + os.path.sep:
224 # While it'd be nice to enforce path casing on Windows, it's impractical.
225 # Also give up enforcing strict path case on OSX. Really, it's that sad.
226 # The case where it happens is very specific and hard to reproduce:
227 # get_native_path_case(
228 # u'Foo.framework/Versions/A/Resources/Something.nib') will return
229 # u'Foo.framework/Versions/A/resources/Something.nib', e.g. lowercase 'r'.
230 #
231 # Note that this is really something deep in OSX because running
232 # ls Foo.framework/Versions/A
233 # will print out 'Resources', while file_path.get_native_path_case()
234 # returns a lower case 'r'.
235 #
236 # So *something* is happening under the hood resulting in the command 'ls'
237 # and Carbon.File.FSPathMakeRef('path').FSRefMakePath() to disagree. We
238 # have no idea why.
239 if sys.platform not in ('darwin', 'win32'):
240 raise MappingError(
241 'File path doesn\'t equal native file path\n%s != %s' %
242 (filepath, native_filepath))
243
244 symlinks = []
245 if follow_symlinks:
246 relfile, symlinks = expand_symlinks(indir, relfile)
247
248 if relfile.endswith(os.path.sep):
249 if not os.path.isdir(infile):
250 raise MappingError(
251 '%s is not a directory but ends with "%s"' % (infile, os.path.sep))
252
253 # Special case './'.
254 if relfile.startswith('.' + os.path.sep):
255 relfile = relfile[2:]
256 outfiles = symlinks
257 try:
258 for filename in os.listdir(infile):
259 inner_relfile = os.path.join(relfile, filename)
260 if blacklist and blacklist(inner_relfile):
261 continue
262 if os.path.isdir(os.path.join(indir, inner_relfile)):
263 inner_relfile += os.path.sep
264 outfiles.extend(
265 expand_directory_and_symlink(indir, inner_relfile, blacklist,
266 follow_symlinks))
267 return outfiles
268 except OSError as e:
269 raise MappingError(
270 'Unable to iterate over directory %s.\n%s' % (infile, e))
271 else:
272 # Always add individual files even if they were blacklisted.
273 if os.path.isdir(infile):
274 raise MappingError(
275 'Input directory %s must have a trailing slash' % infile)
276
277 if not os.path.isfile(infile):
278 raise MappingError('Input file %s doesn\'t exist' % infile)
279
280 return symlinks + [relfile]
281
282
283def expand_directories_and_symlinks(
284 indir, infiles, blacklist, follow_symlinks, ignore_broken_items):
285 """Expands the directories and the symlinks, applies the blacklist and
286 verifies files exist.
287
288 Files are specified in os native path separator.
289 """
290 outfiles = []
291 for relfile in infiles:
292 try:
293 outfiles.extend(
294 expand_directory_and_symlink(
295 indir, relfile, blacklist, follow_symlinks))
296 except MappingError as e:
297 if not ignore_broken_items:
298 raise
299 logging.info('warning: %s', e)
300 return outfiles
301
302
303def file_to_metadata(filepath, prevdict, read_only, algo):
304 """Processes an input file, a dependency, and return meta data about it.
305
306 Behaviors:
307 - Retrieves the file mode, file size, file timestamp, file link
308 destination if it is a file link and calcultate the SHA-1 of the file's
309 content if the path points to a file and not a symlink.
310
311 Arguments:
312 filepath: File to act on.
313 prevdict: the previous dictionary. It is used to retrieve the cached sha-1
314 to skip recalculating the hash. Optional.
315 read_only: If 1 or 2, the file mode is manipulated. In practice, only save
316 one of 4 modes: 0755 (rwx), 0644 (rw), 0555 (rx), 0444 (r). On
317 windows, mode is not set since all files are 'executable' by
318 default.
319 algo: Hashing algorithm used.
320
321 Returns:
322 The necessary dict to create a entry in the 'files' section of an .isolated
323 file.
324 """
325 out = {}
326 # Always check the file stat and check if it is a link. The timestamp is used
327 # to know if the file's content/symlink destination should be looked into.
328 # E.g. only reuse from prevdict if the timestamp hasn't changed.
329 # There is the risk of the file's timestamp being reset to its last value
330 # manually while its content changed. We don't protect against that use case.
331 try:
332 filestats = os.lstat(filepath)
333 except OSError:
334 # The file is not present.
335 raise MappingError('%s is missing' % filepath)
336 is_link = stat.S_ISLNK(filestats.st_mode)
337
338 if sys.platform != 'win32':
339 # Ignore file mode on Windows since it's not really useful there.
340 filemode = stat.S_IMODE(filestats.st_mode)
341 # Remove write access for group and all access to 'others'.
342 filemode &= ~(stat.S_IWGRP | stat.S_IRWXO)
343 if read_only:
344 filemode &= ~stat.S_IWUSR
345 if filemode & stat.S_IXUSR:
346 filemode |= stat.S_IXGRP
347 else:
348 filemode &= ~stat.S_IXGRP
349 if not is_link:
350 out['m'] = filemode
351
352 # Used to skip recalculating the hash or link destination. Use the most recent
353 # update time.
354 out['t'] = int(round(filestats.st_mtime))
355
356 if not is_link:
357 out['s'] = filestats.st_size
358 # If the timestamp wasn't updated and the file size is still the same, carry
359 # on the sha-1.
360 if (prevdict.get('t') == out['t'] and
361 prevdict.get('s') == out['s']):
362 # Reuse the previous hash if available.
363 out['h'] = prevdict.get('h')
364 if not out.get('h'):
365 out['h'] = hash_file(filepath, algo)
366 else:
367 # If the timestamp wasn't updated, carry on the link destination.
368 if prevdict.get('t') == out['t']:
369 # Reuse the previous link destination if available.
370 out['l'] = prevdict.get('l')
371 if out.get('l') is None:
372 # The link could be in an incorrect path case. In practice, this only
373 # happen on OSX on case insensitive HFS.
374 # TODO(maruel): It'd be better if it was only done once, in
375 # expand_directory_and_symlink(), so it would not be necessary to do again
376 # here.
377 symlink_value = os.readlink(filepath) # pylint: disable=E1101
378 filedir = file_path.get_native_path_case(os.path.dirname(filepath))
379 native_dest = file_path.fix_native_path_case(filedir, symlink_value)
380 out['l'] = os.path.relpath(native_dest, filedir)
381 return out
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400382
383
384def save_isolated(isolated, data):
385 """Writes one or multiple .isolated files.
386
387 Note: this reference implementation does not create child .isolated file so it
388 always returns an empty list.
389
390 Returns the list of child isolated files that are included by |isolated|.
391 """
392 # Make sure the data is valid .isolated data by 'reloading' it.
393 algo = SUPPORTED_ALGOS[data['algo']]
394 load_isolated(json.dumps(data), algo)
395 tools.write_json(isolated, data, True)
396 return []
397
398
399def load_isolated(content, algo):
400 """Verifies the .isolated file is valid and loads this object with the json
401 data.
402
403 Arguments:
404 - content: raw serialized content to load.
405 - algo: hashlib algorithm class. Used to confirm the algorithm matches the
406 algorithm used on the Isolate Server.
407 """
408 try:
409 data = json.loads(content)
410 except ValueError:
411 raise IsolatedError('Failed to parse: %s...' % content[:100])
412
413 if not isinstance(data, dict):
414 raise IsolatedError('Expected dict, got %r' % data)
415
416 # Check 'version' first, since it could modify the parsing after.
417 value = data.get('version', '1.0')
418 if not isinstance(value, basestring):
419 raise IsolatedError('Expected string, got %r' % value)
420 try:
421 version = tuple(map(int, value.split('.')))
422 except ValueError:
423 raise IsolatedError('Expected valid version, got %r' % value)
424
425 expected_version = tuple(map(int, ISOLATED_FILE_VERSION.split('.')))
426 # Major version must match.
427 if version[0] != expected_version[0]:
428 raise IsolatedError(
429 'Expected compatible \'%s\' version, got %r' %
430 (ISOLATED_FILE_VERSION, value))
431
432 if algo is None:
433 # TODO(maruel): Remove the default around Jan 2014.
434 # Default the algorithm used in the .isolated file itself, falls back to
435 # 'sha-1' if unspecified.
436 algo = SUPPORTED_ALGOS_REVERSE[data.get('algo', 'sha-1')]
437
438 for key, value in data.iteritems():
439 if key == 'algo':
440 if not isinstance(value, basestring):
441 raise IsolatedError('Expected string, got %r' % value)
442 if value not in SUPPORTED_ALGOS:
443 raise IsolatedError(
444 'Expected one of \'%s\', got %r' %
445 (', '.join(sorted(SUPPORTED_ALGOS)), value))
446 if value != SUPPORTED_ALGOS_REVERSE[algo]:
447 raise IsolatedError(
448 'Expected \'%s\', got %r' % (SUPPORTED_ALGOS_REVERSE[algo], value))
449
450 elif key == 'command':
451 if not isinstance(value, list):
452 raise IsolatedError('Expected list, got %r' % value)
453 if not value:
454 raise IsolatedError('Expected non-empty command')
455 for subvalue in value:
456 if not isinstance(subvalue, basestring):
457 raise IsolatedError('Expected string, got %r' % subvalue)
458
459 elif key == 'files':
460 if not isinstance(value, dict):
461 raise IsolatedError('Expected dict, got %r' % value)
462 for subkey, subvalue in value.iteritems():
463 if not isinstance(subkey, basestring):
464 raise IsolatedError('Expected string, got %r' % subkey)
465 if not isinstance(subvalue, dict):
466 raise IsolatedError('Expected dict, got %r' % subvalue)
467 for subsubkey, subsubvalue in subvalue.iteritems():
468 if subsubkey == 'l':
469 if not isinstance(subsubvalue, basestring):
470 raise IsolatedError('Expected string, got %r' % subsubvalue)
471 elif subsubkey == 'm':
472 if not isinstance(subsubvalue, int):
473 raise IsolatedError('Expected int, got %r' % subsubvalue)
474 elif subsubkey == 'h':
475 if not is_valid_hash(subsubvalue, algo):
476 raise IsolatedError('Expected sha-1, got %r' % subsubvalue)
477 elif subsubkey == 's':
478 if not isinstance(subsubvalue, (int, long)):
479 raise IsolatedError('Expected int or long, got %r' % subsubvalue)
480 else:
481 raise IsolatedError('Unknown subsubkey %s' % subsubkey)
482 if bool('h' in subvalue) == bool('l' in subvalue):
483 raise IsolatedError(
484 'Need only one of \'h\' (sha-1) or \'l\' (link), got: %r' %
485 subvalue)
486 if bool('h' in subvalue) != bool('s' in subvalue):
487 raise IsolatedError(
488 'Both \'h\' (sha-1) and \'s\' (size) should be set, got: %r' %
489 subvalue)
490 if bool('s' in subvalue) == bool('l' in subvalue):
491 raise IsolatedError(
492 'Need only one of \'s\' (size) or \'l\' (link), got: %r' %
493 subvalue)
494 if bool('l' in subvalue) and bool('m' in subvalue):
495 raise IsolatedError(
496 'Cannot use \'m\' (mode) and \'l\' (link), got: %r' %
497 subvalue)
498
499 elif key == 'includes':
500 if not isinstance(value, list):
501 raise IsolatedError('Expected list, got %r' % value)
502 if not value:
503 raise IsolatedError('Expected non-empty includes list')
504 for subvalue in value:
505 if not is_valid_hash(subvalue, algo):
506 raise IsolatedError('Expected sha-1, got %r' % subvalue)
507
508 elif key == 'os':
509 if version >= (1, 4):
510 raise IsolatedError('Key \'os\' is not allowed starting version 1.4')
511
512 elif key == 'read_only':
513 if not value in (0, 1, 2):
514 raise IsolatedError('Expected 0, 1 or 2, got %r' % value)
515
516 elif key == 'relative_cwd':
517 if not isinstance(value, basestring):
518 raise IsolatedError('Expected string, got %r' % value)
519
520 elif key == 'version':
521 # Already checked above.
522 pass
523
524 else:
525 raise IsolatedError('Unknown key %r' % key)
526
527 # Automatically fix os.path.sep if necessary. While .isolated files are always
528 # in the the native path format, someone could want to download an .isolated
529 # tree from another OS.
530 wrong_path_sep = '/' if os.path.sep == '\\' else '\\'
531 if 'files' in data:
532 data['files'] = dict(
533 (k.replace(wrong_path_sep, os.path.sep), v)
534 for k, v in data['files'].iteritems())
535 for v in data['files'].itervalues():
536 if 'l' in v:
537 v['l'] = v['l'].replace(wrong_path_sep, os.path.sep)
538 if 'relative_cwd' in data:
539 data['relative_cwd'] = data['relative_cwd'].replace(
540 wrong_path_sep, os.path.sep)
541 return data