blob: 38647b0449b181ac6bfc8c5dabdfadf49ed0395f [file] [log] [blame]
Marc-Antoine Ruel8bee66d2014-08-28 19:02:07 -04001# Copyright 2014 The Swarming Authors. All rights reserved.
2# Use of this source code is governed under the Apache License, Version 2.0 that
3# can be found in the LICENSE file.
4
5"""Understands .isolated files and can do local operations on them."""
6
7import hashlib
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -04008import json
Marc-Antoine Ruel92257792014-08-28 20:51:08 -04009import logging
10import os
Marc-Antoine Ruel8bee66d2014-08-28 19:02:07 -040011import re
Marc-Antoine Ruel92257792014-08-28 20:51:08 -040012import stat
13import sys
14
15from utils import file_path
maruel12e30012015-10-09 11:55:35 -070016from utils import fs
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -040017from utils import tools
Marc-Antoine Ruel8bee66d2014-08-28 19:02:07 -040018
19
20# Version stored and expected in .isolated files.
21ISOLATED_FILE_VERSION = '1.4'
22
23
24# Chunk size to use when doing disk I/O.
25DISK_FILE_CHUNK = 1024 * 1024
26
27
28# Sadly, hashlib uses 'sha1' instead of the standard 'sha-1' so explicitly
29# specify the names here.
30SUPPORTED_ALGOS = {
31 'md5': hashlib.md5,
32 'sha-1': hashlib.sha1,
33 'sha-512': hashlib.sha512,
34}
35
36
37# Used for serialization.
38SUPPORTED_ALGOS_REVERSE = dict((v, k) for k, v in SUPPORTED_ALGOS.iteritems())
39
40
Marc-Antoine Ruel1e7658c2014-08-28 19:46:39 -040041class IsolatedError(ValueError):
42 """Generic failure to load a .isolated file."""
43 pass
44
45
46class MappingError(OSError):
47 """Failed to recreate the tree."""
48 pass
49
50
Marc-Antoine Ruel8bee66d2014-08-28 19:02:07 -040051def is_valid_hash(value, algo):
52 """Returns if the value is a valid hash for the corresponding algorithm."""
53 size = 2 * algo().digest_size
54 return bool(re.match(r'^[a-fA-F0-9]{%d}$' % size, value))
55
56
57def get_hash_algo(_namespace):
58 """Return hash algorithm class to use when uploading to given |namespace|."""
59 # TODO(vadimsh): Implement this at some point.
60 return hashlib.sha1
61
62
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -040063def is_namespace_with_compression(namespace):
64 """Returns True if given |namespace| stores compressed objects."""
65 return namespace.endswith(('-gzip', '-deflate'))
66
67
Marc-Antoine Ruel8bee66d2014-08-28 19:02:07 -040068def hash_file(filepath, algo):
69 """Calculates the hash of a file without reading it all in memory at once.
70
71 |algo| should be one of hashlib hashing algorithm.
72 """
73 digest = algo()
maruel12e30012015-10-09 11:55:35 -070074 with fs.open(filepath, 'rb') as f:
Marc-Antoine Ruel8bee66d2014-08-28 19:02:07 -040075 while True:
76 chunk = f.read(DISK_FILE_CHUNK)
77 if not chunk:
78 break
79 digest.update(chunk)
80 return digest.hexdigest()
Marc-Antoine Ruel92257792014-08-28 20:51:08 -040081
82
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -040083class IsolatedFile(object):
84 """Represents a single parsed .isolated file."""
Vadim Shtayura7f7459c2014-09-04 13:25:10 -070085
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -040086 def __init__(self, obj_hash, algo):
87 """|obj_hash| is really the sha-1 of the file."""
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -040088 self.obj_hash = obj_hash
89 self.algo = algo
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -040090
91 # Raw data.
92 self.data = {}
93 # A IsolatedFile instance, one per object in self.includes.
94 self.children = []
95
96 # Set once the .isolated file is loaded.
Vadim Shtayura7f7459c2014-09-04 13:25:10 -070097 self._is_loaded = False
98
99 def __repr__(self):
100 return 'IsolatedFile(%s, loaded: %s)' % (self.obj_hash, self._is_loaded)
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400101
102 def load(self, content):
103 """Verifies the .isolated file is valid and loads this object with the json
104 data.
105 """
106 logging.debug('IsolatedFile.load(%s)' % self.obj_hash)
Vadim Shtayura7f7459c2014-09-04 13:25:10 -0700107 assert not self._is_loaded
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400108 self.data = load_isolated(content, self.algo)
109 self.children = [
110 IsolatedFile(i, self.algo) for i in self.data.get('includes', [])
111 ]
Vadim Shtayura7f7459c2014-09-04 13:25:10 -0700112 self._is_loaded = True
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400113
Vadim Shtayura7f7459c2014-09-04 13:25:10 -0700114 @property
115 def is_loaded(self):
116 """Returns True if 'load' was already called."""
117 return self._is_loaded
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400118
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400119
Vadim Shtayura7f7459c2014-09-04 13:25:10 -0700120def walk_includes(isolated):
121 """Walks IsolatedFile include graph and yields IsolatedFile objects.
122
123 Visits root node first, then recursively all children, left to right.
124 Not yet loaded nodes are considered childless.
125 """
126 yield isolated
127 for child in isolated.children:
128 for x in walk_includes(child):
129 yield x
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400130
131
Vadim Shtayurac28b74f2014-10-06 20:00:08 -0700132@tools.profile
Marc-Antoine Ruel92257792014-08-28 20:51:08 -0400133def expand_symlinks(indir, relfile):
134 """Follows symlinks in |relfile|, but treating symlinks that point outside the
135 build tree as if they were ordinary directories/files. Returns the final
136 symlink-free target and a list of paths to symlinks encountered in the
137 process.
138
139 The rule about symlinks outside the build tree is for the benefit of the
140 Chromium OS ebuild, which symlinks the output directory to an unrelated path
141 in the chroot.
142
143 Fails when a directory loop is detected, although in theory we could support
144 that case.
145 """
146 is_directory = relfile.endswith(os.path.sep)
147 done = indir
148 todo = relfile.strip(os.path.sep)
149 symlinks = []
150
151 while todo:
Vadim Shtayura56c17562014-10-07 17:13:34 -0700152 pre_symlink, symlink, post_symlink = file_path.split_at_symlink(done, todo)
Marc-Antoine Ruel92257792014-08-28 20:51:08 -0400153 if not symlink:
154 todo = file_path.fix_native_path_case(done, todo)
155 done = os.path.join(done, todo)
156 break
157 symlink_path = os.path.join(done, pre_symlink, symlink)
158 post_symlink = post_symlink.lstrip(os.path.sep)
159 # readlink doesn't exist on Windows.
160 # pylint: disable=E1101
161 target = os.path.normpath(os.path.join(done, pre_symlink))
162 symlink_target = os.readlink(symlink_path)
163 if os.path.isabs(symlink_target):
164 # Absolute path are considered a normal directories. The use case is
165 # generally someone who puts the output directory on a separate drive.
166 target = symlink_target
167 else:
168 # The symlink itself could be using the wrong path case.
169 target = file_path.fix_native_path_case(target, symlink_target)
170
171 if not os.path.exists(target):
172 raise MappingError(
173 'Symlink target doesn\'t exist: %s -> %s' % (symlink_path, target))
174 target = file_path.get_native_path_case(target)
175 if not file_path.path_starts_with(indir, target):
176 done = symlink_path
177 todo = post_symlink
178 continue
179 if file_path.path_starts_with(target, symlink_path):
180 raise MappingError(
181 'Can\'t map recursive symlink reference %s -> %s' %
182 (symlink_path, target))
183 logging.info('Found symlink: %s -> %s', symlink_path, target)
184 symlinks.append(os.path.relpath(symlink_path, indir))
185 # Treat the common prefix of the old and new paths as done, and start
186 # scanning again.
187 target = target.split(os.path.sep)
188 symlink_path = symlink_path.split(os.path.sep)
189 prefix_length = 0
190 for target_piece, symlink_path_piece in zip(target, symlink_path):
191 if target_piece == symlink_path_piece:
192 prefix_length += 1
193 else:
194 break
195 done = os.path.sep.join(target[:prefix_length])
196 todo = os.path.join(
197 os.path.sep.join(target[prefix_length:]), post_symlink)
198
199 relfile = os.path.relpath(done, indir)
200 relfile = relfile.rstrip(os.path.sep) + is_directory * os.path.sep
201 return relfile, symlinks
202
203
Vadim Shtayurac28b74f2014-10-06 20:00:08 -0700204@tools.profile
Marc-Antoine Ruel92257792014-08-28 20:51:08 -0400205def expand_directory_and_symlink(indir, relfile, blacklist, follow_symlinks):
206 """Expands a single input. It can result in multiple outputs.
207
208 This function is recursive when relfile is a directory.
209
210 Note: this code doesn't properly handle recursive symlink like one created
211 with:
212 ln -s .. foo
213 """
214 if os.path.isabs(relfile):
215 raise MappingError('Can\'t map absolute path %s' % relfile)
216
217 infile = file_path.normpath(os.path.join(indir, relfile))
218 if not infile.startswith(indir):
219 raise MappingError('Can\'t map file %s outside %s' % (infile, indir))
220
221 filepath = os.path.join(indir, relfile)
222 native_filepath = file_path.get_native_path_case(filepath)
223 if filepath != native_filepath:
224 # Special case './'.
225 if filepath != native_filepath + '.' + os.path.sep:
226 # While it'd be nice to enforce path casing on Windows, it's impractical.
227 # Also give up enforcing strict path case on OSX. Really, it's that sad.
228 # The case where it happens is very specific and hard to reproduce:
229 # get_native_path_case(
230 # u'Foo.framework/Versions/A/Resources/Something.nib') will return
231 # u'Foo.framework/Versions/A/resources/Something.nib', e.g. lowercase 'r'.
232 #
233 # Note that this is really something deep in OSX because running
234 # ls Foo.framework/Versions/A
235 # will print out 'Resources', while file_path.get_native_path_case()
236 # returns a lower case 'r'.
237 #
238 # So *something* is happening under the hood resulting in the command 'ls'
239 # and Carbon.File.FSPathMakeRef('path').FSRefMakePath() to disagree. We
240 # have no idea why.
241 if sys.platform not in ('darwin', 'win32'):
242 raise MappingError(
243 'File path doesn\'t equal native file path\n%s != %s' %
244 (filepath, native_filepath))
245
246 symlinks = []
247 if follow_symlinks:
Marc-Antoine Ruela275b292014-11-25 15:17:21 -0500248 try:
249 relfile, symlinks = expand_symlinks(indir, relfile)
250 except OSError:
251 # The file doesn't exist, it will throw below.
252 pass
Marc-Antoine Ruel92257792014-08-28 20:51:08 -0400253
254 if relfile.endswith(os.path.sep):
255 if not os.path.isdir(infile):
256 raise MappingError(
257 '%s is not a directory but ends with "%s"' % (infile, os.path.sep))
258
259 # Special case './'.
260 if relfile.startswith('.' + os.path.sep):
261 relfile = relfile[2:]
262 outfiles = symlinks
263 try:
maruel12e30012015-10-09 11:55:35 -0700264 for filename in fs.listdir(infile):
Marc-Antoine Ruel92257792014-08-28 20:51:08 -0400265 inner_relfile = os.path.join(relfile, filename)
266 if blacklist and blacklist(inner_relfile):
267 continue
268 if os.path.isdir(os.path.join(indir, inner_relfile)):
269 inner_relfile += os.path.sep
270 outfiles.extend(
271 expand_directory_and_symlink(indir, inner_relfile, blacklist,
272 follow_symlinks))
273 return outfiles
274 except OSError as e:
275 raise MappingError(
276 'Unable to iterate over directory %s.\n%s' % (infile, e))
277 else:
278 # Always add individual files even if they were blacklisted.
279 if os.path.isdir(infile):
280 raise MappingError(
281 'Input directory %s must have a trailing slash' % infile)
282
283 if not os.path.isfile(infile):
284 raise MappingError('Input file %s doesn\'t exist' % infile)
285
286 return symlinks + [relfile]
287
288
289def expand_directories_and_symlinks(
290 indir, infiles, blacklist, follow_symlinks, ignore_broken_items):
291 """Expands the directories and the symlinks, applies the blacklist and
292 verifies files exist.
293
294 Files are specified in os native path separator.
295 """
296 outfiles = []
297 for relfile in infiles:
298 try:
299 outfiles.extend(
300 expand_directory_and_symlink(
301 indir, relfile, blacklist, follow_symlinks))
302 except MappingError as e:
303 if not ignore_broken_items:
304 raise
305 logging.info('warning: %s', e)
306 return outfiles
307
308
Vadim Shtayurac28b74f2014-10-06 20:00:08 -0700309@tools.profile
Marc-Antoine Ruel92257792014-08-28 20:51:08 -0400310def file_to_metadata(filepath, prevdict, read_only, algo):
311 """Processes an input file, a dependency, and return meta data about it.
312
313 Behaviors:
314 - Retrieves the file mode, file size, file timestamp, file link
315 destination if it is a file link and calcultate the SHA-1 of the file's
316 content if the path points to a file and not a symlink.
317
318 Arguments:
319 filepath: File to act on.
320 prevdict: the previous dictionary. It is used to retrieve the cached sha-1
321 to skip recalculating the hash. Optional.
322 read_only: If 1 or 2, the file mode is manipulated. In practice, only save
323 one of 4 modes: 0755 (rwx), 0644 (rw), 0555 (rx), 0444 (r). On
324 windows, mode is not set since all files are 'executable' by
325 default.
326 algo: Hashing algorithm used.
327
328 Returns:
329 The necessary dict to create a entry in the 'files' section of an .isolated
330 file.
331 """
Marc-Antoine Ruelf1d827c2014-11-24 15:22:25 -0500332 # TODO(maruel): None is not a valid value.
333 assert read_only in (None, 0, 1, 2), read_only
Marc-Antoine Ruel92257792014-08-28 20:51:08 -0400334 out = {}
335 # Always check the file stat and check if it is a link. The timestamp is used
336 # to know if the file's content/symlink destination should be looked into.
337 # E.g. only reuse from prevdict if the timestamp hasn't changed.
338 # There is the risk of the file's timestamp being reset to its last value
339 # manually while its content changed. We don't protect against that use case.
340 try:
341 filestats = os.lstat(filepath)
342 except OSError:
343 # The file is not present.
344 raise MappingError('%s is missing' % filepath)
345 is_link = stat.S_ISLNK(filestats.st_mode)
346
347 if sys.platform != 'win32':
348 # Ignore file mode on Windows since it's not really useful there.
349 filemode = stat.S_IMODE(filestats.st_mode)
350 # Remove write access for group and all access to 'others'.
351 filemode &= ~(stat.S_IWGRP | stat.S_IRWXO)
352 if read_only:
353 filemode &= ~stat.S_IWUSR
Marc-Antoine Ruela275b292014-11-25 15:17:21 -0500354 if filemode & (stat.S_IXUSR|stat.S_IRGRP) == (stat.S_IXUSR|stat.S_IRGRP):
355 # Only keep x group bit if both x user bit and group read bit are set.
Marc-Antoine Ruel92257792014-08-28 20:51:08 -0400356 filemode |= stat.S_IXGRP
357 else:
358 filemode &= ~stat.S_IXGRP
359 if not is_link:
360 out['m'] = filemode
361
362 # Used to skip recalculating the hash or link destination. Use the most recent
363 # update time.
364 out['t'] = int(round(filestats.st_mtime))
365
366 if not is_link:
367 out['s'] = filestats.st_size
368 # If the timestamp wasn't updated and the file size is still the same, carry
369 # on the sha-1.
370 if (prevdict.get('t') == out['t'] and
371 prevdict.get('s') == out['s']):
372 # Reuse the previous hash if available.
373 out['h'] = prevdict.get('h')
374 if not out.get('h'):
375 out['h'] = hash_file(filepath, algo)
376 else:
377 # If the timestamp wasn't updated, carry on the link destination.
378 if prevdict.get('t') == out['t']:
379 # Reuse the previous link destination if available.
380 out['l'] = prevdict.get('l')
381 if out.get('l') is None:
382 # The link could be in an incorrect path case. In practice, this only
383 # happen on OSX on case insensitive HFS.
384 # TODO(maruel): It'd be better if it was only done once, in
385 # expand_directory_and_symlink(), so it would not be necessary to do again
386 # here.
387 symlink_value = os.readlink(filepath) # pylint: disable=E1101
388 filedir = file_path.get_native_path_case(os.path.dirname(filepath))
389 native_dest = file_path.fix_native_path_case(filedir, symlink_value)
390 out['l'] = os.path.relpath(native_dest, filedir)
391 return out
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400392
393
394def save_isolated(isolated, data):
395 """Writes one or multiple .isolated files.
396
397 Note: this reference implementation does not create child .isolated file so it
398 always returns an empty list.
399
400 Returns the list of child isolated files that are included by |isolated|.
401 """
402 # Make sure the data is valid .isolated data by 'reloading' it.
403 algo = SUPPORTED_ALGOS[data['algo']]
404 load_isolated(json.dumps(data), algo)
405 tools.write_json(isolated, data, True)
406 return []
407
408
409def load_isolated(content, algo):
410 """Verifies the .isolated file is valid and loads this object with the json
411 data.
412
413 Arguments:
414 - content: raw serialized content to load.
415 - algo: hashlib algorithm class. Used to confirm the algorithm matches the
416 algorithm used on the Isolate Server.
417 """
418 try:
419 data = json.loads(content)
420 except ValueError:
421 raise IsolatedError('Failed to parse: %s...' % content[:100])
422
423 if not isinstance(data, dict):
424 raise IsolatedError('Expected dict, got %r' % data)
425
426 # Check 'version' first, since it could modify the parsing after.
427 value = data.get('version', '1.0')
428 if not isinstance(value, basestring):
429 raise IsolatedError('Expected string, got %r' % value)
430 try:
431 version = tuple(map(int, value.split('.')))
432 except ValueError:
433 raise IsolatedError('Expected valid version, got %r' % value)
434
435 expected_version = tuple(map(int, ISOLATED_FILE_VERSION.split('.')))
436 # Major version must match.
437 if version[0] != expected_version[0]:
438 raise IsolatedError(
439 'Expected compatible \'%s\' version, got %r' %
440 (ISOLATED_FILE_VERSION, value))
441
442 if algo is None:
443 # TODO(maruel): Remove the default around Jan 2014.
444 # Default the algorithm used in the .isolated file itself, falls back to
445 # 'sha-1' if unspecified.
446 algo = SUPPORTED_ALGOS_REVERSE[data.get('algo', 'sha-1')]
447
448 for key, value in data.iteritems():
449 if key == 'algo':
450 if not isinstance(value, basestring):
451 raise IsolatedError('Expected string, got %r' % value)
452 if value not in SUPPORTED_ALGOS:
453 raise IsolatedError(
454 'Expected one of \'%s\', got %r' %
455 (', '.join(sorted(SUPPORTED_ALGOS)), value))
456 if value != SUPPORTED_ALGOS_REVERSE[algo]:
457 raise IsolatedError(
458 'Expected \'%s\', got %r' % (SUPPORTED_ALGOS_REVERSE[algo], value))
459
460 elif key == 'command':
461 if not isinstance(value, list):
462 raise IsolatedError('Expected list, got %r' % value)
463 if not value:
464 raise IsolatedError('Expected non-empty command')
465 for subvalue in value:
466 if not isinstance(subvalue, basestring):
467 raise IsolatedError('Expected string, got %r' % subvalue)
468
469 elif key == 'files':
470 if not isinstance(value, dict):
471 raise IsolatedError('Expected dict, got %r' % value)
472 for subkey, subvalue in value.iteritems():
473 if not isinstance(subkey, basestring):
474 raise IsolatedError('Expected string, got %r' % subkey)
475 if not isinstance(subvalue, dict):
476 raise IsolatedError('Expected dict, got %r' % subvalue)
477 for subsubkey, subsubvalue in subvalue.iteritems():
478 if subsubkey == 'l':
479 if not isinstance(subsubvalue, basestring):
480 raise IsolatedError('Expected string, got %r' % subsubvalue)
481 elif subsubkey == 'm':
482 if not isinstance(subsubvalue, int):
483 raise IsolatedError('Expected int, got %r' % subsubvalue)
484 elif subsubkey == 'h':
485 if not is_valid_hash(subsubvalue, algo):
486 raise IsolatedError('Expected sha-1, got %r' % subsubvalue)
487 elif subsubkey == 's':
488 if not isinstance(subsubvalue, (int, long)):
489 raise IsolatedError('Expected int or long, got %r' % subsubvalue)
490 else:
491 raise IsolatedError('Unknown subsubkey %s' % subsubkey)
492 if bool('h' in subvalue) == bool('l' in subvalue):
493 raise IsolatedError(
494 'Need only one of \'h\' (sha-1) or \'l\' (link), got: %r' %
495 subvalue)
496 if bool('h' in subvalue) != bool('s' in subvalue):
497 raise IsolatedError(
498 'Both \'h\' (sha-1) and \'s\' (size) should be set, got: %r' %
499 subvalue)
500 if bool('s' in subvalue) == bool('l' in subvalue):
501 raise IsolatedError(
502 'Need only one of \'s\' (size) or \'l\' (link), got: %r' %
503 subvalue)
504 if bool('l' in subvalue) and bool('m' in subvalue):
505 raise IsolatedError(
506 'Cannot use \'m\' (mode) and \'l\' (link), got: %r' %
507 subvalue)
508
509 elif key == 'includes':
510 if not isinstance(value, list):
511 raise IsolatedError('Expected list, got %r' % value)
512 if not value:
513 raise IsolatedError('Expected non-empty includes list')
514 for subvalue in value:
515 if not is_valid_hash(subvalue, algo):
516 raise IsolatedError('Expected sha-1, got %r' % subvalue)
517
518 elif key == 'os':
519 if version >= (1, 4):
520 raise IsolatedError('Key \'os\' is not allowed starting version 1.4')
521
522 elif key == 'read_only':
523 if not value in (0, 1, 2):
524 raise IsolatedError('Expected 0, 1 or 2, got %r' % value)
525
526 elif key == 'relative_cwd':
527 if not isinstance(value, basestring):
528 raise IsolatedError('Expected string, got %r' % value)
529
530 elif key == 'version':
531 # Already checked above.
532 pass
533
534 else:
535 raise IsolatedError('Unknown key %r' % key)
536
537 # Automatically fix os.path.sep if necessary. While .isolated files are always
538 # in the the native path format, someone could want to download an .isolated
539 # tree from another OS.
540 wrong_path_sep = '/' if os.path.sep == '\\' else '\\'
541 if 'files' in data:
542 data['files'] = dict(
543 (k.replace(wrong_path_sep, os.path.sep), v)
544 for k, v in data['files'].iteritems())
545 for v in data['files'].itervalues():
546 if 'l' in v:
547 v['l'] = v['l'].replace(wrong_path_sep, os.path.sep)
548 if 'relative_cwd' in data:
549 data['relative_cwd'] = data['relative_cwd'].replace(
550 wrong_path_sep, os.path.sep)
551 return data