blob: 48c2dd8112902f5e48e68a742a8958c3119c2521 [file] [log] [blame]
Marc-Antoine Ruel8bee66d2014-08-28 19:02:07 -04001# Copyright 2014 The Swarming Authors. All rights reserved.
2# Use of this source code is governed under the Apache License, Version 2.0 that
3# can be found in the LICENSE file.
4
5"""Understands .isolated files and can do local operations on them."""
6
7import hashlib
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -04008import json
Marc-Antoine Ruel92257792014-08-28 20:51:08 -04009import logging
10import os
Marc-Antoine Ruel8bee66d2014-08-28 19:02:07 -040011import re
Marc-Antoine Ruel92257792014-08-28 20:51:08 -040012import stat
13import sys
14
15from utils import file_path
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -040016from utils import tools
Marc-Antoine Ruel8bee66d2014-08-28 19:02:07 -040017
18
19# Version stored and expected in .isolated files.
20ISOLATED_FILE_VERSION = '1.4'
21
22
23# Chunk size to use when doing disk I/O.
24DISK_FILE_CHUNK = 1024 * 1024
25
26
27# Sadly, hashlib uses 'sha1' instead of the standard 'sha-1' so explicitly
28# specify the names here.
29SUPPORTED_ALGOS = {
30 'md5': hashlib.md5,
31 'sha-1': hashlib.sha1,
32 'sha-512': hashlib.sha512,
33}
34
35
36# Used for serialization.
37SUPPORTED_ALGOS_REVERSE = dict((v, k) for k, v in SUPPORTED_ALGOS.iteritems())
38
39
Marc-Antoine Ruel1e7658c2014-08-28 19:46:39 -040040class IsolatedError(ValueError):
41 """Generic failure to load a .isolated file."""
42 pass
43
44
45class MappingError(OSError):
46 """Failed to recreate the tree."""
47 pass
48
49
Marc-Antoine Ruel8bee66d2014-08-28 19:02:07 -040050def is_valid_hash(value, algo):
51 """Returns if the value is a valid hash for the corresponding algorithm."""
52 size = 2 * algo().digest_size
53 return bool(re.match(r'^[a-fA-F0-9]{%d}$' % size, value))
54
55
56def get_hash_algo(_namespace):
57 """Return hash algorithm class to use when uploading to given |namespace|."""
58 # TODO(vadimsh): Implement this at some point.
59 return hashlib.sha1
60
61
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -040062def is_namespace_with_compression(namespace):
63 """Returns True if given |namespace| stores compressed objects."""
64 return namespace.endswith(('-gzip', '-deflate'))
65
66
Marc-Antoine Ruel8bee66d2014-08-28 19:02:07 -040067def hash_file(filepath, algo):
68 """Calculates the hash of a file without reading it all in memory at once.
69
70 |algo| should be one of hashlib hashing algorithm.
71 """
72 digest = algo()
73 with open(filepath, 'rb') as f:
74 while True:
75 chunk = f.read(DISK_FILE_CHUNK)
76 if not chunk:
77 break
78 digest.update(chunk)
79 return digest.hexdigest()
Marc-Antoine Ruel92257792014-08-28 20:51:08 -040080
81
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -040082class IsolatedFile(object):
83 """Represents a single parsed .isolated file."""
Vadim Shtayura7f7459c2014-09-04 13:25:10 -070084
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -040085 def __init__(self, obj_hash, algo):
86 """|obj_hash| is really the sha-1 of the file."""
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -040087 self.obj_hash = obj_hash
88 self.algo = algo
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -040089
90 # Raw data.
91 self.data = {}
92 # A IsolatedFile instance, one per object in self.includes.
93 self.children = []
94
95 # Set once the .isolated file is loaded.
Vadim Shtayura7f7459c2014-09-04 13:25:10 -070096 self._is_loaded = False
97
98 def __repr__(self):
99 return 'IsolatedFile(%s, loaded: %s)' % (self.obj_hash, self._is_loaded)
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400100
101 def load(self, content):
102 """Verifies the .isolated file is valid and loads this object with the json
103 data.
104 """
105 logging.debug('IsolatedFile.load(%s)' % self.obj_hash)
Vadim Shtayura7f7459c2014-09-04 13:25:10 -0700106 assert not self._is_loaded
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400107 self.data = load_isolated(content, self.algo)
108 self.children = [
109 IsolatedFile(i, self.algo) for i in self.data.get('includes', [])
110 ]
Vadim Shtayura7f7459c2014-09-04 13:25:10 -0700111 self._is_loaded = True
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400112
Vadim Shtayura7f7459c2014-09-04 13:25:10 -0700113 @property
114 def is_loaded(self):
115 """Returns True if 'load' was already called."""
116 return self._is_loaded
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400117
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400118
Vadim Shtayura7f7459c2014-09-04 13:25:10 -0700119def walk_includes(isolated):
120 """Walks IsolatedFile include graph and yields IsolatedFile objects.
121
122 Visits root node first, then recursively all children, left to right.
123 Not yet loaded nodes are considered childless.
124 """
125 yield isolated
126 for child in isolated.children:
127 for x in walk_includes(child):
128 yield x
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400129
130
Vadim Shtayurac28b74f2014-10-06 20:00:08 -0700131@tools.profile
Marc-Antoine Ruel92257792014-08-28 20:51:08 -0400132def expand_symlinks(indir, relfile):
133 """Follows symlinks in |relfile|, but treating symlinks that point outside the
134 build tree as if they were ordinary directories/files. Returns the final
135 symlink-free target and a list of paths to symlinks encountered in the
136 process.
137
138 The rule about symlinks outside the build tree is for the benefit of the
139 Chromium OS ebuild, which symlinks the output directory to an unrelated path
140 in the chroot.
141
142 Fails when a directory loop is detected, although in theory we could support
143 that case.
144 """
145 is_directory = relfile.endswith(os.path.sep)
146 done = indir
147 todo = relfile.strip(os.path.sep)
148 symlinks = []
149
150 while todo:
Vadim Shtayura56c17562014-10-07 17:13:34 -0700151 pre_symlink, symlink, post_symlink = file_path.split_at_symlink(done, todo)
Marc-Antoine Ruel92257792014-08-28 20:51:08 -0400152 if not symlink:
153 todo = file_path.fix_native_path_case(done, todo)
154 done = os.path.join(done, todo)
155 break
156 symlink_path = os.path.join(done, pre_symlink, symlink)
157 post_symlink = post_symlink.lstrip(os.path.sep)
158 # readlink doesn't exist on Windows.
159 # pylint: disable=E1101
160 target = os.path.normpath(os.path.join(done, pre_symlink))
161 symlink_target = os.readlink(symlink_path)
162 if os.path.isabs(symlink_target):
163 # Absolute path are considered a normal directories. The use case is
164 # generally someone who puts the output directory on a separate drive.
165 target = symlink_target
166 else:
167 # The symlink itself could be using the wrong path case.
168 target = file_path.fix_native_path_case(target, symlink_target)
169
170 if not os.path.exists(target):
171 raise MappingError(
172 'Symlink target doesn\'t exist: %s -> %s' % (symlink_path, target))
173 target = file_path.get_native_path_case(target)
174 if not file_path.path_starts_with(indir, target):
175 done = symlink_path
176 todo = post_symlink
177 continue
178 if file_path.path_starts_with(target, symlink_path):
179 raise MappingError(
180 'Can\'t map recursive symlink reference %s -> %s' %
181 (symlink_path, target))
182 logging.info('Found symlink: %s -> %s', symlink_path, target)
183 symlinks.append(os.path.relpath(symlink_path, indir))
184 # Treat the common prefix of the old and new paths as done, and start
185 # scanning again.
186 target = target.split(os.path.sep)
187 symlink_path = symlink_path.split(os.path.sep)
188 prefix_length = 0
189 for target_piece, symlink_path_piece in zip(target, symlink_path):
190 if target_piece == symlink_path_piece:
191 prefix_length += 1
192 else:
193 break
194 done = os.path.sep.join(target[:prefix_length])
195 todo = os.path.join(
196 os.path.sep.join(target[prefix_length:]), post_symlink)
197
198 relfile = os.path.relpath(done, indir)
199 relfile = relfile.rstrip(os.path.sep) + is_directory * os.path.sep
200 return relfile, symlinks
201
202
Vadim Shtayurac28b74f2014-10-06 20:00:08 -0700203@tools.profile
Marc-Antoine Ruel92257792014-08-28 20:51:08 -0400204def expand_directory_and_symlink(indir, relfile, blacklist, follow_symlinks):
205 """Expands a single input. It can result in multiple outputs.
206
207 This function is recursive when relfile is a directory.
208
209 Note: this code doesn't properly handle recursive symlink like one created
210 with:
211 ln -s .. foo
212 """
213 if os.path.isabs(relfile):
214 raise MappingError('Can\'t map absolute path %s' % relfile)
215
216 infile = file_path.normpath(os.path.join(indir, relfile))
217 if not infile.startswith(indir):
218 raise MappingError('Can\'t map file %s outside %s' % (infile, indir))
219
220 filepath = os.path.join(indir, relfile)
221 native_filepath = file_path.get_native_path_case(filepath)
222 if filepath != native_filepath:
223 # Special case './'.
224 if filepath != native_filepath + '.' + os.path.sep:
225 # While it'd be nice to enforce path casing on Windows, it's impractical.
226 # Also give up enforcing strict path case on OSX. Really, it's that sad.
227 # The case where it happens is very specific and hard to reproduce:
228 # get_native_path_case(
229 # u'Foo.framework/Versions/A/Resources/Something.nib') will return
230 # u'Foo.framework/Versions/A/resources/Something.nib', e.g. lowercase 'r'.
231 #
232 # Note that this is really something deep in OSX because running
233 # ls Foo.framework/Versions/A
234 # will print out 'Resources', while file_path.get_native_path_case()
235 # returns a lower case 'r'.
236 #
237 # So *something* is happening under the hood resulting in the command 'ls'
238 # and Carbon.File.FSPathMakeRef('path').FSRefMakePath() to disagree. We
239 # have no idea why.
240 if sys.platform not in ('darwin', 'win32'):
241 raise MappingError(
242 'File path doesn\'t equal native file path\n%s != %s' %
243 (filepath, native_filepath))
244
245 symlinks = []
246 if follow_symlinks:
Marc-Antoine Ruela275b292014-11-25 15:17:21 -0500247 try:
248 relfile, symlinks = expand_symlinks(indir, relfile)
249 except OSError:
250 # The file doesn't exist, it will throw below.
251 pass
Marc-Antoine Ruel92257792014-08-28 20:51:08 -0400252
253 if relfile.endswith(os.path.sep):
254 if not os.path.isdir(infile):
255 raise MappingError(
256 '%s is not a directory but ends with "%s"' % (infile, os.path.sep))
257
258 # Special case './'.
259 if relfile.startswith('.' + os.path.sep):
260 relfile = relfile[2:]
261 outfiles = symlinks
262 try:
Vadim Shtayura56c17562014-10-07 17:13:34 -0700263 for filename in file_path.listdir(infile):
Marc-Antoine Ruel92257792014-08-28 20:51:08 -0400264 inner_relfile = os.path.join(relfile, filename)
265 if blacklist and blacklist(inner_relfile):
266 continue
267 if os.path.isdir(os.path.join(indir, inner_relfile)):
268 inner_relfile += os.path.sep
269 outfiles.extend(
270 expand_directory_and_symlink(indir, inner_relfile, blacklist,
271 follow_symlinks))
272 return outfiles
273 except OSError as e:
274 raise MappingError(
275 'Unable to iterate over directory %s.\n%s' % (infile, e))
276 else:
277 # Always add individual files even if they were blacklisted.
278 if os.path.isdir(infile):
279 raise MappingError(
280 'Input directory %s must have a trailing slash' % infile)
281
282 if not os.path.isfile(infile):
283 raise MappingError('Input file %s doesn\'t exist' % infile)
284
285 return symlinks + [relfile]
286
287
288def expand_directories_and_symlinks(
289 indir, infiles, blacklist, follow_symlinks, ignore_broken_items):
290 """Expands the directories and the symlinks, applies the blacklist and
291 verifies files exist.
292
293 Files are specified in os native path separator.
294 """
295 outfiles = []
296 for relfile in infiles:
297 try:
298 outfiles.extend(
299 expand_directory_and_symlink(
300 indir, relfile, blacklist, follow_symlinks))
301 except MappingError as e:
302 if not ignore_broken_items:
303 raise
304 logging.info('warning: %s', e)
305 return outfiles
306
307
Vadim Shtayurac28b74f2014-10-06 20:00:08 -0700308@tools.profile
Marc-Antoine Ruel92257792014-08-28 20:51:08 -0400309def file_to_metadata(filepath, prevdict, read_only, algo):
310 """Processes an input file, a dependency, and return meta data about it.
311
312 Behaviors:
313 - Retrieves the file mode, file size, file timestamp, file link
314 destination if it is a file link and calcultate the SHA-1 of the file's
315 content if the path points to a file and not a symlink.
316
317 Arguments:
318 filepath: File to act on.
319 prevdict: the previous dictionary. It is used to retrieve the cached sha-1
320 to skip recalculating the hash. Optional.
321 read_only: If 1 or 2, the file mode is manipulated. In practice, only save
322 one of 4 modes: 0755 (rwx), 0644 (rw), 0555 (rx), 0444 (r). On
323 windows, mode is not set since all files are 'executable' by
324 default.
325 algo: Hashing algorithm used.
326
327 Returns:
328 The necessary dict to create a entry in the 'files' section of an .isolated
329 file.
330 """
Marc-Antoine Ruelf1d827c2014-11-24 15:22:25 -0500331 # TODO(maruel): None is not a valid value.
332 assert read_only in (None, 0, 1, 2), read_only
Marc-Antoine Ruel92257792014-08-28 20:51:08 -0400333 out = {}
334 # Always check the file stat and check if it is a link. The timestamp is used
335 # to know if the file's content/symlink destination should be looked into.
336 # E.g. only reuse from prevdict if the timestamp hasn't changed.
337 # There is the risk of the file's timestamp being reset to its last value
338 # manually while its content changed. We don't protect against that use case.
339 try:
340 filestats = os.lstat(filepath)
341 except OSError:
342 # The file is not present.
343 raise MappingError('%s is missing' % filepath)
344 is_link = stat.S_ISLNK(filestats.st_mode)
345
346 if sys.platform != 'win32':
347 # Ignore file mode on Windows since it's not really useful there.
348 filemode = stat.S_IMODE(filestats.st_mode)
349 # Remove write access for group and all access to 'others'.
350 filemode &= ~(stat.S_IWGRP | stat.S_IRWXO)
351 if read_only:
352 filemode &= ~stat.S_IWUSR
Marc-Antoine Ruela275b292014-11-25 15:17:21 -0500353 if filemode & (stat.S_IXUSR|stat.S_IRGRP) == (stat.S_IXUSR|stat.S_IRGRP):
354 # Only keep x group bit if both x user bit and group read bit are set.
Marc-Antoine Ruel92257792014-08-28 20:51:08 -0400355 filemode |= stat.S_IXGRP
356 else:
357 filemode &= ~stat.S_IXGRP
358 if not is_link:
359 out['m'] = filemode
360
361 # Used to skip recalculating the hash or link destination. Use the most recent
362 # update time.
363 out['t'] = int(round(filestats.st_mtime))
364
365 if not is_link:
366 out['s'] = filestats.st_size
367 # If the timestamp wasn't updated and the file size is still the same, carry
368 # on the sha-1.
369 if (prevdict.get('t') == out['t'] and
370 prevdict.get('s') == out['s']):
371 # Reuse the previous hash if available.
372 out['h'] = prevdict.get('h')
373 if not out.get('h'):
374 out['h'] = hash_file(filepath, algo)
375 else:
376 # If the timestamp wasn't updated, carry on the link destination.
377 if prevdict.get('t') == out['t']:
378 # Reuse the previous link destination if available.
379 out['l'] = prevdict.get('l')
380 if out.get('l') is None:
381 # The link could be in an incorrect path case. In practice, this only
382 # happen on OSX on case insensitive HFS.
383 # TODO(maruel): It'd be better if it was only done once, in
384 # expand_directory_and_symlink(), so it would not be necessary to do again
385 # here.
386 symlink_value = os.readlink(filepath) # pylint: disable=E1101
387 filedir = file_path.get_native_path_case(os.path.dirname(filepath))
388 native_dest = file_path.fix_native_path_case(filedir, symlink_value)
389 out['l'] = os.path.relpath(native_dest, filedir)
390 return out
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400391
392
393def save_isolated(isolated, data):
394 """Writes one or multiple .isolated files.
395
396 Note: this reference implementation does not create child .isolated file so it
397 always returns an empty list.
398
399 Returns the list of child isolated files that are included by |isolated|.
400 """
401 # Make sure the data is valid .isolated data by 'reloading' it.
402 algo = SUPPORTED_ALGOS[data['algo']]
403 load_isolated(json.dumps(data), algo)
404 tools.write_json(isolated, data, True)
405 return []
406
407
408def load_isolated(content, algo):
409 """Verifies the .isolated file is valid and loads this object with the json
410 data.
411
412 Arguments:
413 - content: raw serialized content to load.
414 - algo: hashlib algorithm class. Used to confirm the algorithm matches the
415 algorithm used on the Isolate Server.
416 """
417 try:
418 data = json.loads(content)
419 except ValueError:
420 raise IsolatedError('Failed to parse: %s...' % content[:100])
421
422 if not isinstance(data, dict):
423 raise IsolatedError('Expected dict, got %r' % data)
424
425 # Check 'version' first, since it could modify the parsing after.
426 value = data.get('version', '1.0')
427 if not isinstance(value, basestring):
428 raise IsolatedError('Expected string, got %r' % value)
429 try:
430 version = tuple(map(int, value.split('.')))
431 except ValueError:
432 raise IsolatedError('Expected valid version, got %r' % value)
433
434 expected_version = tuple(map(int, ISOLATED_FILE_VERSION.split('.')))
435 # Major version must match.
436 if version[0] != expected_version[0]:
437 raise IsolatedError(
438 'Expected compatible \'%s\' version, got %r' %
439 (ISOLATED_FILE_VERSION, value))
440
441 if algo is None:
442 # TODO(maruel): Remove the default around Jan 2014.
443 # Default the algorithm used in the .isolated file itself, falls back to
444 # 'sha-1' if unspecified.
445 algo = SUPPORTED_ALGOS_REVERSE[data.get('algo', 'sha-1')]
446
447 for key, value in data.iteritems():
448 if key == 'algo':
449 if not isinstance(value, basestring):
450 raise IsolatedError('Expected string, got %r' % value)
451 if value not in SUPPORTED_ALGOS:
452 raise IsolatedError(
453 'Expected one of \'%s\', got %r' %
454 (', '.join(sorted(SUPPORTED_ALGOS)), value))
455 if value != SUPPORTED_ALGOS_REVERSE[algo]:
456 raise IsolatedError(
457 'Expected \'%s\', got %r' % (SUPPORTED_ALGOS_REVERSE[algo], value))
458
459 elif key == 'command':
460 if not isinstance(value, list):
461 raise IsolatedError('Expected list, got %r' % value)
462 if not value:
463 raise IsolatedError('Expected non-empty command')
464 for subvalue in value:
465 if not isinstance(subvalue, basestring):
466 raise IsolatedError('Expected string, got %r' % subvalue)
467
468 elif key == 'files':
469 if not isinstance(value, dict):
470 raise IsolatedError('Expected dict, got %r' % value)
471 for subkey, subvalue in value.iteritems():
472 if not isinstance(subkey, basestring):
473 raise IsolatedError('Expected string, got %r' % subkey)
474 if not isinstance(subvalue, dict):
475 raise IsolatedError('Expected dict, got %r' % subvalue)
476 for subsubkey, subsubvalue in subvalue.iteritems():
477 if subsubkey == 'l':
478 if not isinstance(subsubvalue, basestring):
479 raise IsolatedError('Expected string, got %r' % subsubvalue)
480 elif subsubkey == 'm':
481 if not isinstance(subsubvalue, int):
482 raise IsolatedError('Expected int, got %r' % subsubvalue)
483 elif subsubkey == 'h':
484 if not is_valid_hash(subsubvalue, algo):
485 raise IsolatedError('Expected sha-1, got %r' % subsubvalue)
486 elif subsubkey == 's':
487 if not isinstance(subsubvalue, (int, long)):
488 raise IsolatedError('Expected int or long, got %r' % subsubvalue)
489 else:
490 raise IsolatedError('Unknown subsubkey %s' % subsubkey)
491 if bool('h' in subvalue) == bool('l' in subvalue):
492 raise IsolatedError(
493 'Need only one of \'h\' (sha-1) or \'l\' (link), got: %r' %
494 subvalue)
495 if bool('h' in subvalue) != bool('s' in subvalue):
496 raise IsolatedError(
497 'Both \'h\' (sha-1) and \'s\' (size) should be set, got: %r' %
498 subvalue)
499 if bool('s' in subvalue) == bool('l' in subvalue):
500 raise IsolatedError(
501 'Need only one of \'s\' (size) or \'l\' (link), got: %r' %
502 subvalue)
503 if bool('l' in subvalue) and bool('m' in subvalue):
504 raise IsolatedError(
505 'Cannot use \'m\' (mode) and \'l\' (link), got: %r' %
506 subvalue)
507
508 elif key == 'includes':
509 if not isinstance(value, list):
510 raise IsolatedError('Expected list, got %r' % value)
511 if not value:
512 raise IsolatedError('Expected non-empty includes list')
513 for subvalue in value:
514 if not is_valid_hash(subvalue, algo):
515 raise IsolatedError('Expected sha-1, got %r' % subvalue)
516
517 elif key == 'os':
518 if version >= (1, 4):
519 raise IsolatedError('Key \'os\' is not allowed starting version 1.4')
520
521 elif key == 'read_only':
522 if not value in (0, 1, 2):
523 raise IsolatedError('Expected 0, 1 or 2, got %r' % value)
524
525 elif key == 'relative_cwd':
526 if not isinstance(value, basestring):
527 raise IsolatedError('Expected string, got %r' % value)
528
529 elif key == 'version':
530 # Already checked above.
531 pass
532
533 else:
534 raise IsolatedError('Unknown key %r' % key)
535
536 # Automatically fix os.path.sep if necessary. While .isolated files are always
537 # in the the native path format, someone could want to download an .isolated
538 # tree from another OS.
539 wrong_path_sep = '/' if os.path.sep == '\\' else '\\'
540 if 'files' in data:
541 data['files'] = dict(
542 (k.replace(wrong_path_sep, os.path.sep), v)
543 for k, v in data['files'].iteritems())
544 for v in data['files'].itervalues():
545 if 'l' in v:
546 v['l'] = v['l'].replace(wrong_path_sep, os.path.sep)
547 if 'relative_cwd' in data:
548 data['relative_cwd'] = data['relative_cwd'].replace(
549 wrong_path_sep, os.path.sep)
550 return data