blob: 13ee0925348c63bab381a639bf0ae0c376c3e1c0 [file] [log] [blame]
Marc-Antoine Ruel8bee66d2014-08-28 19:02:07 -04001# Copyright 2014 The Swarming Authors. All rights reserved.
2# Use of this source code is governed under the Apache License, Version 2.0 that
3# can be found in the LICENSE file.
4
5"""Understands .isolated files and can do local operations on them."""
6
7import hashlib
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -04008import json
Marc-Antoine Ruel92257792014-08-28 20:51:08 -04009import logging
10import os
Marc-Antoine Ruel8bee66d2014-08-28 19:02:07 -040011import re
Marc-Antoine Ruel92257792014-08-28 20:51:08 -040012import stat
13import sys
14
15from utils import file_path
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -040016from utils import tools
Marc-Antoine Ruel8bee66d2014-08-28 19:02:07 -040017
18
19# Version stored and expected in .isolated files.
20ISOLATED_FILE_VERSION = '1.4'
21
22
23# Chunk size to use when doing disk I/O.
24DISK_FILE_CHUNK = 1024 * 1024
25
26
27# Sadly, hashlib uses 'sha1' instead of the standard 'sha-1' so explicitly
28# specify the names here.
29SUPPORTED_ALGOS = {
30 'md5': hashlib.md5,
31 'sha-1': hashlib.sha1,
32 'sha-512': hashlib.sha512,
33}
34
35
36# Used for serialization.
37SUPPORTED_ALGOS_REVERSE = dict((v, k) for k, v in SUPPORTED_ALGOS.iteritems())
38
39
Marc-Antoine Ruel1e7658c2014-08-28 19:46:39 -040040class IsolatedError(ValueError):
41 """Generic failure to load a .isolated file."""
42 pass
43
44
45class MappingError(OSError):
46 """Failed to recreate the tree."""
47 pass
48
49
Marc-Antoine Ruel8bee66d2014-08-28 19:02:07 -040050def is_valid_hash(value, algo):
51 """Returns if the value is a valid hash for the corresponding algorithm."""
52 size = 2 * algo().digest_size
53 return bool(re.match(r'^[a-fA-F0-9]{%d}$' % size, value))
54
55
56def get_hash_algo(_namespace):
57 """Return hash algorithm class to use when uploading to given |namespace|."""
58 # TODO(vadimsh): Implement this at some point.
59 return hashlib.sha1
60
61
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -040062def is_namespace_with_compression(namespace):
63 """Returns True if given |namespace| stores compressed objects."""
64 return namespace.endswith(('-gzip', '-deflate'))
65
66
Marc-Antoine Ruel8bee66d2014-08-28 19:02:07 -040067def hash_file(filepath, algo):
68 """Calculates the hash of a file without reading it all in memory at once.
69
70 |algo| should be one of hashlib hashing algorithm.
71 """
72 digest = algo()
73 with open(filepath, 'rb') as f:
74 while True:
75 chunk = f.read(DISK_FILE_CHUNK)
76 if not chunk:
77 break
78 digest.update(chunk)
79 return digest.hexdigest()
Marc-Antoine Ruel92257792014-08-28 20:51:08 -040080
81
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -040082class IsolatedFile(object):
83 """Represents a single parsed .isolated file."""
Vadim Shtayura7f7459c2014-09-04 13:25:10 -070084
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -040085 def __init__(self, obj_hash, algo):
86 """|obj_hash| is really the sha-1 of the file."""
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -040087 self.obj_hash = obj_hash
88 self.algo = algo
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -040089
90 # Raw data.
91 self.data = {}
92 # A IsolatedFile instance, one per object in self.includes.
93 self.children = []
94
95 # Set once the .isolated file is loaded.
Vadim Shtayura7f7459c2014-09-04 13:25:10 -070096 self._is_loaded = False
97
98 def __repr__(self):
99 return 'IsolatedFile(%s, loaded: %s)' % (self.obj_hash, self._is_loaded)
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400100
101 def load(self, content):
102 """Verifies the .isolated file is valid and loads this object with the json
103 data.
104 """
105 logging.debug('IsolatedFile.load(%s)' % self.obj_hash)
Vadim Shtayura7f7459c2014-09-04 13:25:10 -0700106 assert not self._is_loaded
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400107 self.data = load_isolated(content, self.algo)
108 self.children = [
109 IsolatedFile(i, self.algo) for i in self.data.get('includes', [])
110 ]
Vadim Shtayura7f7459c2014-09-04 13:25:10 -0700111 self._is_loaded = True
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400112
Vadim Shtayura7f7459c2014-09-04 13:25:10 -0700113 @property
114 def is_loaded(self):
115 """Returns True if 'load' was already called."""
116 return self._is_loaded
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400117
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400118
Vadim Shtayura7f7459c2014-09-04 13:25:10 -0700119def walk_includes(isolated):
120 """Walks IsolatedFile include graph and yields IsolatedFile objects.
121
122 Visits root node first, then recursively all children, left to right.
123 Not yet loaded nodes are considered childless.
124 """
125 yield isolated
126 for child in isolated.children:
127 for x in walk_includes(child):
128 yield x
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400129
130
Vadim Shtayurac28b74f2014-10-06 20:00:08 -0700131# Wrap listdir for profiling. Noop if profiling is disabled.
132os_listdir = tools.profile(os.listdir)
133
134
135@tools.profile
Marc-Antoine Ruel92257792014-08-28 20:51:08 -0400136def expand_symlinks(indir, relfile):
137 """Follows symlinks in |relfile|, but treating symlinks that point outside the
138 build tree as if they were ordinary directories/files. Returns the final
139 symlink-free target and a list of paths to symlinks encountered in the
140 process.
141
142 The rule about symlinks outside the build tree is for the benefit of the
143 Chromium OS ebuild, which symlinks the output directory to an unrelated path
144 in the chroot.
145
146 Fails when a directory loop is detected, although in theory we could support
147 that case.
148 """
149 is_directory = relfile.endswith(os.path.sep)
150 done = indir
151 todo = relfile.strip(os.path.sep)
152 symlinks = []
153
154 while todo:
155 pre_symlink, symlink, post_symlink = file_path.split_at_symlink(
156 done, todo)
157 if not symlink:
158 todo = file_path.fix_native_path_case(done, todo)
159 done = os.path.join(done, todo)
160 break
161 symlink_path = os.path.join(done, pre_symlink, symlink)
162 post_symlink = post_symlink.lstrip(os.path.sep)
163 # readlink doesn't exist on Windows.
164 # pylint: disable=E1101
165 target = os.path.normpath(os.path.join(done, pre_symlink))
166 symlink_target = os.readlink(symlink_path)
167 if os.path.isabs(symlink_target):
168 # Absolute path are considered a normal directories. The use case is
169 # generally someone who puts the output directory on a separate drive.
170 target = symlink_target
171 else:
172 # The symlink itself could be using the wrong path case.
173 target = file_path.fix_native_path_case(target, symlink_target)
174
175 if not os.path.exists(target):
176 raise MappingError(
177 'Symlink target doesn\'t exist: %s -> %s' % (symlink_path, target))
178 target = file_path.get_native_path_case(target)
179 if not file_path.path_starts_with(indir, target):
180 done = symlink_path
181 todo = post_symlink
182 continue
183 if file_path.path_starts_with(target, symlink_path):
184 raise MappingError(
185 'Can\'t map recursive symlink reference %s -> %s' %
186 (symlink_path, target))
187 logging.info('Found symlink: %s -> %s', symlink_path, target)
188 symlinks.append(os.path.relpath(symlink_path, indir))
189 # Treat the common prefix of the old and new paths as done, and start
190 # scanning again.
191 target = target.split(os.path.sep)
192 symlink_path = symlink_path.split(os.path.sep)
193 prefix_length = 0
194 for target_piece, symlink_path_piece in zip(target, symlink_path):
195 if target_piece == symlink_path_piece:
196 prefix_length += 1
197 else:
198 break
199 done = os.path.sep.join(target[:prefix_length])
200 todo = os.path.join(
201 os.path.sep.join(target[prefix_length:]), post_symlink)
202
203 relfile = os.path.relpath(done, indir)
204 relfile = relfile.rstrip(os.path.sep) + is_directory * os.path.sep
205 return relfile, symlinks
206
207
Vadim Shtayurac28b74f2014-10-06 20:00:08 -0700208@tools.profile
Marc-Antoine Ruel92257792014-08-28 20:51:08 -0400209def expand_directory_and_symlink(indir, relfile, blacklist, follow_symlinks):
210 """Expands a single input. It can result in multiple outputs.
211
212 This function is recursive when relfile is a directory.
213
214 Note: this code doesn't properly handle recursive symlink like one created
215 with:
216 ln -s .. foo
217 """
218 if os.path.isabs(relfile):
219 raise MappingError('Can\'t map absolute path %s' % relfile)
220
221 infile = file_path.normpath(os.path.join(indir, relfile))
222 if not infile.startswith(indir):
223 raise MappingError('Can\'t map file %s outside %s' % (infile, indir))
224
225 filepath = os.path.join(indir, relfile)
226 native_filepath = file_path.get_native_path_case(filepath)
227 if filepath != native_filepath:
228 # Special case './'.
229 if filepath != native_filepath + '.' + os.path.sep:
230 # While it'd be nice to enforce path casing on Windows, it's impractical.
231 # Also give up enforcing strict path case on OSX. Really, it's that sad.
232 # The case where it happens is very specific and hard to reproduce:
233 # get_native_path_case(
234 # u'Foo.framework/Versions/A/Resources/Something.nib') will return
235 # u'Foo.framework/Versions/A/resources/Something.nib', e.g. lowercase 'r'.
236 #
237 # Note that this is really something deep in OSX because running
238 # ls Foo.framework/Versions/A
239 # will print out 'Resources', while file_path.get_native_path_case()
240 # returns a lower case 'r'.
241 #
242 # So *something* is happening under the hood resulting in the command 'ls'
243 # and Carbon.File.FSPathMakeRef('path').FSRefMakePath() to disagree. We
244 # have no idea why.
245 if sys.platform not in ('darwin', 'win32'):
246 raise MappingError(
247 'File path doesn\'t equal native file path\n%s != %s' %
248 (filepath, native_filepath))
249
250 symlinks = []
251 if follow_symlinks:
252 relfile, symlinks = expand_symlinks(indir, relfile)
253
254 if relfile.endswith(os.path.sep):
255 if not os.path.isdir(infile):
256 raise MappingError(
257 '%s is not a directory but ends with "%s"' % (infile, os.path.sep))
258
259 # Special case './'.
260 if relfile.startswith('.' + os.path.sep):
261 relfile = relfile[2:]
262 outfiles = symlinks
263 try:
Vadim Shtayurac28b74f2014-10-06 20:00:08 -0700264 for filename in os_listdir(infile):
Marc-Antoine Ruel92257792014-08-28 20:51:08 -0400265 inner_relfile = os.path.join(relfile, filename)
266 if blacklist and blacklist(inner_relfile):
267 continue
268 if os.path.isdir(os.path.join(indir, inner_relfile)):
269 inner_relfile += os.path.sep
270 outfiles.extend(
271 expand_directory_and_symlink(indir, inner_relfile, blacklist,
272 follow_symlinks))
273 return outfiles
274 except OSError as e:
275 raise MappingError(
276 'Unable to iterate over directory %s.\n%s' % (infile, e))
277 else:
278 # Always add individual files even if they were blacklisted.
279 if os.path.isdir(infile):
280 raise MappingError(
281 'Input directory %s must have a trailing slash' % infile)
282
283 if not os.path.isfile(infile):
284 raise MappingError('Input file %s doesn\'t exist' % infile)
285
286 return symlinks + [relfile]
287
288
289def expand_directories_and_symlinks(
290 indir, infiles, blacklist, follow_symlinks, ignore_broken_items):
291 """Expands the directories and the symlinks, applies the blacklist and
292 verifies files exist.
293
294 Files are specified in os native path separator.
295 """
296 outfiles = []
297 for relfile in infiles:
298 try:
299 outfiles.extend(
300 expand_directory_and_symlink(
301 indir, relfile, blacklist, follow_symlinks))
302 except MappingError as e:
303 if not ignore_broken_items:
304 raise
305 logging.info('warning: %s', e)
306 return outfiles
307
308
Vadim Shtayurac28b74f2014-10-06 20:00:08 -0700309@tools.profile
Marc-Antoine Ruel92257792014-08-28 20:51:08 -0400310def file_to_metadata(filepath, prevdict, read_only, algo):
311 """Processes an input file, a dependency, and return meta data about it.
312
313 Behaviors:
314 - Retrieves the file mode, file size, file timestamp, file link
315 destination if it is a file link and calcultate the SHA-1 of the file's
316 content if the path points to a file and not a symlink.
317
318 Arguments:
319 filepath: File to act on.
320 prevdict: the previous dictionary. It is used to retrieve the cached sha-1
321 to skip recalculating the hash. Optional.
322 read_only: If 1 or 2, the file mode is manipulated. In practice, only save
323 one of 4 modes: 0755 (rwx), 0644 (rw), 0555 (rx), 0444 (r). On
324 windows, mode is not set since all files are 'executable' by
325 default.
326 algo: Hashing algorithm used.
327
328 Returns:
329 The necessary dict to create a entry in the 'files' section of an .isolated
330 file.
331 """
332 out = {}
333 # Always check the file stat and check if it is a link. The timestamp is used
334 # to know if the file's content/symlink destination should be looked into.
335 # E.g. only reuse from prevdict if the timestamp hasn't changed.
336 # There is the risk of the file's timestamp being reset to its last value
337 # manually while its content changed. We don't protect against that use case.
338 try:
339 filestats = os.lstat(filepath)
340 except OSError:
341 # The file is not present.
342 raise MappingError('%s is missing' % filepath)
343 is_link = stat.S_ISLNK(filestats.st_mode)
344
345 if sys.platform != 'win32':
346 # Ignore file mode on Windows since it's not really useful there.
347 filemode = stat.S_IMODE(filestats.st_mode)
348 # Remove write access for group and all access to 'others'.
349 filemode &= ~(stat.S_IWGRP | stat.S_IRWXO)
350 if read_only:
351 filemode &= ~stat.S_IWUSR
352 if filemode & stat.S_IXUSR:
353 filemode |= stat.S_IXGRP
354 else:
355 filemode &= ~stat.S_IXGRP
356 if not is_link:
357 out['m'] = filemode
358
359 # Used to skip recalculating the hash or link destination. Use the most recent
360 # update time.
361 out['t'] = int(round(filestats.st_mtime))
362
363 if not is_link:
364 out['s'] = filestats.st_size
365 # If the timestamp wasn't updated and the file size is still the same, carry
366 # on the sha-1.
367 if (prevdict.get('t') == out['t'] and
368 prevdict.get('s') == out['s']):
369 # Reuse the previous hash if available.
370 out['h'] = prevdict.get('h')
371 if not out.get('h'):
372 out['h'] = hash_file(filepath, algo)
373 else:
374 # If the timestamp wasn't updated, carry on the link destination.
375 if prevdict.get('t') == out['t']:
376 # Reuse the previous link destination if available.
377 out['l'] = prevdict.get('l')
378 if out.get('l') is None:
379 # The link could be in an incorrect path case. In practice, this only
380 # happen on OSX on case insensitive HFS.
381 # TODO(maruel): It'd be better if it was only done once, in
382 # expand_directory_and_symlink(), so it would not be necessary to do again
383 # here.
384 symlink_value = os.readlink(filepath) # pylint: disable=E1101
385 filedir = file_path.get_native_path_case(os.path.dirname(filepath))
386 native_dest = file_path.fix_native_path_case(filedir, symlink_value)
387 out['l'] = os.path.relpath(native_dest, filedir)
388 return out
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400389
390
391def save_isolated(isolated, data):
392 """Writes one or multiple .isolated files.
393
394 Note: this reference implementation does not create child .isolated file so it
395 always returns an empty list.
396
397 Returns the list of child isolated files that are included by |isolated|.
398 """
399 # Make sure the data is valid .isolated data by 'reloading' it.
400 algo = SUPPORTED_ALGOS[data['algo']]
401 load_isolated(json.dumps(data), algo)
402 tools.write_json(isolated, data, True)
403 return []
404
405
406def load_isolated(content, algo):
407 """Verifies the .isolated file is valid and loads this object with the json
408 data.
409
410 Arguments:
411 - content: raw serialized content to load.
412 - algo: hashlib algorithm class. Used to confirm the algorithm matches the
413 algorithm used on the Isolate Server.
414 """
415 try:
416 data = json.loads(content)
417 except ValueError:
418 raise IsolatedError('Failed to parse: %s...' % content[:100])
419
420 if not isinstance(data, dict):
421 raise IsolatedError('Expected dict, got %r' % data)
422
423 # Check 'version' first, since it could modify the parsing after.
424 value = data.get('version', '1.0')
425 if not isinstance(value, basestring):
426 raise IsolatedError('Expected string, got %r' % value)
427 try:
428 version = tuple(map(int, value.split('.')))
429 except ValueError:
430 raise IsolatedError('Expected valid version, got %r' % value)
431
432 expected_version = tuple(map(int, ISOLATED_FILE_VERSION.split('.')))
433 # Major version must match.
434 if version[0] != expected_version[0]:
435 raise IsolatedError(
436 'Expected compatible \'%s\' version, got %r' %
437 (ISOLATED_FILE_VERSION, value))
438
439 if algo is None:
440 # TODO(maruel): Remove the default around Jan 2014.
441 # Default the algorithm used in the .isolated file itself, falls back to
442 # 'sha-1' if unspecified.
443 algo = SUPPORTED_ALGOS_REVERSE[data.get('algo', 'sha-1')]
444
445 for key, value in data.iteritems():
446 if key == 'algo':
447 if not isinstance(value, basestring):
448 raise IsolatedError('Expected string, got %r' % value)
449 if value not in SUPPORTED_ALGOS:
450 raise IsolatedError(
451 'Expected one of \'%s\', got %r' %
452 (', '.join(sorted(SUPPORTED_ALGOS)), value))
453 if value != SUPPORTED_ALGOS_REVERSE[algo]:
454 raise IsolatedError(
455 'Expected \'%s\', got %r' % (SUPPORTED_ALGOS_REVERSE[algo], value))
456
457 elif key == 'command':
458 if not isinstance(value, list):
459 raise IsolatedError('Expected list, got %r' % value)
460 if not value:
461 raise IsolatedError('Expected non-empty command')
462 for subvalue in value:
463 if not isinstance(subvalue, basestring):
464 raise IsolatedError('Expected string, got %r' % subvalue)
465
466 elif key == 'files':
467 if not isinstance(value, dict):
468 raise IsolatedError('Expected dict, got %r' % value)
469 for subkey, subvalue in value.iteritems():
470 if not isinstance(subkey, basestring):
471 raise IsolatedError('Expected string, got %r' % subkey)
472 if not isinstance(subvalue, dict):
473 raise IsolatedError('Expected dict, got %r' % subvalue)
474 for subsubkey, subsubvalue in subvalue.iteritems():
475 if subsubkey == 'l':
476 if not isinstance(subsubvalue, basestring):
477 raise IsolatedError('Expected string, got %r' % subsubvalue)
478 elif subsubkey == 'm':
479 if not isinstance(subsubvalue, int):
480 raise IsolatedError('Expected int, got %r' % subsubvalue)
481 elif subsubkey == 'h':
482 if not is_valid_hash(subsubvalue, algo):
483 raise IsolatedError('Expected sha-1, got %r' % subsubvalue)
484 elif subsubkey == 's':
485 if not isinstance(subsubvalue, (int, long)):
486 raise IsolatedError('Expected int or long, got %r' % subsubvalue)
487 else:
488 raise IsolatedError('Unknown subsubkey %s' % subsubkey)
489 if bool('h' in subvalue) == bool('l' in subvalue):
490 raise IsolatedError(
491 'Need only one of \'h\' (sha-1) or \'l\' (link), got: %r' %
492 subvalue)
493 if bool('h' in subvalue) != bool('s' in subvalue):
494 raise IsolatedError(
495 'Both \'h\' (sha-1) and \'s\' (size) should be set, got: %r' %
496 subvalue)
497 if bool('s' in subvalue) == bool('l' in subvalue):
498 raise IsolatedError(
499 'Need only one of \'s\' (size) or \'l\' (link), got: %r' %
500 subvalue)
501 if bool('l' in subvalue) and bool('m' in subvalue):
502 raise IsolatedError(
503 'Cannot use \'m\' (mode) and \'l\' (link), got: %r' %
504 subvalue)
505
506 elif key == 'includes':
507 if not isinstance(value, list):
508 raise IsolatedError('Expected list, got %r' % value)
509 if not value:
510 raise IsolatedError('Expected non-empty includes list')
511 for subvalue in value:
512 if not is_valid_hash(subvalue, algo):
513 raise IsolatedError('Expected sha-1, got %r' % subvalue)
514
515 elif key == 'os':
516 if version >= (1, 4):
517 raise IsolatedError('Key \'os\' is not allowed starting version 1.4')
518
519 elif key == 'read_only':
520 if not value in (0, 1, 2):
521 raise IsolatedError('Expected 0, 1 or 2, got %r' % value)
522
523 elif key == 'relative_cwd':
524 if not isinstance(value, basestring):
525 raise IsolatedError('Expected string, got %r' % value)
526
527 elif key == 'version':
528 # Already checked above.
529 pass
530
531 else:
532 raise IsolatedError('Unknown key %r' % key)
533
534 # Automatically fix os.path.sep if necessary. While .isolated files are always
535 # in the the native path format, someone could want to download an .isolated
536 # tree from another OS.
537 wrong_path_sep = '/' if os.path.sep == '\\' else '\\'
538 if 'files' in data:
539 data['files'] = dict(
540 (k.replace(wrong_path_sep, os.path.sep), v)
541 for k, v in data['files'].iteritems())
542 for v in data['files'].itervalues():
543 if 'l' in v:
544 v['l'] = v['l'].replace(wrong_path_sep, os.path.sep)
545 if 'relative_cwd' in data:
546 data['relative_cwd'] = data['relative_cwd'].replace(
547 wrong_path_sep, os.path.sep)
548 return data