blob: a18dbd85f826a2d681584f369ef77d3cdfe19565 [file] [log] [blame]
Marc-Antoine Ruel8bee66d2014-08-28 19:02:07 -04001# Copyright 2014 The Swarming Authors. All rights reserved.
2# Use of this source code is governed under the Apache License, Version 2.0 that
3# can be found in the LICENSE file.
4
5"""Understands .isolated files and can do local operations on them."""
6
7import hashlib
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -04008import json
Marc-Antoine Ruel92257792014-08-28 20:51:08 -04009import logging
10import os
Marc-Antoine Ruel8bee66d2014-08-28 19:02:07 -040011import re
Marc-Antoine Ruel92257792014-08-28 20:51:08 -040012import stat
13import sys
14
15from utils import file_path
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -040016from utils import tools
Marc-Antoine Ruel8bee66d2014-08-28 19:02:07 -040017
18
19# Version stored and expected in .isolated files.
20ISOLATED_FILE_VERSION = '1.4'
21
22
23# Chunk size to use when doing disk I/O.
24DISK_FILE_CHUNK = 1024 * 1024
25
26
27# Sadly, hashlib uses 'sha1' instead of the standard 'sha-1' so explicitly
28# specify the names here.
29SUPPORTED_ALGOS = {
30 'md5': hashlib.md5,
31 'sha-1': hashlib.sha1,
32 'sha-512': hashlib.sha512,
33}
34
35
36# Used for serialization.
37SUPPORTED_ALGOS_REVERSE = dict((v, k) for k, v in SUPPORTED_ALGOS.iteritems())
38
39
Marc-Antoine Ruel1e7658c2014-08-28 19:46:39 -040040class IsolatedError(ValueError):
41 """Generic failure to load a .isolated file."""
42 pass
43
44
45class MappingError(OSError):
46 """Failed to recreate the tree."""
47 pass
48
49
Marc-Antoine Ruel8bee66d2014-08-28 19:02:07 -040050def is_valid_hash(value, algo):
51 """Returns if the value is a valid hash for the corresponding algorithm."""
52 size = 2 * algo().digest_size
53 return bool(re.match(r'^[a-fA-F0-9]{%d}$' % size, value))
54
55
56def get_hash_algo(_namespace):
57 """Return hash algorithm class to use when uploading to given |namespace|."""
58 # TODO(vadimsh): Implement this at some point.
59 return hashlib.sha1
60
61
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -040062def is_namespace_with_compression(namespace):
63 """Returns True if given |namespace| stores compressed objects."""
64 return namespace.endswith(('-gzip', '-deflate'))
65
66
Marc-Antoine Ruel8bee66d2014-08-28 19:02:07 -040067def hash_file(filepath, algo):
68 """Calculates the hash of a file without reading it all in memory at once.
69
70 |algo| should be one of hashlib hashing algorithm.
71 """
72 digest = algo()
73 with open(filepath, 'rb') as f:
74 while True:
75 chunk = f.read(DISK_FILE_CHUNK)
76 if not chunk:
77 break
78 digest.update(chunk)
79 return digest.hexdigest()
Marc-Antoine Ruel92257792014-08-28 20:51:08 -040080
81
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -040082class IsolatedFile(object):
83 """Represents a single parsed .isolated file."""
Vadim Shtayura7f7459c2014-09-04 13:25:10 -070084
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -040085 def __init__(self, obj_hash, algo):
86 """|obj_hash| is really the sha-1 of the file."""
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -040087 self.obj_hash = obj_hash
88 self.algo = algo
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -040089
90 # Raw data.
91 self.data = {}
92 # A IsolatedFile instance, one per object in self.includes.
93 self.children = []
94
95 # Set once the .isolated file is loaded.
Vadim Shtayura7f7459c2014-09-04 13:25:10 -070096 self._is_loaded = False
97
98 def __repr__(self):
99 return 'IsolatedFile(%s, loaded: %s)' % (self.obj_hash, self._is_loaded)
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400100
101 def load(self, content):
102 """Verifies the .isolated file is valid and loads this object with the json
103 data.
104 """
105 logging.debug('IsolatedFile.load(%s)' % self.obj_hash)
Vadim Shtayura7f7459c2014-09-04 13:25:10 -0700106 assert not self._is_loaded
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400107 self.data = load_isolated(content, self.algo)
108 self.children = [
109 IsolatedFile(i, self.algo) for i in self.data.get('includes', [])
110 ]
Vadim Shtayura7f7459c2014-09-04 13:25:10 -0700111 self._is_loaded = True
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400112
Vadim Shtayura7f7459c2014-09-04 13:25:10 -0700113 @property
114 def is_loaded(self):
115 """Returns True if 'load' was already called."""
116 return self._is_loaded
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400117
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400118
Vadim Shtayura7f7459c2014-09-04 13:25:10 -0700119def walk_includes(isolated):
120 """Walks IsolatedFile include graph and yields IsolatedFile objects.
121
122 Visits root node first, then recursively all children, left to right.
123 Not yet loaded nodes are considered childless.
124 """
125 yield isolated
126 for child in isolated.children:
127 for x in walk_includes(child):
128 yield x
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400129
130
Vadim Shtayurac28b74f2014-10-06 20:00:08 -0700131@tools.profile
Marc-Antoine Ruel92257792014-08-28 20:51:08 -0400132def expand_symlinks(indir, relfile):
133 """Follows symlinks in |relfile|, but treating symlinks that point outside the
134 build tree as if they were ordinary directories/files. Returns the final
135 symlink-free target and a list of paths to symlinks encountered in the
136 process.
137
138 The rule about symlinks outside the build tree is for the benefit of the
139 Chromium OS ebuild, which symlinks the output directory to an unrelated path
140 in the chroot.
141
142 Fails when a directory loop is detected, although in theory we could support
143 that case.
144 """
145 is_directory = relfile.endswith(os.path.sep)
146 done = indir
147 todo = relfile.strip(os.path.sep)
148 symlinks = []
149
150 while todo:
Vadim Shtayura56c17562014-10-07 17:13:34 -0700151 pre_symlink, symlink, post_symlink = file_path.split_at_symlink(done, todo)
Marc-Antoine Ruel92257792014-08-28 20:51:08 -0400152 if not symlink:
153 todo = file_path.fix_native_path_case(done, todo)
154 done = os.path.join(done, todo)
155 break
156 symlink_path = os.path.join(done, pre_symlink, symlink)
157 post_symlink = post_symlink.lstrip(os.path.sep)
158 # readlink doesn't exist on Windows.
159 # pylint: disable=E1101
160 target = os.path.normpath(os.path.join(done, pre_symlink))
161 symlink_target = os.readlink(symlink_path)
162 if os.path.isabs(symlink_target):
163 # Absolute path are considered a normal directories. The use case is
164 # generally someone who puts the output directory on a separate drive.
165 target = symlink_target
166 else:
167 # The symlink itself could be using the wrong path case.
168 target = file_path.fix_native_path_case(target, symlink_target)
169
170 if not os.path.exists(target):
171 raise MappingError(
172 'Symlink target doesn\'t exist: %s -> %s' % (symlink_path, target))
173 target = file_path.get_native_path_case(target)
174 if not file_path.path_starts_with(indir, target):
175 done = symlink_path
176 todo = post_symlink
177 continue
178 if file_path.path_starts_with(target, symlink_path):
179 raise MappingError(
180 'Can\'t map recursive symlink reference %s -> %s' %
181 (symlink_path, target))
182 logging.info('Found symlink: %s -> %s', symlink_path, target)
183 symlinks.append(os.path.relpath(symlink_path, indir))
184 # Treat the common prefix of the old and new paths as done, and start
185 # scanning again.
186 target = target.split(os.path.sep)
187 symlink_path = symlink_path.split(os.path.sep)
188 prefix_length = 0
189 for target_piece, symlink_path_piece in zip(target, symlink_path):
190 if target_piece == symlink_path_piece:
191 prefix_length += 1
192 else:
193 break
194 done = os.path.sep.join(target[:prefix_length])
195 todo = os.path.join(
196 os.path.sep.join(target[prefix_length:]), post_symlink)
197
198 relfile = os.path.relpath(done, indir)
199 relfile = relfile.rstrip(os.path.sep) + is_directory * os.path.sep
200 return relfile, symlinks
201
202
Vadim Shtayurac28b74f2014-10-06 20:00:08 -0700203@tools.profile
Marc-Antoine Ruel92257792014-08-28 20:51:08 -0400204def expand_directory_and_symlink(indir, relfile, blacklist, follow_symlinks):
205 """Expands a single input. It can result in multiple outputs.
206
207 This function is recursive when relfile is a directory.
208
209 Note: this code doesn't properly handle recursive symlink like one created
210 with:
211 ln -s .. foo
212 """
213 if os.path.isabs(relfile):
214 raise MappingError('Can\'t map absolute path %s' % relfile)
215
216 infile = file_path.normpath(os.path.join(indir, relfile))
217 if not infile.startswith(indir):
218 raise MappingError('Can\'t map file %s outside %s' % (infile, indir))
219
220 filepath = os.path.join(indir, relfile)
221 native_filepath = file_path.get_native_path_case(filepath)
222 if filepath != native_filepath:
223 # Special case './'.
224 if filepath != native_filepath + '.' + os.path.sep:
225 # While it'd be nice to enforce path casing on Windows, it's impractical.
226 # Also give up enforcing strict path case on OSX. Really, it's that sad.
227 # The case where it happens is very specific and hard to reproduce:
228 # get_native_path_case(
229 # u'Foo.framework/Versions/A/Resources/Something.nib') will return
230 # u'Foo.framework/Versions/A/resources/Something.nib', e.g. lowercase 'r'.
231 #
232 # Note that this is really something deep in OSX because running
233 # ls Foo.framework/Versions/A
234 # will print out 'Resources', while file_path.get_native_path_case()
235 # returns a lower case 'r'.
236 #
237 # So *something* is happening under the hood resulting in the command 'ls'
238 # and Carbon.File.FSPathMakeRef('path').FSRefMakePath() to disagree. We
239 # have no idea why.
240 if sys.platform not in ('darwin', 'win32'):
241 raise MappingError(
242 'File path doesn\'t equal native file path\n%s != %s' %
243 (filepath, native_filepath))
244
245 symlinks = []
246 if follow_symlinks:
247 relfile, symlinks = expand_symlinks(indir, relfile)
248
249 if relfile.endswith(os.path.sep):
250 if not os.path.isdir(infile):
251 raise MappingError(
252 '%s is not a directory but ends with "%s"' % (infile, os.path.sep))
253
254 # Special case './'.
255 if relfile.startswith('.' + os.path.sep):
256 relfile = relfile[2:]
257 outfiles = symlinks
258 try:
Vadim Shtayura56c17562014-10-07 17:13:34 -0700259 for filename in file_path.listdir(infile):
Marc-Antoine Ruel92257792014-08-28 20:51:08 -0400260 inner_relfile = os.path.join(relfile, filename)
261 if blacklist and blacklist(inner_relfile):
262 continue
263 if os.path.isdir(os.path.join(indir, inner_relfile)):
264 inner_relfile += os.path.sep
265 outfiles.extend(
266 expand_directory_and_symlink(indir, inner_relfile, blacklist,
267 follow_symlinks))
268 return outfiles
269 except OSError as e:
270 raise MappingError(
271 'Unable to iterate over directory %s.\n%s' % (infile, e))
272 else:
273 # Always add individual files even if they were blacklisted.
274 if os.path.isdir(infile):
275 raise MappingError(
276 'Input directory %s must have a trailing slash' % infile)
277
278 if not os.path.isfile(infile):
279 raise MappingError('Input file %s doesn\'t exist' % infile)
280
281 return symlinks + [relfile]
282
283
284def expand_directories_and_symlinks(
285 indir, infiles, blacklist, follow_symlinks, ignore_broken_items):
286 """Expands the directories and the symlinks, applies the blacklist and
287 verifies files exist.
288
289 Files are specified in os native path separator.
290 """
291 outfiles = []
292 for relfile in infiles:
293 try:
294 outfiles.extend(
295 expand_directory_and_symlink(
296 indir, relfile, blacklist, follow_symlinks))
297 except MappingError as e:
298 if not ignore_broken_items:
299 raise
300 logging.info('warning: %s', e)
301 return outfiles
302
303
Vadim Shtayurac28b74f2014-10-06 20:00:08 -0700304@tools.profile
Marc-Antoine Ruel92257792014-08-28 20:51:08 -0400305def file_to_metadata(filepath, prevdict, read_only, algo):
306 """Processes an input file, a dependency, and return meta data about it.
307
308 Behaviors:
309 - Retrieves the file mode, file size, file timestamp, file link
310 destination if it is a file link and calcultate the SHA-1 of the file's
311 content if the path points to a file and not a symlink.
312
313 Arguments:
314 filepath: File to act on.
315 prevdict: the previous dictionary. It is used to retrieve the cached sha-1
316 to skip recalculating the hash. Optional.
317 read_only: If 1 or 2, the file mode is manipulated. In practice, only save
318 one of 4 modes: 0755 (rwx), 0644 (rw), 0555 (rx), 0444 (r). On
319 windows, mode is not set since all files are 'executable' by
320 default.
321 algo: Hashing algorithm used.
322
323 Returns:
324 The necessary dict to create a entry in the 'files' section of an .isolated
325 file.
326 """
327 out = {}
328 # Always check the file stat and check if it is a link. The timestamp is used
329 # to know if the file's content/symlink destination should be looked into.
330 # E.g. only reuse from prevdict if the timestamp hasn't changed.
331 # There is the risk of the file's timestamp being reset to its last value
332 # manually while its content changed. We don't protect against that use case.
333 try:
334 filestats = os.lstat(filepath)
335 except OSError:
336 # The file is not present.
337 raise MappingError('%s is missing' % filepath)
338 is_link = stat.S_ISLNK(filestats.st_mode)
339
340 if sys.platform != 'win32':
341 # Ignore file mode on Windows since it's not really useful there.
342 filemode = stat.S_IMODE(filestats.st_mode)
343 # Remove write access for group and all access to 'others'.
344 filemode &= ~(stat.S_IWGRP | stat.S_IRWXO)
345 if read_only:
346 filemode &= ~stat.S_IWUSR
347 if filemode & stat.S_IXUSR:
348 filemode |= stat.S_IXGRP
349 else:
350 filemode &= ~stat.S_IXGRP
351 if not is_link:
352 out['m'] = filemode
353
354 # Used to skip recalculating the hash or link destination. Use the most recent
355 # update time.
356 out['t'] = int(round(filestats.st_mtime))
357
358 if not is_link:
359 out['s'] = filestats.st_size
360 # If the timestamp wasn't updated and the file size is still the same, carry
361 # on the sha-1.
362 if (prevdict.get('t') == out['t'] and
363 prevdict.get('s') == out['s']):
364 # Reuse the previous hash if available.
365 out['h'] = prevdict.get('h')
366 if not out.get('h'):
367 out['h'] = hash_file(filepath, algo)
368 else:
369 # If the timestamp wasn't updated, carry on the link destination.
370 if prevdict.get('t') == out['t']:
371 # Reuse the previous link destination if available.
372 out['l'] = prevdict.get('l')
373 if out.get('l') is None:
374 # The link could be in an incorrect path case. In practice, this only
375 # happen on OSX on case insensitive HFS.
376 # TODO(maruel): It'd be better if it was only done once, in
377 # expand_directory_and_symlink(), so it would not be necessary to do again
378 # here.
379 symlink_value = os.readlink(filepath) # pylint: disable=E1101
380 filedir = file_path.get_native_path_case(os.path.dirname(filepath))
381 native_dest = file_path.fix_native_path_case(filedir, symlink_value)
382 out['l'] = os.path.relpath(native_dest, filedir)
383 return out
Marc-Antoine Ruel52436aa2014-08-28 21:57:57 -0400384
385
386def save_isolated(isolated, data):
387 """Writes one or multiple .isolated files.
388
389 Note: this reference implementation does not create child .isolated file so it
390 always returns an empty list.
391
392 Returns the list of child isolated files that are included by |isolated|.
393 """
394 # Make sure the data is valid .isolated data by 'reloading' it.
395 algo = SUPPORTED_ALGOS[data['algo']]
396 load_isolated(json.dumps(data), algo)
397 tools.write_json(isolated, data, True)
398 return []
399
400
401def load_isolated(content, algo):
402 """Verifies the .isolated file is valid and loads this object with the json
403 data.
404
405 Arguments:
406 - content: raw serialized content to load.
407 - algo: hashlib algorithm class. Used to confirm the algorithm matches the
408 algorithm used on the Isolate Server.
409 """
410 try:
411 data = json.loads(content)
412 except ValueError:
413 raise IsolatedError('Failed to parse: %s...' % content[:100])
414
415 if not isinstance(data, dict):
416 raise IsolatedError('Expected dict, got %r' % data)
417
418 # Check 'version' first, since it could modify the parsing after.
419 value = data.get('version', '1.0')
420 if not isinstance(value, basestring):
421 raise IsolatedError('Expected string, got %r' % value)
422 try:
423 version = tuple(map(int, value.split('.')))
424 except ValueError:
425 raise IsolatedError('Expected valid version, got %r' % value)
426
427 expected_version = tuple(map(int, ISOLATED_FILE_VERSION.split('.')))
428 # Major version must match.
429 if version[0] != expected_version[0]:
430 raise IsolatedError(
431 'Expected compatible \'%s\' version, got %r' %
432 (ISOLATED_FILE_VERSION, value))
433
434 if algo is None:
435 # TODO(maruel): Remove the default around Jan 2014.
436 # Default the algorithm used in the .isolated file itself, falls back to
437 # 'sha-1' if unspecified.
438 algo = SUPPORTED_ALGOS_REVERSE[data.get('algo', 'sha-1')]
439
440 for key, value in data.iteritems():
441 if key == 'algo':
442 if not isinstance(value, basestring):
443 raise IsolatedError('Expected string, got %r' % value)
444 if value not in SUPPORTED_ALGOS:
445 raise IsolatedError(
446 'Expected one of \'%s\', got %r' %
447 (', '.join(sorted(SUPPORTED_ALGOS)), value))
448 if value != SUPPORTED_ALGOS_REVERSE[algo]:
449 raise IsolatedError(
450 'Expected \'%s\', got %r' % (SUPPORTED_ALGOS_REVERSE[algo], value))
451
452 elif key == 'command':
453 if not isinstance(value, list):
454 raise IsolatedError('Expected list, got %r' % value)
455 if not value:
456 raise IsolatedError('Expected non-empty command')
457 for subvalue in value:
458 if not isinstance(subvalue, basestring):
459 raise IsolatedError('Expected string, got %r' % subvalue)
460
461 elif key == 'files':
462 if not isinstance(value, dict):
463 raise IsolatedError('Expected dict, got %r' % value)
464 for subkey, subvalue in value.iteritems():
465 if not isinstance(subkey, basestring):
466 raise IsolatedError('Expected string, got %r' % subkey)
467 if not isinstance(subvalue, dict):
468 raise IsolatedError('Expected dict, got %r' % subvalue)
469 for subsubkey, subsubvalue in subvalue.iteritems():
470 if subsubkey == 'l':
471 if not isinstance(subsubvalue, basestring):
472 raise IsolatedError('Expected string, got %r' % subsubvalue)
473 elif subsubkey == 'm':
474 if not isinstance(subsubvalue, int):
475 raise IsolatedError('Expected int, got %r' % subsubvalue)
476 elif subsubkey == 'h':
477 if not is_valid_hash(subsubvalue, algo):
478 raise IsolatedError('Expected sha-1, got %r' % subsubvalue)
479 elif subsubkey == 's':
480 if not isinstance(subsubvalue, (int, long)):
481 raise IsolatedError('Expected int or long, got %r' % subsubvalue)
482 else:
483 raise IsolatedError('Unknown subsubkey %s' % subsubkey)
484 if bool('h' in subvalue) == bool('l' in subvalue):
485 raise IsolatedError(
486 'Need only one of \'h\' (sha-1) or \'l\' (link), got: %r' %
487 subvalue)
488 if bool('h' in subvalue) != bool('s' in subvalue):
489 raise IsolatedError(
490 'Both \'h\' (sha-1) and \'s\' (size) should be set, got: %r' %
491 subvalue)
492 if bool('s' in subvalue) == bool('l' in subvalue):
493 raise IsolatedError(
494 'Need only one of \'s\' (size) or \'l\' (link), got: %r' %
495 subvalue)
496 if bool('l' in subvalue) and bool('m' in subvalue):
497 raise IsolatedError(
498 'Cannot use \'m\' (mode) and \'l\' (link), got: %r' %
499 subvalue)
500
501 elif key == 'includes':
502 if not isinstance(value, list):
503 raise IsolatedError('Expected list, got %r' % value)
504 if not value:
505 raise IsolatedError('Expected non-empty includes list')
506 for subvalue in value:
507 if not is_valid_hash(subvalue, algo):
508 raise IsolatedError('Expected sha-1, got %r' % subvalue)
509
510 elif key == 'os':
511 if version >= (1, 4):
512 raise IsolatedError('Key \'os\' is not allowed starting version 1.4')
513
514 elif key == 'read_only':
515 if not value in (0, 1, 2):
516 raise IsolatedError('Expected 0, 1 or 2, got %r' % value)
517
518 elif key == 'relative_cwd':
519 if not isinstance(value, basestring):
520 raise IsolatedError('Expected string, got %r' % value)
521
522 elif key == 'version':
523 # Already checked above.
524 pass
525
526 else:
527 raise IsolatedError('Unknown key %r' % key)
528
529 # Automatically fix os.path.sep if necessary. While .isolated files are always
530 # in the the native path format, someone could want to download an .isolated
531 # tree from another OS.
532 wrong_path_sep = '/' if os.path.sep == '\\' else '\\'
533 if 'files' in data:
534 data['files'] = dict(
535 (k.replace(wrong_path_sep, os.path.sep), v)
536 for k, v in data['files'].iteritems())
537 for v in data['files'].itervalues():
538 if 'l' in v:
539 v['l'] = v['l'].replace(wrong_path_sep, os.path.sep)
540 if 'relative_cwd' in data:
541 data['relative_cwd'] = data['relative_cwd'].replace(
542 wrong_path_sep, os.path.sep)
543 return data