Marc-Antoine Ruel | 8bee66d | 2014-08-28 19:02:07 -0400 | [diff] [blame] | 1 | # Copyright 2014 The Swarming Authors. All rights reserved. |
| 2 | # Use of this source code is governed under the Apache License, Version 2.0 that |
| 3 | # can be found in the LICENSE file. |
| 4 | |
| 5 | """Understands .isolated files and can do local operations on them.""" |
| 6 | |
| 7 | import hashlib |
Marc-Antoine Ruel | 52436aa | 2014-08-28 21:57:57 -0400 | [diff] [blame^] | 8 | import json |
Marc-Antoine Ruel | 9225779 | 2014-08-28 20:51:08 -0400 | [diff] [blame] | 9 | import logging |
| 10 | import os |
Marc-Antoine Ruel | 8bee66d | 2014-08-28 19:02:07 -0400 | [diff] [blame] | 11 | import re |
Marc-Antoine Ruel | 9225779 | 2014-08-28 20:51:08 -0400 | [diff] [blame] | 12 | import stat |
| 13 | import sys |
| 14 | |
| 15 | from utils import file_path |
Marc-Antoine Ruel | 52436aa | 2014-08-28 21:57:57 -0400 | [diff] [blame^] | 16 | from utils import threading_utils |
| 17 | from utils import tools |
Marc-Antoine Ruel | 8bee66d | 2014-08-28 19:02:07 -0400 | [diff] [blame] | 18 | |
| 19 | |
| 20 | # Version stored and expected in .isolated files. |
| 21 | ISOLATED_FILE_VERSION = '1.4' |
| 22 | |
| 23 | |
| 24 | # Chunk size to use when doing disk I/O. |
| 25 | DISK_FILE_CHUNK = 1024 * 1024 |
| 26 | |
| 27 | |
Marc-Antoine Ruel | 1e7658c | 2014-08-28 19:46:39 -0400 | [diff] [blame] | 28 | # The file size to be used when we don't know the correct file size, |
| 29 | # generally used for .isolated files. |
| 30 | UNKNOWN_FILE_SIZE = None |
| 31 | |
| 32 | |
Marc-Antoine Ruel | 52436aa | 2014-08-28 21:57:57 -0400 | [diff] [blame^] | 33 | # Maximum expected delay (in seconds) between successive file fetches |
| 34 | # in run_tha_test. If it takes longer than that, a deadlock might be happening |
| 35 | # and all stack frames for all threads are dumped to log. |
| 36 | DEADLOCK_TIMEOUT = 5 * 60 |
| 37 | |
| 38 | |
Marc-Antoine Ruel | 8bee66d | 2014-08-28 19:02:07 -0400 | [diff] [blame] | 39 | # Sadly, hashlib uses 'sha1' instead of the standard 'sha-1' so explicitly |
| 40 | # specify the names here. |
| 41 | SUPPORTED_ALGOS = { |
| 42 | 'md5': hashlib.md5, |
| 43 | 'sha-1': hashlib.sha1, |
| 44 | 'sha-512': hashlib.sha512, |
| 45 | } |
| 46 | |
| 47 | |
| 48 | # Used for serialization. |
| 49 | SUPPORTED_ALGOS_REVERSE = dict((v, k) for k, v in SUPPORTED_ALGOS.iteritems()) |
| 50 | |
| 51 | |
Marc-Antoine Ruel | 1e7658c | 2014-08-28 19:46:39 -0400 | [diff] [blame] | 52 | class IsolatedError(ValueError): |
| 53 | """Generic failure to load a .isolated file.""" |
| 54 | pass |
| 55 | |
| 56 | |
| 57 | class MappingError(OSError): |
| 58 | """Failed to recreate the tree.""" |
| 59 | pass |
| 60 | |
| 61 | |
Marc-Antoine Ruel | 8bee66d | 2014-08-28 19:02:07 -0400 | [diff] [blame] | 62 | def is_valid_hash(value, algo): |
| 63 | """Returns if the value is a valid hash for the corresponding algorithm.""" |
| 64 | size = 2 * algo().digest_size |
| 65 | return bool(re.match(r'^[a-fA-F0-9]{%d}$' % size, value)) |
| 66 | |
| 67 | |
| 68 | def get_hash_algo(_namespace): |
| 69 | """Return hash algorithm class to use when uploading to given |namespace|.""" |
| 70 | # TODO(vadimsh): Implement this at some point. |
| 71 | return hashlib.sha1 |
| 72 | |
| 73 | |
Marc-Antoine Ruel | 52436aa | 2014-08-28 21:57:57 -0400 | [diff] [blame^] | 74 | def is_namespace_with_compression(namespace): |
| 75 | """Returns True if given |namespace| stores compressed objects.""" |
| 76 | return namespace.endswith(('-gzip', '-deflate')) |
| 77 | |
| 78 | |
Marc-Antoine Ruel | 8bee66d | 2014-08-28 19:02:07 -0400 | [diff] [blame] | 79 | def hash_file(filepath, algo): |
| 80 | """Calculates the hash of a file without reading it all in memory at once. |
| 81 | |
| 82 | |algo| should be one of hashlib hashing algorithm. |
| 83 | """ |
| 84 | digest = algo() |
| 85 | with open(filepath, 'rb') as f: |
| 86 | while True: |
| 87 | chunk = f.read(DISK_FILE_CHUNK) |
| 88 | if not chunk: |
| 89 | break |
| 90 | digest.update(chunk) |
| 91 | return digest.hexdigest() |
Marc-Antoine Ruel | 9225779 | 2014-08-28 20:51:08 -0400 | [diff] [blame] | 92 | |
| 93 | |
Marc-Antoine Ruel | 52436aa | 2014-08-28 21:57:57 -0400 | [diff] [blame^] | 94 | class WorkerPool(threading_utils.AutoRetryThreadPool): |
| 95 | """Thread pool that automatically retries on IOError and runs a preconfigured |
| 96 | function. |
| 97 | """ |
| 98 | # Initial and maximum number of worker threads. |
| 99 | INITIAL_WORKERS = 2 |
| 100 | MAX_WORKERS = 16 |
| 101 | RETRIES = 5 |
| 102 | |
| 103 | def __init__(self): |
| 104 | super(WorkerPool, self).__init__( |
| 105 | [IOError], |
| 106 | self.RETRIES, |
| 107 | self.INITIAL_WORKERS, |
| 108 | self.MAX_WORKERS, |
| 109 | 0, |
| 110 | 'remote') |
| 111 | |
| 112 | |
| 113 | class LocalCache(object): |
| 114 | """Local cache that stores objects fetched via Storage. |
| 115 | |
| 116 | It can be accessed concurrently from multiple threads, so it should protect |
| 117 | its internal state with some lock. |
| 118 | """ |
| 119 | cache_dir = None |
| 120 | |
| 121 | def __enter__(self): |
| 122 | """Context manager interface.""" |
| 123 | return self |
| 124 | |
| 125 | def __exit__(self, _exc_type, _exec_value, _traceback): |
| 126 | """Context manager interface.""" |
| 127 | return False |
| 128 | |
| 129 | def cached_set(self): |
| 130 | """Returns a set of all cached digests (always a new object).""" |
| 131 | raise NotImplementedError() |
| 132 | |
| 133 | def touch(self, digest, size): |
| 134 | """Ensures item is not corrupted and updates its LRU position. |
| 135 | |
| 136 | Arguments: |
| 137 | digest: hash digest of item to check. |
| 138 | size: expected size of this item. |
| 139 | |
| 140 | Returns: |
| 141 | True if item is in cache and not corrupted. |
| 142 | """ |
| 143 | raise NotImplementedError() |
| 144 | |
| 145 | def evict(self, digest): |
| 146 | """Removes item from cache if it's there.""" |
| 147 | raise NotImplementedError() |
| 148 | |
| 149 | def read(self, digest): |
| 150 | """Returns contents of the cached item as a single str.""" |
| 151 | raise NotImplementedError() |
| 152 | |
| 153 | def write(self, digest, content): |
| 154 | """Reads data from |content| generator and stores it in cache.""" |
| 155 | raise NotImplementedError() |
| 156 | |
| 157 | def hardlink(self, digest, dest, file_mode): |
| 158 | """Ensures file at |dest| has same content as cached |digest|. |
| 159 | |
| 160 | If file_mode is provided, it is used to set the executable bit if |
| 161 | applicable. |
| 162 | """ |
| 163 | raise NotImplementedError() |
| 164 | |
| 165 | |
| 166 | class IsolatedFile(object): |
| 167 | """Represents a single parsed .isolated file.""" |
| 168 | def __init__(self, obj_hash, algo): |
| 169 | """|obj_hash| is really the sha-1 of the file.""" |
| 170 | logging.debug('IsolatedFile(%s)' % obj_hash) |
| 171 | self.obj_hash = obj_hash |
| 172 | self.algo = algo |
| 173 | # Set once all the left-side of the tree is parsed. 'Tree' here means the |
| 174 | # .isolate and all the .isolated files recursively included by it with |
| 175 | # 'includes' key. The order of each sha-1 in 'includes', each representing a |
| 176 | # .isolated file in the hash table, is important, as the later ones are not |
| 177 | # processed until the firsts are retrieved and read. |
| 178 | self.can_fetch = False |
| 179 | |
| 180 | # Raw data. |
| 181 | self.data = {} |
| 182 | # A IsolatedFile instance, one per object in self.includes. |
| 183 | self.children = [] |
| 184 | |
| 185 | # Set once the .isolated file is loaded. |
| 186 | self._is_parsed = False |
| 187 | # Set once the files are fetched. |
| 188 | self.files_fetched = False |
| 189 | |
| 190 | def load(self, content): |
| 191 | """Verifies the .isolated file is valid and loads this object with the json |
| 192 | data. |
| 193 | """ |
| 194 | logging.debug('IsolatedFile.load(%s)' % self.obj_hash) |
| 195 | assert not self._is_parsed |
| 196 | self.data = load_isolated(content, self.algo) |
| 197 | self.children = [ |
| 198 | IsolatedFile(i, self.algo) for i in self.data.get('includes', []) |
| 199 | ] |
| 200 | self._is_parsed = True |
| 201 | |
| 202 | def fetch_files(self, fetch_queue, files): |
| 203 | """Adds files in this .isolated file not present in |files| dictionary. |
| 204 | |
| 205 | Preemptively request files. |
| 206 | |
| 207 | Note that |files| is modified by this function. |
| 208 | """ |
| 209 | assert self.can_fetch |
| 210 | if not self._is_parsed or self.files_fetched: |
| 211 | return |
| 212 | logging.debug('fetch_files(%s)' % self.obj_hash) |
| 213 | for filepath, properties in self.data.get('files', {}).iteritems(): |
| 214 | # Root isolated has priority on the files being mapped. In particular, |
| 215 | # overriden files must not be fetched. |
| 216 | if filepath not in files: |
| 217 | files[filepath] = properties |
| 218 | if 'h' in properties: |
| 219 | # Preemptively request files. |
| 220 | logging.debug('fetching %s' % filepath) |
| 221 | fetch_queue.add(properties['h'], properties['s'], WorkerPool.MED) |
| 222 | self.files_fetched = True |
| 223 | |
| 224 | |
| 225 | class Settings(object): |
| 226 | """Results of a completely parsed .isolated file.""" |
| 227 | def __init__(self): |
| 228 | self.command = [] |
| 229 | self.files = {} |
| 230 | self.read_only = None |
| 231 | self.relative_cwd = None |
| 232 | # The main .isolated file, a IsolatedFile instance. |
| 233 | self.root = None |
| 234 | |
| 235 | def load(self, fetch_queue, root_isolated_hash, algo): |
| 236 | """Loads the .isolated and all the included .isolated asynchronously. |
| 237 | |
| 238 | It enables support for "included" .isolated files. They are processed in |
| 239 | strict order but fetched asynchronously from the cache. This is important so |
| 240 | that a file in an included .isolated file that is overridden by an embedding |
| 241 | .isolated file is not fetched needlessly. The includes are fetched in one |
| 242 | pass and the files are fetched as soon as all the ones on the left-side |
| 243 | of the tree were fetched. |
| 244 | |
| 245 | The prioritization is very important here for nested .isolated files. |
| 246 | 'includes' have the highest priority and the algorithm is optimized for both |
| 247 | deep and wide trees. A deep one is a long link of .isolated files referenced |
| 248 | one at a time by one item in 'includes'. A wide one has a large number of |
| 249 | 'includes' in a single .isolated file. 'left' is defined as an included |
| 250 | .isolated file earlier in the 'includes' list. So the order of the elements |
| 251 | in 'includes' is important. |
| 252 | """ |
| 253 | self.root = IsolatedFile(root_isolated_hash, algo) |
| 254 | |
| 255 | # Isolated files being retrieved now: hash -> IsolatedFile instance. |
| 256 | pending = {} |
| 257 | # Set of hashes of already retrieved items to refuse recursive includes. |
| 258 | seen = set() |
| 259 | |
| 260 | def retrieve(isolated_file): |
| 261 | h = isolated_file.obj_hash |
| 262 | if h in seen: |
| 263 | raise IsolatedError('IsolatedFile %s is retrieved recursively' % h) |
| 264 | assert h not in pending |
| 265 | seen.add(h) |
| 266 | pending[h] = isolated_file |
| 267 | fetch_queue.add(h, priority=WorkerPool.HIGH) |
| 268 | |
| 269 | retrieve(self.root) |
| 270 | |
| 271 | while pending: |
| 272 | item_hash = fetch_queue.wait(pending) |
| 273 | item = pending.pop(item_hash) |
| 274 | item.load(fetch_queue.cache.read(item_hash)) |
| 275 | if item_hash == root_isolated_hash: |
| 276 | # It's the root item. |
| 277 | item.can_fetch = True |
| 278 | |
| 279 | for new_child in item.children: |
| 280 | retrieve(new_child) |
| 281 | |
| 282 | # Traverse the whole tree to see if files can now be fetched. |
| 283 | self._traverse_tree(fetch_queue, self.root) |
| 284 | |
| 285 | def check(n): |
| 286 | return all(check(x) for x in n.children) and n.files_fetched |
| 287 | assert check(self.root) |
| 288 | |
| 289 | self.relative_cwd = self.relative_cwd or '' |
| 290 | |
| 291 | def _traverse_tree(self, fetch_queue, node): |
| 292 | if node.can_fetch: |
| 293 | if not node.files_fetched: |
| 294 | self._update_self(fetch_queue, node) |
| 295 | will_break = False |
| 296 | for i in node.children: |
| 297 | if not i.can_fetch: |
| 298 | if will_break: |
| 299 | break |
| 300 | # Automatically mark the first one as fetcheable. |
| 301 | i.can_fetch = True |
| 302 | will_break = True |
| 303 | self._traverse_tree(fetch_queue, i) |
| 304 | |
| 305 | def _update_self(self, fetch_queue, node): |
| 306 | node.fetch_files(fetch_queue, self.files) |
| 307 | # Grabs properties. |
| 308 | if not self.command and node.data.get('command'): |
| 309 | # Ensure paths are correctly separated on windows. |
| 310 | self.command = node.data['command'] |
| 311 | if self.command: |
| 312 | self.command[0] = self.command[0].replace('/', os.path.sep) |
| 313 | self.command = tools.fix_python_path(self.command) |
| 314 | if self.read_only is None and node.data.get('read_only') is not None: |
| 315 | self.read_only = node.data['read_only'] |
| 316 | if (self.relative_cwd is None and |
| 317 | node.data.get('relative_cwd') is not None): |
| 318 | self.relative_cwd = node.data['relative_cwd'] |
| 319 | |
| 320 | |
Marc-Antoine Ruel | 9225779 | 2014-08-28 20:51:08 -0400 | [diff] [blame] | 321 | def expand_symlinks(indir, relfile): |
| 322 | """Follows symlinks in |relfile|, but treating symlinks that point outside the |
| 323 | build tree as if they were ordinary directories/files. Returns the final |
| 324 | symlink-free target and a list of paths to symlinks encountered in the |
| 325 | process. |
| 326 | |
| 327 | The rule about symlinks outside the build tree is for the benefit of the |
| 328 | Chromium OS ebuild, which symlinks the output directory to an unrelated path |
| 329 | in the chroot. |
| 330 | |
| 331 | Fails when a directory loop is detected, although in theory we could support |
| 332 | that case. |
| 333 | """ |
| 334 | is_directory = relfile.endswith(os.path.sep) |
| 335 | done = indir |
| 336 | todo = relfile.strip(os.path.sep) |
| 337 | symlinks = [] |
| 338 | |
| 339 | while todo: |
| 340 | pre_symlink, symlink, post_symlink = file_path.split_at_symlink( |
| 341 | done, todo) |
| 342 | if not symlink: |
| 343 | todo = file_path.fix_native_path_case(done, todo) |
| 344 | done = os.path.join(done, todo) |
| 345 | break |
| 346 | symlink_path = os.path.join(done, pre_symlink, symlink) |
| 347 | post_symlink = post_symlink.lstrip(os.path.sep) |
| 348 | # readlink doesn't exist on Windows. |
| 349 | # pylint: disable=E1101 |
| 350 | target = os.path.normpath(os.path.join(done, pre_symlink)) |
| 351 | symlink_target = os.readlink(symlink_path) |
| 352 | if os.path.isabs(symlink_target): |
| 353 | # Absolute path are considered a normal directories. The use case is |
| 354 | # generally someone who puts the output directory on a separate drive. |
| 355 | target = symlink_target |
| 356 | else: |
| 357 | # The symlink itself could be using the wrong path case. |
| 358 | target = file_path.fix_native_path_case(target, symlink_target) |
| 359 | |
| 360 | if not os.path.exists(target): |
| 361 | raise MappingError( |
| 362 | 'Symlink target doesn\'t exist: %s -> %s' % (symlink_path, target)) |
| 363 | target = file_path.get_native_path_case(target) |
| 364 | if not file_path.path_starts_with(indir, target): |
| 365 | done = symlink_path |
| 366 | todo = post_symlink |
| 367 | continue |
| 368 | if file_path.path_starts_with(target, symlink_path): |
| 369 | raise MappingError( |
| 370 | 'Can\'t map recursive symlink reference %s -> %s' % |
| 371 | (symlink_path, target)) |
| 372 | logging.info('Found symlink: %s -> %s', symlink_path, target) |
| 373 | symlinks.append(os.path.relpath(symlink_path, indir)) |
| 374 | # Treat the common prefix of the old and new paths as done, and start |
| 375 | # scanning again. |
| 376 | target = target.split(os.path.sep) |
| 377 | symlink_path = symlink_path.split(os.path.sep) |
| 378 | prefix_length = 0 |
| 379 | for target_piece, symlink_path_piece in zip(target, symlink_path): |
| 380 | if target_piece == symlink_path_piece: |
| 381 | prefix_length += 1 |
| 382 | else: |
| 383 | break |
| 384 | done = os.path.sep.join(target[:prefix_length]) |
| 385 | todo = os.path.join( |
| 386 | os.path.sep.join(target[prefix_length:]), post_symlink) |
| 387 | |
| 388 | relfile = os.path.relpath(done, indir) |
| 389 | relfile = relfile.rstrip(os.path.sep) + is_directory * os.path.sep |
| 390 | return relfile, symlinks |
| 391 | |
| 392 | |
| 393 | def expand_directory_and_symlink(indir, relfile, blacklist, follow_symlinks): |
| 394 | """Expands a single input. It can result in multiple outputs. |
| 395 | |
| 396 | This function is recursive when relfile is a directory. |
| 397 | |
| 398 | Note: this code doesn't properly handle recursive symlink like one created |
| 399 | with: |
| 400 | ln -s .. foo |
| 401 | """ |
| 402 | if os.path.isabs(relfile): |
| 403 | raise MappingError('Can\'t map absolute path %s' % relfile) |
| 404 | |
| 405 | infile = file_path.normpath(os.path.join(indir, relfile)) |
| 406 | if not infile.startswith(indir): |
| 407 | raise MappingError('Can\'t map file %s outside %s' % (infile, indir)) |
| 408 | |
| 409 | filepath = os.path.join(indir, relfile) |
| 410 | native_filepath = file_path.get_native_path_case(filepath) |
| 411 | if filepath != native_filepath: |
| 412 | # Special case './'. |
| 413 | if filepath != native_filepath + '.' + os.path.sep: |
| 414 | # While it'd be nice to enforce path casing on Windows, it's impractical. |
| 415 | # Also give up enforcing strict path case on OSX. Really, it's that sad. |
| 416 | # The case where it happens is very specific and hard to reproduce: |
| 417 | # get_native_path_case( |
| 418 | # u'Foo.framework/Versions/A/Resources/Something.nib') will return |
| 419 | # u'Foo.framework/Versions/A/resources/Something.nib', e.g. lowercase 'r'. |
| 420 | # |
| 421 | # Note that this is really something deep in OSX because running |
| 422 | # ls Foo.framework/Versions/A |
| 423 | # will print out 'Resources', while file_path.get_native_path_case() |
| 424 | # returns a lower case 'r'. |
| 425 | # |
| 426 | # So *something* is happening under the hood resulting in the command 'ls' |
| 427 | # and Carbon.File.FSPathMakeRef('path').FSRefMakePath() to disagree. We |
| 428 | # have no idea why. |
| 429 | if sys.platform not in ('darwin', 'win32'): |
| 430 | raise MappingError( |
| 431 | 'File path doesn\'t equal native file path\n%s != %s' % |
| 432 | (filepath, native_filepath)) |
| 433 | |
| 434 | symlinks = [] |
| 435 | if follow_symlinks: |
| 436 | relfile, symlinks = expand_symlinks(indir, relfile) |
| 437 | |
| 438 | if relfile.endswith(os.path.sep): |
| 439 | if not os.path.isdir(infile): |
| 440 | raise MappingError( |
| 441 | '%s is not a directory but ends with "%s"' % (infile, os.path.sep)) |
| 442 | |
| 443 | # Special case './'. |
| 444 | if relfile.startswith('.' + os.path.sep): |
| 445 | relfile = relfile[2:] |
| 446 | outfiles = symlinks |
| 447 | try: |
| 448 | for filename in os.listdir(infile): |
| 449 | inner_relfile = os.path.join(relfile, filename) |
| 450 | if blacklist and blacklist(inner_relfile): |
| 451 | continue |
| 452 | if os.path.isdir(os.path.join(indir, inner_relfile)): |
| 453 | inner_relfile += os.path.sep |
| 454 | outfiles.extend( |
| 455 | expand_directory_and_symlink(indir, inner_relfile, blacklist, |
| 456 | follow_symlinks)) |
| 457 | return outfiles |
| 458 | except OSError as e: |
| 459 | raise MappingError( |
| 460 | 'Unable to iterate over directory %s.\n%s' % (infile, e)) |
| 461 | else: |
| 462 | # Always add individual files even if they were blacklisted. |
| 463 | if os.path.isdir(infile): |
| 464 | raise MappingError( |
| 465 | 'Input directory %s must have a trailing slash' % infile) |
| 466 | |
| 467 | if not os.path.isfile(infile): |
| 468 | raise MappingError('Input file %s doesn\'t exist' % infile) |
| 469 | |
| 470 | return symlinks + [relfile] |
| 471 | |
| 472 | |
| 473 | def expand_directories_and_symlinks( |
| 474 | indir, infiles, blacklist, follow_symlinks, ignore_broken_items): |
| 475 | """Expands the directories and the symlinks, applies the blacklist and |
| 476 | verifies files exist. |
| 477 | |
| 478 | Files are specified in os native path separator. |
| 479 | """ |
| 480 | outfiles = [] |
| 481 | for relfile in infiles: |
| 482 | try: |
| 483 | outfiles.extend( |
| 484 | expand_directory_and_symlink( |
| 485 | indir, relfile, blacklist, follow_symlinks)) |
| 486 | except MappingError as e: |
| 487 | if not ignore_broken_items: |
| 488 | raise |
| 489 | logging.info('warning: %s', e) |
| 490 | return outfiles |
| 491 | |
| 492 | |
| 493 | def file_to_metadata(filepath, prevdict, read_only, algo): |
| 494 | """Processes an input file, a dependency, and return meta data about it. |
| 495 | |
| 496 | Behaviors: |
| 497 | - Retrieves the file mode, file size, file timestamp, file link |
| 498 | destination if it is a file link and calcultate the SHA-1 of the file's |
| 499 | content if the path points to a file and not a symlink. |
| 500 | |
| 501 | Arguments: |
| 502 | filepath: File to act on. |
| 503 | prevdict: the previous dictionary. It is used to retrieve the cached sha-1 |
| 504 | to skip recalculating the hash. Optional. |
| 505 | read_only: If 1 or 2, the file mode is manipulated. In practice, only save |
| 506 | one of 4 modes: 0755 (rwx), 0644 (rw), 0555 (rx), 0444 (r). On |
| 507 | windows, mode is not set since all files are 'executable' by |
| 508 | default. |
| 509 | algo: Hashing algorithm used. |
| 510 | |
| 511 | Returns: |
| 512 | The necessary dict to create a entry in the 'files' section of an .isolated |
| 513 | file. |
| 514 | """ |
| 515 | out = {} |
| 516 | # Always check the file stat and check if it is a link. The timestamp is used |
| 517 | # to know if the file's content/symlink destination should be looked into. |
| 518 | # E.g. only reuse from prevdict if the timestamp hasn't changed. |
| 519 | # There is the risk of the file's timestamp being reset to its last value |
| 520 | # manually while its content changed. We don't protect against that use case. |
| 521 | try: |
| 522 | filestats = os.lstat(filepath) |
| 523 | except OSError: |
| 524 | # The file is not present. |
| 525 | raise MappingError('%s is missing' % filepath) |
| 526 | is_link = stat.S_ISLNK(filestats.st_mode) |
| 527 | |
| 528 | if sys.platform != 'win32': |
| 529 | # Ignore file mode on Windows since it's not really useful there. |
| 530 | filemode = stat.S_IMODE(filestats.st_mode) |
| 531 | # Remove write access for group and all access to 'others'. |
| 532 | filemode &= ~(stat.S_IWGRP | stat.S_IRWXO) |
| 533 | if read_only: |
| 534 | filemode &= ~stat.S_IWUSR |
| 535 | if filemode & stat.S_IXUSR: |
| 536 | filemode |= stat.S_IXGRP |
| 537 | else: |
| 538 | filemode &= ~stat.S_IXGRP |
| 539 | if not is_link: |
| 540 | out['m'] = filemode |
| 541 | |
| 542 | # Used to skip recalculating the hash or link destination. Use the most recent |
| 543 | # update time. |
| 544 | out['t'] = int(round(filestats.st_mtime)) |
| 545 | |
| 546 | if not is_link: |
| 547 | out['s'] = filestats.st_size |
| 548 | # If the timestamp wasn't updated and the file size is still the same, carry |
| 549 | # on the sha-1. |
| 550 | if (prevdict.get('t') == out['t'] and |
| 551 | prevdict.get('s') == out['s']): |
| 552 | # Reuse the previous hash if available. |
| 553 | out['h'] = prevdict.get('h') |
| 554 | if not out.get('h'): |
| 555 | out['h'] = hash_file(filepath, algo) |
| 556 | else: |
| 557 | # If the timestamp wasn't updated, carry on the link destination. |
| 558 | if prevdict.get('t') == out['t']: |
| 559 | # Reuse the previous link destination if available. |
| 560 | out['l'] = prevdict.get('l') |
| 561 | if out.get('l') is None: |
| 562 | # The link could be in an incorrect path case. In practice, this only |
| 563 | # happen on OSX on case insensitive HFS. |
| 564 | # TODO(maruel): It'd be better if it was only done once, in |
| 565 | # expand_directory_and_symlink(), so it would not be necessary to do again |
| 566 | # here. |
| 567 | symlink_value = os.readlink(filepath) # pylint: disable=E1101 |
| 568 | filedir = file_path.get_native_path_case(os.path.dirname(filepath)) |
| 569 | native_dest = file_path.fix_native_path_case(filedir, symlink_value) |
| 570 | out['l'] = os.path.relpath(native_dest, filedir) |
| 571 | return out |
Marc-Antoine Ruel | 52436aa | 2014-08-28 21:57:57 -0400 | [diff] [blame^] | 572 | |
| 573 | |
| 574 | def save_isolated(isolated, data): |
| 575 | """Writes one or multiple .isolated files. |
| 576 | |
| 577 | Note: this reference implementation does not create child .isolated file so it |
| 578 | always returns an empty list. |
| 579 | |
| 580 | Returns the list of child isolated files that are included by |isolated|. |
| 581 | """ |
| 582 | # Make sure the data is valid .isolated data by 'reloading' it. |
| 583 | algo = SUPPORTED_ALGOS[data['algo']] |
| 584 | load_isolated(json.dumps(data), algo) |
| 585 | tools.write_json(isolated, data, True) |
| 586 | return [] |
| 587 | |
| 588 | |
| 589 | def load_isolated(content, algo): |
| 590 | """Verifies the .isolated file is valid and loads this object with the json |
| 591 | data. |
| 592 | |
| 593 | Arguments: |
| 594 | - content: raw serialized content to load. |
| 595 | - algo: hashlib algorithm class. Used to confirm the algorithm matches the |
| 596 | algorithm used on the Isolate Server. |
| 597 | """ |
| 598 | try: |
| 599 | data = json.loads(content) |
| 600 | except ValueError: |
| 601 | raise IsolatedError('Failed to parse: %s...' % content[:100]) |
| 602 | |
| 603 | if not isinstance(data, dict): |
| 604 | raise IsolatedError('Expected dict, got %r' % data) |
| 605 | |
| 606 | # Check 'version' first, since it could modify the parsing after. |
| 607 | value = data.get('version', '1.0') |
| 608 | if not isinstance(value, basestring): |
| 609 | raise IsolatedError('Expected string, got %r' % value) |
| 610 | try: |
| 611 | version = tuple(map(int, value.split('.'))) |
| 612 | except ValueError: |
| 613 | raise IsolatedError('Expected valid version, got %r' % value) |
| 614 | |
| 615 | expected_version = tuple(map(int, ISOLATED_FILE_VERSION.split('.'))) |
| 616 | # Major version must match. |
| 617 | if version[0] != expected_version[0]: |
| 618 | raise IsolatedError( |
| 619 | 'Expected compatible \'%s\' version, got %r' % |
| 620 | (ISOLATED_FILE_VERSION, value)) |
| 621 | |
| 622 | if algo is None: |
| 623 | # TODO(maruel): Remove the default around Jan 2014. |
| 624 | # Default the algorithm used in the .isolated file itself, falls back to |
| 625 | # 'sha-1' if unspecified. |
| 626 | algo = SUPPORTED_ALGOS_REVERSE[data.get('algo', 'sha-1')] |
| 627 | |
| 628 | for key, value in data.iteritems(): |
| 629 | if key == 'algo': |
| 630 | if not isinstance(value, basestring): |
| 631 | raise IsolatedError('Expected string, got %r' % value) |
| 632 | if value not in SUPPORTED_ALGOS: |
| 633 | raise IsolatedError( |
| 634 | 'Expected one of \'%s\', got %r' % |
| 635 | (', '.join(sorted(SUPPORTED_ALGOS)), value)) |
| 636 | if value != SUPPORTED_ALGOS_REVERSE[algo]: |
| 637 | raise IsolatedError( |
| 638 | 'Expected \'%s\', got %r' % (SUPPORTED_ALGOS_REVERSE[algo], value)) |
| 639 | |
| 640 | elif key == 'command': |
| 641 | if not isinstance(value, list): |
| 642 | raise IsolatedError('Expected list, got %r' % value) |
| 643 | if not value: |
| 644 | raise IsolatedError('Expected non-empty command') |
| 645 | for subvalue in value: |
| 646 | if not isinstance(subvalue, basestring): |
| 647 | raise IsolatedError('Expected string, got %r' % subvalue) |
| 648 | |
| 649 | elif key == 'files': |
| 650 | if not isinstance(value, dict): |
| 651 | raise IsolatedError('Expected dict, got %r' % value) |
| 652 | for subkey, subvalue in value.iteritems(): |
| 653 | if not isinstance(subkey, basestring): |
| 654 | raise IsolatedError('Expected string, got %r' % subkey) |
| 655 | if not isinstance(subvalue, dict): |
| 656 | raise IsolatedError('Expected dict, got %r' % subvalue) |
| 657 | for subsubkey, subsubvalue in subvalue.iteritems(): |
| 658 | if subsubkey == 'l': |
| 659 | if not isinstance(subsubvalue, basestring): |
| 660 | raise IsolatedError('Expected string, got %r' % subsubvalue) |
| 661 | elif subsubkey == 'm': |
| 662 | if not isinstance(subsubvalue, int): |
| 663 | raise IsolatedError('Expected int, got %r' % subsubvalue) |
| 664 | elif subsubkey == 'h': |
| 665 | if not is_valid_hash(subsubvalue, algo): |
| 666 | raise IsolatedError('Expected sha-1, got %r' % subsubvalue) |
| 667 | elif subsubkey == 's': |
| 668 | if not isinstance(subsubvalue, (int, long)): |
| 669 | raise IsolatedError('Expected int or long, got %r' % subsubvalue) |
| 670 | else: |
| 671 | raise IsolatedError('Unknown subsubkey %s' % subsubkey) |
| 672 | if bool('h' in subvalue) == bool('l' in subvalue): |
| 673 | raise IsolatedError( |
| 674 | 'Need only one of \'h\' (sha-1) or \'l\' (link), got: %r' % |
| 675 | subvalue) |
| 676 | if bool('h' in subvalue) != bool('s' in subvalue): |
| 677 | raise IsolatedError( |
| 678 | 'Both \'h\' (sha-1) and \'s\' (size) should be set, got: %r' % |
| 679 | subvalue) |
| 680 | if bool('s' in subvalue) == bool('l' in subvalue): |
| 681 | raise IsolatedError( |
| 682 | 'Need only one of \'s\' (size) or \'l\' (link), got: %r' % |
| 683 | subvalue) |
| 684 | if bool('l' in subvalue) and bool('m' in subvalue): |
| 685 | raise IsolatedError( |
| 686 | 'Cannot use \'m\' (mode) and \'l\' (link), got: %r' % |
| 687 | subvalue) |
| 688 | |
| 689 | elif key == 'includes': |
| 690 | if not isinstance(value, list): |
| 691 | raise IsolatedError('Expected list, got %r' % value) |
| 692 | if not value: |
| 693 | raise IsolatedError('Expected non-empty includes list') |
| 694 | for subvalue in value: |
| 695 | if not is_valid_hash(subvalue, algo): |
| 696 | raise IsolatedError('Expected sha-1, got %r' % subvalue) |
| 697 | |
| 698 | elif key == 'os': |
| 699 | if version >= (1, 4): |
| 700 | raise IsolatedError('Key \'os\' is not allowed starting version 1.4') |
| 701 | |
| 702 | elif key == 'read_only': |
| 703 | if not value in (0, 1, 2): |
| 704 | raise IsolatedError('Expected 0, 1 or 2, got %r' % value) |
| 705 | |
| 706 | elif key == 'relative_cwd': |
| 707 | if not isinstance(value, basestring): |
| 708 | raise IsolatedError('Expected string, got %r' % value) |
| 709 | |
| 710 | elif key == 'version': |
| 711 | # Already checked above. |
| 712 | pass |
| 713 | |
| 714 | else: |
| 715 | raise IsolatedError('Unknown key %r' % key) |
| 716 | |
| 717 | # Automatically fix os.path.sep if necessary. While .isolated files are always |
| 718 | # in the the native path format, someone could want to download an .isolated |
| 719 | # tree from another OS. |
| 720 | wrong_path_sep = '/' if os.path.sep == '\\' else '\\' |
| 721 | if 'files' in data: |
| 722 | data['files'] = dict( |
| 723 | (k.replace(wrong_path_sep, os.path.sep), v) |
| 724 | for k, v in data['files'].iteritems()) |
| 725 | for v in data['files'].itervalues(): |
| 726 | if 'l' in v: |
| 727 | v['l'] = v['l'].replace(wrong_path_sep, os.path.sep) |
| 728 | if 'relative_cwd' in data: |
| 729 | data['relative_cwd'] = data['relative_cwd'].replace( |
| 730 | wrong_path_sep, os.path.sep) |
| 731 | return data |