Congbin Guo | c427758 | 2018-06-06 16:44:48 -0700 | [diff] [blame] | 1 | # -*- coding: utf-8 -*- |
| 2 | # Copyright 2018 The Chromium OS Authors. All rights reserved. |
| 3 | # Use of this source code is governed by a BSD-style license that can be |
| 4 | # found in the LICENSE file. |
| 5 | """This module provides utils to handle response of "Range Request".""" |
| 6 | |
| 7 | from __future__ import absolute_import |
| 8 | from __future__ import division |
| 9 | from __future__ import print_function |
| 10 | |
Congbin Guo | 3c6cc4b | 2018-06-14 17:45:10 -0700 | [diff] [blame] | 11 | import collections |
| 12 | import itertools |
| 13 | import json |
| 14 | import re |
| 15 | |
Congbin Guo | c427758 | 2018-06-06 16:44:48 -0700 | [diff] [blame] | 16 | import constants |
| 17 | |
Congbin Guo | 3c6cc4b | 2018-06-14 17:45:10 -0700 | [diff] [blame] | 18 | _RANGE_HEADER_SEPARATORS = re.compile('[-/ ]') |
| 19 | |
| 20 | _ContentRangeHeader = collections.namedtuple('_ContentRangeHeader', |
| 21 | ('bytes', 'start', 'end', 'total')) |
| 22 | |
Congbin Guo | c427758 | 2018-06-06 16:44:48 -0700 | [diff] [blame] | 23 | |
| 24 | class FormatError(Exception): |
| 25 | """Exception raised when we parse wrong format of response.""" |
| 26 | |
| 27 | |
| 28 | class NoFileFoundError(Exception): |
| 29 | """Exception raised when we cannot get a file match the range.""" |
| 30 | |
| 31 | |
Congbin Guo | 3c6cc4b | 2018-06-14 17:45:10 -0700 | [diff] [blame] | 32 | class ResponseQueueError(Exception): |
| 33 | """Exception raised when trying to queue responses not allowed.""" |
| 34 | |
| 35 | |
| 36 | def _get_file_by_range_header(range_header_str, file_name_map): |
| 37 | """Get file name and size by the Content-Range header. |
| 38 | |
| 39 | The format of Content-Range header is like: |
| 40 | Content-Range: bytes <start>-<end>/<total> |
| 41 | We get the <start> and <end> from it and retrieve the file name from |
| 42 | |file_name_map|. |
| 43 | |
| 44 | Args: |
| 45 | range_header_str: A string of range header. |
| 46 | file_name_map: A dict of {(<start:str>, <size:int>): filename, ...}. |
| 47 | |
| 48 | Returns: |
| 49 | A tuple of (filename, size). |
| 50 | |
| 51 | Raises: |
| 52 | FormatError: Raised when response content interrupted. |
| 53 | NoFileFoundError: Raised when we cannot get a file matches the range. |
| 54 | """ |
| 55 | # Split the part of 'Content-Range:' first if needed. |
| 56 | if range_header_str.lower().startswith('content-range:'): |
| 57 | range_header_str = range_header_str.split(': ', 1)[1] |
| 58 | |
| 59 | try: |
| 60 | range_header = _ContentRangeHeader._make( |
| 61 | _RANGE_HEADER_SEPARATORS.split(range_header_str) |
| 62 | ) |
| 63 | size = int(range_header.end) - int(range_header.start) + 1 |
| 64 | except (IndexError, ValueError): |
| 65 | raise FormatError('Wrong format of content range header: %s' % |
| 66 | range_header_str) |
| 67 | |
| 68 | try: |
| 69 | filename = file_name_map[(range_header.start, size)] |
| 70 | except KeyError: |
| 71 | raise NoFileFoundError('Cannot find a file matches the range %s' % |
| 72 | range_header_str) |
| 73 | |
| 74 | return filename, size |
| 75 | |
| 76 | |
| 77 | class JsonStreamer(object): |
| 78 | """A class to stream the responses for range requests. |
| 79 | |
| 80 | The class accepts responses and format the file content in all of them as a |
| 81 | JSON stream. The format: |
| 82 | '{"<filename>": "<content>", "<filename>": "<content>", ...}' |
| 83 | """ |
| 84 | |
| 85 | def __init__(self): |
| 86 | self._files_iter_list = [] |
| 87 | self._can_add_more_response = True |
| 88 | |
| 89 | def queue_response(self, response, file_info_list): |
| 90 | """Add a reponse to the queue to be streamed as JSON. |
| 91 | |
| 92 | We can add either: |
| 93 | 1. one and only one response for single-part range requests, or |
| 94 | 2. a series of responses for multi-part range requests. |
| 95 | |
| 96 | Args: |
| 97 | response: An instance of requests.Response, which may be the response of a |
| 98 | single range request, or a multi-part range request. |
| 99 | file_info_list: A list of tarfile_utils.TarMemberInfo. We use it to look |
| 100 | up file name by content start offset and size. |
| 101 | |
| 102 | Raises: |
| 103 | FormatError: Raised when response to be queued isn't for a range request. |
| 104 | ResponseQueueError: Raised when either queuing more than one response for |
| 105 | single-part range request, or mixed responses for single-part and |
| 106 | multi-part range request. |
| 107 | """ |
| 108 | if not self._can_add_more_response: |
| 109 | raise ResponseQueueError( |
| 110 | 'No more reponses can be added when there was a response for ' |
| 111 | 'single-part range request in the queue!') |
| 112 | |
| 113 | file_name_map = {(f.content_start, int(f.size)): f.filename |
| 114 | for f in file_info_list} |
| 115 | |
| 116 | # Check if the response is for single range, or multi-part range. For a |
| 117 | # single range request, the response must have header 'Content-Range'. For a |
| 118 | # multi-part ranges request, the Content-Type header must be like |
| 119 | # 'multipart/byteranges; ......'. |
| 120 | content_range = response.headers.get('Content-Range', None) |
| 121 | content_type = response.headers.get('Content-Type', '') |
| 122 | |
| 123 | if content_range: |
| 124 | if self._files_iter_list: |
| 125 | raise ResponseQueueError( |
| 126 | 'Cannot queue more than one responses for single-part range ' |
| 127 | 'request, or mix responses for single-part and multi-part.') |
| 128 | filename, _ = _get_file_by_range_header(content_range, file_name_map) |
| 129 | self._files_iter_list = [iter([(filename, response.content)])] |
| 130 | self._can_add_more_response = False |
| 131 | |
| 132 | elif content_type.startswith('multipart/byteranges;'): |
| 133 | self._files_iter_list.append( |
| 134 | iter(_FileIterator(response, file_name_map))) |
| 135 | |
| 136 | else: |
| 137 | raise FormatError('The response is not for a range request.') |
| 138 | |
| 139 | def stream(self): |
| 140 | """Yield the series of responses content as a JSON stream. |
| 141 | |
| 142 | Yields: |
| 143 | A JSON stream in format described above. |
| 144 | """ |
| 145 | files_iter = itertools.chain(*self._files_iter_list) |
| 146 | |
| 147 | json_encoder = json.JSONEncoder() |
| 148 | filename, content = next(files_iter) |
| 149 | yield '{%s: %s' % (json_encoder.encode(filename), |
| 150 | json_encoder.encode(content)) |
| 151 | for filename, content in files_iter: |
| 152 | yield ', %s: %s' % (json_encoder.encode(filename), |
| 153 | json_encoder.encode(content)) |
| 154 | yield '}' |
| 155 | |
| 156 | |
| 157 | class _FileIterator(object): |
| 158 | """The iterator of files in a response of multi-part range request. |
Congbin Guo | c427758 | 2018-06-06 16:44:48 -0700 | [diff] [blame] | 159 | |
| 160 | An example response is like: |
| 161 | |
| 162 | HTTP/1.1 206 Partial Content |
| 163 | Content-Type: multipart/byteranges; boundary=magic_string |
| 164 | Content-Length: 282 |
| 165 | |
| 166 | --magic_string |
| 167 | Content-Type: text/html |
| 168 | Content-Range: bytes 0-50/1270 |
| 169 | |
| 170 | <data> |
| 171 | --magic_string |
| 172 | Content-Type: text/html |
| 173 | Content-Range: bytes 100-150/1270 |
| 174 | |
| 175 | <data> |
| 176 | --magic_string-- |
| 177 | |
| 178 | In our application, each part is the content of a file. This class iterates |
| 179 | the files. |
| 180 | """ |
| 181 | |
Congbin Guo | 3c6cc4b | 2018-06-14 17:45:10 -0700 | [diff] [blame] | 182 | def __init__(self, response, file_name_map): |
Congbin Guo | c427758 | 2018-06-06 16:44:48 -0700 | [diff] [blame] | 183 | """Constructor. |
| 184 | |
| 185 | Args: |
| 186 | response: An instance of requests.response. |
Congbin Guo | 3c6cc4b | 2018-06-14 17:45:10 -0700 | [diff] [blame] | 187 | file_name_map: A dict of {(<start:str>, <size:int>): filename, ...}. |
Congbin Guo | c427758 | 2018-06-06 16:44:48 -0700 | [diff] [blame] | 188 | """ |
| 189 | self._response_iter = response.iter_content( |
| 190 | constants.READ_BUFFER_SIZE_BYTES) |
| 191 | self._chunk = None |
Congbin Guo | 3c6cc4b | 2018-06-14 17:45:10 -0700 | [diff] [blame] | 192 | self._file_name_map = file_name_map |
Congbin Guo | c427758 | 2018-06-06 16:44:48 -0700 | [diff] [blame] | 193 | |
| 194 | def __iter__(self): |
| 195 | self._chunk = next(self._response_iter) |
| 196 | return self._iter_files() |
| 197 | |
| 198 | def _read_next_chunk(self): |
| 199 | """Helper function to read next chunk of data and return current chunk.""" |
| 200 | buffered = self._chunk |
| 201 | try: |
| 202 | self._chunk = next(self._response_iter) |
| 203 | except StopIteration: |
| 204 | self._chunk = None |
| 205 | |
| 206 | return buffered |
| 207 | |
| 208 | def _read_line(self): |
| 209 | """Read one CRLF ended line from the response. |
| 210 | |
| 211 | Returns: |
| 212 | The line read. Return None if nothing to read. |
| 213 | """ |
| 214 | if self._chunk is None: |
| 215 | return None |
| 216 | |
| 217 | buffered = '' |
| 218 | while True: |
| 219 | buffered += self._chunk |
| 220 | parts = buffered.split('\r\n', 1) |
| 221 | if len(parts) == 2: |
| 222 | line, self._chunk = parts |
| 223 | return line |
| 224 | else: # No '\r\n' in current chunk. Read one more. |
| 225 | self._read_next_chunk() |
| 226 | if self._chunk is None: |
| 227 | return buffered |
| 228 | |
| 229 | def _read_bytes(self, max_bytes): |
| 230 | """Read at most |max_bytes| bytes from the response. |
| 231 | |
| 232 | Args: |
| 233 | max_bytes: An integer of maximum bytes of bytes to read. |
| 234 | |
| 235 | Returns: |
| 236 | The bytes read. Return None if nothing to read. |
| 237 | """ |
| 238 | if self._chunk is None: |
| 239 | return None |
| 240 | |
| 241 | buffered = '' |
| 242 | bytes_remaining = max_bytes |
| 243 | while True: |
| 244 | bytes_remaining -= len(self._chunk) |
| 245 | if bytes_remaining < 0: |
| 246 | buffered += self._chunk[:bytes_remaining] |
| 247 | self._chunk = self._chunk[bytes_remaining:] |
| 248 | return buffered |
| 249 | |
| 250 | buffered += self._read_next_chunk() |
| 251 | if self._chunk is None: |
| 252 | return buffered |
| 253 | |
| 254 | def _read_empty_line(self): |
| 255 | """Read one line and assert it is empty.""" |
| 256 | line = self._read_line() |
| 257 | if line is None: |
| 258 | raise FormatError('Expect an empty line, but got EOF.') |
| 259 | |
| 260 | if line: |
| 261 | raise FormatError('Expect an empty line, but got "%s".' % line) |
| 262 | |
| 263 | def _iter_files(self): |
| 264 | """Iterate the files in the response. |
| 265 | |
| 266 | Yields: |
| 267 | A pair of (name, content) of the file. |
| 268 | |
| 269 | Raises: |
| 270 | FormatError: Raised when response content interrupted. |
Congbin Guo | c427758 | 2018-06-06 16:44:48 -0700 | [diff] [blame] | 271 | """ |
| 272 | self._read_empty_line() # The first line is empty. |
| 273 | while True: |
| 274 | self._read_line() # The second line is the boundary. |
| 275 | self._read_line() # The line sub content type. |
| 276 | sub_range_header = self._read_line() # The line of sub content range. |
| 277 | if sub_range_header is None: |
| 278 | break |
| 279 | self._read_empty_line() # Another empty line. |
| 280 | |
Congbin Guo | 3c6cc4b | 2018-06-14 17:45:10 -0700 | [diff] [blame] | 281 | filename, size = _get_file_by_range_header(sub_range_header, |
| 282 | self._file_name_map) |
Congbin Guo | c427758 | 2018-06-06 16:44:48 -0700 | [diff] [blame] | 283 | content = self._read_bytes(size) |
Congbin Guo | 3c6cc4b | 2018-06-14 17:45:10 -0700 | [diff] [blame] | 284 | |
Congbin Guo | c427758 | 2018-06-06 16:44:48 -0700 | [diff] [blame] | 285 | self._read_empty_line() # Every content has a trailing '\r\n'. |
| 286 | |
| 287 | bytes_read = 0 if content is None else len(content) |
| 288 | if bytes_read != size: |
| 289 | raise FormatError( |
| 290 | '%s: Error in reading content (read %d B, expect %d B)' % |
| 291 | (filename, bytes_read, size) |
| 292 | ) |
| 293 | |
| 294 | yield filename, content |