Congbin Guo | c427758 | 2018-06-06 16:44:48 -0700 | [diff] [blame] | 1 | # -*- coding: utf-8 -*- |
| 2 | # Copyright 2018 The Chromium OS Authors. All rights reserved. |
| 3 | # Use of this source code is governed by a BSD-style license that can be |
| 4 | # found in the LICENSE file. |
| 5 | """This module provides utils to handle response of "Range Request".""" |
| 6 | |
| 7 | from __future__ import absolute_import |
| 8 | from __future__ import division |
| 9 | from __future__ import print_function |
| 10 | |
Congbin Guo | 3c6cc4b | 2018-06-14 17:45:10 -0700 | [diff] [blame] | 11 | import collections |
| 12 | import itertools |
| 13 | import json |
| 14 | import re |
| 15 | |
Congbin Guo | c427758 | 2018-06-06 16:44:48 -0700 | [diff] [blame] | 16 | import constants |
| 17 | |
Congbin Guo | 3c6cc4b | 2018-06-14 17:45:10 -0700 | [diff] [blame] | 18 | _RANGE_HEADER_SEPARATORS = re.compile('[-/ ]') |
Congbin Guo | 52f7cd0 | 2018-06-20 13:12:36 -0700 | [diff] [blame] | 19 | _ONE_LINE = object() # Special object to indicate data reader to read one line. |
Congbin Guo | 3c6cc4b | 2018-06-14 17:45:10 -0700 | [diff] [blame] | 20 | |
| 21 | _ContentRangeHeader = collections.namedtuple('_ContentRangeHeader', |
| 22 | ('bytes', 'start', 'end', 'total')) |
| 23 | |
Congbin Guo | c427758 | 2018-06-06 16:44:48 -0700 | [diff] [blame] | 24 | |
| 25 | class FormatError(Exception): |
| 26 | """Exception raised when we parse wrong format of response.""" |
| 27 | |
| 28 | |
| 29 | class NoFileFoundError(Exception): |
| 30 | """Exception raised when we cannot get a file match the range.""" |
| 31 | |
| 32 | |
Congbin Guo | 3c6cc4b | 2018-06-14 17:45:10 -0700 | [diff] [blame] | 33 | class ResponseQueueError(Exception): |
| 34 | """Exception raised when trying to queue responses not allowed.""" |
| 35 | |
| 36 | |
| 37 | def _get_file_by_range_header(range_header_str, file_name_map): |
| 38 | """Get file name and size by the Content-Range header. |
| 39 | |
| 40 | The format of Content-Range header is like: |
| 41 | Content-Range: bytes <start>-<end>/<total> |
| 42 | We get the <start> and <end> from it and retrieve the file name from |
| 43 | |file_name_map|. |
| 44 | |
| 45 | Args: |
| 46 | range_header_str: A string of range header. |
| 47 | file_name_map: A dict of {(<start:str>, <size:int>): filename, ...}. |
| 48 | |
| 49 | Returns: |
| 50 | A tuple of (filename, size). |
| 51 | |
| 52 | Raises: |
| 53 | FormatError: Raised when response content interrupted. |
| 54 | NoFileFoundError: Raised when we cannot get a file matches the range. |
| 55 | """ |
| 56 | # Split the part of 'Content-Range:' first if needed. |
| 57 | if range_header_str.lower().startswith('content-range:'): |
| 58 | range_header_str = range_header_str.split(': ', 1)[1] |
| 59 | |
| 60 | try: |
| 61 | range_header = _ContentRangeHeader._make( |
| 62 | _RANGE_HEADER_SEPARATORS.split(range_header_str) |
| 63 | ) |
| 64 | size = int(range_header.end) - int(range_header.start) + 1 |
| 65 | except (IndexError, ValueError): |
| 66 | raise FormatError('Wrong format of content range header: %s' % |
| 67 | range_header_str) |
| 68 | |
| 69 | try: |
| 70 | filename = file_name_map[(range_header.start, size)] |
| 71 | except KeyError: |
| 72 | raise NoFileFoundError('Cannot find a file matches the range %s' % |
| 73 | range_header_str) |
| 74 | |
| 75 | return filename, size |
| 76 | |
| 77 | |
| 78 | class JsonStreamer(object): |
| 79 | """A class to stream the responses for range requests. |
| 80 | |
| 81 | The class accepts responses and format the file content in all of them as a |
| 82 | JSON stream. The format: |
| 83 | '{"<filename>": "<content>", "<filename>": "<content>", ...}' |
| 84 | """ |
| 85 | |
| 86 | def __init__(self): |
| 87 | self._files_iter_list = [] |
| 88 | self._can_add_more_response = True |
| 89 | |
| 90 | def queue_response(self, response, file_info_list): |
| 91 | """Add a reponse to the queue to be streamed as JSON. |
| 92 | |
| 93 | We can add either: |
| 94 | 1. one and only one response for single-part range requests, or |
| 95 | 2. a series of responses for multi-part range requests. |
| 96 | |
| 97 | Args: |
| 98 | response: An instance of requests.Response, which may be the response of a |
| 99 | single range request, or a multi-part range request. |
| 100 | file_info_list: A list of tarfile_utils.TarMemberInfo. We use it to look |
| 101 | up file name by content start offset and size. |
| 102 | |
| 103 | Raises: |
| 104 | FormatError: Raised when response to be queued isn't for a range request. |
| 105 | ResponseQueueError: Raised when either queuing more than one response for |
| 106 | single-part range request, or mixed responses for single-part and |
| 107 | multi-part range request. |
| 108 | """ |
| 109 | if not self._can_add_more_response: |
| 110 | raise ResponseQueueError( |
| 111 | 'No more reponses can be added when there was a response for ' |
| 112 | 'single-part range request in the queue!') |
| 113 | |
| 114 | file_name_map = {(f.content_start, int(f.size)): f.filename |
| 115 | for f in file_info_list} |
| 116 | |
| 117 | # Check if the response is for single range, or multi-part range. For a |
| 118 | # single range request, the response must have header 'Content-Range'. For a |
| 119 | # multi-part ranges request, the Content-Type header must be like |
| 120 | # 'multipart/byteranges; ......'. |
| 121 | content_range = response.headers.get('Content-Range', None) |
| 122 | content_type = response.headers.get('Content-Type', '') |
| 123 | |
| 124 | if content_range: |
| 125 | if self._files_iter_list: |
| 126 | raise ResponseQueueError( |
| 127 | 'Cannot queue more than one responses for single-part range ' |
| 128 | 'request, or mix responses for single-part and multi-part.') |
| 129 | filename, _ = _get_file_by_range_header(content_range, file_name_map) |
| 130 | self._files_iter_list = [iter([(filename, response.content)])] |
| 131 | self._can_add_more_response = False |
| 132 | |
| 133 | elif content_type.startswith('multipart/byteranges;'): |
| 134 | self._files_iter_list.append( |
Congbin Guo | 52f7cd0 | 2018-06-20 13:12:36 -0700 | [diff] [blame] | 135 | _file_iterator(response, file_name_map)) |
Congbin Guo | 3c6cc4b | 2018-06-14 17:45:10 -0700 | [diff] [blame] | 136 | |
| 137 | else: |
| 138 | raise FormatError('The response is not for a range request.') |
| 139 | |
| 140 | def stream(self): |
| 141 | """Yield the series of responses content as a JSON stream. |
| 142 | |
| 143 | Yields: |
| 144 | A JSON stream in format described above. |
| 145 | """ |
| 146 | files_iter = itertools.chain(*self._files_iter_list) |
| 147 | |
| 148 | json_encoder = json.JSONEncoder() |
| 149 | filename, content = next(files_iter) |
| 150 | yield '{%s: %s' % (json_encoder.encode(filename), |
| 151 | json_encoder.encode(content)) |
| 152 | for filename, content in files_iter: |
| 153 | yield ', %s: %s' % (json_encoder.encode(filename), |
| 154 | json_encoder.encode(content)) |
| 155 | yield '}' |
| 156 | |
| 157 | |
Congbin Guo | 52f7cd0 | 2018-06-20 13:12:36 -0700 | [diff] [blame] | 158 | def _data_reader(data_iter): |
| 159 | """A coroutine to read data from |data_iter|. |
| 160 | |
| 161 | It accepts two type of parameter: |
| 162 | 1. _ONE_LINE: Read one CRLF ended line if possible. |
| 163 | 2. An integer N: Read at most N bytes. |
| 164 | |
| 165 | Args: |
| 166 | data_iter: An iterator of data source. |
| 167 | |
| 168 | Yields: |
| 169 | The data read. |
| 170 | """ |
| 171 | buffered = next(data_iter) |
| 172 | |
| 173 | # Get what to be read in runtime by passing value into the generator. See |
| 174 | # https://docs.python.org/2.5/whatsnew/pep-342.html for syntax details. |
| 175 | to_be_read = yield |
| 176 | |
| 177 | while True: |
| 178 | if to_be_read is _ONE_LINE: |
| 179 | parts = buffered.split('\r\n', 1) |
| 180 | if len(parts) == 2: |
| 181 | line, buffered = parts |
| 182 | to_be_read = (yield line) |
| 183 | continue |
| 184 | |
| 185 | else: # Read at most |to_be_read| bytes of data. |
| 186 | bytes_remaining = to_be_read - len(buffered) |
| 187 | if bytes_remaining < 0: |
| 188 | read_bytes = buffered[:bytes_remaining] |
| 189 | buffered = buffered[bytes_remaining:] |
| 190 | to_be_read = (yield read_bytes) |
| 191 | continue |
| 192 | |
| 193 | try: |
| 194 | buffered += next(data_iter) |
| 195 | except StopIteration: |
| 196 | break |
| 197 | |
| 198 | if buffered: |
| 199 | yield buffered |
| 200 | |
| 201 | |
| 202 | def _read_line(reader): |
| 203 | """Read one CRLF ended line from the response. |
| 204 | |
| 205 | Returns: |
| 206 | The line read. Return None if nothing to read. |
| 207 | """ |
| 208 | return reader.send(_ONE_LINE) |
| 209 | |
| 210 | |
| 211 | def _read_empty_line(reader): |
| 212 | """Read one line and assert it is empty.""" |
| 213 | try: |
| 214 | line = _read_line(reader) |
| 215 | except StopIteration: |
| 216 | raise FormatError('Expect an empty line, but got EOF.') |
| 217 | if line: |
| 218 | raise FormatError('Expect an empty line, but got "%s".' % line) |
| 219 | |
| 220 | |
| 221 | def _read_bytes(reader, max_bytes): |
| 222 | """Read at most |max_bytes| bytes from the reader. |
| 223 | |
| 224 | Args: |
| 225 | reader: |
| 226 | max_bytes: An integer of maximum bytes of bytes to read. |
| 227 | |
| 228 | Returns: |
| 229 | The bytes read. Return None if nothing to read. |
| 230 | """ |
| 231 | return reader.send(max_bytes) |
| 232 | |
| 233 | |
| 234 | def _file_iterator(response, file_name_map): |
Congbin Guo | 3c6cc4b | 2018-06-14 17:45:10 -0700 | [diff] [blame] | 235 | """The iterator of files in a response of multi-part range request. |
Congbin Guo | c427758 | 2018-06-06 16:44:48 -0700 | [diff] [blame] | 236 | |
| 237 | An example response is like: |
| 238 | |
| 239 | HTTP/1.1 206 Partial Content |
| 240 | Content-Type: multipart/byteranges; boundary=magic_string |
| 241 | Content-Length: 282 |
| 242 | |
| 243 | --magic_string |
| 244 | Content-Type: text/html |
| 245 | Content-Range: bytes 0-50/1270 |
| 246 | |
| 247 | <data> |
| 248 | --magic_string |
| 249 | Content-Type: text/html |
| 250 | Content-Range: bytes 100-150/1270 |
| 251 | |
| 252 | <data> |
| 253 | --magic_string-- |
| 254 | |
| 255 | In our application, each part is the content of a file. This class iterates |
| 256 | the files. |
Congbin Guo | 52f7cd0 | 2018-06-20 13:12:36 -0700 | [diff] [blame] | 257 | |
| 258 | Args: |
| 259 | response: An instance of requests.response. |
| 260 | file_name_map: A dict of {(<start:str>, <size:int>): filename, ...}. |
| 261 | |
| 262 | Yields: |
| 263 | A pair of (name, content) of the file. |
| 264 | |
| 265 | Raises: |
| 266 | FormatError: Raised when response content interrupted. |
Congbin Guo | c427758 | 2018-06-06 16:44:48 -0700 | [diff] [blame] | 267 | """ |
Congbin Guo | 52f7cd0 | 2018-06-20 13:12:36 -0700 | [diff] [blame] | 268 | reader = _data_reader( |
| 269 | response.iter_content(constants.READ_BUFFER_SIZE_BYTES)) |
| 270 | reader.next() # initialize the coroutine |
Congbin Guo | c427758 | 2018-06-06 16:44:48 -0700 | [diff] [blame] | 271 | |
Congbin Guo | 52f7cd0 | 2018-06-20 13:12:36 -0700 | [diff] [blame] | 272 | _read_empty_line(reader) # The first line is empty. |
| 273 | while True: |
| 274 | _read_line(reader) # The second line is the boundary. |
| 275 | _read_line(reader) # The line sub content type. |
| 276 | sub_range_header = _read_line(reader) # The line of sub content range. |
| 277 | if sub_range_header is None: |
| 278 | break |
| 279 | _read_empty_line(reader) # Another empty line. |
Congbin Guo | c427758 | 2018-06-06 16:44:48 -0700 | [diff] [blame] | 280 | |
Congbin Guo | 52f7cd0 | 2018-06-20 13:12:36 -0700 | [diff] [blame] | 281 | filename, size = _get_file_by_range_header(sub_range_header, |
| 282 | file_name_map) |
| 283 | content = _read_bytes(reader, size) |
Congbin Guo | c427758 | 2018-06-06 16:44:48 -0700 | [diff] [blame] | 284 | |
Congbin Guo | 52f7cd0 | 2018-06-20 13:12:36 -0700 | [diff] [blame] | 285 | _read_empty_line(reader) # Every content has a trailing '\r\n'. |
Congbin Guo | c427758 | 2018-06-06 16:44:48 -0700 | [diff] [blame] | 286 | |
Congbin Guo | 52f7cd0 | 2018-06-20 13:12:36 -0700 | [diff] [blame] | 287 | bytes_read = 0 if content is None else len(content) |
| 288 | if bytes_read != size: |
| 289 | raise FormatError( |
| 290 | '%s: Error in reading content (read %d B, expect %d B)' % |
| 291 | (filename, bytes_read, size) |
| 292 | ) |
Congbin Guo | c427758 | 2018-06-06 16:44:48 -0700 | [diff] [blame] | 293 | |
Congbin Guo | 52f7cd0 | 2018-06-20 13:12:36 -0700 | [diff] [blame] | 294 | yield filename, content |