blob: c8c4daca6aad70ebd06cbb085671e8bf15922e40 [file] [log] [blame]
Congbin Guoc4277582018-06-06 16:44:48 -07001# -*- coding: utf-8 -*-
2# Copyright 2018 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5"""This module provides utils to handle response of "Range Request"."""
6
7from __future__ import absolute_import
8from __future__ import division
9from __future__ import print_function
10
Congbin Guo3c6cc4b2018-06-14 17:45:10 -070011import collections
12import itertools
13import json
14import re
15
Congbin Guoc4277582018-06-06 16:44:48 -070016import constants
17
Congbin Guo3c6cc4b2018-06-14 17:45:10 -070018_RANGE_HEADER_SEPARATORS = re.compile('[-/ ]')
Congbin Guo52f7cd02018-06-20 13:12:36 -070019_ONE_LINE = object() # Special object to indicate data reader to read one line.
Congbin Guo3c6cc4b2018-06-14 17:45:10 -070020
21_ContentRangeHeader = collections.namedtuple('_ContentRangeHeader',
22 ('bytes', 'start', 'end', 'total'))
23
Congbin Guoc4277582018-06-06 16:44:48 -070024
25class FormatError(Exception):
26 """Exception raised when we parse wrong format of response."""
27
28
29class NoFileFoundError(Exception):
30 """Exception raised when we cannot get a file match the range."""
31
32
Congbin Guo3c6cc4b2018-06-14 17:45:10 -070033class ResponseQueueError(Exception):
34 """Exception raised when trying to queue responses not allowed."""
35
36
37def _get_file_by_range_header(range_header_str, file_name_map):
38 """Get file name and size by the Content-Range header.
39
40 The format of Content-Range header is like:
41 Content-Range: bytes <start>-<end>/<total>
42 We get the <start> and <end> from it and retrieve the file name from
43 |file_name_map|.
44
45 Args:
46 range_header_str: A string of range header.
47 file_name_map: A dict of {(<start:str>, <size:int>): filename, ...}.
48
49 Returns:
50 A tuple of (filename, size).
51
52 Raises:
53 FormatError: Raised when response content interrupted.
54 NoFileFoundError: Raised when we cannot get a file matches the range.
55 """
56 # Split the part of 'Content-Range:' first if needed.
57 if range_header_str.lower().startswith('content-range:'):
58 range_header_str = range_header_str.split(': ', 1)[1]
59
60 try:
61 range_header = _ContentRangeHeader._make(
62 _RANGE_HEADER_SEPARATORS.split(range_header_str)
63 )
64 size = int(range_header.end) - int(range_header.start) + 1
65 except (IndexError, ValueError):
66 raise FormatError('Wrong format of content range header: %s' %
67 range_header_str)
68
69 try:
70 filename = file_name_map[(range_header.start, size)]
71 except KeyError:
72 raise NoFileFoundError('Cannot find a file matches the range %s' %
73 range_header_str)
74
75 return filename, size
76
77
78class JsonStreamer(object):
79 """A class to stream the responses for range requests.
80
81 The class accepts responses and format the file content in all of them as a
82 JSON stream. The format:
83 '{"<filename>": "<content>", "<filename>": "<content>", ...}'
84 """
85
86 def __init__(self):
87 self._files_iter_list = []
88 self._can_add_more_response = True
89
90 def queue_response(self, response, file_info_list):
91 """Add a reponse to the queue to be streamed as JSON.
92
93 We can add either:
94 1. one and only one response for single-part range requests, or
95 2. a series of responses for multi-part range requests.
96
97 Args:
98 response: An instance of requests.Response, which may be the response of a
99 single range request, or a multi-part range request.
100 file_info_list: A list of tarfile_utils.TarMemberInfo. We use it to look
101 up file name by content start offset and size.
102
103 Raises:
104 FormatError: Raised when response to be queued isn't for a range request.
105 ResponseQueueError: Raised when either queuing more than one response for
106 single-part range request, or mixed responses for single-part and
107 multi-part range request.
108 """
109 if not self._can_add_more_response:
110 raise ResponseQueueError(
111 'No more reponses can be added when there was a response for '
112 'single-part range request in the queue!')
113
114 file_name_map = {(f.content_start, int(f.size)): f.filename
115 for f in file_info_list}
116
117 # Check if the response is for single range, or multi-part range. For a
118 # single range request, the response must have header 'Content-Range'. For a
119 # multi-part ranges request, the Content-Type header must be like
120 # 'multipart/byteranges; ......'.
121 content_range = response.headers.get('Content-Range', None)
122 content_type = response.headers.get('Content-Type', '')
123
124 if content_range:
125 if self._files_iter_list:
126 raise ResponseQueueError(
127 'Cannot queue more than one responses for single-part range '
128 'request, or mix responses for single-part and multi-part.')
129 filename, _ = _get_file_by_range_header(content_range, file_name_map)
130 self._files_iter_list = [iter([(filename, response.content)])]
131 self._can_add_more_response = False
132
133 elif content_type.startswith('multipart/byteranges;'):
134 self._files_iter_list.append(
Congbin Guo52f7cd02018-06-20 13:12:36 -0700135 _file_iterator(response, file_name_map))
Congbin Guo3c6cc4b2018-06-14 17:45:10 -0700136
137 else:
138 raise FormatError('The response is not for a range request.')
139
140 def stream(self):
141 """Yield the series of responses content as a JSON stream.
142
143 Yields:
144 A JSON stream in format described above.
145 """
146 files_iter = itertools.chain(*self._files_iter_list)
147
148 json_encoder = json.JSONEncoder()
149 filename, content = next(files_iter)
150 yield '{%s: %s' % (json_encoder.encode(filename),
151 json_encoder.encode(content))
152 for filename, content in files_iter:
153 yield ', %s: %s' % (json_encoder.encode(filename),
154 json_encoder.encode(content))
155 yield '}'
156
157
Congbin Guo52f7cd02018-06-20 13:12:36 -0700158def _data_reader(data_iter):
159 """A coroutine to read data from |data_iter|.
160
161 It accepts two type of parameter:
162 1. _ONE_LINE: Read one CRLF ended line if possible.
163 2. An integer N: Read at most N bytes.
164
165 Args:
166 data_iter: An iterator of data source.
167
168 Yields:
169 The data read.
170 """
171 buffered = next(data_iter)
172
173 # Get what to be read in runtime by passing value into the generator. See
174 # https://docs.python.org/2.5/whatsnew/pep-342.html for syntax details.
175 to_be_read = yield
176
177 while True:
178 if to_be_read is _ONE_LINE:
179 parts = buffered.split('\r\n', 1)
180 if len(parts) == 2:
181 line, buffered = parts
182 to_be_read = (yield line)
183 continue
184
185 else: # Read at most |to_be_read| bytes of data.
186 bytes_remaining = to_be_read - len(buffered)
187 if bytes_remaining < 0:
188 read_bytes = buffered[:bytes_remaining]
189 buffered = buffered[bytes_remaining:]
190 to_be_read = (yield read_bytes)
191 continue
192
193 try:
194 buffered += next(data_iter)
195 except StopIteration:
196 break
197
198 if buffered:
199 yield buffered
200
201
202def _read_line(reader):
203 """Read one CRLF ended line from the response.
204
205 Returns:
206 The line read. Return None if nothing to read.
207 """
208 return reader.send(_ONE_LINE)
209
210
211def _read_empty_line(reader):
212 """Read one line and assert it is empty."""
213 try:
214 line = _read_line(reader)
215 except StopIteration:
216 raise FormatError('Expect an empty line, but got EOF.')
217 if line:
218 raise FormatError('Expect an empty line, but got "%s".' % line)
219
220
221def _read_bytes(reader, max_bytes):
222 """Read at most |max_bytes| bytes from the reader.
223
224 Args:
225 reader:
226 max_bytes: An integer of maximum bytes of bytes to read.
227
228 Returns:
229 The bytes read. Return None if nothing to read.
230 """
231 return reader.send(max_bytes)
232
233
234def _file_iterator(response, file_name_map):
Congbin Guo3c6cc4b2018-06-14 17:45:10 -0700235 """The iterator of files in a response of multi-part range request.
Congbin Guoc4277582018-06-06 16:44:48 -0700236
237 An example response is like:
238
239 HTTP/1.1 206 Partial Content
240 Content-Type: multipart/byteranges; boundary=magic_string
241 Content-Length: 282
242
243 --magic_string
244 Content-Type: text/html
245 Content-Range: bytes 0-50/1270
246
247 <data>
248 --magic_string
249 Content-Type: text/html
250 Content-Range: bytes 100-150/1270
251
252 <data>
253 --magic_string--
254
255 In our application, each part is the content of a file. This class iterates
256 the files.
Congbin Guo52f7cd02018-06-20 13:12:36 -0700257
258 Args:
259 response: An instance of requests.response.
260 file_name_map: A dict of {(<start:str>, <size:int>): filename, ...}.
261
262 Yields:
263 A pair of (name, content) of the file.
264
265 Raises:
266 FormatError: Raised when response content interrupted.
Congbin Guoc4277582018-06-06 16:44:48 -0700267 """
Congbin Guo52f7cd02018-06-20 13:12:36 -0700268 reader = _data_reader(
269 response.iter_content(constants.READ_BUFFER_SIZE_BYTES))
270 reader.next() # initialize the coroutine
Congbin Guoc4277582018-06-06 16:44:48 -0700271
Congbin Guo52f7cd02018-06-20 13:12:36 -0700272 _read_empty_line(reader) # The first line is empty.
273 while True:
274 _read_line(reader) # The second line is the boundary.
275 _read_line(reader) # The line sub content type.
276 sub_range_header = _read_line(reader) # The line of sub content range.
277 if sub_range_header is None:
278 break
279 _read_empty_line(reader) # Another empty line.
Congbin Guoc4277582018-06-06 16:44:48 -0700280
Congbin Guo52f7cd02018-06-20 13:12:36 -0700281 filename, size = _get_file_by_range_header(sub_range_header,
282 file_name_map)
283 content = _read_bytes(reader, size)
Congbin Guoc4277582018-06-06 16:44:48 -0700284
Congbin Guo52f7cd02018-06-20 13:12:36 -0700285 _read_empty_line(reader) # Every content has a trailing '\r\n'.
Congbin Guoc4277582018-06-06 16:44:48 -0700286
Congbin Guo52f7cd02018-06-20 13:12:36 -0700287 bytes_read = 0 if content is None else len(content)
288 if bytes_read != size:
289 raise FormatError(
290 '%s: Error in reading content (read %d B, expect %d B)' %
291 (filename, bytes_read, size)
292 )
Congbin Guoc4277582018-06-06 16:44:48 -0700293
Congbin Guo52f7cd02018-06-20 13:12:36 -0700294 yield filename, content