GS Cache: list member of a tar file This change adds a feature of GS cache server to list all members of a tar file as lines of CSV, e.g. <filename>,<record start>,<record size>,<record end>,<content start>,<content size><content end> ... This feature is useful when we extract files from a tar by using HTTP Range header, e.g. curl -r <content start>-<content end> http://gs-cache/download/path/to/file.tar Another usecase is extracting some of files and re-create another tar file: for f in file_info_list: reader.seek(f.record_start) writer.write(reader.read(r.record_size)) BUG=chromium:824580 TEST=Ran unit tests. Change-Id: I2630a04795e16eb35dcdee46c17db64ba380ca09 Reviewed-on: https://chromium-review.googlesource.com/1047959 Commit-Ready: ChromeOS CL Exonerator Bot <chromiumos-cl-exonerator@appspot.gserviceaccount.com> Tested-by: Congbin Guo <guocb@chromium.org> Reviewed-by: Congbin Guo <guocb@chromium.org>

commit: d9506f01f3e409325a01429aab71bd5e26cc1348 [log] [tgz]
author: Congbin Guo <guocb@google.com> Mon May 07 14:56:55 2018 -0700
committer: chrome-bot <chrome-bot@chromium.org> Tue May 22 18:45:41 2018 -0700
tree: 74242e5515c34eb6daec89a011c2692f5d3d8454
parent: f6bd5742fe5235a79e439d5e0f022b5e940c7625 [diff]
diff --git a/bin/gs_archive_server b/bin/gs_archive_server
index 06cd7ec..a0b5e56 100755
--- a/bin/gs_archive_server
+++ b/bin/gs_archive_server

@@ -9,10 +9,4 @@
 readonly homedir=$(cd "$bindir"/../gs_cache; pwd)
 export PYTHONPATH=$homedir
 
-# Run the server, or run tests
-if [[ $(basename "$0") == gs_archive_server_test ]]; then
-  exec vpython -vpython-spec $homedir/.vpython -m pytest \
-      "$homedir"/gs_archive_server_test.py "$@"
-else
-  exec vpython -vpython-spec $homedir/.vpython -m gs_archive_server "$@"
-fi
+exec vpython -vpython-spec $homedir/.vpython -m gs_archive_server "$@"

diff --git a/bin/gs_archive_server_test b/bin/gs_archive_server_test
deleted file mode 120000
index d349ad7..0000000
--- a/bin/gs_archive_server_test
+++ /dev/null

@@ -1 +0,0 @@
-gs_archive_server
\ No newline at end of file

diff --git a/bin/gs_archive_server_test b/bin/gs_archive_server_test
new file mode 100755
index 0000000..e551eaa
--- /dev/null
+++ b/bin/gs_archive_server_test

@@ -0,0 +1,13 @@
+#!/bin/bash
+# Copyright 2018 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+# Run Google Storage archive server from inside virtual environment.
+set -eu
+readonly bindir=$(dirname -- "$(readlink -e -- "$0")")
+readonly homedir=$(cd "$bindir"/../gs_cache; pwd)
+export PYTHONPATH=$homedir
+
+exec vpython -vpython-spec $homedir/.vpython -m pytest \
+    "$homedir"/*.py "$homedir"/tests "$@"

diff --git a/gs_cache/.vpython b/gs_cache/.vpython
index 95ff3f5..2855ba9 100644
--- a/gs_cache/.vpython
+++ b/gs_cache/.vpython

@@ -21,16 +21,31 @@
 >
 
 wheel: <
+  name: "infra/python/wheels/coverage/${vpython_platform}"
+  version: "version:4.5.1"
+>
+
+wheel: <
   name: "infra/python/wheels/funcsigs-py2_py3"
   version: "version:1.0.2"
 >
 
 wheel: <
+  name: "infra/python/wheels/mock-py2_py3"
+  version: "version:2.0.0"
+>
+
+wheel: <
   name: "infra/python/wheels/more-itertools-py2_py3"
   version: "version:4.1.0"
 >
 
 wheel: <
+  name: "infra/python/wheels/pbr-py2_py3"
+  version: "version:3.0.0"
+>
+
+wheel: <
   name: "infra/python/wheels/pluggy-py2_py3"
   version: "version:0.6.0"
 >
@@ -51,11 +66,21 @@
 >
 
 wheel: <
+  name: "infra/python/wheels/pytest-cov-py2_py3"
+  version: "version:2.5.1"
+>
+
+wheel: <
   name: "infra/python/wheels/pytz-py2_py3"
   version: "version:2018.4"
 >
 
 wheel: <
+  name: "infra/python/wheels/requests-py2_py3"
+  version: "version:2.13.0"
+>
+
+wheel: <
   name: "infra/python/wheels/six-py2_py3"
   version: "version:1.11.0"
 >

diff --git a/gs_cache/gs_archive_server.py b/gs_cache/gs_archive_server.py
index 958904f..5884cee 100644
--- a/gs_cache/gs_archive_server.py
+++ b/gs_cache/gs_archive_server.py

@@ -17,19 +17,35 @@
 from __future__ import print_function
 
 import argparse
+import contextlib
+import functools
 import os
+import StringIO
+import subprocess
 import sys
+import tempfile
+import urllib
+import urlparse
 
 import cherrypy
+import requests
 
+import tarfile_utils
 from chromite.lib import cros_logging as logging
 from chromite.lib import gs
 
 # some http status codes
+_HTTP_BAD_REQUEST = 400
 _HTTP_UNAUTHORIZED = 401
 _HTTP_NOT_FOUND = 404
 _HTTP_SERVICE_UNAVAILABLE = 503
 
+_READ_BUFFER_SIZE_BYTES = 1024 * 1024  # 1 MB
+_WRITE_BUFFER_SIZE_BYTES = 1024 * 1024  # 1 MB
+
+# The max size of temporary spool file in memory.
+_SPOOL_FILE_SIZE_BYTES = 100 * 1024 * 1024  # 100 MB
+
 _logger = logging.getLogger(__file__)
 
 
@@ -39,13 +55,175 @@
   _logger.log(level, extra=cherrypy.request.headers, *args, **kwargs)
 
 
-class GSArchiveServer(object):
+def _check_file_extension(filename, ext_names=None):
+  """Check the file name and, optionally, the ext name.
+
+  Args:
+    filename: The file name to be checked.
+    ext_names: The valid extension of |filename| should have.
+
+  Returns:
+    The filename if the check is good.
+
+  Raises:
+    ValueError: Raised if the checking failed.
+  """
+  if not filename:
+    raise ValueError('File name is required.')
+
+  for ext_name in ext_names or []:
+    if filename.endswith(ext_name):
+      break
+    else:
+      raise ValueError("Extension name of '%s' isn't in %s" % (filename,
+                                                               ext_names))
+  return filename
+
+
+def _to_cherrypy_error(func):
+  """A decorator to convert Exceptions raised to proper cherrypy.HTTPError."""
+  @functools.wraps(func)
+  def func_wrapper(*args, **kwargs):
+    try:
+      return func(*args, **kwargs)
+    except requests.HTTPError as err:
+      # cherrypy.HTTPError wraps the error messages with HTML tags. But
+      # requests.HTTPError also do same work. So return the error message
+      # directly.
+      cherrypy.response.status = err.response.status_code
+      return err.response.content
+    except ValueError as err:
+      # The exception message is just a plain text, so wrap it with
+      # cherrypy.HTTPError to have necessary HTML tags
+      raise cherrypy.HTTPError(_HTTP_BAD_REQUEST, err.message)
+  return func_wrapper
+
+
+class _CachingServer(object):
+  r"""The interface of caching server for GsArchiveServer.
+
+  This class provides an interface to work with the caching server (usually a
+  reversed http proxy server) which caches all intermediate results, e.g.
+  downloaded files, etc. and serves to GsArchiveServer.
+
+  The relationship of this class and other components is:
+    /-------------(python function call)-----------------------\
+    |                                                          |
+    v                                                          |
+  _CachingServer --(http/socket)--> NGINX --(http/socket)--> GsArchiveServer
+                                      ^                        |
+                                      |                     (https)
+  End user, DUTs ---(http)------------/                        |
+                                                               V
+                                                         GoogleStorage
+  """
+
+  def __init__(self, url):
+    """Constructor
+
+    Args:
+      url: A tuple of URL scheme and netloc.
+
+    Raises:
+      ValueError: Raised when input URL in wrong format.
+    """
+    self._url = url
+
+  def _call(self, action, path, args=None, headers=None):
+    """Helper function to generate all RPC calls to the proxy server."""
+    url = urlparse.urlunsplit(self._url + ('%s/%s' % (action, path),
+                                           urllib.urlencode(args or {}), None))
+    _log('Sending request to proxy: %s', url)
+    rsp = requests.get(url, headers=headers, stream=True)
+    _log('Proxy response %s', rsp.status_code)
+    rsp.raise_for_status()
+    return rsp
+
+  def download(self, path, headers=None):
+    """Call download RPC."""
+    return self._call('download', path, headers=headers)
+
+
+class GsArchiveServer(object):
   """The backend of Google Storage Cache server."""
 
-  def __init__(self):
+  def __init__(self, caching_server):
     self._gsutil = gs.GSContext()
+    self._caching_server = caching_server
 
   @cherrypy.expose
+  @_to_cherrypy_error
+  def list_member(self, *args):
+    """Get file list of an tar archive in CSV format.
+
+    An example, GET /list_member/bucket/path/to/file.tar
+    The output is in format of:
+      <file name>,<data1>,<data2>,...<data6>
+      <file name>,<data1>,<data2>,...<data6>
+      ...
+
+    Details:
+      <file name>: The file name of the member, in URL percent encoding, e.g.
+        path/to/file,name  -> path/to/file%2Cname.
+      <data1>: File record start offset, in bytes.
+      <data2>: File record size, in bytes.
+      <data3>: File record end offset, in bytes.
+      <data4>: File content start offset, in bytes.
+      <data5>: File content size, in bytes.
+      <data6>: File content end offset, in bytes.
+
+    This is an internal RPC and shouldn't be called by end user!
+
+    Args:
+      *args: All parts of tar file name (must end with '.tar').
+
+    Returns:
+      The generator of CSV stream.
+    """
+    # TODO(guocb): new parameter to filter the list
+
+    archive = _check_file_extension('/'.join(args), ext_names=['.tar'])
+    rsp = self._caching_server.download(archive, cherrypy.request.headers)
+    cherrypy.response.headers['Content-Type'] = 'text/csv'
+
+    # We run tar command to get member list of a tar file (python tarfile module
+    # is too slow). Option '--block-number/-R' of tar prints out the starting
+    # block number for each file record.
+    _log('list member of the tar %s', archive)
+    tar_tv = tempfile.SpooledTemporaryFile(max_size=_SPOOL_FILE_SIZE_BYTES)
+    tar = subprocess.Popen(['tar', 'tv', '--block-number'],
+                           stdin=subprocess.PIPE, stdout=tar_tv)
+    for chunk in rsp.iter_content(_READ_BUFFER_SIZE_BYTES):
+      tar.stdin.write(chunk)
+
+    tar.wait()
+
+    def _tar_member_list():
+      with tar_tv, contextlib.closing(StringIO.StringIO()) as stream:
+        tar_tv.seek(0)
+        for info in tarfile_utils.list_tar_members(tar_tv):
+          # some pre-computation for easier use of clients
+          content_end = info.content_start + info.size - 1
+          record_end = info.record_start + info.record_size - 1
+
+          # encode file name using URL percent encoding, so ',' => '%2C'
+          stream.write('%s,%d,%d,%d,%d,%d,%d\n' % (
+              urllib.quote(info.filename), info.record_start, info.record_size,
+              record_end, info.content_start, info.size, content_end))
+
+          if stream.tell() > _WRITE_BUFFER_SIZE_BYTES:
+            yield stream.getvalue()
+            stream.seek(0)
+
+        if stream.tell():
+          yield stream.getvalue()
+
+      _log('list_member done')
+
+    return _tar_member_list()
+
+  @cherrypy.expose
+  @_to_cherrypy_error
   def download(self, *args):
     """Download a file from Google Storage.
 
@@ -58,7 +236,7 @@
     Returns:
       The stream of downloaded file.
     """
-    path = 'gs://%s' % '/'.join(args)
+    path = 'gs://%s' % _check_file_extension('/'.join(args))
 
     _log('Downloading %s', path, level=logging.INFO)
     try:
@@ -85,14 +263,40 @@
 
   # pylint:disable=protected-access
   download._cp_config = {'response.stream': True}
+  list_member._cp_config = {'response.stream': True}
+
+
+def _url_type(input_string):
+  """Ensure |input_string| is a valid URL and convert to target type.
+
+  The target type is a tuple of (scheme, netloc).
+  """
+  split_result = urlparse.urlsplit(input_string)
+  if not split_result.scheme:
+    input_string = 'http://%s' % input_string
+
+  split_result = urlparse.urlsplit(input_string)
+  if not split_result.scheme or not split_result.netloc:
+    raise argparse.ArgumentTypeError('Wrong URL format: %s' % input_string)
+
+  return split_result.scheme, split_result.netloc
 
 
 def parse_args(argv):
   """Parse arguments."""
-  parser = argparse.ArgumentParser(description=__doc__)
+  parser = argparse.ArgumentParser(
+      formatter_class=argparse.RawDescriptionHelpFormatter,
+      description=__doc__)
   parser.add_argument('-s', '--socket', help='Unix domain socket to bind')
   parser.add_argument('-p', '--port', type=int, default=8080,
                       help='Port number to listen, default: %(default)s.')
+  # TODO(guocb): support Unix domain socket
+  parser.add_argument(
+      '-c', '--caching-server', required=True, type=_url_type,
+      help='URL of the proxy server. Valid format is '
+      '[http://]{<hostname>|<IP>}[:<port_number>]. When skipped, the default '
+      'scheme is http and port number is 80. Any other components in URL are '
+      'ignored.')
   return parser.parse_args(argv)
 
 
@@ -124,7 +328,7 @@
 
   cherrypy.config.update({'server.socket_port': args.port,
                           'server.socket_host': '127.0.0.1'})
-  cherrypy.quickstart(GSArchiveServer())
+  cherrypy.quickstart(GsArchiveServer(_CachingServer(args.caching_server)))
 
 
 if __name__ == '__main__':

diff --git a/gs_cache/gs_archive_server_test.py b/gs_cache/gs_archive_server_test.py
deleted file mode 100644
index c6047f5..0000000
--- a/gs_cache/gs_archive_server_test.py
+++ /dev/null

@@ -1,63 +0,0 @@
-# Copyright 2018 The Chromium OS Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-
-
-"""Tests for gs_archive_server."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import unittest
-
-import cherrypy
-from cherrypy.test import helper
-
-import gs_archive_server
-from chromite.lib import gs
-
-_DIR = '/gs_archive_server_test'
-# some REAL files and info on Google Storage
-_TEST_DATA = {
-    'a_plain_file': {
-        'path': '%s/README.md' % _DIR,
-        'mime': 'application/octet-stream',
-        'size': 139
-    },
-}
-
-
-def access_to_gs():
-  """Skip some tests if we cannot access google storage."""
-  return gs.GSContext()._TestGSLs()  # pylint:disable=protected-access
-
-
-@unittest.skipUnless(access_to_gs(), 'Have no access to google storage')
-class UnmockedGSArchiveServerTest(helper.CPWebCase):
-  """Some integration tests using cherrypy test framework."""
-  @staticmethod
-  def setup_server():
-    """An API used by cherrypy to setup test environment."""
-    cherrypy.tree.mount(gs_archive_server.GSArchiveServer())
-
-  def test_download_a_file(self):
-    """Test normal files downloading."""
-    tested_file = _TEST_DATA['a_plain_file']
-    self.getPage('/download%(path)s' % tested_file)
-    self.assertStatus(200)
-    self.assertHeader('Content-Type', tested_file['mime'])
-    self.assertEquals(len(self.body), tested_file['size'])
-
-  def test_download_a_non_existing_file(self):
-    """Test downloading non-existing files."""
-    self.getPage('/download/chromeos-images-archive/existing/file')
-    self.assertStatus(404)
-
-  def test_download_against_unauthorized_bucket(self):
-    """Test downloading from unauthorized bucket."""
-    self.getPage('/download/another_bucket/file')
-    self.assertStatus(401)
-
-
-if __name__ == "__main__":
-  unittest.main()

diff --git a/gs_cache/pytest.ini b/gs_cache/pytest.ini
new file mode 100644
index 0000000..3cf8346
--- /dev/null
+++ b/gs_cache/pytest.ini

@@ -0,0 +1,4 @@
+[pytest]
+addopts =
+  --doctest-modules
+  --cov gs_cache

diff --git a/gs_cache/tarfile_utils.py b/gs_cache/tarfile_utils.py
new file mode 100644
index 0000000..200ceb5
--- /dev/null
+++ b/gs_cache/tarfile_utils.py

@@ -0,0 +1,162 @@
+# Copyright 2018 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""Utils for manipulating tar format archives.
+
+We use tar command to manipulate tar file other than using Python tarfile module
+because that module is very slow in the case of large file.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import re
+
+from chromite.lib import cros_logging as logging
+
+_logger = logging.getLogger(__name__)
+
+
+def _round_up_to_512(number):
+  """Up round the given |number| to smallest multiple of 512.
+
+  Examples:
+    >>> for n in (0, 1, 512, 1025):
+    ...   _round_up_to_512(n)
+    0
+    512
+    512
+    1536
+
+  Args:
+    number: Zero or positive integer.
+
+  Returns:
+    The smallest multiple of 512.
+  """
+  return (number + 511) & -512
+
+
+def _get_command_result_from_tar_tvR(an_output_line):
+  """Get an object of _TarListCommandResult from one line of `tar tvR` output.
+
+  Args:
+    an_output_line: One line of `tar tvR` output. Trailing '\n' is acceptable.
+      The last line of `tar tvR` is acceptable.
+
+  Returns:
+    An object of _TarListCommandResult.
+  """
+  separators = re.compile('[ \t:]+')
+  fields_num = len(_TarListCommandResult._fields)
+  fields = re.split(separators, an_output_line.rstrip('\n'),
+                    maxsplit=fields_num - 1)
+  try:
+    return _TarListCommandResult._make(fields)
+  except TypeError:
+    # The last line of `tar tvR` hasn't enough fields. Fill with fake data.
+    _logger.debug('This should be the last line of `tar tvR`: %s',
+                  an_output_line)
+    fields.extend(_TarListCommandResult._fields[len(fields):])
+    return _TarListCommandResult._make(fields)
+
+
+def _block_to_bytes(block_num):
+  """Get offset of the block |block_num| in bytes, i.e. times 512"""
+  return block_num << 9  # * 512
+
+
+# The tuple of tar member information to be returned to caller.
+# Fields:
+#   filename: The file name of the tar member.
+#   record_start: The zero-based start offset of the file record, in bytes.
+#   record_size: The size of the file record, in bytes.
+#   content_start: The zero-based start offset of the file content, in bytes.
+#   size: The size of the file content, in bytes.
+TarMemberInfo = collections.namedtuple(
+    'TarMemberInfo', ('filename', 'record_start', 'record_size',
+                      'content_start', 'size'))
+
+
+class _TarListCommandResult(collections.namedtuple(
+    '_TarListCommandResult', ('block', 'block_num', 'mode', 'ownership',
+                              'size_str', 'date', 'hour', 'min', 'filename'))):
+  """Information of each member in a Tar archive.
+
+  This class using the output of command `tar tvR` to compute more information
+  we need, e.g. file content start offset, etc.
+
+  The output of `tar tvR` is like:
+  block 0: -rw-r--r-- user/group <size> <date> <time> <file name>
+  ...
+  block 7: ** Block of NULs **
+  """
+
+  @property
+  def record_start(self):
+    """Start offset of the file record, in bytes."""
+    return _block_to_bytes(int(self.block_num))
+
+  @property
+  def size(self):
+    return int(self.size_str)
+
+
+def _get_prev_content_start(cur_record_start, prev_file):
+  """Deduct prev file content information from current file record information.
+
+  In tar format, each file record has a header and followed by file content.
+  Both header and file content are rounded up to 512 Bytes. The header length is
+  variable, but we can get the current file content starting offset by
+  subtracting up rounded file size from next file header starting offset, i.e.
+
+  current_offset = block(next_file) * 512 - round_up_to_512(current_size)
+
+  |********|************************.......|********|****
+  | header |         content               | header |
+  |        |<----- prev_size ----->|
+  |        |<- prev_size round up to 512 ->|
+           ^prev_content_start             ^cur_record_start
+
+  Args:
+    cur_record_start: The zero-based start position of current file record, in
+        bytes.
+    prev_file: An instance of _TarListCommandResult which has size of the
+        previous file.
+
+  Returns:
+    The zero-based start position of previous file content, in bytes.
+  """
+  return cur_record_start - _round_up_to_512(prev_file.size)
+
+
+def list_tar_members(tar_tvR_output):
+  """List the members of a tar with information.
+
+  Yield each member of the tar archive with information of record start/size,
+  content start/size, etc.
+
+  Args:
+    tar_tvR_output: The output of command 'tar tvR'. Option 'R' print out the
+        starting block number of the file record.
+
+  Yields:
+    A tuple of data described above in the same order.
+  """
+  prev_file = _get_command_result_from_tar_tvR(tar_tvR_output.readline())
+
+  for line in tar_tvR_output:
+    cur_file = _get_command_result_from_tar_tvR(line)
+
+    prev_content_start = _get_prev_content_start(cur_file.record_start,
+                                                 prev_file)
+    prev_record_size = cur_file.record_start - prev_file.record_start
+
+    yield TarMemberInfo(prev_file.filename,
+                        prev_file.record_start, prev_record_size,
+                        prev_content_start, prev_file.size)
+
+    prev_file = cur_file

diff --git a/gs_cache/tests/conftest.py b/gs_cache/tests/conftest.py
new file mode 100644
index 0000000..a43dcfe
--- /dev/null
+++ b/gs_cache/tests/conftest.py

@@ -0,0 +1,31 @@
+# Copyright 2018 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""The configuration python file for Pytest.
+
+In this file, we add below customized command line option:
+  --network: Run tests that depend on good netowrk connectivity.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import pytest
+
+
+def pytest_addoption(parser):
+  parser.addoption("--network", action="store_true", default=False,
+                   help="Run tests that depend on good network connectivity")
+
+
+def pytest_collection_modifyitems(config, items):
+  if config.getoption("--network"):
+    # run network tests
+    return
+  skip_network_tests = pytest.mark.skip(
+      reason="Skipping network test (re-run w/--network)")
+  for item in items:
+    if "network" in item.keywords:
+      item.add_marker(skip_network_tests)

diff --git a/gs_cache/tests/gs_archive_server_test.py b/gs_cache/tests/gs_archive_server_test.py
new file mode 100644
index 0000000..ae07a39
--- /dev/null
+++ b/gs_cache/tests/gs_archive_server_test.py

@@ -0,0 +1,170 @@
+# Copyright 2018 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""Tests for gs_archive_server."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import base64
+import gzip
+import md5
+import os
+import StringIO
+import unittest
+
+import cherrypy
+import mock
+import pytest
+import requests
+from cherrypy.test import helper
+
+import gs_archive_server
+from chromite.lib import cros_logging as logging
+
+_TESTING_SERVER = 'http://127.0.0.1:8888'
+_DIR = '/gs_archive_server_test'
+# Some REAL files and info on Google Storage.
+_TEST_DATA = {
+    'a_plain_file': {
+        'path': '%s/README.md' % _DIR,
+        'mime': 'application/octet-stream',
+        'size': 139,
+    },
+    'a_tar_file': {
+        'path': '%s/control_files.tar' % _DIR,
+        'members_md5': '0d5d60e9f10d41c60dd85a7f0081de5d',
+    }
+}
+
+# a tgz file with only one file "bar" which content is "foo\n"
+_A_TGZ_FILE = base64.b64decode(
+    'H4sIAC8VyFoAA+3OMQ7CMAxGYc+cIkdw3DQ9T4pExYBSuc3A7WlhR2JoWd43+vfwxuJyNN3klC'
+    'R2McWcRK0feuve9499s2yqQ9r3aKZZgh5etmnLWjwEmVq9jl/+Zr8/ij8nr20+o+skt1ov/24A'
+    'AAAAAAAAAAAAAAAAAPzuBWP9bg8AKAAA'
+)
+_A_TAR_FILE = gzip.GzipFile(fileobj=StringIO.StringIO(_A_TGZ_FILE)).read()
+
+
+@pytest.mark.network
+class UnmockedGSArchiveServerTest(helper.CPWebCase):
+  """Some integration tests using cherrypy test framework."""
+  @staticmethod
+  def setup_server():
+    """An API used by cherrypy to setup test environment."""
+    cherrypy.tree.mount(gs_archive_server.GsArchiveServer(''))
+
+  def test_download_a_file(self):
+    """Test normal files downloading."""
+    tested_file = _TEST_DATA['a_plain_file']
+    self.getPage('/download%(path)s' % tested_file)
+    self.assertStatus(200)
+    self.assertHeader('Content-Type', tested_file['mime'])
+    self.assertEquals(len(self.body), tested_file['size'])
+
+  def test_download_a_non_existing_file(self):
+    """Test downloading non-existing files."""
+    self.getPage('/download/chromeos-images-archive/existing/file')
+    self.assertStatus(404)
+
+  def test_download_against_unauthorized_bucket(self):
+    """Test downloading from unauthorized bucket."""
+    self.getPage('/download/another_bucket/file')
+    self.assertStatus(401)
+
+
+class MockedGSArchiveServerTest(unittest.TestCase):
+  """Unit test of GsArchiveServer using mock objects."""
+
+  def setUp(self):
+    """Setup method."""
+    self.server = gs_archive_server.GsArchiveServer('')
+
+  def test_list_member(self):
+    """Test list_member RPC."""
+    with mock.patch.object(self.server, '_caching_server') as caching_server:
+      rsp = mock.MagicMock()
+      caching_server.download.return_value = rsp
+      rsp.iter_content.return_value = (_A_TAR_FILE[:100], _A_TAR_FILE[100:])
+      csv = list(self.server.list_member('baz.tar'))
+      self.assertEquals(len(csv), 1)
+      (filename, record_start, record_size, record_end,
+       content_start, size, content_end) = csv[0].split(',')
+      self.assertEquals(filename, 'bar')
+      self.assertEquals(record_start, '0')
+      self.assertEquals(record_size, '1024')
+      self.assertEquals(record_end, '1023')  # 1024 - 1
+      self.assertEquals(content_start, '512')
+      self.assertEquals(size, '4')
+      self.assertEquals(content_end, '515\n')  # 512 + 4 - 1
+
+      # test char quoting in file name
+      with gzip.open(os.path.join(os.path.dirname(__file__),
+                                  'index_tar_member_testing.tgz')) as f:
+        rsp.iter_content.return_value = f.read()
+        members = next(self.server.list_member('baz.tar'))
+        for csv in members.rstrip('\n').split('\n'):
+          # each line can be split into 7 elements, even ',' in filename
+          elements = csv.split(',')
+          self.assertEquals(len(elements), 7)
+          # elements from 1 to 6 are integers
+          _ = [int(d) for d in elements[1:7]]
+
+
+def testing_server_setup():
+  """Check if testing server is setup."""
+  try:
+    rsp = requests.get(_TESTING_SERVER)
+    if rsp.status_code >= 500:
+      logging.warn(
+          'Testing server %s has internal errors. Some tests are skipped!',
+          _TESTING_SERVER)
+      return False
+    return True
+  except Exception:
+    logging.warn('No testings server detected. Some tests are skipped!')
+    return False
+
+
+@unittest.skipUnless(testing_server_setup(), 'Testing servers not available!')
+class GsCacheBackendFunctionalTest(unittest.TestCase):
+  """This is a functional blackbox test
+
+  These tests depend on a full setup of the server and proxy server.
+  If either of they is not available, all tests in this class are skipped.
+  """
+
+  def _get_page(self, url, headers=None, expect_status=200):
+    headers = headers.copy() if headers else {}
+    if not os.environ.get('WITH_CACHE', None):
+      headers['x-no-cache'] = '1'  # bypass all caching to test the whole flow
+
+    rsp = requests.get('%s%s' % (_TESTING_SERVER, url), headers=headers,
+                       stream=True)
+    self.assertEquals(rsp.status_code, expect_status)
+    return rsp
+
+  def _verify_md5(self, content, expected_md5):
+    """Verify the md5 sum of input content equals to expteced value."""
+    m = md5.new()
+    m.update(content)
+    self.assertEquals(m.hexdigest(), expected_md5)
+
+  def test_download_plain_file(self):
+    """Test download RPC."""
+    tested_file = _TEST_DATA['a_plain_file']
+    rsp = self._get_page('/download%(path)s' % tested_file)
+    self.assertEquals(rsp.headers['Content-Length'], str(tested_file['size']))
+
+  def test_list_member(self):
+    """Test list member of a tar file."""
+    tested_file = _TEST_DATA['a_tar_file']
+    rsp = self._get_page('/list_member%(path)s' % tested_file)
+    self.assertEquals(rsp.headers['Content-Type'], 'text/csv;charset=utf-8')
+    self._verify_md5(rsp.content, tested_file['members_md5'])
+
+
+if __name__ == "__main__":
+  unittest.main()

diff --git a/gs_cache/tests/index_tar_member_testing.tgz b/gs_cache/tests/index_tar_member_testing.tgz
new file mode 100644
index 0000000..ea4b2fc
--- /dev/null
+++ b/gs_cache/tests/index_tar_member_testing.tgz
Binary files differ

diff --git a/gs_cache/tests/tarfile_utils_test.py b/gs_cache/tests/tarfile_utils_test.py
new file mode 100644
index 0000000..5a821a7
--- /dev/null
+++ b/gs_cache/tests/tarfile_utils_test.py

@@ -0,0 +1,70 @@
+# Copyright 2018 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""Tests for tarfile_utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import StringIO
+import subprocess
+import tarfile
+import unittest
+
+import tarfile_utils
+
+
+class TarfileUtilsTest(unittest.TestCase):
+  """Tests of tarfile_utils."""
+
+  def test_list_tar_members_empty_file(self):
+    """Test listing file member of an empty tar."""
+    tar_tvR = StringIO.StringIO('block 0: ** Block of NULs **\n')
+    self.assertFalse(list(tarfile_utils.list_tar_members(tar_tvR)))
+
+  def test_list_tar_members_non_empty_file(self):
+    """Test listing file member of an non-empty tar."""
+    tar_tvR = StringIO.StringIO(
+        'block 0: mode owner 1 date hour:min\tfilename\n'
+        'block 2: mode owner 123 date hour:min\tfile name with spaces\n'
+        'block 4: mode owner 0 date hour:min\tdirectory/\n'
+        'block 5: mode owner 0 date hour:min\tdirectory/symbol link -> filename'
+        '\n'
+        'block 6: ** Block of NULs **\n'
+    )
+    result = list(tarfile_utils.list_tar_members(tar_tvR))
+    self.assertEquals(result, [
+        ('filename', 0, 1024, 512, 1),
+        ('file name with spaces', 512 * 2, 1024, 512 * 3, 123),
+        ('directory/', 512 * 4, 512, 512 * 5, 0),
+        ('directory/symbol link -> filename', 512 * 5, 512, 512 * 6, 0)
+    ])
+
+  def test_list_tar_member_with_real_tar_file(self):
+    """Using a real tar file to test listing tar member."""
+    tar_name = os.path.join(os.path.dirname(__file__),
+                            'index_tar_member_testing.tgz')
+    tar_tvR = StringIO.StringIO(
+        subprocess.check_output(['tar', 'tvRzf', tar_name]))
+    members = tarfile_utils.list_tar_members(tar_tvR)
+    with tarfile.open(tar_name, 'r:gz') as tar:
+      for tar_info, result in zip(tar, members):
+        if tar_info.isreg():
+          name = tar_info.name
+
+        if tar_info.isdir():
+          name = '%s/' % tar_info.name
+
+        if tar_info.issym():
+          name = '%s -> %s' % (tar_info.name, tar_info.linkname)
+
+        self.assertEquals(name, result.filename)
+        self.assertEquals(tar_info.offset_data, result.content_start)
+        self.assertEquals(tar_info.size, result.size)
+
+
+if __name__ == '__main__':
+  unittest.main()
commit	d9506f01f3e409325a01429aab71bd5e26cc1348	[log] [tgz]
author	Congbin Guo <guocb@google.com>	Mon May 07 14:56:55 2018 -0700
committer	chrome-bot <chrome-bot@chromium.org>	Tue May 22 18:45:41 2018 -0700
tree	74242e5515c34eb6daec89a011c2692f5d3d8454
parent	f6bd5742fe5235a79e439d5e0f022b5e940c7625 [diff]