Speed up downloading of the LOBs. For some reason, trying to download all of the LOBs in the repo one-at-a-time via download_from_google_storage is *very* slow ( it may take 30 min or more to fetch everything). After experimenting with a few different approachs (a single tarball, connection pooling, HTTPS vs. HTTP/2 vs. HTTP/3, Python vs. curl, etc.) it looks like fetching everything in Python using urllib3 with connection pools lets me fetch everything in ~30s over a good network on a high-core machine. Using a single tarball can get this down to ~6s, but that would significantly complicate how we did things (do we need to upload new tarballs every time a single LOB changes, etc.). It's not clear that using HTTP/2 or HTTP/3 really helps, as the additional overhead of the multiple objects may be server-side in GCS. The fetch is incremental (will only download new things) and is a no-op if everything is up-to-date, so hopefully this is an acceptable compromise. We should figure out why download_from_google_storage is so much slower, but that can be done at a later date. Bug: 1266070 Change-Id: Ib37353870f89eca43bdaf3547779643919b28850 Reviewed-on: https://chromium-review.googlesource.com/c/website/+/3430249 Reviewed-by: Struan Shrimpton <sshrimp@google.com> Commit-Queue: Dirk Pranke <dpranke@google.com>

commit: 11078579c7f9bb7cdcf2b8d51f79e5ce46edb890 [log] [tgz]
author: Dirk Pranke <dpranke@google.com> Wed Feb 02 13:49:59 2022 -0800
committer: chromium-website-scoped@luci-project-accounts.iam.gserviceaccount.com <chromium-website-scoped@luci-project-accounts.iam.gserviceaccount.com> Wed Feb 02 21:52:55 2022 +0000
tree: cbfae1a6f426b703dc1d6056a196a4df8a29bcb6
parent: b045bcf0e75892630074b913d7b3cb13b04c0d54 [diff] [blame]
diff --git a/scripts/fetch_lobs.py b/scripts/fetch_lobs.py
new file mode 100755
index 0000000..c10052e
--- /dev/null
+++ b/scripts/fetch_lobs.py

@@ -0,0 +1,117 @@
+#!/usr/bin/env vpython3
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Download all the LOBS in //site."""
+
+import argparse
+import hashlib
+import io
+import os
+import sys
+import time
+import urllib3
+from urllib.error import HTTPError, URLError
+
+import common
+
+http = None
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f', '--force', action='store_true')
+    parser.add_argument('-j', '--jobs', type=int, default=common.cpu_count())
+    parser.add_argument('-m', '--multiprocess', action='store_true',
+                        default=False)
+    args = parser.parse_args()
+    q = common.JobQueue(_handle, args.jobs, args.multiprocess)
+    paths = [path.replace('.sha1', '')
+             for path in common.walk(common.SITE_DIR)
+             if path.endswith('.sha1')]
+
+    stdin = ''
+    for path in paths:
+        with open(os.path.join(common.SITE_DIR, path + '.sha1'), 'r') as fp:
+            expected_sha1 = fp.read().strip()
+
+        if not args.force and os.path.exists(os.path.join(common.SITE_DIR, path)):
+            with open(os.path.join(common.SITE_DIR, path), 'rb') as fp:
+                s = hashlib.sha1()
+                s.update(fp.read())
+                actual_sha1 = s.hexdigest()
+            if args.force or (actual_sha1 != expected_sha1):
+                q.request(path, (args, expected_sha1))
+        else:
+            q.request(path, (args, expected_sha1))
+
+    if not len(q.all_tasks()):
+        return 0
+
+    start = time.time()
+    updated = 0
+    failed = False
+    total_bytes = 0
+    for path, res, resp in q.results():
+        did_update, num_bytes = resp
+        if res:
+            print('%s failed: %s' % (path, res))
+            failed = True
+        if did_update:
+            updated += 1
+        total_bytes += num_bytes
+    end = time.time()
+
+    print('Fetched %d LOBs (%.1fMB) in %.3f seconds (%.1fMbps).'  %
+          (updated,
+           (total_bytes / 1_000_000),
+           (end - start),
+           (total_bytes * 8 / (end - start) / 1_000_000)))
+    return 1 if failed else 0
+
+
+def _url(expected_sha1):
+    return 'https://storage.googleapis.com/%s/%s' % (
+        'chromium-website-lob-storage', expected_sha1)
+
+
+def _handle(path, obj):
+    args, expected_sha1 = obj
+    global http
+    if http is None:
+        http = urllib3.PoolManager()
+    url = _url(expected_sha1)
+    total_bytes = 0
+    for i in range(4):
+      try:
+        resp = http.request('GET', url)
+        s = hashlib.sha1()
+        s.update(resp.data)
+        actual_sha1 = s.hexdigest()
+        if actual_sha1 != expected_sha1:
+            return ('sha1 mismatch: expected %s, got %s' % (
+                expected_sha1, actual_sha1), (False, len(resp.data)))
+        common.write_binary_file(os.path.join(common.SITE_DIR, path),
+                                 resp.data)
+      except (HTTPError, URLError, TimeoutError) as e:
+          if i < 4:
+            time.sleep(1)
+          else:
+            return str(e), (False, 0)
+      except Exception as e:
+          return str(e), (False, 0)
+    return '', (True, len(resp.data))
+
+if __name__ == '__main__':
+    sys.exit(main())
+
commit	11078579c7f9bb7cdcf2b8d51f79e5ce46edb890	[log] [tgz]
author	Dirk Pranke <dpranke@google.com>	Wed Feb 02 13:49:59 2022 -0800
committer	chromium-website-scoped@luci-project-accounts.iam.gserviceaccount.com <chromium-website-scoped@luci-project-accounts.iam.gserviceaccount.com>	Wed Feb 02 21:52:55 2022 +0000
tree	cbfae1a6f426b703dc1d6056a196a4df8a29bcb6
parent	b045bcf0e75892630074b913d7b3cb13b04c0d54 [diff] [blame]