pkg_size: create a simple CLI tool to dump a system size report

pkg_size dumps the sizes of the packages contained within a built root.
It will also push gauge data (package size) per package and per
invocation (total root size) to the build api metrics append-only queue
via append_metrics_log. This data is to be used for build metrics trend
reporting.

BUG=chromium:1000449
TEST=cros_sdk -- '$HOME/trunk/chromite/run_tests'

Cq-Depend: chromium:1834120
Change-Id: I735057f9bcfe4de367a8df888e76a09836e371d1
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/chromite/+/1801037
Tested-by: Will Bradley <wbbradley@chromium.org>
Commit-Queue: Will Bradley <wbbradley@chromium.org>
Reviewed-by: Mike Frysinger <vapier@chromium.org>
Reviewed-by: Alex Klein <saklein@chromium.org>
diff --git a/api/metrics.py b/api/metrics.py
index cdf438e..1dba763 100644
--- a/api/metrics.py
+++ b/api/metrics.py
@@ -38,14 +38,14 @@
   # Reduce over the input events to append output_events.
   for input_event in metrics.read_metrics_events():
     if input_event.op == metrics.OP_START_TIMER:
-      timers[input_event.key] = (input_event.name,
+      timers[input_event.arg] = (input_event.name,
                                  input_event.timestamp_epoch_millis)
     elif input_event.op == metrics.OP_STOP_TIMER:
       # TODO(wbbradley): Drop the None fallback https://crbug.com/1001909.
-      timer = timers.pop(input_event.key, None)
+      timer = timers.pop(input_event.arg, None)
       if timer is None:
         logging.error('%s: stop timer recorded, but missing start timer!?',
-                      input_event.key)
+                      input_event.arg)
       if timer:
         assert input_event.name == timer[0]
         output_event = output_events.add()
@@ -57,6 +57,11 @@
       output_event = output_events.add()
       output_event.name = make_name(input_event.name)
       output_event.timestamp_milliseconds = input_event.timestamp_epoch_millis
+    elif input_event.op == metrics.OP_GAUGE:
+      output_event = output_events.add()
+      output_event.name = make_name(input_event.name)
+      output_event.timestamp_milliseconds = input_event.timestamp_epoch_millis
+      output_event.gauge = input_event.arg
     else:
       raise ValueError('unexpected op "%s" found in metric event: %s' % (
           input_event.op, input_event))
diff --git a/api/metrics_unittest.py b/api/metrics_unittest.py
index f900f89..a4b47d1 100644
--- a/api/metrics_unittest.py
+++ b/api/metrics_unittest.py
@@ -12,8 +12,8 @@
 from chromite.api import metrics
 from chromite.api.gen.chromite.api import build_api_test_pb2
 from chromite.lib import cros_test_lib
-from chromite.utils.metrics import (MetricEvent, OP_NAMED_EVENT, OP_START_TIMER,
-                                    OP_STOP_TIMER)
+from chromite.utils.metrics import (MetricEvent, OP_GAUGE, OP_NAMED_EVENT,
+                                    OP_START_TIMER, OP_STOP_TIMER)
 
 
 class MetricsTest(cros_test_lib.TestCase):
@@ -23,8 +23,8 @@
     """Test timer math and deserialization into proto objects."""
     response = build_api_test_pb2.TestResultMessage()
     mock_events = [
-        MetricEvent(600, 'a.b', OP_START_TIMER, key='100'),
-        MetricEvent(1000, 'a.b', OP_STOP_TIMER, key='100'),
+        MetricEvent(600, 'a.b', OP_START_TIMER, arg='100'),
+        MetricEvent(1000, 'a.b', OP_STOP_TIMER, arg='100'),
     ]
     with mock.patch('chromite.api.metrics.metrics.read_metrics_events',
                     return_value=mock_events):
@@ -41,7 +41,7 @@
     """
     response = build_api_test_pb2.TestResultMessage()
     mock_events = [
-        MetricEvent(1000, 'a.named_event', OP_NAMED_EVENT, key=None),
+        MetricEvent(1000, 'a.named_event', OP_NAMED_EVENT, arg=None),
     ]
     with mock.patch('chromite.api.metrics.metrics.read_metrics_events',
                     return_value=mock_events):
@@ -50,3 +50,17 @@
       self.assertEqual(response.events[0].name, 'prefix.a.named_event')
       self.assertEqual(response.events[0].timestamp_milliseconds, 1000)
       self.assertFalse(response.events[0].duration_milliseconds)
+
+  def testDeserializeGauge(self):
+    """Test deserialization of a gauge."""
+    response = build_api_test_pb2.TestResultMessage()
+    mock_events = [
+        MetricEvent(1000, 'a.gauge', OP_GAUGE, arg=17),
+    ]
+    with mock.patch('chromite.api.metrics.metrics.read_metrics_events',
+                    return_value=mock_events):
+      metrics.deserialize_metrics_log(response.events)
+      self.assertEqual(len(response.events), 1)
+      self.assertEqual(response.events[0].name, 'a.gauge')
+      self.assertEqual(response.events[0].timestamp_milliseconds, 1000)
+      self.assertEqual(response.events[0].gauge, 17)
diff --git a/bin/README.md b/bin/README.md
new file mode 100644
index 0000000..7274fb8
--- /dev/null
+++ b/bin/README.md
@@ -0,0 +1,4 @@
+# chromite/bin
+
+This subdirectory is in the $PATH for the SDK. We should only put things in here
+that we expect developers or tools to need.
diff --git a/lib/portage_util.py b/lib/portage_util.py
index ec9f4bb..da89235 100644
--- a/lib/portage_util.py
+++ b/lib/portage_util.py
@@ -2258,3 +2258,51 @@
   """
   result = _Portageq(['match', '/', atom], board=board)
   return SplitCPV(result.output.strip()) if result.output else None
+
+
+class PackageNotFoundError(Error):
+  """Error indicating that the package asked for was not found."""
+
+
+def GenerateInstalledPackages(db, root, packages):
+  """Generate a sequence of installed package objects from package names."""
+  for package in packages:
+    category, pv = package.split('/')
+    installed_package = db.GetInstalledPackage(category, pv)
+    if not installed_package:
+      raise PackageNotFoundError('Unable to locate package %s in %s' % (package,
+                                                                        root))
+    yield installed_package
+
+
+def GeneratePackageSizes(db, root, installed_packages):
+  """Collect package sizes and generate package size pairs.
+
+  Yields:
+    (str, int): A pair of cpv and total package size.
+  """
+  visited_cpvs = set()
+  for installed_package in installed_packages:
+    package_cpv = '%s/%s' % (installed_package.category, installed_package.pf)
+
+    assert package_cpv not in visited_cpvs
+    visited_cpvs.add(package_cpv)
+
+    total_package_filesize = 0
+    if not installed_package:
+      raise PackageNotFoundError('Unable to locate installed_package %s in %s' %
+                                 (package_cpv, root))
+    for content_type, path in installed_package.ListContents():
+      if content_type == InstalledPackage.OBJ:
+        filename = os.path.join(db.root, path)
+        try:
+          filesize = os.path.getsize(filename)
+        except OSError as e:
+          logging.warn('unable to compute the size of %s (skipping): %s',
+                       filename, e)
+          continue
+        logging.debug('size of %s = %d', filename, filesize)
+        total_package_filesize += filesize
+    logging.debug('%s installed_package size is %d', package_cpv,
+                  total_package_filesize)
+    yield (package_cpv, total_package_filesize)
diff --git a/lib/portage_util_unittest.py b/lib/portage_util_unittest.py
index 8a25c6d..cd7bb04 100644
--- a/lib/portage_util_unittest.py
+++ b/lib/portage_util_unittest.py
@@ -1321,6 +1321,25 @@
     self.fake_packages.sort()
     self.assertEqual(self.fake_packages, packages)
 
+  def testGeneratePackageSizes(self):
+    """Test if calculating installed package sizes works."""
+    fake_data = 'FAKE DATA'
+    expected_size = 0
+    for fake_file in self.fake_files:
+      if fake_file[0] == 'obj':
+        fake_filename = os.path.join(self.fake_chroot,
+                                     os.path.relpath(fake_file[1], '/'))
+        osutils.WriteFile(fake_filename, fake_data, makedirs=True)
+        expected_size += len(fake_data)
+
+    portage_db = portage_util.PortageDB(self.fake_chroot)
+    installed_packages = portage_db.InstalledPackages()
+    package_size_pairs = portage_util.GeneratePackageSizes(portage_db,
+                                                           'fake_chroot',
+                                                           installed_packages)
+    total_size = sum(x for _, x in package_size_pairs)
+    self.assertEqual(total_size, expected_size)
+
   def testIsPackageInstalled(self):
     """Test if checking the existence of an installed package works."""
     self.assertTrue(portage_util.IsPackageInstalled(
diff --git a/scripts/emit_metric.py b/scripts/emit_metric.py
index 9e58dba..928365c 100644
--- a/scripts/emit_metric.py
+++ b/scripts/emit_metric.py
@@ -14,16 +14,19 @@
 def main(argv):
   """Emit a metric event."""
   parser = commandline.ArgumentParser(description=__doc__)
-  parser.add_argument('op', choices=metrics.VALID_OPS,
+  parser.add_argument('op', choices=sorted(metrics.VALID_OPS),
                       help='Which metric event operator to emit.')
   parser.add_argument('name',
                       help='The name of the metric event as you would like it '
                            'to appear downstream in data stores.')
-  parser.add_argument('key', nargs='?',
-                      help='A unique key for this invocation to ensure that '
-                           'start and stop timers can be matched.')
+  parser.add_argument('arg', nargs='?',
+                      help='An accessory argument dependent upon the "op".')
   opts = parser.parse_args(argv)
 
+  if opts.arg and not metrics.OP_EXPECTS_ARG[opts.op]:
+    # We do not expect to get an |arg| for this |op|.
+    parser.error('Unexpected arg "%s" given for op "%s"' % (opts.arg,
+                                                            opts.op))
+
   timestamp = metrics.current_milli_time()
-  key = opts.key or opts.name
-  metrics.append_metrics_log(timestamp, opts.name, opts.op, key=key)
+  metrics.append_metrics_log(timestamp, opts.name, opts.op, arg=opts.arg)
diff --git a/scripts/pkg_size b/scripts/pkg_size
new file mode 120000
index 0000000..b7045c5
--- /dev/null
+++ b/scripts/pkg_size
@@ -0,0 +1 @@
+wrapper.py
\ No newline at end of file
diff --git a/scripts/pkg_size.py b/scripts/pkg_size.py
new file mode 100644
index 0000000..1a7e845
--- /dev/null
+++ b/scripts/pkg_size.py
@@ -0,0 +1,78 @@
+# -*- coding: utf-8 -*-
+# Copyright 2019 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""The Package Size Reporting CLI entry point."""
+
+from __future__ import print_function
+
+import json
+
+from chromite.lib import commandline
+from chromite.lib import portage_util
+from chromite.utils import metrics
+
+
+def _get_parser():
+  """Create an argument parser for this script."""
+  parser = commandline.ArgumentParser(description=__doc__)
+  parser.add_argument('--root', required=True, type='path',
+                      help='Specify the rootfs to investigate.')
+  parser.add_argument('--image-type',
+                      help='Specify the type of image being investigated. '
+                           'e.g. [base, dev, test]')
+  parser.add_argument('--partition-name',
+                      help='Specify the partition name. '
+                           'e.g. [rootfs, stateful]')
+  parser.add_argument('packages', nargs='*',
+                      help='Names of packages to investigate. Must be '
+                           'specified as category/package-version.')
+  return parser
+
+
+def generate_package_size_report(db, root, image_type, partition_name,
+                                 installed_packages):
+  """Collect package sizes and generate a report."""
+  results = {}
+  total_size = 0
+  package_sizes = portage_util.GeneratePackageSizes(db, root,
+                                                    installed_packages)
+  timestamp = metrics.current_milli_time()
+  for package_cpv, size in package_sizes:
+    results[package_cpv] = size
+    metrics.append_metrics_log(timestamp,
+                               'package_size.%s.%s.%s' % (image_type,
+                                                          partition_name,
+                                                          package_cpv),
+                               metrics.OP_GAUGE,
+                               arg=size)
+    total_size += size
+
+  metrics.append_metrics_log(timestamp,
+                             'total_size.%s.%s' % (image_type, partition_name),
+                             metrics.OP_GAUGE,
+                             arg=total_size)
+  return {'root': root, 'package_sizes': results, 'total_size': total_size}
+
+
+def main(argv):
+  """Find and report approximate size info for a particular built package."""
+  commandline.RunInsideChroot()
+
+  parser = _get_parser()
+  opts = parser.parse_args(argv)
+  opts.Freeze()
+
+  db = portage_util.PortageDB(root=opts.root)
+
+  if opts.packages:
+    installed_packages = portage_util.GenerateInstalledPackages(db, opts.root,
+                                                                opts.packages)
+  else:
+    installed_packages = db.InstalledPackages()
+
+  results = generate_package_size_report(db, opts.root, opts.image_type,
+                                         opts.partition_name,
+                                         installed_packages)
+  print(json.dumps(results))
diff --git a/utils/metrics.py b/utils/metrics.py
index a070dff..e80f36c 100644
--- a/utils/metrics.py
+++ b/utils/metrics.py
@@ -26,13 +26,22 @@
 
 OP_START_TIMER = 'start-timer'
 OP_STOP_TIMER = 'stop-timer'
+OP_GAUGE = 'gauge'
 OP_NAMED_EVENT = 'event'
-VALID_OPS = (OP_START_TIMER, OP_STOP_TIMER, OP_NAMED_EVENT)
+OP_EXPECTS_ARG = {
+    OP_START_TIMER: True,
+    OP_STOP_TIMER: True,
+    OP_NAMED_EVENT: False,
+    OP_GAUGE: True,
+}
+VALID_OPS = set(OP_EXPECTS_ARG)
 
-# MetricEvent store a start or a stop to a timer. Timers are keyed
-# with a unique value to make matching the bookends easier.
+# MetricEvent stores one of a few different types of metric events. The 'arg'
+# parameter is an overloaded value which is discriminated by the 'op' parameter.
+# Timers utilize 'arg' as a key value for disambiguation, and gauges use the arg
+# as their gauge value.
 MetricEvent = collections.namedtuple('MetricEvent', ('timestamp_epoch_millis',
-                                                     'name', 'op', 'key'))
+                                                     'name', 'op', 'arg'))
 
 
 class Error(Exception):
@@ -70,14 +79,14 @@
                            (len(terms), terms))
 
   assert terms[2] in {OP_START_TIMER, OP_STOP_TIMER}
-  return MetricEvent(int(terms[0]), terms[1], terms[2], terms[3])
+  return MetricEvent(int(terms[0]), terms[1], terms[2], arg=terms[3])
 
 
 def parse_named_event(terms):
   """Parse a named event line.
 
   Args:
-    terms: A list of the subdimensions of the MetricEvent type, omitting "key".
+    terms: A list of the subdimensions of the MetricEvent type, omitting "arg".
 
   Returns:
     A MetricEvent from the content of the terms.
@@ -91,7 +100,29 @@
                            (len(terms), terms))
 
   assert terms[2] == OP_NAMED_EVENT
-  return MetricEvent(int(terms[0]), terms[1], terms[2], key=None)
+  return MetricEvent(int(terms[0]), terms[1], terms[2], arg=None)
+
+
+def parse_gauge(terms):
+  """Parse a gauge, which is an event with an associated integer value.
+
+  Args:
+    terms: A list of the subdimensions of the MetricEvent type, leveraging |arg|
+           as a container for the actual gauge value.
+
+  Returns:
+    A MetricEvent from the content of the terms.
+
+  Raises:
+    ParseMetricError: An error occurred parsing the data from the list of terms.
+  """
+  if len(terms) != 4:
+    raise ParseMetricError('Incorrect number of terms for gauge. Should '
+                           'have been 4, instead it is %d. See terms %s.' %
+                           (len(terms), terms))
+
+  assert terms[2] == OP_GAUGE
+  return MetricEvent(int(terms[0]), terms[1], terms[2], arg=int(terms[3]))
 
 
 def get_metric_parser(op):
@@ -100,6 +131,7 @@
       OP_START_TIMER: parse_timer,
       OP_STOP_TIMER: parse_timer,
       OP_NAMED_EVENT: parse_named_event,
+      OP_GAUGE: parse_gauge,
   }[op]
 
 
@@ -153,7 +185,7 @@
   return wrapper
 
 
-def append_metrics_log(timestamp, name, op, key=None):
+def append_metrics_log(timestamp, name, op, arg=None):
   """Handle appending a list of terms to the metrics log.
 
   If the environment does not specify a metrics log, then skip silently.
@@ -162,12 +194,12 @@
     timestamp: A millisecond epoch timestamp.
     name: A period-separated string describing the event.
     op: One of the OP_* values, determining which type of event this is.
-    key: An optional key to disambiguate equivalenty named events.
+    arg: An accessory value for use based on the related |op|.
   """
   metrics_log = os.environ.get(UTILS_METRICS_LOG_ENVVAR)
   terms = [timestamp, name.replace('|', '_'), op]
-  if key:
-    terms.append(key)
+  if arg:
+    terms.append(arg)
 
   # Format the actual line to log.
   line = '|'.join(str(x) for x in terms)
@@ -187,14 +219,14 @@
   Yields:
     Context for context manager surrounding event emission.
   """
-  # Timer events use a "key" to disambiguate in case of multiple concurrent or
+  # Timer events use a |arg| to disambiguate in case of multiple concurrent or
   # overlapping timers with the same name.
   key = uuid.uuid4()
   try:
-    append_metrics_log(current_milli_time(), name, OP_START_TIMER, key=key)
+    append_metrics_log(current_milli_time(), name, OP_START_TIMER, arg=key)
     yield
   finally:
-    append_metrics_log(current_milli_time(), name, OP_STOP_TIMER, key=key)
+    append_metrics_log(current_milli_time(), name, OP_STOP_TIMER, arg=key)
 
 
 def event(name):