network-auditor: Parser v0

parser.py program requires an auditd dump file passed as the first
argument. It parses the provided file and outputs the per-process
frequencies of connect() syscall usage. ONLY AF_INET or AF_INTE6
types of sockets are counted.

BUG=b:214428470
TEST=Outputs frequencies on sample dump file. See README.md!

Change-Id: If37a8f417921ec28f8e861d11b6ef31f1c420c9e
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/platform/dev-util/+/3471537
Reviewed-by: Zauri Meshveliani‎ <zauri@chromium.org>
Reviewed-by: Ramya Gopalan <ramyagopalan@google.com>
Commit-Queue: Ramya Gopalan <ramyagopalan@google.com>
Tested-by: Ramya Gopalan <ramyagopalan@google.com>
Reviewed-by: Jorge Lucangeli Obes <jorgelo@chromium.org>
Commit-Queue: Zauri Meshveliani‎ <zauri@chromium.org>
diff --git a/contrib/net-auditor-parser/parser.py b/contrib/net-auditor-parser/parser.py
new file mode 100644
index 0000000..dfb9573
--- /dev/null
+++ b/contrib/net-auditor-parser/parser.py
@@ -0,0 +1,214 @@
+# // Copyright 2022 The Chromium OS Authors. All rights reserved.
+# // Use of this source code is governed by a BSD-style license that can be
+# // found in the LICENSE file.
+
+"""Parse auditd logs.
+
+Parse the connect() syscall usage per process basis.
+Print the frequency historgram in stdout.
+
+Example:
+    python3 parser.py data/sample_data.txt
+"""
+
+
+import sys
+
+
+DELIM = '----'
+SYSCALL = 'SYSCALL'
+SOCKADDR = 'SOCKADDR'
+EVENTID = 'event_id'
+TYPE = 'type'
+FAM = 'fam'
+LADDR = 'laddr'
+LPORT = 'lport'
+PID = 'pid'
+PPID = 'ppid'
+UID = 'uid'
+COMM = 'comm'
+EXE = 'exe'
+
+IGNORE_ADDR_LIST = ['127.0.0.1', '0.0.0.0', '::', '::1']
+
+
+def main(argv):
+    """Parse the log file and show frequency histogram.
+
+    Format of the output is: '<number> <process_name>', meaning that the
+    <proces_name> invoked the connect() syscall <number> of times.
+
+    Args:
+        argv: A single element list, specifying the log file path
+              e.g. ["data/sample_output.txt"].
+    """
+    data = parse_file(argv[0])
+    visualize(data)
+
+
+def visualize(data):
+    """Print frequency histogram of per-process usage of sys_connect().
+
+    We want to observe what information is sent off of devices.
+    Consequently, ignoring:
+    * sockets that DO NOT use internet protocol
+    * loopback interfaces
+    """
+    stats = {}
+    for data_point in data:
+        # Skip the logs which don't contain the syscall information.
+        if COMM in data_point and FAM in data_point:
+            comm, fam, addr = [
+                data_point[COMM],
+                data_point[FAM],
+                data_point.get(LADDR, None),
+            ]
+
+            # Ignore non-internet protocols
+            if fam not in ['inet', 'inet6']:
+                continue
+            # Ignore loopback addresses.
+            if addr in IGNORE_ADDR_LIST:
+                continue
+
+            stats[comm] = stats.get(comm, 0) + 1
+        else:
+            continue
+
+    for item in sorted(stats, key=stats.get, reverse=True):
+        print(stats[item], '\t', item)
+
+
+def parse_file(file_name):
+    """Parse log file and return as a list of dictionary items.
+
+    Args:
+        file_name: path the log file.
+
+    Returns:
+        A list containing per-event information.
+    """
+    events = None
+    with open(file_name, 'r') as file:
+        events = file.read().split(DELIM)
+        events = list(filter(len, events))
+
+    data = []
+    for event in events:
+        data.append(parse_event(event))
+
+    return data
+
+
+def parse_event(event):
+    """Parse single event information into a data point.
+
+    Multiple log entries are logged for each event. Logs of the other events
+    are separated with '----' delimiter in a log file.
+    This function takes all the log entries of a single event, parses out the
+    required fields into a dictionary and returns it.
+
+    Args:
+        event: String containing all the logs of a single event.
+
+    Returns:
+        A dictionary with the relevant event information.
+    """
+    data_point = {}
+
+    # Parse relevant fields.
+    parse_eventid(event, data_point)
+    parse_syscall_bits(event, data_point)
+    parse_sockaddr_bits(event, data_point)
+
+    return data_point
+
+
+def parse_type(event, log_type):
+    """Return the log entry with the desired log type.
+
+    Args:
+        event: String of all the log entries associated with a single event.
+        log_type: Type of the log entry we're searcing for e.g. 'SYSCALL.
+
+    Returns:
+        The first log entry of the given type (should be EXACTLY one).
+        Returns empty string if the type is not found.
+    """
+    log_entries = event.split('\n')
+    log_entries = list(filter(len, log_entries))
+    log_entries = [e.rstrip() for e in log_entries]
+
+    for entry in log_entries:
+        cur_type = entry.split()[0].replace('type=', '')
+        if cur_type == log_type:
+            return entry
+
+    return ''
+
+
+def parse_eventid(event, data_point):
+    """Populate EVENTID field of the data_point."""
+    log_entry = event.split()[2].rstrip()
+
+    eventid = log_entry.split(')')[0]
+    eventid = eventid[eventid.rfind(':')+1:]
+
+    data_point[EVENTID] = int(eventid)
+
+
+def parse_syscall_bits(event, data_point):
+    """Populate the SYSCALL-related fields into a data point."""
+    sys_entry = parse_type(event, SYSCALL)
+    if sys_entry == '':
+        return
+
+    ppid, pid, auid, uid, gid, comm, exe, subj = [
+        ' ppid=',
+        ' pid=',
+        ' auid=',
+        ' uid=',
+        ' gid=',
+        ' comm=',
+        ' exe=',
+        ' subj=',
+        ]
+    data_point[PPID] = int(sys_entry[sys_entry.find(ppid)+len(ppid)
+                                     : sys_entry.find(pid)])
+    data_point[PID] = int(sys_entry[sys_entry.find(pid)+len(pid)
+                                    : sys_entry.find(auid)])
+    data_point[UID] = sys_entry[sys_entry.find(uid)+len(uid)
+                                : sys_entry.find(gid)]
+    data_point[COMM] = sys_entry[sys_entry.find(comm)+len(comm)
+                                 : sys_entry.find(exe)]
+    data_point[EXE] = sys_entry[sys_entry.find(exe)+len(exe)
+                                : sys_entry.find(subj)]
+
+
+def parse_sockaddr_bits(event, data_point):
+    """Populate the SOCKADDR-related bits into a data point."""
+    sockaddr_entry = parse_type(event, SOCKADDR)
+    if sockaddr_entry == '':
+        return
+
+    fam, laddr, lport = [
+        ' fam=',
+        ' laddr=',
+        ' lport=',
+        ]
+    data_point[FAM] = sockaddr_entry[sockaddr_entry.find(fam)+len(fam)
+                                     : sockaddr_entry.find(laddr)]
+    if data_point[FAM] == 'inet' or data_point[FAM] == 'inet6':
+        data_point[LADDR] = sockaddr_entry[sockaddr_entry.find(laddr)
+                                           + len(laddr)
+                                           : sockaddr_entry.find(lport)]
+        data_point[LPORT] = int(sockaddr_entry[
+            sockaddr_entry.find(lport) + len(lport)
+            : sockaddr_entry.find(' ', sockaddr_entry.find(lport)+1)])
+    else:
+        # TODO(zauri): do we need non-inet[6] packets?
+        pass
+
+
+if __name__ == '__main__':
+    main(sys.argv[1:])