zauri | 9de5cf9 | 2022-02-17 13:13:18 +0000 | [diff] [blame^] | 1 | # // Copyright 2022 The Chromium OS Authors. All rights reserved. |
| 2 | # // Use of this source code is governed by a BSD-style license that can be |
| 3 | # // found in the LICENSE file. |
| 4 | |
| 5 | """Parse auditd logs. |
| 6 | |
| 7 | Parse the connect() syscall usage per process basis. |
| 8 | Print the frequency historgram in stdout. |
| 9 | |
| 10 | Example: |
| 11 | python3 parser.py data/sample_data.txt |
| 12 | """ |
| 13 | |
| 14 | |
| 15 | import sys |
| 16 | |
| 17 | |
| 18 | DELIM = '----' |
| 19 | SYSCALL = 'SYSCALL' |
| 20 | SOCKADDR = 'SOCKADDR' |
| 21 | EVENTID = 'event_id' |
| 22 | TYPE = 'type' |
| 23 | FAM = 'fam' |
| 24 | LADDR = 'laddr' |
| 25 | LPORT = 'lport' |
| 26 | PID = 'pid' |
| 27 | PPID = 'ppid' |
| 28 | UID = 'uid' |
| 29 | COMM = 'comm' |
| 30 | EXE = 'exe' |
| 31 | |
| 32 | IGNORE_ADDR_LIST = ['127.0.0.1', '0.0.0.0', '::', '::1'] |
| 33 | |
| 34 | |
| 35 | def main(argv): |
| 36 | """Parse the log file and show frequency histogram. |
| 37 | |
| 38 | Format of the output is: '<number> <process_name>', meaning that the |
| 39 | <proces_name> invoked the connect() syscall <number> of times. |
| 40 | |
| 41 | Args: |
| 42 | argv: A single element list, specifying the log file path |
| 43 | e.g. ["data/sample_output.txt"]. |
| 44 | """ |
| 45 | data = parse_file(argv[0]) |
| 46 | visualize(data) |
| 47 | |
| 48 | |
| 49 | def visualize(data): |
| 50 | """Print frequency histogram of per-process usage of sys_connect(). |
| 51 | |
| 52 | We want to observe what information is sent off of devices. |
| 53 | Consequently, ignoring: |
| 54 | * sockets that DO NOT use internet protocol |
| 55 | * loopback interfaces |
| 56 | """ |
| 57 | stats = {} |
| 58 | for data_point in data: |
| 59 | # Skip the logs which don't contain the syscall information. |
| 60 | if COMM in data_point and FAM in data_point: |
| 61 | comm, fam, addr = [ |
| 62 | data_point[COMM], |
| 63 | data_point[FAM], |
| 64 | data_point.get(LADDR, None), |
| 65 | ] |
| 66 | |
| 67 | # Ignore non-internet protocols |
| 68 | if fam not in ['inet', 'inet6']: |
| 69 | continue |
| 70 | # Ignore loopback addresses. |
| 71 | if addr in IGNORE_ADDR_LIST: |
| 72 | continue |
| 73 | |
| 74 | stats[comm] = stats.get(comm, 0) + 1 |
| 75 | else: |
| 76 | continue |
| 77 | |
| 78 | for item in sorted(stats, key=stats.get, reverse=True): |
| 79 | print(stats[item], '\t', item) |
| 80 | |
| 81 | |
| 82 | def parse_file(file_name): |
| 83 | """Parse log file and return as a list of dictionary items. |
| 84 | |
| 85 | Args: |
| 86 | file_name: path the log file. |
| 87 | |
| 88 | Returns: |
| 89 | A list containing per-event information. |
| 90 | """ |
| 91 | events = None |
| 92 | with open(file_name, 'r') as file: |
| 93 | events = file.read().split(DELIM) |
| 94 | events = list(filter(len, events)) |
| 95 | |
| 96 | data = [] |
| 97 | for event in events: |
| 98 | data.append(parse_event(event)) |
| 99 | |
| 100 | return data |
| 101 | |
| 102 | |
| 103 | def parse_event(event): |
| 104 | """Parse single event information into a data point. |
| 105 | |
| 106 | Multiple log entries are logged for each event. Logs of the other events |
| 107 | are separated with '----' delimiter in a log file. |
| 108 | This function takes all the log entries of a single event, parses out the |
| 109 | required fields into a dictionary and returns it. |
| 110 | |
| 111 | Args: |
| 112 | event: String containing all the logs of a single event. |
| 113 | |
| 114 | Returns: |
| 115 | A dictionary with the relevant event information. |
| 116 | """ |
| 117 | data_point = {} |
| 118 | |
| 119 | # Parse relevant fields. |
| 120 | parse_eventid(event, data_point) |
| 121 | parse_syscall_bits(event, data_point) |
| 122 | parse_sockaddr_bits(event, data_point) |
| 123 | |
| 124 | return data_point |
| 125 | |
| 126 | |
| 127 | def parse_type(event, log_type): |
| 128 | """Return the log entry with the desired log type. |
| 129 | |
| 130 | Args: |
| 131 | event: String of all the log entries associated with a single event. |
| 132 | log_type: Type of the log entry we're searcing for e.g. 'SYSCALL. |
| 133 | |
| 134 | Returns: |
| 135 | The first log entry of the given type (should be EXACTLY one). |
| 136 | Returns empty string if the type is not found. |
| 137 | """ |
| 138 | log_entries = event.split('\n') |
| 139 | log_entries = list(filter(len, log_entries)) |
| 140 | log_entries = [e.rstrip() for e in log_entries] |
| 141 | |
| 142 | for entry in log_entries: |
| 143 | cur_type = entry.split()[0].replace('type=', '') |
| 144 | if cur_type == log_type: |
| 145 | return entry |
| 146 | |
| 147 | return '' |
| 148 | |
| 149 | |
| 150 | def parse_eventid(event, data_point): |
| 151 | """Populate EVENTID field of the data_point.""" |
| 152 | log_entry = event.split()[2].rstrip() |
| 153 | |
| 154 | eventid = log_entry.split(')')[0] |
| 155 | eventid = eventid[eventid.rfind(':')+1:] |
| 156 | |
| 157 | data_point[EVENTID] = int(eventid) |
| 158 | |
| 159 | |
| 160 | def parse_syscall_bits(event, data_point): |
| 161 | """Populate the SYSCALL-related fields into a data point.""" |
| 162 | sys_entry = parse_type(event, SYSCALL) |
| 163 | if sys_entry == '': |
| 164 | return |
| 165 | |
| 166 | ppid, pid, auid, uid, gid, comm, exe, subj = [ |
| 167 | ' ppid=', |
| 168 | ' pid=', |
| 169 | ' auid=', |
| 170 | ' uid=', |
| 171 | ' gid=', |
| 172 | ' comm=', |
| 173 | ' exe=', |
| 174 | ' subj=', |
| 175 | ] |
| 176 | data_point[PPID] = int(sys_entry[sys_entry.find(ppid)+len(ppid) |
| 177 | : sys_entry.find(pid)]) |
| 178 | data_point[PID] = int(sys_entry[sys_entry.find(pid)+len(pid) |
| 179 | : sys_entry.find(auid)]) |
| 180 | data_point[UID] = sys_entry[sys_entry.find(uid)+len(uid) |
| 181 | : sys_entry.find(gid)] |
| 182 | data_point[COMM] = sys_entry[sys_entry.find(comm)+len(comm) |
| 183 | : sys_entry.find(exe)] |
| 184 | data_point[EXE] = sys_entry[sys_entry.find(exe)+len(exe) |
| 185 | : sys_entry.find(subj)] |
| 186 | |
| 187 | |
| 188 | def parse_sockaddr_bits(event, data_point): |
| 189 | """Populate the SOCKADDR-related bits into a data point.""" |
| 190 | sockaddr_entry = parse_type(event, SOCKADDR) |
| 191 | if sockaddr_entry == '': |
| 192 | return |
| 193 | |
| 194 | fam, laddr, lport = [ |
| 195 | ' fam=', |
| 196 | ' laddr=', |
| 197 | ' lport=', |
| 198 | ] |
| 199 | data_point[FAM] = sockaddr_entry[sockaddr_entry.find(fam)+len(fam) |
| 200 | : sockaddr_entry.find(laddr)] |
| 201 | if data_point[FAM] == 'inet' or data_point[FAM] == 'inet6': |
| 202 | data_point[LADDR] = sockaddr_entry[sockaddr_entry.find(laddr) |
| 203 | + len(laddr) |
| 204 | : sockaddr_entry.find(lport)] |
| 205 | data_point[LPORT] = int(sockaddr_entry[ |
| 206 | sockaddr_entry.find(lport) + len(lport) |
| 207 | : sockaddr_entry.find(' ', sockaddr_entry.find(lport)+1)]) |
| 208 | else: |
| 209 | # TODO(zauri): do we need non-inet[6] packets? |
| 210 | pass |
| 211 | |
| 212 | |
| 213 | if __name__ == '__main__': |
| 214 | main(sys.argv[1:]) |