blob: dfb95735d6de30f3820c53514bff8c7052b4dc42 [file] [log] [blame]
zauri9de5cf92022-02-17 13:13:18 +00001# // Copyright 2022 The Chromium OS Authors. All rights reserved.
2# // Use of this source code is governed by a BSD-style license that can be
3# // found in the LICENSE file.
4
5"""Parse auditd logs.
6
7Parse the connect() syscall usage per process basis.
8Print the frequency historgram in stdout.
9
10Example:
11 python3 parser.py data/sample_data.txt
12"""
13
14
15import sys
16
17
18DELIM = '----'
19SYSCALL = 'SYSCALL'
20SOCKADDR = 'SOCKADDR'
21EVENTID = 'event_id'
22TYPE = 'type'
23FAM = 'fam'
24LADDR = 'laddr'
25LPORT = 'lport'
26PID = 'pid'
27PPID = 'ppid'
28UID = 'uid'
29COMM = 'comm'
30EXE = 'exe'
31
32IGNORE_ADDR_LIST = ['127.0.0.1', '0.0.0.0', '::', '::1']
33
34
35def main(argv):
36 """Parse the log file and show frequency histogram.
37
38 Format of the output is: '<number> <process_name>', meaning that the
39 <proces_name> invoked the connect() syscall <number> of times.
40
41 Args:
42 argv: A single element list, specifying the log file path
43 e.g. ["data/sample_output.txt"].
44 """
45 data = parse_file(argv[0])
46 visualize(data)
47
48
49def visualize(data):
50 """Print frequency histogram of per-process usage of sys_connect().
51
52 We want to observe what information is sent off of devices.
53 Consequently, ignoring:
54 * sockets that DO NOT use internet protocol
55 * loopback interfaces
56 """
57 stats = {}
58 for data_point in data:
59 # Skip the logs which don't contain the syscall information.
60 if COMM in data_point and FAM in data_point:
61 comm, fam, addr = [
62 data_point[COMM],
63 data_point[FAM],
64 data_point.get(LADDR, None),
65 ]
66
67 # Ignore non-internet protocols
68 if fam not in ['inet', 'inet6']:
69 continue
70 # Ignore loopback addresses.
71 if addr in IGNORE_ADDR_LIST:
72 continue
73
74 stats[comm] = stats.get(comm, 0) + 1
75 else:
76 continue
77
78 for item in sorted(stats, key=stats.get, reverse=True):
79 print(stats[item], '\t', item)
80
81
82def parse_file(file_name):
83 """Parse log file and return as a list of dictionary items.
84
85 Args:
86 file_name: path the log file.
87
88 Returns:
89 A list containing per-event information.
90 """
91 events = None
92 with open(file_name, 'r') as file:
93 events = file.read().split(DELIM)
94 events = list(filter(len, events))
95
96 data = []
97 for event in events:
98 data.append(parse_event(event))
99
100 return data
101
102
103def parse_event(event):
104 """Parse single event information into a data point.
105
106 Multiple log entries are logged for each event. Logs of the other events
107 are separated with '----' delimiter in a log file.
108 This function takes all the log entries of a single event, parses out the
109 required fields into a dictionary and returns it.
110
111 Args:
112 event: String containing all the logs of a single event.
113
114 Returns:
115 A dictionary with the relevant event information.
116 """
117 data_point = {}
118
119 # Parse relevant fields.
120 parse_eventid(event, data_point)
121 parse_syscall_bits(event, data_point)
122 parse_sockaddr_bits(event, data_point)
123
124 return data_point
125
126
127def parse_type(event, log_type):
128 """Return the log entry with the desired log type.
129
130 Args:
131 event: String of all the log entries associated with a single event.
132 log_type: Type of the log entry we're searcing for e.g. 'SYSCALL.
133
134 Returns:
135 The first log entry of the given type (should be EXACTLY one).
136 Returns empty string if the type is not found.
137 """
138 log_entries = event.split('\n')
139 log_entries = list(filter(len, log_entries))
140 log_entries = [e.rstrip() for e in log_entries]
141
142 for entry in log_entries:
143 cur_type = entry.split()[0].replace('type=', '')
144 if cur_type == log_type:
145 return entry
146
147 return ''
148
149
150def parse_eventid(event, data_point):
151 """Populate EVENTID field of the data_point."""
152 log_entry = event.split()[2].rstrip()
153
154 eventid = log_entry.split(')')[0]
155 eventid = eventid[eventid.rfind(':')+1:]
156
157 data_point[EVENTID] = int(eventid)
158
159
160def parse_syscall_bits(event, data_point):
161 """Populate the SYSCALL-related fields into a data point."""
162 sys_entry = parse_type(event, SYSCALL)
163 if sys_entry == '':
164 return
165
166 ppid, pid, auid, uid, gid, comm, exe, subj = [
167 ' ppid=',
168 ' pid=',
169 ' auid=',
170 ' uid=',
171 ' gid=',
172 ' comm=',
173 ' exe=',
174 ' subj=',
175 ]
176 data_point[PPID] = int(sys_entry[sys_entry.find(ppid)+len(ppid)
177 : sys_entry.find(pid)])
178 data_point[PID] = int(sys_entry[sys_entry.find(pid)+len(pid)
179 : sys_entry.find(auid)])
180 data_point[UID] = sys_entry[sys_entry.find(uid)+len(uid)
181 : sys_entry.find(gid)]
182 data_point[COMM] = sys_entry[sys_entry.find(comm)+len(comm)
183 : sys_entry.find(exe)]
184 data_point[EXE] = sys_entry[sys_entry.find(exe)+len(exe)
185 : sys_entry.find(subj)]
186
187
188def parse_sockaddr_bits(event, data_point):
189 """Populate the SOCKADDR-related bits into a data point."""
190 sockaddr_entry = parse_type(event, SOCKADDR)
191 if sockaddr_entry == '':
192 return
193
194 fam, laddr, lport = [
195 ' fam=',
196 ' laddr=',
197 ' lport=',
198 ]
199 data_point[FAM] = sockaddr_entry[sockaddr_entry.find(fam)+len(fam)
200 : sockaddr_entry.find(laddr)]
201 if data_point[FAM] == 'inet' or data_point[FAM] == 'inet6':
202 data_point[LADDR] = sockaddr_entry[sockaddr_entry.find(laddr)
203 + len(laddr)
204 : sockaddr_entry.find(lport)]
205 data_point[LPORT] = int(sockaddr_entry[
206 sockaddr_entry.find(lport) + len(lport)
207 : sockaddr_entry.find(' ', sockaddr_entry.find(lport)+1)])
208 else:
209 # TODO(zauri): do we need non-inet[6] packets?
210 pass
211
212
213if __name__ == '__main__':
214 main(sys.argv[1:])