Dennis Kempin | 19e972b | 2013-06-20 13:21:38 -0700 | [diff] [blame] | 1 | #! /usr/bin/env python |
| 2 | # Copyright (c) 2013 The Chromium OS Authors. All rights reserved. |
| 3 | # Use of this source code is governed by a BSD-style license that can be |
| 4 | # found in the LICENSE file. |
| 5 | from mtlib.log import Log, FeedbackDownloader, FeedbackLog |
| 6 | from mtreplay import MTReplay |
Dennis Kempin | 6e03a43 | 2013-06-25 09:00:53 -0700 | [diff] [blame] | 7 | import collections |
Dennis Kempin | 19e972b | 2013-06-20 13:21:38 -0700 | [diff] [blame] | 8 | import json |
| 9 | import multiprocessing |
| 10 | import os |
| 11 | import random |
| 12 | import re |
Dennis Kempin | 55af9cc | 2013-06-20 15:07:21 -0700 | [diff] [blame] | 13 | import traceback |
Dennis Kempin | 19e972b | 2013-06-20 13:21:38 -0700 | [diff] [blame] | 14 | import urllib |
| 15 | |
| 16 | script_dir = os.path.dirname(os.path.realpath(__file__)) |
| 17 | log_dir = os.path.join(script_dir, '../cache/logs/') |
| 18 | if not os.path.exists(log_dir): |
| 19 | os.mkdir(log_dir) |
| 20 | |
| 21 | |
Dennis Kempin | 253ee05 | 2013-07-01 14:58:22 -0700 | [diff] [blame^] | 22 | def SortedDict(dict): |
| 23 | return collections.OrderedDict(sorted(dict.items(), key=lambda t: t[0])) |
| 24 | |
| 25 | |
| 26 | def MTStatSearchSubprocess(params): |
| 27 | return MTStat().FindMatchesInFile(params[0], params[1]) |
Dennis Kempin | 6e03a43 | 2013-06-25 09:00:53 -0700 | [diff] [blame] | 28 | |
Dennis Kempin | 19e972b | 2013-06-20 13:21:38 -0700 | [diff] [blame] | 29 | |
| 30 | def MTStatDownloadSubprocess(id): |
| 31 | return MTStat().DownloadFile(id) |
| 32 | |
Dennis Kempin | 6e03a43 | 2013-06-25 09:00:53 -0700 | [diff] [blame] | 33 | |
Dennis Kempin | 253ee05 | 2013-07-01 14:58:22 -0700 | [diff] [blame^] | 34 | class SearchMatch(object): |
| 35 | # Example: MTStat:124.321:Key=Value |
| 36 | mtstat_regex = re.compile('MTStat:([0-9]+\.[0-9]+):(\w+)([=:])(\w+)') |
Dennis Kempin | 6e03a43 | 2013-06-25 09:00:53 -0700 | [diff] [blame] | 37 | |
Dennis Kempin | 253ee05 | 2013-07-01 14:58:22 -0700 | [diff] [blame^] | 38 | def __init__(self, line, file): |
| 39 | self.line = line |
| 40 | self.file = file |
| 41 | |
| 42 | self.key = None |
| 43 | self.value = None |
| 44 | self.operator = None |
| 45 | self.timestamp = None |
| 46 | self.ismtstat = False |
| 47 | |
| 48 | match = SearchMatch.mtstat_regex.search(self.line) |
Dennis Kempin | 6e03a43 | 2013-06-25 09:00:53 -0700 | [diff] [blame] | 49 | if match: |
Dennis Kempin | 253ee05 | 2013-07-01 14:58:22 -0700 | [diff] [blame^] | 50 | self.timestamp = float(match.group(1)) |
| 51 | self.key = match.group(2) |
| 52 | self.operator = match.group(3) |
| 53 | self.value = match.group(4) |
| 54 | self.ismtstat = True |
| 55 | |
Dennis Kempin | 6e03a43 | 2013-06-25 09:00:53 -0700 | [diff] [blame] | 56 | |
Dennis Kempin | 19e972b | 2013-06-20 13:21:38 -0700 | [diff] [blame] | 57 | class MTStat(object): |
Dennis Kempin | 253ee05 | 2013-07-01 14:58:22 -0700 | [diff] [blame^] | 58 | |
| 59 | search_operators = { |
| 60 | ">": lambda a, b: float(a) > float(b), |
| 61 | ">=": lambda a, b: float(a) >= float(b), |
| 62 | "<": lambda a, b: float(a) < float(b), |
| 63 | "<=": lambda a, b: float(a) <= float(b), |
| 64 | "=": lambda a, b: str(a) == str(b), |
| 65 | "==": lambda a, b: str(a) == str(b), |
| 66 | "!=": lambda a, b: str(a) != str(b) |
| 67 | } |
| 68 | |
| 69 | def FindMatchesInFile(self, filename, regex_str=None): |
| 70 | """ Searchs all MTStat entries from a log file |
| 71 | |
| 72 | Replays a logfile and searched for MTStat log entries. |
| 73 | Returns two values, a list of SearchMarches and the number of |
| 74 | SYN reports that has been processed. |
| 75 | """ |
| 76 | try: |
| 77 | log = Log(filename) |
| 78 | |
| 79 | # find platform for file |
| 80 | replay = MTReplay() |
| 81 | platform = replay.PlatformOf(log) |
| 82 | if not platform: |
| 83 | print 'No platform for %s' % os.path.basename(filename) |
| 84 | return ([], 0) |
| 85 | |
| 86 | # count the number of syn reports in log file |
| 87 | num_syn = len(tuple(re.finditer("0000 0000 0", log.evdev))) |
| 88 | |
| 89 | # run replay |
| 90 | result = replay.Replay(log) |
| 91 | |
| 92 | # per default we search for MTStat lines |
| 93 | if regex_str: |
| 94 | regex = re.compile(regex_str) |
| 95 | else: |
| 96 | regex = SearchMatch.mtstat_regex |
| 97 | |
| 98 | # find all lines matching the regex |
| 99 | stats = [] |
| 100 | for line in result.gestures_log.splitlines(): |
| 101 | if regex.search(line): |
| 102 | stats.append(SearchMatch(line, filename)) |
| 103 | return (stats, num_syn) |
| 104 | except: |
| 105 | print filename, traceback.format_exc() |
| 106 | return ([], 0) |
| 107 | |
| 108 | def _FindAllMatches(self, number, parallel, search_regex=None): |
| 109 | # make sure the replay binaries are up to date |
| 110 | MTReplay().Recompile() |
| 111 | |
| 112 | # list all log files |
| 113 | files = [os.path.abspath(os.path.join(log_dir, f)) |
| 114 | for f in os.listdir(log_dir) |
| 115 | if f.isdigit()] |
| 116 | |
| 117 | # randomly select subset of files |
| 118 | if number is not None: |
| 119 | files = random.sample(files, number) |
| 120 | |
| 121 | # arguments for MTStatSearchSubprocess |
| 122 | parameters = [(file, search_regex) for file in files] |
| 123 | |
| 124 | print "Processing %d log files" % len(files) |
| 125 | |
| 126 | # process all files either in parallel or sequential |
| 127 | if parallel: |
| 128 | pool = multiprocessing.Pool() |
| 129 | pool_results = pool.map(MTStatSearchSubprocess, parameters) |
| 130 | pool.terminate() |
| 131 | else: |
| 132 | pool_results = map(MTStatSearchSubprocess, parameters) |
| 133 | |
| 134 | # merge results of each file into one big list |
| 135 | entries = [] |
| 136 | syn_count = 0 |
| 137 | for file_entries, file_syn_count in pool_results: |
| 138 | syn_count = syn_count + file_syn_count |
| 139 | |
| 140 | # the = operator is used to make only the latest |
| 141 | # entry of each file count. |
| 142 | file_uniques = {} |
| 143 | for entry in file_entries: |
| 144 | if entry.operator == '=': |
| 145 | file_uniques[entry.key] = entry |
| 146 | else: |
| 147 | entries.append(entry) |
| 148 | |
| 149 | # add the last value of each unique entry |
| 150 | for value in file_uniques.values(): |
| 151 | entries.append(value) |
| 152 | |
| 153 | # syn reports are coming at approx 60 Hz on most platforms |
| 154 | syn_per_second = 60.0 |
| 155 | hours = syn_count / syn_per_second / 60.0 / 60.0 |
| 156 | |
| 157 | print "Processed ~%.2f hours of interaction" % hours |
| 158 | |
| 159 | return entries |
| 160 | |
| 161 | def Search(self, query=None, regex=None, number=None, parallel=True): |
| 162 | """ Search for occurences of a specific tag or regex. |
| 163 | |
| 164 | Specify either a 'query' or a 'regex'. Queries are formatted |
| 165 | in a simple "key operator value" format. For example: |
| 166 | "MyKey > 5" will return all matches where MyKey has a value |
| 167 | of greater than 5. |
| 168 | Supported operators are: >, <, >=, <=, =, != |
| 169 | |
| 170 | number: optional number of random reports to use |
| 171 | parallel: use parallel processing |
| 172 | |
| 173 | returns a dictionary of lists containing the matches |
| 174 | for each file. |
| 175 | """ |
| 176 | entries = self._FindAllMatches(number, parallel, search_regex=regex) |
| 177 | |
| 178 | if query: |
| 179 | match = re.match("\s*(\w+)\s*([<>=!]+)\s*([0-9a-zA-Z]+)\s*", query) |
| 180 | if not match: |
| 181 | print query, " is not a valid query" |
| 182 | return {} |
| 183 | search_key = match.group(1) |
| 184 | search_op = MTStat.search_operators[match.group(2)] |
| 185 | search_value = match.group(3) |
| 186 | |
| 187 | entries_by_file = collections.defaultdict(list) |
| 188 | for entry in entries: |
| 189 | if query is None or (entry.key == search_key and |
| 190 | search_op(entry.value, search_value)): |
| 191 | entries_by_file[entry.file].append(entry) |
| 192 | return entries_by_file |
| 193 | |
| 194 | def GatherStats(self, number=None, parallel=True, num_bins=10): |
| 195 | """ Gathers stats on feedback reports. |
| 196 | |
| 197 | Returns a dictionary with a histogram for each recorded key. |
| 198 | """ |
| 199 | entries = self._FindAllMatches(number, parallel) |
| 200 | |
| 201 | # gather values for each key in a list |
| 202 | value_collection = collections.defaultdict(list) |
| 203 | for entry in entries: |
| 204 | value_collection[entry.key].append(entry.value) |
| 205 | |
| 206 | # build histograms |
| 207 | histograms = {} |
| 208 | for key, values in value_collection.items(): |
| 209 | histograms[key] = self._Histogram(values, num_bins) |
| 210 | return SortedDict(histograms) |
| 211 | |
| 212 | def _Histogram(self, values, num_bins): |
| 213 | def RawHistogram(values): |
| 214 | return SortedDict(collections.Counter(values)) |
| 215 | |
| 216 | # convert all items to integers. |
| 217 | integers = [] |
| 218 | for value in values: |
| 219 | try: |
| 220 | integers.append(int(value)) |
| 221 | except: |
| 222 | # not an integer. |
| 223 | return RawHistogram(values) |
| 224 | |
| 225 | # don't condense lists that are already small enough |
| 226 | if len(set(integers)) <= num_bins: |
| 227 | return RawHistogram(integers) |
| 228 | |
| 229 | # all integer values, use bins for histogram |
| 230 | histogram = collections.OrderedDict() |
| 231 | integers = sorted(integers) |
| 232 | |
| 233 | # calculate bin size (append one at the end to include last value) |
| 234 | begin = integers[0] |
| 235 | end = integers[-1] + 1 |
| 236 | bin_size = float(end - begin) / float(num_bins) |
| 237 | |
| 238 | # remove each bins integers from the list and count them. |
| 239 | for i in range(num_bins): |
| 240 | high_v = round((i + 1) * bin_size) + begin |
| 241 | |
| 242 | filtered = filter(lambda i: i >= high_v, integers) |
| 243 | histogram["<%d" % high_v] = len(integers) - len(filtered) |
| 244 | integers = filtered |
| 245 | return histogram |
| 246 | |
Dennis Kempin | 19e972b | 2013-06-20 13:21:38 -0700 | [diff] [blame] | 247 | def DownloadFile(self, id): |
| 248 | """Download one feedback log into cache.""" |
| 249 | downloader = FeedbackDownloader() |
| 250 | |
| 251 | filename = os.path.join(log_dir, id) |
| 252 | if os.path.exists(filename): |
| 253 | print 'Skipping existing report', id |
| 254 | return |
| 255 | |
| 256 | print 'Downloading new report', id |
| 257 | try: |
| 258 | # might throw IO/Tar/Zip/etc exceptions |
| 259 | report = FeedbackLog(id, force_latest='pad') |
Dennis Kempin | 55af9cc | 2013-06-20 15:07:21 -0700 | [diff] [blame] | 260 | # Test parse. Will throw exception on malformed log |
| 261 | json.loads(report.activity) |
Dennis Kempin | 19e972b | 2013-06-20 13:21:38 -0700 | [diff] [blame] | 262 | except: |
| 263 | print 'Invalid report %s' % id |
Dennis Kempin | 55af9cc | 2013-06-20 15:07:21 -0700 | [diff] [blame] | 264 | return |
| 265 | |
Dennis Kempin | 19e972b | 2013-06-20 13:21:38 -0700 | [diff] [blame] | 266 | # check if report contains logs and actual events |
| 267 | if report.activity and report.evdev and 'E:' in report.evdev: |
| 268 | report.SaveAs(filename) |
| 269 | else: |
| 270 | print 'Invalid report %s' % id |
| 271 | |
| 272 | def Download(self, num, offset=0, parallel=True): |
| 273 | """Download 'num' new feedback logs into cache.""" |
| 274 | downloader = FeedbackDownloader() |
| 275 | |
| 276 | # download list of feedback report id's |
| 277 | params = { |
| 278 | '$limit': str(num), |
| 279 | '$offset': str(offset), |
| 280 | 'mapping': ':list', |
| 281 | 'productId': '208' # ChromeOS |
| 282 | } |
| 283 | url = ('https://feedback.corp.google.com/resty/ReportSearch?' + |
| 284 | urllib.urlencode(params)) |
| 285 | data = downloader.DownloadFile(url) |
| 286 | data = data[data.find('{'):] # strip garbage before json data |
| 287 | |
| 288 | reports_json = json.loads(data) |
| 289 | report_ids = [item['id'] for item in reports_json['results']] |
| 290 | |
| 291 | # Download and check each report |
| 292 | if parallel: |
| 293 | pool = multiprocessing.Pool() |
| 294 | results = pool.map(MTStatDownloadSubprocess, report_ids) |
| 295 | pool.terminate() |
| 296 | else: |
| 297 | results = map(MTStatDownloadSubprocess, report_ids) |