Fix test runner crash when not in a git repo
[ghc.git] / testsuite / driver / perf_notes.py
1 #!/usr/bin/env python3
2
3 #
4 # (c) Jared Weakly 2017
5 #
6 # This file will be a utility to help facilitate the comparison of performance
7 # metrics across arbitrary commits. The file will produce a table comparing
8 # metrics between measurements taken for given commits in the environment
9 # (which defaults to 'local' if not given by --test-env).
10 #
11
12 import argparse
13 import re
14 import subprocess
15 import time
16 import sys
17
18 from collections import namedtuple
19 from math import ceil, trunc
20
21 from testutil import passed, failBecause
22
23
24 # Check if "git rev-parse" can be run successfully.
25 # True implies the current directory is a git repo.
26 _inside_git_repo_cache = None
27 def inside_git_repo():
28 global _inside_git_repo_cache
29 if _inside_git_repo_cache == None:
30 try:
31 subprocess.check_call(['git', 'rev-parse', 'HEAD'],
32 stdout=subprocess.DEVNULL)
33 _inside_git_repo_cache = True
34 except subprocess.CalledProcessError:
35 _inside_git_repo_cache = False
36 return _inside_git_repo_cache
37
38 # Check if the worktree is dirty.
39 def is_worktree_dirty():
40 return subprocess.check_output(['git', 'status', '--porcelain']) != b''
41
42 #
43 # Some data access functions. A the moment this uses git notes.
44 #
45
46 # The metrics (a.k.a stats) are named tuples, PerfStat, in this form:
47 #
48 # ( test_env : 'val', # Test environment.
49 # test : 'val', # Name of the test
50 # way : 'val',
51 # metric : 'val', # Metric being recorded
52 # value : 'val', # The statistic result e.g. runtime
53 # )
54
55 # All the fields of a metric (excluding commit field).
56 PerfStat = namedtuple('PerfStat', ['test_env','test','way','metric','value'])
57
58 # A baseline recovered form stored metrics.
59 Baseline = namedtuple('Baseline', ['perfStat','commit','commitDepth'])
60
61 class MetricChange:
62 NewMetric = 'NewMetric'
63 NoChange = 'NoChange'
64 Increase = 'Increase'
65 Decrease = 'Decrease'
66
67 def parse_perf_stat(stat_str):
68 field_vals = stat_str.strip('\t').split('\t')
69 return PerfStat(*field_vals)
70
71 # Get all recorded (in a git note) metrics for a given commit.
72 # Returns an empty array if the note is not found.
73 def get_perf_stats(commit='HEAD', namespace='perf'):
74 try:
75 log = subprocess.check_output(['git', 'notes', '--ref=' + namespace, 'show', commit], stderr=subprocess.STDOUT).decode('utf-8')
76 except subprocess.CalledProcessError:
77 return []
78
79 log = log.strip('\n').split('\n')
80 log = list(filter(None, log))
81 log = [parse_perf_stat(stat_str) for stat_str in log]
82 return log
83
84 # Check if a str is in a 40 character git commit hash.
85 # str -> bool
86 _commit_hash_re = re.compile('[0-9a-f]' * 40)
87 def is_commit_hash(hash):
88 return _commit_hash_re.fullmatch(hash) != None
89
90 # Convert a <ref> to a commit hash code.
91 # str -> str
92 def commit_hash(commit):
93 if is_commit_hash(commit):
94 return commit
95 return subprocess.check_output(['git', 'rev-parse', commit], \
96 stderr=subprocess.STDOUT) \
97 .decode() \
98 .strip()
99
100 # Get allowed changes to performance. This is extracted from the commit message of
101 # the given commit in this form:
102 # Metric (Increase | Decrease) ['metric' | \['metrics',..\]] [\((test_env|way)='abc',...\)]: TestName01, TestName02, ...
103 # Returns a *dictionary* from test name to a *list* of items of the form:
104 # {
105 # 'direction': either 'Increase' or 'Decrease,
106 # 'metrics': ['metricA', 'metricB', ...],
107 # 'opts': {
108 # 'optionA': 'string value',
109 # 'optionB': 'string value', # e.g. test_env: "x86_64-linux"
110 # ...
111 # }
112 # }
113 _get_allowed_perf_changes_cache = {}
114 def get_allowed_perf_changes(commit='HEAD'):
115 global _get_allowed_perf_changes_cache
116 commit = commit_hash(commit)
117 if not commit in _get_allowed_perf_changes_cache:
118 commitByteStr = subprocess.check_output(\
119 ['git', '--no-pager', 'log', '-n1', '--format=%B', commit])
120 _get_allowed_perf_changes_cache[commit] \
121 = parse_allowed_perf_changes(commitByteStr.decode())
122 return _get_allowed_perf_changes_cache[commit]
123
124 def parse_allowed_perf_changes(commitMsg):
125 # Helper regex. Non-capturing unless postfixed with Cap.
126 s = r"(?:\s*\n?\s+)" # Space, possible new line with an indent.
127 qstr = r"(?:'(?:[^'\\]|\\.)*')" # Quoted string.
128 qstrCap = r"(?:'((?:[^'\\]|\\.)*)')" # Quoted string. Captures the string without the quotes.
129 innerQstrList = r"(?:"+qstr+r"(?:"+s+r"?,"+s+r"?"+qstr+r")*)?" # Inside of a list of strings.gs.s..
130 qstrList = r"(?:\["+s+r"?"+innerQstrList+s+r"?\])" # A list of strings (using box brackets)..
131
132 exp = (r"^Metric"
133 +s+r"(Increase|Decrease)"
134 +s+r"?("+qstr+r"|"+qstrList+r")?" # Metric or list of metrics.s..
135 +s+r"?(\(" + r"(?:[^')]|"+qstr+r")*" + r"\))?" # Options surrounded in parenthesis. (allow parenthases in quoted strings)
136 +s+r"?:?" # Optional ":"
137 +s+r"?((?:(?!\n\n)(?!\n[^\s])(?:.|\n))*)" # Test names. Stop parsing on empty or non-indented new line.
138 )
139
140 matches = re.findall(exp, commitMsg, re.M)
141 changes = {}
142 for (direction, metrics_str, opts_str, tests_str) in matches:
143 tests = re.findall(r"(\w+)", tests_str)
144 for test in tests:
145 changes.setdefault(test, []).append({
146 'direction': direction,
147 'metrics': re.findall(qstrCap, metrics_str),
148 'opts': dict(re.findall(r"(\w+)"+s+r"?="+s+r"?"+qstrCap, opts_str))
149 })
150
151 return changes
152
153 # Calculates a suggested string to append to the git commit in order to accept the
154 # given changes.
155 # changes: [(MetricChange, PerfStat)]
156 def allow_changes_string(changes):
157 Dec = MetricChange.Decrease
158 Inc = MetricChange.Increase
159
160 # We only care about increase / decrease metrics.
161 changes = [change for change in changes if change[0] in [Inc, Dec]]
162
163 # Map tests to a map from change direction to metrics.
164 test_to_dir_to_metrics = {}
165 for (change, perf_stat) in changes:
166 change_dir_to_metrics = test_to_dir_to_metrics.setdefault(perf_stat.test, { Inc: [], Dec: [] })
167 change_dir_to_metrics[change].append(perf_stat.metric)
168
169 # Split into 3 groups.
170 # Tests where all changes are *increasing*.
171 # Tests where all changes are *decreasing*.
172 # Tests where changes are *mixed* increasing and decreasing.
173 groupDec = []
174 groupInc = []
175 groupMix = []
176 for (test, decsAndIncs) in test_to_dir_to_metrics.items():
177 decs = decsAndIncs[Dec]
178 incs = decsAndIncs[Inc]
179 if decs and incs:
180 groupMix.append(test)
181 elif not decs:
182 groupInc.append(test)
183 else:
184 groupDec.append(test)
185
186 msgs = []
187 nltab = '\n '
188
189 # Decreasing group.
190 if groupDec:
191 msgs.append('Metric Decrease:' + nltab + nltab.join(groupDec))
192
193 # Increasing group.
194 if groupInc:
195 msgs.append('Metric Increase:' + nltab + nltab.join(groupInc))
196
197 # Mixed group.
198 if groupMix:
199 # Split mixed group tests by decrease/increase, then by metric.
200 dir_to_metric_to_tests = {
201 Dec: {},
202 Inc: {}
203 }
204 for test in groupMix:
205 for change_dir, metrics in test_to_dir_to_metrics[test].items():
206 for metric in metrics:
207 dir_to_metric_to_tests[change_dir].setdefault(metric, []).append(test)
208
209 for change_dir in [Dec, Inc]:
210 metric_to_tests = dir_to_metric_to_tests[change_dir]
211 for metric in sorted(metric_to_tests.keys()):
212 tests = metric_to_tests[metric]
213 msgs.append('Metric ' + change_dir + ' \'' + metric + '\':' + nltab + nltab.join(tests))
214
215 return '\n\n'.join(msgs)
216
217 # Formats a list of metrics into a string. Used e.g. to save metrics to a file or git note.
218 def format_perf_stat(stats):
219 # If a single stat, convert to a singleton list.
220 if not isinstance(stats, list):
221 stats = [stats]
222
223 return "\n".join(["\t".join([str(stat_val) for stat_val in stat]) for stat in stats])
224
225 # Appends a list of metrics to the git note of the given commit.
226 # Tries up to max_tries times to write to git notes should it fail for some reason.
227 # Each retry will wait 1 second.
228 # Returns True if the note was successfully appended.
229 def append_perf_stat(stats, commit='HEAD', namespace='perf', max_tries=5):
230 # Append to git note
231 print('Appending ' + str(len(stats)) + ' stats to git notes.')
232 stats_str = format_perf_stat(stats)
233 def try_append():
234 try:
235 return subprocess.check_output(['git', 'notes', '--ref=' + namespace, 'append', commit, '-m', stats_str])
236 except subprocess.CalledProcessError:
237 return b'Git - fatal'
238
239 tries = 0
240 while tries < max_tries:
241 if not b'Git - fatal' in try_append():
242 return True
243 tries += 1
244 time.sleep(1)
245
246 print("\nAn error occurred while writing the performance metrics to git notes.\n \
247 This is usually due to a lock-file existing somewhere in the git repo.")
248
249 return False
250
251 #
252 # Baseline calculation
253 #
254
255 # Max number of ancestor commits to search when compiling a baseline performance metric.
256 BaselineSearchDepth = 75
257
258 # The git notes name space for local results.
259 LocalNamespace = "perf"
260
261 # The git notes name space for ci results.
262 CiNamespace = "ci/" + LocalNamespace
263
264 # (isCalculated, best fit ci test_env or None)
265 BestFitCiTestEnv = (False, None)
266
267 # test_env string or None
268 def best_fit_ci_test_env():
269 global BestFitCiTestEnv
270 if not BestFitCiTestEnv[0]:
271 platform = sys.platform
272 isArch64 = sys.maxsize > 2**32
273 arch = "x86_64" if isArch64 else "i386"
274
275 if platform.startswith("linux"):
276 test_env = arch + "-linux-deb9"
277 elif platform.startswith("win32"):
278 # There are no windows CI test results.
279 test_env = None
280 elif isArch64 and platform.startswith("darwin"):
281 test_env = arch + "-darwin"
282 elif isArch64 and platform.startswith("freebsd"):
283 test_env = arch + "-freebsd"
284 else:
285 test_env = None
286
287 BestFitCiTestEnv = (True, test_env)
288
289 return BestFitCiTestEnv[1]
290
291 _baseline_depth_commit_log = {}
292
293 # Get the commit hashes for the last BaselineSearchDepth commits from and
294 # including the input commit. The output commits are all commit hashes.
295 # str -> [str]
296 def baseline_commit_log(commit):
297 global _baseline_depth_commit_log
298 commit = commit_hash(commit)
299 if not commit in _baseline_depth_commit_log:
300 _baseline_depth_commit_log[commit] = \
301 subprocess.check_output(['git', 'log', '--format=%H', \
302 '-n' + str(BaselineSearchDepth)]) \
303 .decode().split('\n')
304 return _baseline_depth_commit_log[commit]
305
306 # Cache of baseline values. This is a dict of dicts indexed on:
307 # (useCiNamespace, commit) -> (test_env, test, metric, way) -> baseline
308 # (bool , str ) -> (str , str , str , str) -> float
309 _commit_metric_cache = {}
310
311 # Get the baseline (expected value) of a test at a given commit. This searches
312 # git notes from older commits for recorded metrics (locally and from ci). More
313 # recent commits are favoured, then local results over ci results are favoured.
314 #
315 # commit: str - must be a commit hash (see commit_has())
316 # name: str - test name
317 # test_env: str - test environment (note a best fit test_env will be used
318 # instead when looking for ci results)
319 # metric: str - test metric
320 # way: str - test way
321 # returns: the Baseline named tuple or None if no metric was found within
322 # BaselineSearchDepth commits and since the last expected change.
323 def baseline_metric(commit, name, test_env, metric, way):
324 # For performance reasons (in order to avoid calling commit_hash), we assert
325 # commit is already a commit hash.
326 assert is_commit_hash(commit)
327
328 # Get all recent commit hashes.
329 commit_hashes = baseline_commit_log(commit)
330 def depth_to_commit(depth):
331 return commit_hashes[depth]
332
333 def has_expected_change(commit):
334 return get_allowed_perf_changes(commit).get(name) \
335 != None
336
337 # Bool -> String
338 def namespace(useCiNamespace):
339 return CiNamespace if useCiNamespace else LocalNamespace
340
341 ci_test_env = best_fit_ci_test_env()
342
343 # gets the metric of a given commit
344 # (Bool, Int) -> (float | None)
345 def commit_metric(useCiNamespace, depth):
346 global _commit_metric_cache
347 currentCommit = depth_to_commit(depth)
348
349 # Get test environment.
350 effective_test_env = ci_test_env if useCiNamespace else test_env
351 if effective_test_env == None:
352 # This can happen when no best fit ci test is found.
353 return None
354
355 # Check for cached value.
356 cacheKeyA = (useCiNamespace, currentCommit)
357 cacheKeyB = (effective_test_env, name, metric, way)
358 if cacheKeyA in _commit_metric_cache:
359 return _commit_metric_cache[cacheKeyA].get(cacheKeyB)
360
361 # Cache miss.
362 # Calculate baselines from the current commit's git note.
363 # Note that the git note may contain data for other tests. All tests'
364 # baselines will be collected and cached for future use.
365 allCommitMetrics = get_perf_stats(
366 currentCommit,
367 namespace(useCiNamespace))
368
369 # Collect recorded values by cacheKeyB.
370 values_by_cache_key_b = {}
371 for perfStat in allCommitMetrics:
372 currentCacheKey = (perfStat.test_env, perfStat.test, \
373 perfStat.metric, perfStat.way)
374 currentValues = values_by_cache_key_b.setdefault(currentCacheKey, [])
375 currentValues.append(float(perfStat.value))
376
377 # Calculate and baseline (average of values) by cacheKeyB.
378 baseline_by_cache_key_b = {}
379 for currentCacheKey, currentValues in values_by_cache_key_b.items():
380 baseline_by_cache_key_b[currentCacheKey] = Baseline( \
381 PerfStat( \
382 currentCacheKey[0],
383 currentCacheKey[1],
384 currentCacheKey[3],
385 currentCacheKey[2],
386 sum(currentValues) / len(currentValues)),
387 currentCommit,
388 depth)
389
390 # Save baselines to the cache.
391 _commit_metric_cache[cacheKeyA] = baseline_by_cache_key_b
392 return baseline_by_cache_key_b.get(cacheKeyB)
393
394 # Searches through previous commits trying local then ci for each commit in.
395 def search(useCiNamespace, depth):
396 # Stop if reached the max search depth, or if
397 # there is an expected change at the child commit (depth-1). This is a
398 # subtlety: Metrics recorded on commit x incorporate the expected
399 # changes for commit x. Hence metrics from x are still a valid baseline,
400 # while older commits are not. This is why we check for expected changes
401 # on depth-1 rather than depth.
402 if depth >= BaselineSearchDepth or has_expected_change( \
403 depth_to_commit(depth - 1)):
404 return None
405
406 # Check for a metric on this commit.
407 current_metric = commit_metric(useCiNamespace, depth)
408 if current_metric != None:
409 return current_metric
410
411 # Metric is not available.
412 # If tried local, now try CI. Else move to the parent commit.
413 if not useCiNamespace:
414 return search(True, depth)
415 else:
416 return search(False, depth + 1)
417
418 # Start search from parent commit using local name space.
419 return search(False, 1)
420
421
422 # Check test stats. This prints the results for the user.
423 # actual: the PerfStat with actual value.
424 # baseline: the expected Baseline value (this should generally be derived from baseline_metric())
425 # tolerance_dev: allowed deviation of the actual value from the expected value.
426 # allowed_perf_changes: allowed changes in stats. This is a dictionary as returned by get_allowed_perf_changes().
427 # force_print: Print stats even if the test stat was in the tolerance range.
428 # Returns a (MetricChange, pass/fail object) tuple. Passes if the stats are withing the expected value ranges.
429 def check_stats_change(actual, baseline, tolerance_dev, allowed_perf_changes = {}, force_print = False):
430 expected_val = baseline.perfStat.value
431 full_name = actual.test + ' (' + actual.way + ')'
432
433 lowerBound = trunc( int(expected_val) * ((100 - float(tolerance_dev))/100))
434 upperBound = trunc(0.5 + ceil(int(expected_val) * ((100 + float(tolerance_dev))/100)))
435
436 actual_dev = round(((float(actual.value) * 100)/ int(expected_val)) - 100, 1)
437
438 # Find the direction of change.
439 change = MetricChange.NoChange
440 if actual.value < lowerBound:
441 change = MetricChange.Decrease
442 elif actual.value > upperBound:
443 change = MetricChange.Increase
444
445 # Is the change allowed?
446 allowed_change_directions = [MetricChange.NoChange] + [ allow_stmt['direction']
447 for allow_stmt in allowed_perf_changes.get(actual.test, [])
448
449 # List of metrics are not specified or the metric is in the list of metrics.
450 if not allow_stmt['metrics'] or actual.metric in allow_stmt['metrics']
451
452 # way/test are not specified, or match the actual way/test.
453 if ((not 'way' in allow_stmt['opts'].keys()) or actual.way == allow_stmt['opts']['way'])
454 if ((not 'test_env' in allow_stmt['opts'].keys()) or actual.test_env == allow_stmt['opts']['test_env'])
455 ]
456 change_allowed = change in allowed_change_directions
457
458 # Print errors and create pass/fail object.
459 result = passed()
460 if not change_allowed:
461 error = change + ' from ' + baseline.perfStat.test_env + \
462 ' baseline @ HEAD~' + str(baseline.commitDepth)
463 print(actual.metric, error + ':')
464 result = failBecause('stat ' + error, tag='stat')
465
466 if not change_allowed or force_print:
467 length = max(len(str(x)) for x in [expected_val, lowerBound, upperBound, actual.value])
468
469 def display(descr, val, extra):
470 print(descr, str(val).rjust(length), extra)
471
472 display(' Expected ' + full_name + ' ' + actual.metric + ':', expected_val, '+/-' + str(tolerance_dev) + '%')
473 display(' Lower bound ' + full_name + ' ' + actual.metric + ':', lowerBound, '')
474 display(' Upper bound ' + full_name + ' ' + actual.metric + ':', upperBound, '')
475 display(' Actual ' + full_name + ' ' + actual.metric + ':', actual.value, '')
476 if actual.value != expected_val:
477 display(' Deviation ' + full_name + ' ' + actual.metric + ':', actual_dev, '%')
478
479 return (change, result)
480
481 if __name__ == '__main__':
482 parser = argparse.ArgumentParser()
483 parser.add_argument("--test-env",
484 help="The given test environment to be compared.")
485 parser.add_argument("--test-name",
486 help="If given, filters table to include only \
487 tests matching the given regular expression.")
488 parser.add_argument("--add-note", nargs=3,
489 help="Development only. --add-note N commit seed \
490 Adds N fake metrics to the given commit using the random seed.")
491 parser.add_argument("commits", nargs=argparse.REMAINDER,
492 help="The rest of the arguments will be the commits that will be used.")
493 args = parser.parse_args()
494
495 env = 'local'
496 name = re.compile('.*')
497 # metrics is a tuple (str commit, PerfStat stat)
498 CommitAndStat = namedtuple('CommitAndStat', ['commit', 'stat'])
499 metrics = []
500 singleton_commit = len(args.commits) == 1
501
502 #
503 # Main logic of the program when called from the command-line.
504 #
505
506 if args.commits:
507 for c in args.commits:
508 metrics += [CommitAndStat(c, stat) for stat in get_perf_stats(c)]
509
510 if args.test_env:
511 metrics = [test for test in metrics if test.stat.test_env == args.test_env]
512
513 if args.test_name:
514 nameRe = re.compile(args.test_name)
515 metrics = [test for test in metrics if nameRe.search(test.test)]
516
517 if args.add_note:
518 def note_gen(n, commit, delta=''):
519 note = []
520 # Generates simple fake data. Likely not comprehensive enough to catch all edge cases.
521 if not delta:
522 note.extend([PerfStat('local', 'T'+ str(i*100), 'some_way', 'some_field', str(i*1000)) for i in range(1,int(int(n)/2)+1)])
523 note.extend([PerfStat('non-local', 'W'+ str(i*100), 'other_way', 'other_field', str(i*100)) for i in range(int(int(n)/2)+1,int(n)+1)])
524 if delta:
525 hu = abs(hash(delta))
526 hv = abs(hash(hu))
527 u = int(hu % 100)
528 v = int(hv % 10)
529 note.extend([PerfStat('local', 'T'+ str(i*100), 'some_way', 'some_field', str(i*u)) for i in range(1,int(int(n)/2)+1)])
530 note.extend([PerfStat('non-local', 'W'+ str(i*100), 'other_way', 'other_field', str(i*v)) for i in range(int(int(n)/2)+1,int(n)+1)])
531
532 append_perf_stat(note, commit)
533
534 note_gen(args.add_note[0],args.add_note[1],args.add_note[2])
535
536 #
537 # String utilities for pretty-printing
538 #
539
540 row_fmt = '{:18}' * len(args.commits)
541 commits = row_fmt.format(*[c[:10] for c in args.commits])
542
543 def cmtline(insert):
544 return row_fmt.format(*[insert for c in args.commits]).strip()
545
546 def header(unit):
547 first_line = "{:27}{:30}".format(' ',' ') + cmtline(unit)
548 second_line = ("{:27}{:30}".format('Test','Metric') + commits).strip()
549
550 # Test Metric c1 c2 c3 ...
551 print("-" * (len(second_line)+1))
552 print(first_line)
553 print(second_line)
554 print("-" * (len(second_line)+1))
555
556 def commit_string(test, flag):
557 def delta(v1, v2):
558 return round((100 * (v1 - v2)/v2),2)
559
560 # Get the average value per commit (or None if that commit contains no metrics).
561 # Note: if the test environment is not set, this will combine metrics from all test environments.
562 averageValuesOrNones = []
563 for commit in args.commits:
564 values = [float(t.stat.value) for t in metrics if t.commit == commit and t.stat.test == test]
565 if values == []:
566 averageValuesOrNones.append(None)
567 else:
568 averageValuesOrNones.append(sum(values) / len(values))
569
570 if flag == 'metrics':
571 strings = [str(v) if v != None else '-' for v in averageValuesOrNones]
572 if flag == 'percentages':
573 # If the baseline commit has no stats, then we can not produce any percentages.
574 baseline = averageValuesOrNones[0]
575 if baseline == None:
576 strings = ['-' for v in averageValuesOrNones]
577 else:
578 baseline = float(baseline)
579 strings = ['-' if val == None else str(delta(baseline,float(val))) + '%' for val in averageValuesOrNones]
580
581 return row_fmt.format(*strings).strip()
582
583 #
584 # The pretty-printed output
585 #
586
587 header('commit')
588 # Printing out metrics.
589 all_tests = sorted(set([(test.stat.test, test.stat.metric) for test in metrics]))
590 for test, metric in all_tests:
591 print("{:27}{:30}".format(test, metric) + commit_string(test,'metrics'))
592
593 # Has no meaningful output if there is no commit to compare to.
594 if not singleton_commit:
595 header('percent')
596
597 # Printing out percentages.
598 for test, metric in all_tests:
599 print("{:27}{:30}".format(test, metric) + commit_string(test,'percentages'))