#!/usr/bin/env python2
import time, re, os, sys

GCC = os.getenv('GCC') or 'gcc'
BENCH_VAL = os.getenv('BENCH_VAL') or '1000000'

def combinator(args):
    if not args:
        yield ()
        return
    for i in (None,) + args[0]:
        for j in combinator(args[1:]):
            yield tuple(filter(None, (i,) + j))

args = [
    ('-O2', '-O3', '-O4'),
    ('-fomit-frame-pointer',),
    ('-m64',),
    ('-march=core2', '-march=pentium4', '-march=opteron'),
    ('-ftracer',),
    ('-ffast-math',),
    ('-funroll-loops', '-funroll-all-loops'),
    ('-finline-limit=999', '-finline-limit=999999'),
    ('-fwhole-program',),
]

R_BENCH = re.compile(r'processed in ([0-9]+) us')
R_BADCMD = re.compile(r'(?:unrecognized option|unrecognized command line option) ["`]([^"\']+)["\']')
R_BADOPT = re.compile(r'error: bad value \((core2)\) for -m\w+= switch')
NOT64 = 'CPU you selected does not support x86-64 instruction set'

def getstatusoutput(cmd):
    """Backported from Py3k"""
    if isinstance(cmd, (list, tuple)):
        # BAH!
        cmd = ' '.join(cmd)

    pipe = os.popen('{ ' + cmd + '; } 2>&1', 'r')
    text = pipe.read()
    sts = pipe.close()
    if sts is None: sts = 0
    if text[-1:] == '\n': text = text[:-1]
    return sts, text

def get_total_runs():
    n = 1
    for a in args:
        n *= len(a) + 1
    return n

def tval(x):
    return '%02d:%02d' % (x/60, x%60)

def which_cpu(opts):
    x = list(filter(lambda x: x.startswith('-mcpu') or x.startswith('-march'), opts))
    return x[0]

def do_timerun(n, total, opts, benchval):
    sys.stdout.write("Testing %5d/%d...    " % (n, total))
    sys.stdout.flush()

    cmd = (GCC, '-DSINGLERUN', '-DBENCH_VAL='+str(benchval), '-DTESTING',
           '-o', 'runner_temp', 'runner.c') + opts
    (rc, output) = getstatusoutput(cmd)
    if rc != 0:
        # Try to figure out why, whether we need to cull args
        cull = set()
        for arg in R_BADCMD.finditer(output):
            cull.append(arg.group(1))
        # Bad -mcpu sets -march and/or -mtune, so be more lenient on that error.
        for arg in R_BADOPT.finditer(output):
            for t in opts:
                if arg.group(1) in t:
                    cull.add(t)
        if NOT64 in output:
            # hrmm. this one is interesting.
            assert '-m64' in opts
            cull.add((which_cpu(opts), '-m64'))
        if ('64-bit mode not compiled in' in output or
            'stubs-64.h: No such file' in output):
            cull.add('-m64')
        if cull:
            sys.stdout.write("compile fail: %r %r\n" % (rc, cull))
        else:
            sys.stdout.write("compile fail: %d in %r\n" % (rc, output))
        return cull

    sys.stdout.write("run... ")
    sys.stdout.flush()

    (rc, output) = getstatusoutput('./runner_temp')
    if rc != 0:
        sys.stdout.write("run fail: %d\n" % (rc,))
    else:
        m = R_BENCH.search(output)
        speed = float(m.group(1))
        elapsed = time.time() - START_TIME
        remaining = elapsed * (total-n) / n
        print speed, tval(remaining), "remain"
        return speed
#
#  elapsed     remaining
#  -------  =  ---------
#     n          (d-n)
START_TIME = time.time()

def main():
    """
    TODOs
    - since there's such a large variation in run times, try the initial
      compiles with a short run.  Then take the top quarter and run a longer
      time trial.  Long run should be incremental.
    - make it parallel so it can take up available cores
    - split out the quick run to a separate command, so the slow version can
      be run only after major changes.
    - parse 'cc1: error: unrecognized command line option "-fwhole-program"'
      and cull.
    - same for -m64, not that it takes very long to spin through them...
    - add "trials left" and "estimated time" ('cull' step needs to just
      estimate)  some options make the compile take longer.
    """

    to_try = list(combinator(args))
    print "About to try", len(to_try), "sets of options"

    def remove_from_set(which):
        print "Removing %r from the options to try" % (which,)
        if isinstance(which, tuple):
            assert len(which) == 2
            # only remove when all are there
            for i in range(len(to_try)-1, -1, -1):
                if which[0] in to_try[i] and which[1] in to_try[i]:
                    del to_try[i]
        else:
            # regular string
            for i in range(len(to_try)-1, -1, -1):
                if which in to_try[i]:
                    del to_try[i]

    results = []
    i = 1
    spreadsheet = open('compile-opts-fast.csv', 'wb')
    while to_try:
        opts = to_try.pop()
        r = do_timerun(i, i+len(to_try), opts, 100000)
        if isinstance(r, (int, long, float)):
            results.append((r, opts))
            spreadsheet.write('%s,%s\n' % (r, opts))
        else:
            for bad in r:
                remove_from_set(bad)
        i += 1
    spreadsheet.close()

    results.sort()

    # Now re-run with the ones that came within 5% of the best time.
    time_to_beat = results[0][0] * 1.05
    winners = [x for x in results if x[0] <= time_to_beat]

    global START_TIME
    START_TIME = time.time()
    print "Running long form"

    winner_results = []
    i = 1
    spreadsheet = open('compile-opts.csv', 'wb')
    try:
        for _, opts in winners:
            r = do_timerun(i, len(winners), opts, 16000000)
            if isinstance(r, (int, long, float)):
                winner_results.append((r, opts))
                spreadsheet.write('%s,%s\n' % (r, ' '.join(opts)))
            else:
                print "weird failure", r
            i += 1
    except KeyboardInterrupt:
        print "Interrupted"

    # save the fastest options as the ones to use in the build
    winner_results.sort()
    open('compile-opts', 'wb').write(" ".join(winner_results[0][1]))


if __name__ == '__main__':
    main()