#!/usr/bin/python
#
# Plot the amount of CPU or wall clock time used by builds, installs,
# or tests, or the amount of diskspace used by the install,
# as a function of the CVS source date
#
# usage:
#
#  python timeplot.py [build|install|test] [cpu|wall] time
#  python timeplot.py disk
#
# Note limitations:
#  - old sparc tests lack qemu version because it is not printed in dmesg
#  - some qemu version information is missing because of qemu printing "2.5+"
#

from __future__ import print_function

import gzip
import sys
import optparse
import re

import matplotlib
#matplotlib.use('GTKAgg')
import matplotlib.pyplot as pyplot
import matplotlib.dates
import numpy

from bracket import *
from utils import ts2py

from htmlgen import h1, html, head, body, img, a, div, p, style, table, tr, td, title

the_style = '''
table {
    border-collapse: collapse;
}
th,td {
    border: 1px solid black;
    padding: 5px;
}
'''

def fatal(s):
    print(s, file=sys.stderr)
    sys.exit(1)

# Parse a time like 30:24.56 and return a float number of seconds

def time2seconds(t):
    try:
        return float(t)
    except:
        a = t.split(b':')
        sum = 0
        mul = 1
        for i in reversed(list(range(len(a)))):
            sum += mul * float(a[i])
            mul *= 60
        return sum

# Parse a human-readable date range into a pair of Unix timestamps

def parse_date_range(range):
    min = 0
    max = 0xFFFFFFFF
    if range == 'all data':
        return (min, max)
    if range == 'last year':
        return (sourcedate2ts('1 year ago'), max)
    if range == 'last month':
        return (sourcedate2ts('1 month ago'), max)
    if range == '2017':
        return (sourcedate2ts('2017.01.01.00.00.00'), sourcedate2ts('2018.01.01.00.00.00'))
    raise RuntimeError("bad date range " + range)

# Find the measurement value t (e.g., time) for "query" at timestamp "ts"
# along with some auxililary information for keying
#
# Returns tuple (t, { auxiliary key-value pairs }), or None

def calculate(ts, query):
    try:
        return _calculate(ts, query)
    except Exception as e:
        print(e)
        #traceback.print_exc()
        return None

def _calculate(ts, query):
    measurement = query[-1]
    if measurement == 'time':
        timetype = query[-2]
        phase = query[-3]

    db = get_db(ts)

    aux_data = { }

    if measurement == 'time':
        if phase == 'build':
            fn = 'build.log.tail.gz'
            if get_cached_status_if_any(ts, 'build_status') != 0:
                return None
        elif phase == 'test':
            fn = 'test.log.gz'
            # Omit runs where the tests didn't complete
            if not db.get('passed_tests'):
                return None
        elif phase == 'install':
            fn = 'install.log.gz'
            if get_cached_status_if_any(ts, 'install_status') != 0:
                return None
        elif phase == 'build_8':
            # From running tests/build_8.py
            fn = 'build_8.log.gz'
            if get_cached_status_if_any(ts, 'build_8_status') != 0:
                return None
        else:
            fatal("unknown phase %s" % phase)

        # Extract build time from log
        fullfn = os.path.join(results_dir(ts), fn)
        try:
            f = gzip.open(fullfn, 'rb')
        except:
           return None
        content = f.read()
        f.close()

        # Parse the time(1) output
        m = re.search(br'([\d\.]+) real\s*([\d\.]+) user\s*([\d\.]+) sys', content)
        if m:
            realtime_s, usertime_s, systemtime_s = m.groups()
        else:
            # Linux style
            m = re.search(br'(\d[\d\.:]+)user ([\d\.:]+)system ([\d\.:]+)elapsed', content)
            if m:
                usertime_s, systemtime_s, realtime_s = m.groups()
            else:
                return None

        # Convert strings to float
        realtime   = time2seconds(realtime_s)
        usertime   = time2seconds(usertime_s)
        systemtime = time2seconds(systemtime_s)

        cputime = usertime + systemtime

        if timetype == 'user':
            t = usertime
        elif timetype == 'system':
            t = systemtime
        elif timetype == 'cpu':
            t = cputime
        elif timetype == 'wall':
            t = realtime
        else:
            fatal("unknown time type %s" % timetype)

    elif measurement == 'disk':
        fn = 'test.log.gz'
        try:
            f = gzip.open(os.path.join(results_dir(ts), fn), 'r')
        except:
           return None
        content = f.read()
        f.close()
        m = re.search(br'^df-pre-test /dev/(?:.d0a|dk0)\s+(\d+)\s+(\d+)', content, re.MULTILINE)
        if m:
            t = float(m.group(2)) / 1024.0
        else:
            return None

    else:
        fatal("unknown measurement %s" % measurement)

    # Determine qemu version, if any
    if measurement == 'time' and (phase == 'install' or phase == 'test'):
        qemu_version = None
        # Anita >= 1.40 runs qemu --version at the very beginning
        m = re.search(br'QEMU emulator version ([\d\.]+)', content)
        if m:
            qemu_version = m.group(1)
        else:
            # Try to extract it from the CPU manufacturer ID string
            m = re.search(br'QEMU Virtual CPU version ([\d\.]+\+?)', content)
            if m:
                qemu_version = m.group(1)
        if qemu_version is not None:
            aux_data['qemu_version'] = 'qemu ' + qemu_version.decode('ASCII')

    # Determine value of -tb-size option, if any
    m = re.search(br'--vmm-args=.*-tb-size (\d+)', content)
    if m:
        aux_data['tb_size'] = m.group(1).decode('ASCII')

    # Determine value of -accel option, if any
    m = re.search(br'--vmm-args=.*-accel ([a-z]+)', content)
    if m:
        aux_data['accel'] = m.group(1).decode('ASCII')
    else:
        # -enable-kvm is equivalent to -accel kvm
        m = re.search(br'-vmm-args=-enable-kvm', content)
        if m:
            aux_data['accel'] = 'kvm'

    return t, aux_data

def db_put_verbose(ts, k, v):
    print("caching %s %s = %s" % (ts2rcs(ts), k, v))
    db_put(ts, k, v)

# query is a list like ['test', 'cpu', 'time'] or ['disk']
# start_date is a string, e.g., '1 year ago', or a false value
# dir is the directory where the plot is to be stored
#
# returns the relative image file path (for use in HTML links)

def make_resource_plot(query, date_range, key_on, dir):
    measurement = query[-1]
    if measurement == 'time':
        timetype = query[-2]
        phase = query[-3]

    ts_min, ts_max = parse_date_range(date_range)

    build_dates = existing_build_dates()

    data_by_key = {}

    for ts in build_dates:
        if ts < ts_min or ts >= ts_max:
            continue
        db = get_db(ts)

        keys = {}

        host = db.get('build_host')
        if host is None:
            continue

        # Strip domain part (the naive way)
        host = re.sub(r'\..*$', '', host)
        keys['host'] = host

        # Extract njobs from build_sh_cmd
        build_sh_cmd = db.get('build_sh_cmd')
        if build_sh_cmd:
            words = [word.strip("'") for word in build_sh_cmd.split(' ')]
            if '-j' in words:
                keys['njobs'] = words[words.index('-j') + 1]

        osrel = db.get('build_host_osrel')
        if osrel:
            keys['osrel'] = osrel

        # Construct the databse key this measurement result is/will be stored under
        db_key = '_'.join(query)

        # When applicable, also construct database keys for things
        # like the qemu version and tb_size option

        if measurement == 'time':
            cached_keys = ['qemu_version', 'tb_size', 'accel']
        else:
            cached_keys = []

        def db_key_name(cached_key):
            return phase + '_' + cached_key

        # See if we have a cached measurement value in the database
        t = db.get(db_key)
        # Uncomment to force re-caching (causes duplicate db entries)
        # t = None #XXX
        if t is None:
            # No cached value, calculate
            r = calculate(ts, query)
            if r is None:
                continue
            t, key_values = r
            # t, qemu_version, tb_size, accel = r
            print(ts2rcs(ts), ' '.join(query), t, key_values)
            db_put_verbose(ts, db_key, str(t))
            for k in cached_keys:
                keys[k] = key_values.get(k)
                if key_values.get(k) is not None and db.get(db_key_name(k)) is None:
                    db_put_verbose(ts, db_key_name(k), key_values[k])
        else:
            # Have cached value, use it
            t = float(t)
            for k in cached_keys:
                keys[k] = db.get(db_key_name(k))

        cpu_brand = db.get('build_machdep_cpu_brand')
        if cpu_brand is not None:
            # Collapse multiple spaces
            cpu_brand = re.sub(r' +', ' ', cpu_brand)
            # Shorten
            cpu_brand = re.sub(r'Intel\(R\) (Xeon|Core)\((R|TM)\) +', '', cpu_brand)
            cpu_brand = re.sub(r'CPU ', '', cpu_brand)
            cpu_brand = re.sub(r' Processor', '', cpu_brand)
        keys['cpu_brand'] = cpu_brand

        ncpu = db.get('build_hw_ncpu')
        keys['ncpu'] = ncpu

        # Generate a key string
        def explain(k):
            if k == 'tb_size':
                k = 'tb'
            return k

        key_parts = [explain(k) + '=' + keys[k] for k in key_on if keys.get(k) is not None]
        key = ' / '.join(key_parts)

        #print ts2rcs(ts), key, t

        if key not in data_by_key:
            data_by_key[key] = []
        data_by_key[key].append((ts, t))

    if len(data_by_key) == 0:
        print("warning: no data for", query, file=sys.stderr)
        return None

    fig = pyplot.figure()
    ax = fig.add_subplot(111)

    y_words = [measurement]
    if measurement == 'time':
       y_words = [phase, timetype] + y_words

    x_words = [date_range]

    title_words = [config['arch']] + y_words + x_words
    title = ' '.join(title_words)

    ax.set_title(title)

    def by_average_date(k):
        data = data_by_key[k]
        x, y = list(zip(*data))
        x = [ts for ts in x if ts > 100000000]
        mean_date = float(sum(x)) / len(x)
        return mean_date

    keys = list(data_by_key.keys())
    keys.sort(key = by_average_date)

    yvalues = []
    for key in keys:
        #print key
        data = data_by_key[key]
        x, y = list(zip(*data))
        #x = [matplotlib.dates.date2num(ts2py(t)) for t in x]
        x = [ts2py(t) for t in x]
        ax.plot(x, y, label = key)
        yvalues.extend(y)
        leg = ax.legend(loc='upper left', prop = { 'size': 7 })
        leg.get_frame().set_alpha(0.5)

    y_max = numpy.percentile(yvalues, 98) * 1.2

    if measurement == 'disk':
        unit = 'MiB'
    else:
        unit = 'seconds'

    ax.set_xlabel('CVS source date')
    ax.set_ylabel('%s (%s)' % (' '.join(y_words).capitalize(), unit))
    ax.set_ylim(bottom = 0)
    ax.set_ylim(top = y_max)

    #ax.set_xlim(left = ts2py(rcs2ts('2015.12.01.00.00.00')))
    #matplotlib.axis.axis_date()
    ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%m-%d"))
    #ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%m"))
    #ax.xaxis.set_major_locator(matplotlib.dates.YearLocator())

    # This formats the date labels diagonally to avoid overlap
    fig.autofmt_xdate()

    #pyplot.show()

    def no_spaces(s):
        return s.replace(' ', '-')

    relfn = '%s.png' % ('-'.join(y_words + [no_spaces(word) for word in x_words]))
    absfn = os.path.join(dir, relfn)
    pyplot.savefig(absfn)
    return relfn

def timeplot_main(argv1):
    parser = optparse.OptionParser()
    (options, query) = parser.parse_args(argv1)

    if len(query):
        queries = [query]
    else:
        queries = []
        for phase in ['build', 'install', 'test']:
            for t in ['wall', 'cpu', 'user', 'system']:
                queries.append([phase, t, 'time'])
        queries.append(['disk'])

    date_ranges = [
        'all data',
        'last year',
        'last month',
    ]

    dir = os.path.join(config['htmldir'], 'resource')
    mkdir_p(dir)

    lines = ''
    for query in queries:
        if query[-1] == 'disk':
            key_on = []
        else:
            key_on = ['host', 'cpu_brand', 'qemu_version', 'tb_size', 'accel']

        if query[0] == 'build':
            key_on += ['osrel', 'njobs']

        links = []
        for date_range in date_ranges:
            print(query, date_range)
            img_url = make_resource_plot(query, date_range, key_on, dir)
            if img_url:
                links.append(a({'href': img_url}, date_range))
            else:
                links.append('Missing')
        lines += tr(td(' '.join(query), *[td(link) for link in links]))

    with open(os.path.join(dir, 'index.html'), 'w') as f:
        intro = h1('Resource usage plots') + \
            p('''Below are links to plots of resources consumed by the %s builds and test runs
as a function of the CVS source date.''' % config['arch'])
        print(html(head(title('Resource plots'), style(the_style)), body(intro + table(lines))), file=f)
    publish_reports()
