openreplay/ee/api/chalicelib/core/sessions_insights.py

import schemas_ee
from chalicelib.core import metrics
from chalicelib.utils import ch_client


def _table_slice(table, index):
    col = list()
    for row in table:
        col.append(row[index])
    return col


def _table_where(table, index, value):
    new_table = list()
    for row in table:
        if row[index] == value:
            new_table.append(row)
    return new_table


def _sum_table_index(table, index):
    s = 0
    count = 0
    for row in table:
        v = row[index]
        if v is None:
            continue
        s += v
        count += 1
    return s


def _mean_table_index(table, index):
    s = _sum_table_index(table, index)
    c = len(table)
    return s / c


def _sort_table_index(table, index, reverse=False):
    return sorted(table, key=lambda k: k[index], reverse=reverse)


def _select_rec(l, selector):
    print('selector:', selector)
    print('list:', l)
    if len(selector) == 1:
        return l[selector[0]]
    else:
        s = selector[0]
        L = l[s]
        type_ = type(s)
        if type_ == slice:
            return [_select_rec(l_, selector[1:]) for l_ in L]
        elif type_ == int:
            return [_select_rec(L, selector[1:])]


# TODO Deal with None values

def __get_two_values(response, time_index='hh', name_index='name'):
    columns = list(response[0].keys())
    name_index_val = columns.index(name_index)
    time_index_value = columns.index(time_index)

    table = [list(r.values()) for r in response]
    table_hh1 = list()
    table_hh2 = list()
    hh_vals = list()
    names_hh1 = list()
    names_hh2 = list()
    for e in table:
        if e[time_index_value] not in hh_vals and len(hh_vals) == 2:
            break
        elif e[time_index_value] not in hh_vals:
            hh_vals.append(e[time_index_value])

        if len(hh_vals) == 1:
            table_hh1.append(e)
            if e[name_index_val] not in names_hh1:
                names_hh1.append(e[name_index_val])
        elif len(hh_vals) == 2:
            table_hh2.append(e)
            if e[name_index_val] not in names_hh2:
                names_hh2.append(e[name_index_val])
    return table_hh1, table_hh2, columns, names_hh1, names_hh2


def __handle_timestep(time_step):
    base = "{0}"
    if time_step == 'hour':
        return f"toStartOfHour({base})", 3600
    elif time_step == 'day':
        return f"toStartOfDay({base})", 24 * 3600
    elif time_step == 'week':
        return f"toStartOfWeek({base})", 7 * 24 * 3600
    else:
        assert type(
            time_step) == int, "time_step must be {'hour', 'day', 'week'} or an integer representing the time step in minutes"
        return f"toStartOfInterval({base}, INTERVAL {time_step} minute)", int(time_step) * 60


def query_requests_by_period(project_id, start_time, end_time, conn=None):
    params = {
        "project_id": project_id, "startTimestamp": start_time, "endTimestamp": end_time,
        "step_size": metrics.__get_step_size(endTimestamp=end_time, startTimestamp=start_time, density=3)
    }
    conditions = ["event_type = 'REQUEST'"]
    query = f"""WITH toUInt32(toStartOfInterval(toDateTime(%(startTimestamp)s/1000), INTERVAL %(step_size)s second)) AS start,
                     toUInt32(toStartOfInterval(toDateTime(%(endTimestamp)s/1000), INTERVAL %(step_size)s second)) AS end
                SELECT T1.hh, count(T2.session_id) as sessions, avg(T2.success) as success_rate, T2.url_host as names,
                        T2.url_path as source, avg(T2.duration) as avg_duration
                FROM (SELECT arrayJoin(arrayMap(x -> toDateTime(x), range(start, end, %(step_size)s))) as hh) AS T1
                LEFT JOIN (SELECT session_id, url_host, url_path, success, message, duration, toStartOfInterval(datetime, INTERVAL %(step_size)s second) as dtime
                           FROM experimental.events
                           WHERE project_id = {project_id}
                                AND {" AND ".join(conditions)}) AS T2 ON T2.dtime = T1.hh
                GROUP BY T1.hh, T2.url_host, T2.url_path
                ORDER BY T1.hh DESC;"""
    if conn is None:
        with ch_client.ClickHouseClient() as conn:
            query = conn.format(query=query, params=params)
            res = conn.execute(query=query)
    else:
        query = conn.format(query=query, params=params)
        res = conn.execute(query=query)
    table_hh1, table_hh2, columns, this_period_hosts, last_period_hosts = __get_two_values(res, time_index='hh',
                                                                                           name_index='source')
    del res

    new_hosts = [x for x in this_period_hosts if x not in last_period_hosts]
    common_names = [x for x in this_period_hosts if x not in new_hosts]

    source_idx = columns.index('source')
    duration_idx = columns.index('avg_duration')
    success_idx = columns.index('success_rate')
    delta_duration = dict()
    delta_success = dict()
    for n in common_names:
        d1_tmp = _table_where(table_hh1, source_idx, n)
        # d1_tmp = table_hh1[table_hh1[:, source_idx] == n]
        d2_tmp = _table_where(table_hh2, source_idx, n)
        # d2_tmp = table_hh2[table_hh2[:, source_idx] == n]
        delta_duration[n] = _mean_table_index(d1_tmp, duration_idx) - _mean_table_index(d2_tmp, duration_idx)
        # delta_duration[n] = d1_tmp[:, duration_idx].mean() - d2_tmp[:, duration_idx].mean()
        delta_success[n] = _mean_table_index(d1_tmp, success_idx) - _mean_table_index(d2_tmp, success_idx)
        # delta_success[n] = d1_tmp[:, success_idx].mean() - d2_tmp[:, success_idx].mean()

    names_idx = columns.index('names')
    d1_tmp = _sort_table_index(table_hh1, success_idx)
    # d1_tmp = d1_tmp[d1_tmp[:, success_idx].argsort()]
    return {'ratio': list(zip(_table_slice(d1_tmp, source_idx), _table_slice(d1_tmp, success_idx))),
            'increase': sorted(delta_success.items(), key=lambda k: k[1], reverse=False),
            'newEvents': new_hosts}


def query_most_errors_by_period(project_id, start_time, end_time, conn=None):
    params = {
        "project_id": project_id, "startTimestamp": start_time, "endTimestamp": end_time,
        "step_size": metrics.__get_step_size(endTimestamp=end_time, startTimestamp=start_time, density=3)
    }
    conditions = ["event_type = 'ERROR'"]
    query = f"""WITH toUInt32(toStartOfInterval(toDateTime(%(startTimestamp)s/1000), INTERVAL %(step_size)s second)) AS start,
                     toUInt32(toStartOfInterval(toDateTime(%(endTimestamp)s/1000), INTERVAL %(step_size)s second)) AS end
                SELECT T1.hh, count(T2.session_id) as sessions, T2.name as names,
                        groupUniqArray(T2.source) as sources
                FROM (SELECT arrayJoin(arrayMap(x -> toDateTime(x), range(start, end, %(step_size)s))) as hh) AS T1
                    LEFT JOIN (SELECT session_id, name, source, message, toStartOfInterval(datetime, INTERVAL %(step_size)s second) as dtime
                               FROM experimental.events
                               WHERE project_id = {project_id}
                                    AND {" AND ".join(conditions)}) AS T2 ON T2.dtime = T1.hh
                GROUP BY T1.hh, T2.name
                ORDER BY T1.hh DESC;"""
    # print("----------------------------------")
    # print(query)
    # print("----------------------------------")
    if conn is None:
        with ch_client.ClickHouseClient() as conn:
            query = conn.format(query=query, params=params)
            res = conn.execute(query=query)
    else:
        query = conn.format(query=query, params=params)
        res = conn.execute(query=query)

    table_hh1, table_hh2, columns, this_period_errors, last_period_errors = __get_two_values(res, time_index='hh',
                                                                                             name_index='names')
    del res

    new_errors = [x for x in this_period_errors if x not in last_period_errors]
    common_errors = [x for x in this_period_errors if x not in new_errors]

    sessions_idx = columns.index('sessions')
    names_idx = columns.index('names')
    percentage_errors = dict()
    total = _sum_table_index(table_hh1, sessions_idx)
    # total = table_hh1[:, sessions_idx].sum()
    error_increase = dict()
    for n in this_period_errors:
        percentage_errors[n] = _sum_table_index(_table_where(table_hh1, names_idx, n), sessions_idx) / total
        # percentage_errors[n] = (table_hh1[table_hh1[:, names_idx] == n][:, sessions_idx].sum())/total
    for n in common_errors:
        error_increase[n] = _sum_table_index(_table_where(table_hh1, names_idx, n), names_idx) - _sum_table_index(
            _table_where(table_hh2, names_idx, n), names_idx)
        # error_increase[n] = table_hh1[table_hh1[:, names_idx] == n][:, names_idx].sum() - table_hh2[table_hh2[:, names_idx] == n][:, names_idx].sum()
    return {'ratio': sorted(percentage_errors.items(), key=lambda k: k[1], reverse=True),
            'increase': sorted(error_increase.items(), key=lambda k: k[1], reverse=True),
            'newEvents': new_errors}


def query_cpu_memory_by_period(project_id, start_time, end_time, conn=None):
    params = {
        "project_id": project_id, "startTimestamp": start_time, "endTimestamp": end_time,
        "step_size": metrics.__get_step_size(endTimestamp=end_time, startTimestamp=start_time, density=3)
    }
    conditions = ["event_type = 'PERFORMANCE'"]
    query = f"""WITH toUInt32(toStartOfInterval(toDateTime(%(startTimestamp)s/1000), INTERVAL %(step_size)s second)) AS start,
                     toUInt32(toStartOfInterval(toDateTime(%(endTimestamp)s/1000), INTERVAL %(step_size)s second)) AS end
                SELECT T1.hh, count(T2.session_id) as sessions, avg(T2.avg_cpu) as cpu_used,
                        avg(T2.avg_used_js_heap_size) as memory_used, T2.url_host as names, groupUniqArray(T2.url_path) as sources
                FROM (SELECT arrayJoin(arrayMap(x -> toDateTime(x), range(start, end, %(step_size)s))) as hh) AS T1
                LEFT JOIN (SELECT session_id, url_host, url_path, avg_used_js_heap_size, avg_cpu, toStartOfInterval(datetime, INTERVAL %(step_size)s second) as dtime
                           FROM experimental.events
                           WHERE project_id = {project_id}
                                AND {" AND ".join(conditions)}) AS T2 ON T2.dtime = T1.hh
                GROUP BY T1.hh, T2.url_host
                ORDER BY T1.hh DESC;"""
    if conn is None:
        with ch_client.ClickHouseClient() as conn:
            query = conn.format(query=query, params=params)
            res = conn.execute(query=query)
    else:
        query = conn.format(query=query, params=params)
        res = conn.execute(query=query)
    table_hh1, table_hh2, columns, this_period_resources, last_period_resources = __get_two_values(res, time_index='hh',
                                                                                                   name_index='names')
    del res

    memory_idx = columns.index('memory_used')
    cpu_idx = columns.index('cpu_used')

    _tmp = _mean_table_index(table_hh2, memory_idx)
    # _tmp = table_hh2[:, memory_idx].mean()
    # TODO: what if _tmp=0 ?
    _tmp = 1 if _tmp == 0 else _tmp
    return {'cpuIncrease': _mean_table_index(table_hh1, cpu_idx) - _mean_table_index(table_hh2, cpu_idx),
            'memoryIncrease': (_mean_table_index(table_hh1, memory_idx) - _tmp) / _tmp}


def query_click_rage_by_period(project_id, start_time, end_time, conn=None):
    params = {
        "project_id": project_id, "startTimestamp": start_time, "endTimestamp": end_time,
        "step_size": metrics.__get_step_size(endTimestamp=end_time, startTimestamp=start_time, density=3)}

    conditions = ["issue_type = 'click_rage'", "event_type = 'ISSUE'"]
    query = f"""WITH toUInt32(toStartOfInterval(toDateTime(%(startTimestamp)s/1000), INTERVAL %(step_size)s second)) AS start,
                     toUInt32(toStartOfInterval(toDateTime(%(endTimestamp)s/1000), INTERVAL %(step_size)s second)) AS end
                SELECT T1.hh, count(T2.session_id) as sessions, T2.url_host as names, groupUniqArray(T2.url_path) as sources
                FROM (SELECT arrayJoin(arrayMap(x -> toDateTime(x), range(start, end, %(step_size)s))) as hh) AS T1
                LEFT JOIN (SELECT session_id, url_host, url_path, toStartOfInterval(datetime, INTERVAL %(step_size)s second ) as dtime
                           FROM experimental.events
                           WHERE project_id = %(project_id)s
                                AND datetime >= toDateTime(%(startTimestamp)s/1000)
                                AND datetime < toDateTime(%(endTimestamp)s/1000)
                                AND {" AND ".join(conditions)}) AS T2 ON T2.dtime = T1.hh
                GROUP BY T1.hh, T2.url_host
                ORDER BY T1.hh DESC;"""
    if conn is None:
        with ch_client.ClickHouseClient() as conn:
            query = conn.format(query=query, params=params)
            print("--------------------")
            print(query)
            print("--------------------")
            res = conn.execute(query=query)
    else:
        query = conn.format(query=query, params=params)
        print("--------------------")
        print(query)
        print("--------------------")
        res = conn.execute(query=query)

    table_hh1, table_hh2, columns, this_period_rage, last_period_rage = __get_two_values(res, time_index='hh',
                                                                                         name_index='names')
    del res

    new_names = [x for x in this_period_rage if x not in last_period_rage]
    common_names = [x for x in this_period_rage if x not in new_names]

    sessions_idx = columns.index('sessions')
    names_idx = columns.index('names')

    raged_increment = dict()
    # TODO verify line (188) _tmp = table_hh2[:, sessions_idx][n].sum()
    for n in common_names:
        if n is None:
            continue
        _tmp = _sum_table_index(_table_where(table_hh2, names_idx, n), sessions_idx)
        # _tmp = table_hh2[:, sessions_idx][n].sum()
        raged_increment[n] = (_sum_table_index(_table_where(table_hh1, names_idx, n), sessions_idx) - _tmp) / _tmp
        # raged_increment[n] = (table_hh1[:, sessions_idx][n].sum()-_tmp)/_tmp

    total = _sum_table_index(table_hh1, sessions_idx)
    # total = table_hh1[:, sessions_idx].sum()
    return {'ratio': list(
        zip(_table_slice(table_hh1, names_idx), map(lambda k: k / total, _table_slice(table_hh1, sessions_idx)))),
        'increase': sorted(raged_increment.items(), key=lambda k: k[1], reverse=True),
        'newEvents': new_names,
    }


def fetch_selected(project_id, data: schemas_ee.GetInsightsSchema):
    output = {}
    with ch_client.ClickHouseClient() as conn:
        if schemas_ee.InsightCategories.errors in data.categories:
            output[schemas_ee.InsightCategories.errors] = query_most_errors_by_period(project_id=project_id,
                                                                                      start_time=data.startTimestamp,
                                                                                      end_time=data.endTimestamp,
                                                                                      conn=conn)
        if schemas_ee.InsightCategories.network in data.categories:
            output[schemas_ee.InsightCategories.network] = query_requests_by_period(project_id=project_id,
                                                                                    start_time=data.startTimestamp,
                                                                                    end_time=data.endTimestamp,
                                                                                    conn=conn)
        if schemas_ee.InsightCategories.rage in data.categories:
            output[schemas_ee.InsightCategories.rage] = query_click_rage_by_period(project_id=project_id,
                                                                                   start_time=data.startTimestamp,
                                                                                   end_time=data.endTimestamp,
                                                                                   conn=conn)
        if schemas_ee.InsightCategories.resources in data.categories:
            output[schemas_ee.InsightCategories.resources] = query_cpu_memory_by_period(project_id=project_id,
                                                                                        start_time=data.startTimestamp,
                                                                                        end_time=data.endTimestamp,
                                                                                        conn=conn)
    return output

# if __name__ == '__main__':
#     # configs
#     start = '2022-04-19'
#     end = '2022-04-21'
#     projectId = 1307
#     time_step = 'hour'
#
#     # Errors widget
#     print('Errors example')
#     res = query_most_errors_by_period(projectId, start_time=start, end_time=end, time_step=time_step)
#     print(res)
#
#     # Resources widgets
#     print('resources example')
#     res = query_cpu_memory_by_period(projectId, start_time=start, end_time=end, time_step=time_step)
#
#     # Network widgets
#     print('Network example')
#     res = query_requests_by_period(projectId, start_time=start, end_time=end, time_step=time_step)
#     print(res)