openreplay/ee/api/chalicelib/core/sessions_insights.py
Kraiem Taha Yassine a34179365e
Api v1.15.0 (#1464)
* feat(DB): rearranged queries
feat(DB): ready for v1.15.0

* refactor(chalice): upgraded dependencies
refactor(crons): upgraded dependencies
refactor(alerts): upgraded dependencies

* fix(chalice): return error when updating inexistant webhook

* feat(chalice): fixed delete webhook response

* feat(chalice): limit webhooks name length

* feat(chalice): upgraded dependencies
feat(alerts): upgraded dependencies
feat(crons): upgraded dependencies

* fix(chalice): remove urllib3 dependency

* feat(chalice): remove FOSS to pydantic v2

* fix(chalice): freeze urllib3 to not have conflicts between boto3 and requests

* feat(chalice): refactoring schema in progress

* feat(chalice): refactoring schema in progress

* feat(chalice): refactoring schema in progress

* feat(chalice): refactoring schema in progress
feat(chalice): upgraded dependencies

* feat(chalice): refactored schema

* fix(chalice): pull rebase dev

* feat(DB): transfer size support

* feat(chalice): support service account

* feat(chalice): support service account

* fix(chalice): fixed refactored PayloadSchema-name

* feat(chalice): path analysis

* feat(chalice): support service account 1/2

* feat(DB): timezone support

* feat(chalice): upgraded dependencies
feat(alerts): upgraded dependencies
feat(crons): upgraded dependencies
feat(assist): upgraded dependencies
feat(sourcemaps): upgraded dependencies

* feat(chalice): path analysis schema changes

* feat(chalice): path analysis query change

* feat(chalice): path analysis query change

* feat(chalice): ios replay support

* feat(chalice): ios replay support

* feat(chalice): path analysis changes

* feat(chalice): upgraded dependencies

* feat(chalice): simple hide minor paths

* feat(chalice): path analysis density

* feat(chalice): session's replay ios events

* feat(chalice): fixed typo

* feat(chalice): support project's platform

* feat(DB): support project's platform

* feat(chalice): path analysis EE in progress

* feat(chalice): project's platform API

* feat(chalice): fixed create project

* feat(chalice): EE path analysis in progress

* feat(chalice): EE path analysis
refactor(chalice): support specific database name for clickhouse-client

* feat(chalice): upgraded dependencies
feat(chalice): path analysis specific event type for startPoint
feat(chalice): path analysis specific event type for endPoint
feat(chalice): path analysis specific event type for exclude

* refactoring(chalice): changed IOS click event type
2023-09-06 17:06:33 +01:00

478 lines
22 KiB
Python

from typing import Optional
import schemas
import schemas
from chalicelib.core import metrics
from chalicelib.utils import ch_client
def _table_slice(table, index):
col = list()
for row in table:
col.append(row[index])
return col
def _table_where(table, index, value):
new_table = list()
for row in table:
if row[index] == value:
new_table.append(row)
return new_table
def _sum_table_index(table, index):
# print(f'index {index}')
s = 0
count = 0
for row in table:
v = row[index]
if v is None:
continue
# print(v)
s += v
count += 1
return s
def _mean_table_index(table, index):
s = _sum_table_index(table, index)
c = len(table)
return s / c
def _sort_table_index(table, index, reverse=False):
return sorted(table, key=lambda k: k[index], reverse=reverse)
def _select_rec(l, selector):
# print('selector:', selector)
# print('list:', l)
if len(selector) == 1:
return l[selector[0]]
else:
s = selector[0]
L = l[s]
type_ = type(s)
if type_ == slice:
return [_select_rec(l_, selector[1:]) for l_ in L]
elif type_ == int:
return [_select_rec(L, selector[1:])]
def __get_two_values(response, time_index='hh', name_index='name'):
columns = list(response[0].keys())
name_index_val = columns.index(name_index)
time_index_value = columns.index(time_index)
table = [list(r.values()) for r in response]
table_hh1 = list()
table_hh2 = list()
hh_vals = list()
names_hh1 = list()
names_hh2 = list()
for e in table:
if e[time_index_value] not in hh_vals and len(hh_vals) == 2:
break
elif e[time_index_value] not in hh_vals:
hh_vals.append(e[time_index_value])
if len(hh_vals) == 1:
table_hh1.append(e)
if e[name_index_val] not in names_hh1:
names_hh1.append(e[name_index_val])
elif len(hh_vals) == 2:
table_hh2.append(e)
if e[name_index_val] not in names_hh2:
names_hh2.append(e[name_index_val])
return table_hh1, table_hh2, columns, names_hh1, names_hh2
def query_requests_by_period(project_id, start_time, end_time, filters: Optional[schemas.SessionsSearchPayloadSchema]):
params = {
"project_id": project_id, "startTimestamp": start_time, "endTimestamp": end_time,
"step_size": metrics.__get_step_size(endTimestamp=end_time, startTimestamp=start_time, density=3)
}
params, sub_query = __filter_subquery(project_id=project_id, filters=filters, params=params)
conditions = ["event_type = 'REQUEST'"]
query = f"""WITH toUInt32(toStartOfInterval(toDateTime(%(startTimestamp)s/1000), INTERVAL %(step_size)s second)) AS start,
toUInt32(toStartOfInterval(toDateTime(%(endTimestamp)s/1000), INTERVAL %(step_size)s second)) AS end
SELECT T1.hh, countIf(T2.session_id != 0) as sessions, avg(T2.success) as success_rate, T2.url_host as names,
T2.url_path as source, avg(T2.duration) as avg_duration
FROM (SELECT arrayJoin(arrayMap(x -> toDateTime(x), range(start, end, %(step_size)s))) as hh) AS T1
LEFT JOIN (SELECT session_id, url_host, url_path, success, message, duration, toStartOfInterval(datetime, INTERVAL %(step_size)s second) as dtime
FROM experimental.events
{sub_query}
WHERE project_id = {project_id}
AND {" AND ".join(conditions)}) AS T2 ON T2.dtime = T1.hh
GROUP BY T1.hh, T2.url_host, T2.url_path
ORDER BY T1.hh DESC;"""
with ch_client.ClickHouseClient() as conn:
query = conn.format(query=query, params=params)
# print("--------")
# print(query)
# print("--------")
res = conn.execute(query=query)
if res is None or sum([r.get("sessions") for r in res]) == 0:
return []
table_hh1, table_hh2, columns, this_period_hosts, last_period_hosts = __get_two_values(res, time_index='hh',
name_index='source')
test = [k[4] for k in table_hh1]
# print(f'length {len(test)}, uniques {len(set(test))}')
del res
new_hosts = [x for x in this_period_hosts if x not in last_period_hosts]
common_names = [x for x in this_period_hosts if x not in new_hosts]
source_idx = columns.index('source')
duration_idx = columns.index('avg_duration')
# success_idx = columns.index('success_rate')
# delta_duration = dict()
# delta_success = dict()
new_duration_values = dict()
duration_values = dict()
for n in common_names:
d1_tmp = _table_where(table_hh1, source_idx, n)
d2_tmp = _table_where(table_hh2, source_idx, n)
old_duration = _mean_table_index(d2_tmp, duration_idx)
new_duration = _mean_table_index(d1_tmp, duration_idx)
if old_duration == 0:
continue
duration_values[n] = new_duration, old_duration, (new_duration - old_duration) / old_duration
# delta_duration[n] = (_mean_table_index(d1_tmp, duration_idx) - _duration1) / _duration1
# delta_success[n] = _mean_table_index(d1_tmp, success_idx) - _mean_table_index(d2_tmp, success_idx)
for n in new_hosts:
d1_tmp = _table_where(table_hh1, source_idx, n)
new_duration_values[n] = _mean_table_index(d1_tmp, duration_idx)
# names_idx = columns.index('names')
total = _sum_table_index(table_hh1, duration_idx)
d1_tmp = _sort_table_index(table_hh1, duration_idx, reverse=True)
_tmp = _table_slice(d1_tmp, duration_idx)
_tmp2 = _table_slice(d1_tmp, source_idx)
increase = sorted(duration_values.items(), key=lambda k: k[1][-1], reverse=True)
ratio = sorted(zip(_tmp2, _tmp), key=lambda k: k[1], reverse=True)
# names_ = set([k[0] for k in increase[:3]+ratio[:3]]+new_hosts[:3])
names_ = set([k[0] for k in increase[:3] + ratio[:3]]) # we took out new hosts since they dont give much info
results = list()
for n in names_:
if n is None:
continue
data_ = {'category': schemas.InsightCategories.network, 'name': n,
'value': None, 'oldValue': None, 'ratio': None, 'change': None, 'isNew': True}
for n_, v in ratio:
if n == n_:
if n in new_hosts:
data_['value'] = new_duration_values[n]
data_['ratio'] = 100 * v / total
break
for n_, v in increase:
if n == n_:
data_['value'] = v[0]
data_['oldValue'] = v[1]
data_['change'] = 100 * v[2]
data_['isNew'] = False
break
results.append(data_)
return results
def __filter_subquery(project_id: int, filters: Optional[schemas.SessionsSearchPayloadSchema], params: dict):
sub_query = ""
if filters and (len(filters.events) > 0 or len(filters.filters)) > 0:
qp_params, sub_query = sessions_exp.search_query_parts_ch(data=filters, project_id=project_id,
error_status=None,
errors_only=True, favorite_only=None,
issue=None, user_id=None)
params = {**params, **qp_params}
sub_query = f"INNER JOIN {sub_query} USING(session_id)"
return params, sub_query
def query_most_errors_by_period(project_id, start_time, end_time,
filters: Optional[schemas.SessionsSearchPayloadSchema]):
params = {
"project_id": project_id, "startTimestamp": start_time, "endTimestamp": end_time,
"step_size": metrics.__get_step_size(endTimestamp=end_time, startTimestamp=start_time, density=3)
}
params, sub_query = __filter_subquery(project_id=project_id, filters=filters, params=params)
conditions = ["event_type = 'ERROR'"]
query = f"""WITH toUInt32(toStartOfInterval(toDateTime(%(startTimestamp)s/1000), INTERVAL %(step_size)s second)) AS start,
toUInt32(toStartOfInterval(toDateTime(%(endTimestamp)s/1000), INTERVAL %(step_size)s second)) AS end
SELECT T1.hh, countIf(T2.session_id != 0) as sessions, T2.message_name as names,
groupUniqArray(T2.source) as sources
FROM (SELECT arrayJoin(arrayMap(x -> toDateTime(x), range(start, end, %(step_size)s))) as hh) AS T1
LEFT JOIN (SELECT session_id, concat(name,': ', message) as message_name, source, toStartOfInterval(datetime, INTERVAL %(step_size)s second) as dtime
FROM experimental.events
{sub_query}
WHERE project_id = {project_id}
AND datetime >= toDateTime(%(startTimestamp)s/1000)
AND datetime < toDateTime(%(endTimestamp)s/1000)
AND {" AND ".join(conditions)}) AS T2 ON T2.dtime = T1.hh
GROUP BY T1.hh, T2.message_name
ORDER BY T1.hh DESC;"""
with ch_client.ClickHouseClient() as conn:
query = conn.format(query=query, params=params)
# print("--------")
# print(query)
# print("--------")
res = conn.execute(query=query)
if res is None or sum([r.get("sessions") for r in res]) == 0:
return []
table_hh1, table_hh2, columns, this_period_errors, last_period_errors = __get_two_values(res, time_index='hh',
name_index='names')
del res
# print(table_hh1)
# print('\n')
# print(table_hh2)
# print('\n')
new_errors = [x for x in this_period_errors if x not in last_period_errors]
common_errors = [x for x in this_period_errors if x not in new_errors]
sessions_idx = columns.index('sessions')
names_idx = columns.index('names')
print(_table_where(table_hh1, names_idx, this_period_errors[0]))
percentage_errors = dict()
total = _sum_table_index(table_hh1, sessions_idx)
# error_increase = dict()
new_error_values = dict()
error_values = dict()
for n in this_period_errors:
if n is None:
continue
percentage_errors[n] = _sum_table_index(_table_where(table_hh1, names_idx, n), sessions_idx)
new_error_values[n] = _sum_table_index(_table_where(table_hh1, names_idx, n), sessions_idx)
for n in common_errors:
if n is None:
continue
sum_old_errors = _sum_table_index(_table_where(table_hh2, names_idx, n), sessions_idx)
if sum_old_errors == 0:
continue
sum_new_errors = _sum_table_index(_table_where(table_hh1, names_idx, n), sessions_idx)
# error_increase[n] = (new_errors - old_errors) / old_errors
error_values[n] = sum_new_errors, sum_old_errors, (sum_new_errors - sum_old_errors) / sum_old_errors
ratio = sorted(percentage_errors.items(), key=lambda k: k[1], reverse=True)
increase = sorted(error_values.items(), key=lambda k: k[1][-1], reverse=True)
names_ = set([k[0] for k in increase[:3] + ratio[:3]] + new_errors[:3])
results = list()
for n in names_:
if n is None:
continue
data_ = {'category': schemas.InsightCategories.errors, 'name': n,
'value': None, 'oldValue': None, 'ratio': None, 'change': None, 'isNew': True}
for n_, v in ratio:
if n == n_:
if n in new_errors:
data_['value'] = new_error_values[n]
data_['ratio'] = 100 * v / total
break
for n_, v in increase:
if n == n_:
data_['value'] = v[0]
data_['oldValue'] = v[1]
data_['change'] = 100 * v[2]
data_['isNew'] = False
break
results.append(data_)
return results
def query_cpu_memory_by_period(project_id, start_time, end_time,
filters: Optional[schemas.SessionsSearchPayloadSchema]):
params = {
"project_id": project_id, "startTimestamp": start_time, "endTimestamp": end_time,
"step_size": metrics.__get_step_size(endTimestamp=end_time, startTimestamp=start_time, density=3)
}
params, sub_query = __filter_subquery(project_id=project_id, filters=filters, params=params)
conditions = ["event_type = 'PERFORMANCE'"]
query = f"""WITH toUInt32(toStartOfInterval(toDateTime(%(startTimestamp)s/1000), INTERVAL %(step_size)s second)) AS start,
toUInt32(toStartOfInterval(toDateTime(%(endTimestamp)s/1000), INTERVAL %(step_size)s second)) AS end
SELECT T1.hh, countIf(T2.session_id != 0) as sessions, avg(T2.avg_cpu) as cpu_used,
avg(T2.avg_used_js_heap_size) as memory_used, T2.url_host as names, groupUniqArray(T2.url_path) as sources
FROM (SELECT arrayJoin(arrayMap(x -> toDateTime(x), range(start, end, %(step_size)s))) as hh) AS T1
LEFT JOIN (SELECT session_id, url_host, url_path, avg_used_js_heap_size, avg_cpu, toStartOfInterval(datetime, INTERVAL %(step_size)s second) as dtime
FROM experimental.events
{sub_query}
WHERE project_id = {project_id}
AND {" AND ".join(conditions)}) AS T2 ON T2.dtime = T1.hh
GROUP BY T1.hh, T2.url_host
ORDER BY T1.hh DESC;"""
with ch_client.ClickHouseClient() as conn:
query = conn.format(query=query, params=params)
# print("--------")
# print(query)
# print("--------")
res = conn.execute(query=query)
if res is None or sum([r.get("sessions") for r in res]) == 0:
return []
table_hh1, table_hh2, columns, this_period_resources, last_period_resources = __get_two_values(res, time_index='hh',
name_index='names')
print(f'TB1\n{table_hh1}')
print(f'TB2\n{table_hh2}')
del res
memory_idx = columns.index('memory_used')
cpu_idx = columns.index('cpu_used')
mem_newvalue = _mean_table_index(table_hh1, memory_idx)
mem_oldvalue = _mean_table_index(table_hh2, memory_idx)
cpu_newvalue = _mean_table_index(table_hh2, cpu_idx)
cpu_oldvalue = _mean_table_index(table_hh2, cpu_idx)
cpu_ratio = 0
mem_ratio = 0
if mem_newvalue == 0:
mem_newvalue = None
mem_ratio = None
if mem_oldvalue == 0:
mem_oldvalue = None
mem_ratio = None
if cpu_newvalue == 0:
cpu_newvalue = None
cpu_ratio = None
if cpu_oldvalue == 0:
cpu_oldvalue = None
cpu_ratio = None
output = list()
if cpu_oldvalue is not None or cpu_newvalue is not None:
output.append({'category': schemas.InsightCategories.resources,
'name': 'cpu',
'value': cpu_newvalue,
'oldValue': cpu_oldvalue,
'change': 100 * (
cpu_newvalue - cpu_oldvalue) / cpu_oldvalue if cpu_ratio is not None else cpu_ratio,
'isNew': True if cpu_newvalue is not None and cpu_oldvalue is None else False})
if mem_oldvalue is not None or mem_newvalue is not None:
output.append({'category': schemas.InsightCategories.resources,
'name': 'memory',
'value': mem_newvalue,
'oldValue': mem_oldvalue,
'change': 100 * (
mem_newvalue - mem_oldvalue) / mem_oldvalue if mem_ratio is not None else mem_ratio,
'isNew': True if mem_newvalue is not None and mem_oldvalue is None else False})
return output
from chalicelib.core import sessions_exp
def query_click_rage_by_period(project_id, start_time, end_time,
filters: Optional[schemas.SessionsSearchPayloadSchema]):
params = {
"project_id": project_id, "startTimestamp": start_time, "endTimestamp": end_time,
"step_size": metrics.__get_step_size(endTimestamp=end_time, startTimestamp=start_time, density=3)}
params, sub_query = __filter_subquery(project_id=project_id, filters=filters, params=params)
conditions = ["issue_type = 'click_rage'", "event_type = 'ISSUE'"]
query = f"""WITH toUInt32(toStartOfInterval(toDateTime(%(startTimestamp)s/1000), INTERVAL %(step_size)s second)) AS start,
toUInt32(toStartOfInterval(toDateTime(%(endTimestamp)s/1000), INTERVAL %(step_size)s second)) AS end
SELECT T1.hh, countIf(T2.session_id != 0) as sessions, groupUniqArray(T2.url_host) as names, T2.url_path as sources
FROM (SELECT arrayJoin(arrayMap(x -> toDateTime(x), range(start, end, %(step_size)s))) as hh) AS T1
LEFT JOIN (SELECT session_id, url_host, url_path, toStartOfInterval(datetime, INTERVAL %(step_size)s second ) as dtime
FROM experimental.events
{sub_query}
WHERE project_id = %(project_id)s
AND datetime >= toDateTime(%(startTimestamp)s/1000)
AND datetime < toDateTime(%(endTimestamp)s/1000)
AND {" AND ".join(conditions)}) AS T2 ON T2.dtime = T1.hh
GROUP BY T1.hh, T2.url_path
ORDER BY T1.hh DESC;"""
with ch_client.ClickHouseClient() as conn:
query = conn.format(query=query, params=params)
# print("--------")
# print(query)
# print("--------")
res = conn.execute(query=query)
if res is None or sum([r.get("sessions") for r in res]) == 0:
return []
table_hh1, table_hh2, columns, this_period_rage, last_period_rage = __get_two_values(res, time_index='hh',
name_index='sources')
del res
new_names = [x for x in this_period_rage if x not in last_period_rage]
common_names = [x for x in this_period_rage if x not in new_names]
sessions_idx = columns.index('sessions')
names_idx = columns.index('sources')
# raged_increment = dict()
raged_values = dict()
new_raged_values = dict()
# TODO verify line (188) _tmp = table_hh2[:, sessions_idx][n].sum()
for n in common_names:
if n is None:
continue
_oldvalue = _sum_table_index(_table_where(table_hh2, names_idx, n), sessions_idx)
_newvalue = _sum_table_index(_table_where(table_hh1, names_idx, n), sessions_idx)
# raged_increment[n] = (_newvalue - _oldvalue) / _oldvalue
raged_values[n] = _newvalue, _oldvalue, (_newvalue - _oldvalue) / _oldvalue
for n in new_names:
if n is None:
continue
_newvalue = _sum_table_index(_table_where(table_hh1, names_idx, n), sessions_idx)
new_raged_values[n] = _newvalue
total = _sum_table_index(table_hh1, sessions_idx)
names, ratio = _table_slice(table_hh1, names_idx), _table_slice(table_hh1, sessions_idx)
ratio = sorted(zip(names, ratio), key=lambda k: k[1], reverse=True)
increase = sorted(raged_values.items(), key=lambda k: k[1][-1], reverse=True)
names_ = set([k[0] for k in increase[:3] + ratio[:3]] + new_names[:3])
results = list()
for n in names_:
if n is None:
continue
data_ = {'category': schemas.InsightCategories.rage, 'name': n,
'value': None, 'oldValue': None, 'ratio': None, 'change': None, 'isNew': True}
for n_, v in ratio:
if n == n_:
if n in new_names:
data_['value'] = new_raged_values[n]
data_['ratio'] = 100 * v / total
break
for n_, v in increase:
if n == n_:
data_['value'] = v[0]
data_['oldValue'] = v[1]
data_['change'] = 100 * v[2]
data_['isNew'] = False
break
results.append(data_)
return results
def fetch_selected(project_id, data: schemas.GetInsightsSchema):
output = list()
if data.metricValue is None or len(data.metricValue) == 0:
data.metricValue = []
for v in schemas.InsightCategories:
data.metricValue.append(v)
filters = None
if len(data.series) > 0:
filters = data.series[0].filter
if schemas.InsightCategories.errors in data.metricValue:
output += query_most_errors_by_period(project_id=project_id, start_time=data.startTimestamp,
end_time=data.endTimestamp, filters=filters)
if schemas.InsightCategories.network in data.metricValue:
output += query_requests_by_period(project_id=project_id, start_time=data.startTimestamp,
end_time=data.endTimestamp, filters=filters)
if schemas.InsightCategories.rage in data.metricValue:
output += query_click_rage_by_period(project_id=project_id, start_time=data.startTimestamp,
end_time=data.endTimestamp, filters=filters)
if schemas.InsightCategories.resources in data.metricValue:
output += query_cpu_memory_by_period(project_id=project_id, start_time=data.startTimestamp,
end_time=data.endTimestamp, filters=filters)
return output