diff --git a/api/chalicelib/core/metrics/custom_metrics.py b/api/chalicelib/core/metrics/custom_metrics.py index 1e359bddd..a62f37cdd 100644 --- a/api/chalicelib/core/metrics/custom_metrics.py +++ b/api/chalicelib/core/metrics/custom_metrics.py @@ -209,14 +209,15 @@ def get_issues(project: schemas.ProjectContext, user_id: int, data: schemas.Card def __get_global_card_info(data: schemas.CardSchema): - r = {"hideExcess": data.hide_excess, "compareTo": data.compare_to} + r = {"hideExcess": data.hide_excess, "compareTo": data.compare_to, "rows": data.rows} return r def __get_path_analysis_card_info(data: schemas.CardPathAnalysis): r = {"start_point": [s.model_dump() for s in data.start_point], "start_type": data.start_type, - "excludes": [e.model_dump() for e in data.excludes]} + "excludes": [e.model_dump() for e in data.excludes], + "rows": data.rows} return r diff --git a/api/chalicelib/core/metrics/product_analytics.py b/api/chalicelib/core/metrics/product_analytics.py index ffa4fd319..424d5de1d 100644 --- a/api/chalicelib/core/metrics/product_analytics.py +++ b/api/chalicelib/core/metrics/product_analytics.py @@ -25,15 +25,17 @@ def __transform_journey(rows, reverse_path=False): nodes = [] nodes_values = [] links = [] + drops = [] + max_depth = 0 for r in rows: - source = f"{r['event_number_in_session']}_{r['event_type']}_{r['e_value']}" + source = f"{r['event_number_in_session'] - 1}_{r['event_type']}_{r['e_value']}" if source not in nodes: nodes.append(source) nodes_values.append({"depth": r['event_number_in_session'] - 1, "name": r['e_value'], "eventType": r['event_type']}) - # if r['next_value']: - target = f"{r['event_number_in_session'] + 1}_{r['next_type']}_{r['next_value']}" + + target = f"{r['event_number_in_session']}_{r['next_type']}_{r['next_value']}" if target not in nodes: nodes.append(target) nodes_values.append({"depth": r['event_number_in_session'], @@ -52,6 +54,40 @@ def __transform_journey(rows, reverse_path=False): link["target"] = sr_idx links.append(link) + max_depth = r['event_number_in_session'] + if r["next_type"] == "DROP": + for d in drops: + if d["depth"] == r['event_number_in_session']: + d["sessions_count"] += r["sessions_count"] + break + else: + drops.append({"depth": r['event_number_in_session'], "sessions_count": r["sessions_count"]}) + + for i in range(len(drops)): + + if drops[i]["depth"] < max_depth: + source = f"{drops[i]['depth']}_DROP_None" + target = f"{drops[i]['depth'] + 1}_DROP_None" + sr_idx = nodes.index(source) + + if i < len(drops) - 1 and drops[i]["depth"] + 1 == drops[i + 1]["depth"]: + tg_idx = nodes.index(target) + else: + nodes.append(target) + nodes_values.append({"depth": drops[i]["depth"] + 1, + "name": None, + "eventType": "DROP"}) + tg_idx = len(nodes) - 1 + + link = {"eventType": "DROP", "sessionsCount": drops[i]["sessions_count"], "value": None} + if not reverse_path: + link["source"] = sr_idx + link["target"] = tg_idx + else: + link["source"] = tg_idx + link["target"] = sr_idx + links.append(link) + return {"nodes": nodes_values, "links": sorted(links, key=lambda x: (x["source"], x["target"]), reverse=False)} diff --git a/api/chalicelib/core/metrics/product_analytics_ch.py b/api/chalicelib/core/metrics/product_analytics_ch.py index 4c5f3334d..6246e5013 100644 --- a/api/chalicelib/core/metrics/product_analytics_ch.py +++ b/api/chalicelib/core/metrics/product_analytics_ch.py @@ -23,246 +23,237 @@ JOURNEY_TYPES = { } -# Q6: use events as a sub_query to support filter of materialized columns when doing a join -# query: Q5, the result is correct, -# startPoints are computed before ranked_events to reduce the number of window functions over rows -# replaced time_to_target by time_from_previous -# compute avg_time_from_previous at the same level as sessions_count (this was removed in v1.22) -# sort by top 5 according to sessions_count at the CTE level -# final part project data without grouping -# if start-point is selected, the selected event is ranked n°1 -def path_analysis(project_id: int, data: schemas.CardPathAnalysis): - # This code is used for testing only +def __get_test_data(): with ch_client.ClickHouseClient(database="experimental") as ch: ch_query1 = """ -CREATE TEMPORARY TABLE pre_ranked_events_1736344377403 AS - (WITH initial_event AS (SELECT events.session_id, MIN(datetime) AS start_event_timestamp - FROM experimental.events AS events - WHERE ((event_type = 'LOCATION' AND (url_path = '/en/deployment/'))) - AND events.project_id = toUInt16(65) - AND events.datetime >= toDateTime(1735599600000 / 1000) - AND events.datetime < toDateTime(1736290799999 / 1000) - GROUP BY 1), - pre_ranked_events AS (SELECT * - FROM (SELECT session_id, - event_type, - datetime, - url_path AS e_value, - row_number() OVER (PARTITION BY session_id - ORDER BY datetime , - message_id ) AS event_number_in_session - FROM experimental.events AS events - INNER JOIN initial_event ON (events.session_id = initial_event.session_id) - WHERE events.project_id = toUInt16(65) - AND events.datetime >= toDateTime(1735599600000 / 1000) - AND events.datetime < toDateTime(1736290799999 / 1000) - AND (events.event_type = 'LOCATION') - AND events.datetime >= initial_event.start_event_timestamp - ) AS full_ranked_events - WHERE event_number_in_session <= 5) - SELECT * - FROM pre_ranked_events); - """ + CREATE TEMPORARY TABLE pre_ranked_events_1736344377403 AS + (WITH initial_event AS (SELECT events.session_id, MIN(datetime) AS start_event_timestamp + FROM experimental.events AS events + WHERE ((event_type = 'LOCATION' AND (url_path = '/en/deployment/'))) + AND events.project_id = toUInt16(65) + AND events.datetime >= toDateTime(1735599600000 / 1000) + AND events.datetime < toDateTime(1736290799999 / 1000) + GROUP BY 1), + pre_ranked_events AS (SELECT * + FROM (SELECT session_id, + event_type, + datetime, + url_path AS e_value, + row_number() OVER (PARTITION BY session_id + ORDER BY datetime , + message_id ) AS event_number_in_session + FROM experimental.events AS events + INNER JOIN initial_event ON (events.session_id = initial_event.session_id) + WHERE events.project_id = toUInt16(65) + AND events.datetime >= toDateTime(1735599600000 / 1000) + AND events.datetime < toDateTime(1736290799999 / 1000) + AND (events.event_type = 'LOCATION') + AND events.datetime >= initial_event.start_event_timestamp + ) AS full_ranked_events + WHERE event_number_in_session <= 5) + SELECT * + FROM pre_ranked_events); + """ ch.execute(query=ch_query1, parameters={}) ch_query1 = """ - CREATE TEMPORARY TABLE ranked_events_1736344377403 AS - (WITH pre_ranked_events AS (SELECT * - FROM pre_ranked_events_1736344377403), - start_points AS (SELECT DISTINCT session_id - FROM pre_ranked_events - WHERE ((event_type = 'LOCATION' AND (e_value = '/en/deployment/'))) - AND pre_ranked_events.event_number_in_session = 1), - ranked_events AS (SELECT pre_ranked_events.*, - leadInFrame(e_value) - OVER (PARTITION BY session_id ORDER BY datetime - ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS next_value, - leadInFrame(toNullable(event_type)) - OVER (PARTITION BY session_id ORDER BY datetime - ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS next_type - FROM start_points - INNER JOIN pre_ranked_events USING (session_id)) - SELECT * - FROM ranked_events); - """ + CREATE TEMPORARY TABLE ranked_events_1736344377403 AS + (WITH pre_ranked_events AS (SELECT * + FROM pre_ranked_events_1736344377403), + start_points AS (SELECT DISTINCT session_id + FROM pre_ranked_events + WHERE ((event_type = 'LOCATION' AND (e_value = '/en/deployment/'))) + AND pre_ranked_events.event_number_in_session = 1), + ranked_events AS (SELECT pre_ranked_events.*, + leadInFrame(e_value) + OVER (PARTITION BY session_id ORDER BY datetime + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS next_value, + leadInFrame(toNullable(event_type)) + OVER (PARTITION BY session_id ORDER BY datetime + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS next_type + FROM start_points + INNER JOIN pre_ranked_events USING (session_id)) + SELECT * + FROM ranked_events); + """ ch.execute(query=ch_query1, parameters={}) ch_query1 = """ - WITH ranked_events AS (SELECT * - FROM ranked_events_1736344377403), - n1 AS (SELECT event_number_in_session, - event_type, - e_value, - next_type, - next_value, - COUNT(1) AS sessions_count - FROM ranked_events - WHERE event_number_in_session = 1 - GROUP BY event_number_in_session, event_type, e_value, next_type, next_value - ORDER BY sessions_count DESC), - n2 AS (SELECT event_number_in_session, - event_type, - e_value, - next_type, - next_value, - COUNT(1) AS sessions_count - FROM ranked_events - WHERE event_number_in_session = 2 - GROUP BY event_number_in_session, event_type, e_value, next_type, next_value - ORDER BY sessions_count DESC), - n3 AS (SELECT event_number_in_session, - event_type, - e_value, - next_type, - next_value, - COUNT(1) AS sessions_count - FROM ranked_events - WHERE event_number_in_session = 3 - GROUP BY event_number_in_session, event_type, e_value, next_type, next_value - ORDER BY sessions_count DESC), + WITH ranked_events AS (SELECT * + FROM ranked_events_1736344377403), + n1 AS (SELECT event_number_in_session, + event_type, + e_value, + next_type, + next_value, + COUNT(1) AS sessions_count + FROM ranked_events + WHERE event_number_in_session = 1 + GROUP BY event_number_in_session, event_type, e_value, next_type, next_value + ORDER BY sessions_count DESC), + n2 AS (SELECT event_number_in_session, + event_type, + e_value, + next_type, + next_value, + COUNT(1) AS sessions_count + FROM ranked_events + WHERE event_number_in_session = 2 + GROUP BY event_number_in_session, event_type, e_value, next_type, next_value + ORDER BY sessions_count DESC), + n3 AS (SELECT event_number_in_session, + event_type, + e_value, + next_type, + next_value, + COUNT(1) AS sessions_count + FROM ranked_events + WHERE event_number_in_session = 3 + GROUP BY event_number_in_session, event_type, e_value, next_type, next_value + ORDER BY sessions_count DESC), - drop_n AS (-- STEP 1 - SELECT event_number_in_session, - event_type, - e_value, - 'DROP' AS next_type, - NULL AS next_value, - sessions_count - FROM n1 - WHERE isNull(n1.next_type) - UNION ALL - -- STEP 2 - SELECT event_number_in_session, - event_type, - e_value, - 'DROP' AS next_type, - NULL AS next_value, - sessions_count - FROM n2 - WHERE isNull(n2.next_type)), - top_n AS (SELECT event_number_in_session, - event_type, - e_value, - SUM(sessions_count) AS sessions_count - FROM n1 - GROUP BY event_number_in_session, event_type, e_value - LIMIT 1 - UNION ALL - -- STEP 2 - SELECT event_number_in_session, - event_type, - e_value, - SUM(sessions_count) AS sessions_count - FROM n2 - GROUP BY event_number_in_session, event_type, e_value - ORDER BY sessions_count DESC - LIMIT 3 - UNION ALL - -- STEP 3 - SELECT event_number_in_session, - event_type, - e_value, - SUM(sessions_count) AS sessions_count - FROM n3 - GROUP BY event_number_in_session, event_type, e_value - ORDER BY sessions_count DESC - LIMIT 3), - top_n_with_next AS (SELECT n1.* - FROM n1 - UNION ALL - SELECT n2.* - FROM n2 - INNER JOIN top_n ON (n2.event_number_in_session = top_n.event_number_in_session - AND n2.event_type = top_n.event_type - AND n2.e_value = top_n.e_value)), - others_n AS ( - -- STEP 2 - SELECT n2.* - FROM n2 - WHERE (n2.event_number_in_session, n2.event_type, n2.e_value) NOT IN - (SELECT event_number_in_session, event_type, e_value - FROM top_n - WHERE top_n.event_number_in_session = 2) - UNION ALL - -- STEP 3 - SELECT n3.* - FROM n3 - WHERE (n3.event_number_in_session, n3.event_type, n3.e_value) NOT IN - (SELECT event_number_in_session, event_type, e_value - FROM top_n - WHERE top_n.event_number_in_session = 3)) -SELECT * -FROM ( --- Top to Top: valid - SELECT top_n_with_next.* - FROM top_n_with_next - INNER JOIN top_n - ON (top_n_with_next.event_number_in_session + 1 = top_n.event_number_in_session - AND top_n_with_next.next_type = top_n.event_type - AND top_n_with_next.next_value = top_n.e_value) - UNION ALL --- Top to Others: valid - SELECT top_n_with_next.event_number_in_session, - top_n_with_next.event_type, - top_n_with_next.e_value, - 'OTHER' AS next_type, - NULL AS next_value, - SUM(top_n_with_next.sessions_count) AS sessions_count - FROM top_n_with_next - WHERE (top_n_with_next.event_number_in_session + 1, top_n_with_next.next_type, top_n_with_next.next_value) IN - (SELECT others_n.event_number_in_session, others_n.event_type, others_n.e_value FROM others_n) - GROUP BY top_n_with_next.event_number_in_session, top_n_with_next.event_type, top_n_with_next.e_value - UNION ALL --- Top go to Drop: valid - SELECT drop_n.event_number_in_session, - drop_n.event_type, - drop_n.e_value, - drop_n.next_type, - drop_n.next_value, - drop_n.sessions_count - FROM drop_n - INNER JOIN top_n ON (drop_n.event_number_in_session = top_n.event_number_in_session - AND drop_n.event_type = top_n.event_type - AND drop_n.e_value = top_n.e_value) - ORDER BY drop_n.event_number_in_session - UNION ALL --- Others got to Drop: valid - SELECT others_n.event_number_in_session, - 'OTHER' AS event_type, - NULL AS e_value, - 'DROP' AS next_type, - NULL AS next_value, - SUM(others_n.sessions_count) AS sessions_count - FROM others_n - WHERE isNull(others_n.next_type) - AND others_n.event_number_in_session < 3 - GROUP BY others_n.event_number_in_session, next_type, next_value - UNION ALL --- Others got to Top:valid - SELECT others_n.event_number_in_session, - 'OTHER' AS event_type, - NULL AS e_value, - others_n.next_type, - others_n.next_value, - SUM(others_n.sessions_count) AS sessions_count - FROM others_n - WHERE isNotNull(others_n.next_type) - AND (others_n.event_number_in_session + 1, others_n.next_type, others_n.next_value) IN - (SELECT top_n.event_number_in_session, top_n.event_type, top_n.e_value FROM top_n) - GROUP BY others_n.event_number_in_session, others_n.next_type, others_n.next_value - UNION ALL --- Others got to Others - SELECT others_n.event_number_in_session, - 'OTHER' AS event_type, - NULL AS e_value, - 'OTHER' AS next_type, - NULL AS next_value, - SUM(sessions_count) AS sessions_count - FROM others_n - WHERE isNotNull(others_n.next_type) - AND others_n.event_number_in_session < 3 - AND (others_n.event_number_in_session + 1, others_n.next_type, others_n.next_value) NOT IN - (SELECT event_number_in_session, event_type, e_value FROM top_n) - GROUP BY others_n.event_number_in_session) -ORDER BY event_number_in_session, sessions_count DESC;""" + drop_n AS (-- STEP 1 + SELECT event_number_in_session, + event_type, + e_value, + 'DROP' AS next_type, + NULL AS next_value, + sessions_count + FROM n1 + WHERE isNull(n1.next_type) + UNION ALL + -- STEP 2 + SELECT event_number_in_session, + event_type, + e_value, + 'DROP' AS next_type, + NULL AS next_value, + sessions_count + FROM n2 + WHERE isNull(n2.next_type)), + top_n AS (SELECT event_number_in_session, + event_type, + e_value, + SUM(sessions_count) AS sessions_count + FROM n1 + GROUP BY event_number_in_session, event_type, e_value + LIMIT 1 + UNION ALL + -- STEP 2 + SELECT event_number_in_session, + event_type, + e_value, + SUM(sessions_count) AS sessions_count + FROM n2 + GROUP BY event_number_in_session, event_type, e_value + ORDER BY sessions_count DESC + LIMIT 3 + UNION ALL + -- STEP 3 + SELECT event_number_in_session, + event_type, + e_value, + SUM(sessions_count) AS sessions_count + FROM n3 + GROUP BY event_number_in_session, event_type, e_value + ORDER BY sessions_count DESC + LIMIT 3), + top_n_with_next AS (SELECT n1.* + FROM n1 + UNION ALL + SELECT n2.* + FROM n2 + INNER JOIN top_n ON (n2.event_number_in_session = top_n.event_number_in_session + AND n2.event_type = top_n.event_type + AND n2.e_value = top_n.e_value)), + others_n AS ( + -- STEP 2 + SELECT n2.* + FROM n2 + WHERE (n2.event_number_in_session, n2.event_type, n2.e_value) NOT IN + (SELECT event_number_in_session, event_type, e_value + FROM top_n + WHERE top_n.event_number_in_session = 2) + UNION ALL + -- STEP 3 + SELECT n3.* + FROM n3 + WHERE (n3.event_number_in_session, n3.event_type, n3.e_value) NOT IN + (SELECT event_number_in_session, event_type, e_value + FROM top_n + WHERE top_n.event_number_in_session = 3)) + SELECT * + FROM ( + -- Top to Top: valid + SELECT top_n_with_next.* + FROM top_n_with_next + INNER JOIN top_n + ON (top_n_with_next.event_number_in_session + 1 = top_n.event_number_in_session + AND top_n_with_next.next_type = top_n.event_type + AND top_n_with_next.next_value = top_n.e_value) + UNION ALL + -- Top to Others: valid + SELECT top_n_with_next.event_number_in_session, + top_n_with_next.event_type, + top_n_with_next.e_value, + 'OTHER' AS next_type, + NULL AS next_value, + SUM(top_n_with_next.sessions_count) AS sessions_count + FROM top_n_with_next + WHERE (top_n_with_next.event_number_in_session + 1, top_n_with_next.next_type, top_n_with_next.next_value) IN + (SELECT others_n.event_number_in_session, others_n.event_type, others_n.e_value FROM others_n) + GROUP BY top_n_with_next.event_number_in_session, top_n_with_next.event_type, top_n_with_next.e_value + UNION ALL + -- Top go to Drop: valid + SELECT drop_n.event_number_in_session, + drop_n.event_type, + drop_n.e_value, + drop_n.next_type, + drop_n.next_value, + drop_n.sessions_count + FROM drop_n + INNER JOIN top_n ON (drop_n.event_number_in_session = top_n.event_number_in_session + AND drop_n.event_type = top_n.event_type + AND drop_n.e_value = top_n.e_value) + ORDER BY drop_n.event_number_in_session + UNION ALL + -- Others got to Drop: valid + SELECT others_n.event_number_in_session, + 'OTHER' AS event_type, + NULL AS e_value, + 'DROP' AS next_type, + NULL AS next_value, + SUM(others_n.sessions_count) AS sessions_count + FROM others_n + WHERE isNull(others_n.next_type) + AND others_n.event_number_in_session < 3 + GROUP BY others_n.event_number_in_session, next_type, next_value + UNION ALL + -- Others got to Top:valid + SELECT others_n.event_number_in_session, + 'OTHER' AS event_type, + NULL AS e_value, + others_n.next_type, + others_n.next_value, + SUM(others_n.sessions_count) AS sessions_count + FROM others_n + WHERE isNotNull(others_n.next_type) + AND (others_n.event_number_in_session + 1, others_n.next_type, others_n.next_value) IN + (SELECT top_n.event_number_in_session, top_n.event_type, top_n.e_value FROM top_n) + GROUP BY others_n.event_number_in_session, others_n.next_type, others_n.next_value + UNION ALL + -- Others got to Others + SELECT others_n.event_number_in_session, + 'OTHER' AS event_type, + NULL AS e_value, + 'OTHER' AS next_type, + NULL AS next_value, + SUM(sessions_count) AS sessions_count + FROM others_n + WHERE isNotNull(others_n.next_type) + AND others_n.event_number_in_session < 3 + AND (others_n.event_number_in_session + 1, others_n.next_type, others_n.next_value) NOT IN + (SELECT event_number_in_session, event_type, e_value FROM top_n) + GROUP BY others_n.event_number_in_session) + ORDER BY event_number_in_session, sessions_count DESC;""" rows = ch.execute(query=ch_query1, parameters={}) drop = 0 for r in rows: @@ -272,6 +263,14 @@ ORDER BY event_number_in_session, sessions_count DESC;""" return __transform_journey(rows=rows, reverse_path=False) + +# startPoints are computed before ranked_events to reduce the number of window functions over rows +# compute avg_time_from_previous at the same level as sessions_count (this was removed in v1.22) +# if start-point is selected, the selected event is ranked n°1 +def path_analysis(project_id: int, data: schemas.CardPathAnalysis): + # # This code is used for testing only + # return __get_test_data() + # ------ end of testing code --- sub_events = [] start_points_conditions = [] @@ -567,55 +566,77 @@ ORDER BY event_number_in_session, sessions_count DESC;""" main_events_table += " INNER JOIN initial_event ON (events.session_id = initial_event.session_id)" sessions_conditions = [] - steps_query = ["""n1 AS (SELECT event_number_in_session, - event_type, - e_value, - next_type, - next_value, - COUNT(1) AS sessions_count - FROM ranked_events - WHERE event_number_in_session = 1 - AND isNotNull(next_value) - GROUP BY event_number_in_session, event_type, e_value, next_type, next_value - ORDER BY sessions_count DESC - LIMIT %(eventThresholdNumberInGroup)s)"""] - projection_query = ["""SELECT event_number_in_session, - event_type, - e_value, - next_type, - next_value, - sessions_count - FROM n1"""] - for i in range(2, data.density + 1): - steps_query.append(f"""n{i} AS (SELECT * - FROM (SELECT re.event_number_in_session AS event_number_in_session, - re.event_type AS event_type, - re.e_value AS e_value, - re.next_type AS next_type, - re.next_value AS next_value, - COUNT(1) AS sessions_count - FROM n{i - 1} INNER JOIN ranked_events AS re - ON (n{i - 1}.next_value = re.e_value AND n{i - 1}.next_type = re.event_type) - WHERE re.event_number_in_session = {i} - GROUP BY re.event_number_in_session, re.event_type, re.e_value, re.next_type, re.next_value) AS sub_level - ORDER BY sessions_count DESC - LIMIT %(eventThresholdNumberInGroup)s)""") - projection_query.append(f"""SELECT event_number_in_session, + steps_query = [] + # This is used if data.hideExcess is True + projection_query = [] + drop_query = [] + top_query = [] + top_with_next_query = [] + other_query = [] + for i in range(1, data.density + (0 if data.hide_excess else 1)): + steps_query.append(f"""n{i} AS (SELECT event_number_in_session, + event_type, + e_value, + next_type, + next_value, + COUNT(1) AS sessions_count + FROM ranked_events + WHERE event_number_in_session = {i} + GROUP BY event_number_in_session, event_type, e_value, next_type, next_value + ORDER BY sessions_count DESC)""") + if data.hide_excess: + projection_query.append(f"""\ + SELECT event_number_in_session, event_type, e_value, next_type, next_value, sessions_count - FROM n{i}""") + FROM n{i} + WHERE isNotNull(next_type)""") + else: + top_query.append(f"""\ + SELECT event_number_in_session, + event_type, + e_value, + SUM(sessions_count) AS sessions_count + FROM n{i} + GROUP BY event_number_in_session, event_type, e_value + ORDER BY sessions_count DESC + LIMIT %(visibleRows)s""") + + if i < data.density: + drop_query.append(f"""SELECT event_number_in_session, + event_type, + e_value, + 'DROP' AS next_type, + NULL AS next_value, + sessions_count + FROM n{i} + WHERE isNull(n{i}.next_type)""") + if not data.hide_excess: + top_with_next_query.append(f"""\ + SELECT n{i}.* + FROM n{i} + INNER JOIN top_n + ON (n{i}.event_number_in_session = top_n.event_number_in_session + AND n{i}.event_type = top_n.event_type + AND n{i}.e_value = top_n.e_value)""") + + if i > 1 and not data.hide_excess: + other_query.append(f"""SELECT n{i}.* + FROM n{i} + WHERE (event_number_in_session, event_type, e_value) NOT IN + (SELECT event_number_in_session, event_type, e_value + FROM top_n + WHERE top_n.event_number_in_session = {i})""") with ch_client.ClickHouseClient(database="experimental") as ch: time_key = TimeUTC.now() _now = time() params = {"project_id": project_id, "startTimestamp": data.startTimestamp, "endTimestamp": data.endTimestamp, "density": data.density, - # This is ignored because UI will take care of it - # "eventThresholdNumberInGroup": 4 if data.hide_excess else 8, - "eventThresholdNumberInGroup": 8, + "visibleRows": data.rows, **extra_values} ch_query1 = f"""\ @@ -640,7 +661,7 @@ FROM pre_ranked_events;""" ch.execute(query=ch_query1, parameters=params) if time() - _now > 2: logger.warning(f">>>>>>>>>PathAnalysis long query EE ({int(time() - _now)}s)<<<<<<<<<") - logger.warning(ch.format(ch_query1, params)) + logger.warning(ch.format(query=ch_query1, parameters=params)) logger.warning("----------------------") _now = time() @@ -663,22 +684,115 @@ FROM ranked_events;""" ch.execute(query=ch_query2, parameters=params) if time() - _now > 2: logger.warning(f">>>>>>>>>PathAnalysis long query EE ({int(time() - _now)}s)<<<<<<<<<") - logger.warning(ch.format(ch_query2, params)) + logger.warning(ch.format(query=ch_query2, parameters=params)) logger.warning("----------------------") _now = time() + sub_cte = "" + if not data.hide_excess: + sub_cte = f""", + top_n AS ({"\nUNION ALL\n".join(top_query)}), + top_n_with_next AS ({"\nUNION ALL\n".join(top_with_next_query)}), + others_n AS ({"\nUNION ALL\n".join(other_query)})""" + projection_query = """\ + -- Top to Top: valid + SELECT top_n_with_next.* + FROM top_n_with_next + INNER JOIN top_n + ON (top_n_with_next.event_number_in_session + 1 = top_n.event_number_in_session + AND top_n_with_next.next_type = top_n.event_type + AND top_n_with_next.next_value = top_n.e_value) + UNION ALL + -- Top to Others: valid + SELECT top_n_with_next.event_number_in_session, + top_n_with_next.event_type, + top_n_with_next.e_value, + 'OTHER' AS next_type, + NULL AS next_value, + SUM(top_n_with_next.sessions_count) AS sessions_count + FROM top_n_with_next + WHERE (top_n_with_next.event_number_in_session + 1, top_n_with_next.next_type, top_n_with_next.next_value) IN + (SELECT others_n.event_number_in_session, others_n.event_type, others_n.e_value FROM others_n) + GROUP BY top_n_with_next.event_number_in_session, top_n_with_next.event_type, top_n_with_next.e_value + UNION ALL + -- Top go to Drop: valid + SELECT drop_n.event_number_in_session, + drop_n.event_type, + drop_n.e_value, + drop_n.next_type, + drop_n.next_value, + drop_n.sessions_count + FROM drop_n + INNER JOIN top_n ON (drop_n.event_number_in_session = top_n.event_number_in_session + AND drop_n.event_type = top_n.event_type + AND drop_n.e_value = top_n.e_value) + ORDER BY drop_n.event_number_in_session + UNION ALL + -- Others got to Drop: valid + SELECT others_n.event_number_in_session, + 'OTHER' AS event_type, + NULL AS e_value, + 'DROP' AS next_type, + NULL AS next_value, + SUM(others_n.sessions_count) AS sessions_count + FROM others_n + WHERE isNull(others_n.next_type) + AND others_n.event_number_in_session < 3 + GROUP BY others_n.event_number_in_session, next_type, next_value + UNION ALL + -- Others got to Top:valid + SELECT others_n.event_number_in_session, + 'OTHER' AS event_type, + NULL AS e_value, + others_n.next_type, + others_n.next_value, + SUM(others_n.sessions_count) AS sessions_count + FROM others_n + WHERE isNotNull(others_n.next_type) + AND (others_n.event_number_in_session + 1, others_n.next_type, others_n.next_value) IN + (SELECT top_n.event_number_in_session, top_n.event_type, top_n.e_value FROM top_n) + GROUP BY others_n.event_number_in_session, others_n.next_type, others_n.next_value + UNION ALL + -- Others got to Others + SELECT others_n.event_number_in_session, + 'OTHER' AS event_type, + NULL AS e_value, + 'OTHER' AS next_type, + NULL AS next_value, + SUM(sessions_count) AS sessions_count + FROM others_n + WHERE isNotNull(others_n.next_type) + AND others_n.event_number_in_session < %(density)s + AND (others_n.event_number_in_session + 1, others_n.next_type, others_n.next_value) NOT IN + (SELECT event_number_in_session, event_type, e_value FROM top_n) + GROUP BY others_n.event_number_in_session""" + else: + projection_query.append("""\ + SELECT event_number_in_session, + event_type, + e_value, + next_type, + next_value, + sessions_count + FROM drop_n""") + projection_query = "\nUNION ALL\n".join(projection_query) + ch_query3 = f"""\ -WITH ranked_events AS (SELECT * - FROM ranked_events_{time_key}), - {",".join(steps_query)} -SELECT * -FROM ({" UNION ALL ".join(projection_query)}) AS chart_steps -ORDER BY event_number_in_session;""" + WITH ranked_events AS (SELECT * + FROM ranked_events_{time_key}), + {",\n".join(steps_query)}, + drop_n AS ({"\nUNION ALL\n".join(drop_query)}) + {sub_cte} + SELECT * + FROM ( + {projection_query} + ) AS chart_steps + ORDER BY event_number_in_session, sessions_count DESC;""" logger.debug("---------Q3-----------") rows = ch.execute(query=ch_query3, parameters=params) if time() - _now > 2: logger.warning(f">>>>>>>>>PathAnalysis long query EE ({int(time() - _now)}s)<<<<<<<<<") - logger.warning(ch.format(ch_query3, params)) + logger.warning(ch.format(query=ch_query3, parameters=params)) logger.warning("----------------------") return __transform_journey(rows=rows, reverse_path=reverse) diff --git a/api/schemas/schemas.py b/api/schemas/schemas.py index 809aca8e9..82104a41d 100644 --- a/api/schemas/schemas.py +++ b/api/schemas/schemas.py @@ -133,7 +133,7 @@ class _TimedSchema(BaseModel): class NotificationsViewSchema(_TimedSchema): - ids: List[int] = Field(default=[]) + ids: List[int] = Field(default_factory=list) startTimestamp: Optional[int] = Field(default=None) endTimestamp: Optional[int] = Field(default=None) @@ -545,7 +545,7 @@ class SessionSearchEventSchema2(BaseModel): operator: Union[SearchEventOperator, ClickEventExtraOperator] = Field(...) source: Optional[List[Union[ErrorSource, int, str]]] = Field(default=None) sourceOperator: Optional[MathOperator] = Field(default=None) - filters: Optional[List[RequestGraphqlFilterSchema]] = Field(default=[]) + filters: Optional[List[RequestGraphqlFilterSchema]] = Field(default_factory=list) _remove_duplicate_values = field_validator('value', mode='before')(remove_duplicate_values) _single_to_list_values = field_validator('value', mode='before')(single_to_list) @@ -579,7 +579,7 @@ class SessionSearchEventSchema2(BaseModel): class SessionSearchFilterSchema(BaseModel): is_event: Literal[False] = False - value: List[Union[IssueType, PlatformType, int, str]] = Field(default=[]) + value: List[Union[IssueType, PlatformType, int, str]] = Field(default_factory=list) type: FilterType = Field(...) operator: Union[SearchEventOperator, MathOperator] = Field(...) source: Optional[Union[ErrorSource, str]] = Field(default=None) @@ -658,8 +658,8 @@ Field(discriminator='is_event'), BeforeValidator(add_missing_is_event)] class SessionsSearchPayloadSchema(_TimedSchema, _PaginatedSchema): - events: List[SessionSearchEventSchema2] = Field(default=[], doc_hidden=True) - filters: List[GroupedFilterType] = Field(default=[]) + events: List[SessionSearchEventSchema2] = Field(default_factory=list, doc_hidden=True) + filters: List[GroupedFilterType] = Field(default_factory=list) sort: str = Field(default="startTs") order: SortOrderType = Field(default=SortOrderType.DESC) events_order: Optional[SearchEventOrder] = Field(default=SearchEventOrder.THEN) @@ -816,7 +816,7 @@ Field(discriminator='is_event')] class PathAnalysisSchema(_TimedSchema, _PaginatedSchema): density: int = Field(default=7) - filters: List[ProductAnalyticsFilter] = Field(default=[]) + filters: List[ProductAnalyticsFilter] = Field(default_factory=list) type: Optional[str] = Field(default=None) _transform_filters = field_validator('filters', mode='before') \ @@ -928,10 +928,10 @@ class CardSessionsSchema(_TimedSchema, _PaginatedSchema): startTimestamp: int = Field(default=TimeUTC.now(-7)) endTimestamp: int = Field(default=TimeUTC.now()) density: int = Field(default=7, ge=1, le=200) - series: List[CardSeriesSchema] = Field(default=[]) + series: List[CardSeriesSchema] = Field(default_factory=list) - # events: List[SessionSearchEventSchema2] = Field(default=[], doc_hidden=True) - filters: List[GroupedFilterType] = Field(default=[]) + # events: List[SessionSearchEventSchema2] = Field(default_factory=list, doc_hidden=True) + filters: List[GroupedFilterType] = Field(default_factory=list) compare_to: Optional[List[str]] = Field(default=None) @@ -1037,9 +1037,11 @@ class __CardSchema(CardSessionsSchema): view_type: Any metric_type: MetricType = Field(...) metric_of: Any - metric_value: List[IssueType] = Field(default=[]) + metric_value: List[IssueType] = Field(default_factory=list) # This is used to save the selected session for heatmaps session_id: Optional[int] = Field(default=None) + # This is used to specify the number of top values for PathAnalysis + rows: int = Field(default=3, ge=1, le=10) @computed_field @property @@ -1185,14 +1187,15 @@ class CardPathAnalysis(__CardSchema): metric_type: Literal[MetricType.PATH_ANALYSIS] metric_of: MetricOfPathAnalysis = Field(default=MetricOfPathAnalysis.session_count) view_type: MetricOtherViewType = Field(...) - metric_value: List[ProductAnalyticsSelectedEventType] = Field(default=[]) + metric_value: List[ProductAnalyticsSelectedEventType] = Field(default_factory=list) density: int = Field(default=4, ge=2, le=10) + rows: int = Field(default=3, ge=1, le=10) start_type: Literal["start", "end"] = Field(default="start") - start_point: List[PathAnalysisSubFilterSchema] = Field(default=[]) - excludes: List[PathAnalysisSubFilterSchema] = Field(default=[]) + start_point: List[PathAnalysisSubFilterSchema] = Field(default_factory=list) + excludes: List[PathAnalysisSubFilterSchema] = Field(default_factory=list) - series: List[CardPathAnalysisSeriesSchema] = Field(default=[]) + series: List[CardPathAnalysisSeriesSchema] = Field(default_factory=list) @model_validator(mode="before") @classmethod @@ -1258,13 +1261,13 @@ class ProjectConditions(BaseModel): condition_id: Optional[int] = Field(default=None) name: str = Field(...) capture_rate: int = Field(..., ge=0, le=100) - filters: List[GroupedFilterType] = Field(default=[]) + filters: List[GroupedFilterType] = Field(default_factory=list) class ProjectSettings(BaseModel): rate: int = Field(..., ge=0, le=100) conditional_capture: bool = Field(default=False) - conditions: List[ProjectConditions] = Field(default=[]) + conditions: List[ProjectConditions] = Field(default_factory=list) class CreateDashboardSchema(BaseModel): @@ -1272,7 +1275,7 @@ class CreateDashboardSchema(BaseModel): description: Optional[str] = Field(default='') is_public: bool = Field(default=False) is_pinned: bool = Field(default=False) - metrics: Optional[List[int]] = Field(default=[]) + metrics: Optional[List[int]] = Field(default_factory=list) class EditDashboardSchema(CreateDashboardSchema): @@ -1281,7 +1284,7 @@ class EditDashboardSchema(CreateDashboardSchema): class UpdateWidgetPayloadSchema(BaseModel): - config: dict = Field(default={}) + config: dict = Field(default_factory=dict) class AddWidgetToDashboardPayloadSchema(UpdateWidgetPayloadSchema): @@ -1379,7 +1382,7 @@ class IntegrationType(str, Enum): class SearchNoteSchema(_PaginatedSchema): sort: str = Field(default="createdAt") order: SortOrderType = Field(default=SortOrderType.DESC) - tags: Optional[List[str]] = Field(default=[]) + tags: Optional[List[str]] = Field(default_factory=list) shared_only: bool = Field(default=False) mine_only: bool = Field(default=False) search: Optional[str] = Field(default=None) @@ -1426,8 +1429,8 @@ class _HeatMapSearchEventRaw(SessionSearchEventSchema2): class HeatMapSessionsSearch(SessionsSearchPayloadSchema): - events: Optional[List[_HeatMapSearchEventRaw]] = Field(default=[]) - filters: List[Union[SessionSearchFilterSchema, _HeatMapSearchEventRaw]] = Field(default=[]) + events: Optional[List[_HeatMapSearchEventRaw]] = Field(default_factory=list) + filters: List[Union[SessionSearchFilterSchema, _HeatMapSearchEventRaw]] = Field(default_factory=list) @model_validator(mode="before") @classmethod @@ -1442,14 +1445,14 @@ class HeatMapSessionsSearch(SessionsSearchPayloadSchema): class HeatMapFilterSchema(BaseModel): - value: List[Literal[IssueType.CLICK_RAGE, IssueType.DEAD_CLICK]] = Field(default=[]) + value: List[Literal[IssueType.CLICK_RAGE, IssueType.DEAD_CLICK]] = Field(default_factory=list) type: Literal[FilterType.ISSUE] = Field(...) operator: Literal[SearchEventOperator.IS, MathOperator.EQUAL] = Field(...) class GetHeatMapPayloadSchema(_TimedSchema): url: Optional[str] = Field(default=None) - filters: List[HeatMapFilterSchema] = Field(default=[]) + filters: List[HeatMapFilterSchema] = Field(default_factory=list) click_rage: bool = Field(default=False) operator: Literal[SearchEventOperator.IS, SearchEventOperator.STARTS_WITH, SearchEventOperator.CONTAINS, SearchEventOperator.ENDS_WITH] = Field(default=SearchEventOperator.STARTS_WITH) @@ -1470,7 +1473,7 @@ class FeatureFlagVariant(BaseModel): class FeatureFlagConditionFilterSchema(BaseModel): is_event: Literal[False] = False type: FilterType = Field(...) - value: List[str] = Field(default=[], min_length=1) + value: List[str] = Field(default_factory=list, min_length=1) operator: Union[SearchEventOperator, MathOperator] = Field(...) source: Optional[str] = Field(default=None) sourceOperator: Optional[Union[SearchEventOperator, MathOperator]] = Field(default=None) @@ -1486,7 +1489,7 @@ class FeatureFlagCondition(BaseModel): condition_id: Optional[int] = Field(default=None) name: str = Field(...) rollout_percentage: Optional[int] = Field(default=0) - filters: List[FeatureFlagConditionFilterSchema] = Field(default=[]) + filters: List[FeatureFlagConditionFilterSchema] = Field(default_factory=list) class SearchFlagsSchema(_PaginatedSchema): @@ -1513,8 +1516,8 @@ class FeatureFlagSchema(BaseModel): flag_type: FeatureFlagType = Field(default=FeatureFlagType.SINGLE_VARIANT) is_persist: Optional[bool] = Field(default=False) is_active: Optional[bool] = Field(default=True) - conditions: List[FeatureFlagCondition] = Field(default=[], min_length=1) - variants: List[FeatureFlagVariant] = Field(default=[]) + conditions: List[FeatureFlagCondition] = Field(default_factory=list, min_length=1) + variants: List[FeatureFlagVariant] = Field(default_factory=list) class ModuleType(str, Enum):