Dev (#2926)
* fix(chalice): fixed path-finder first step-type issue * refactor(chalice): removed time between steps in path-finder
This commit is contained in:
parent
80462e4534
commit
2291980a89
4 changed files with 28 additions and 36 deletions
|
|
@ -18,10 +18,7 @@ def __transform_journey(rows, reverse_path=False):
|
||||||
break
|
break
|
||||||
number_of_step1 += 1
|
number_of_step1 += 1
|
||||||
total_100p += r["sessions_count"]
|
total_100p += r["sessions_count"]
|
||||||
# for i in range(number_of_step1):
|
|
||||||
# rows[i]["value"] = 100 / number_of_step1
|
|
||||||
|
|
||||||
# for i in range(number_of_step1, len(rows)):
|
|
||||||
for i in range(len(rows)):
|
for i in range(len(rows)):
|
||||||
rows[i]["value"] = rows[i]["sessions_count"] * 100 / total_100p
|
rows[i]["value"] = rows[i]["sessions_count"] * 100 / total_100p
|
||||||
|
|
||||||
|
|
@ -32,22 +29,17 @@ def __transform_journey(rows, reverse_path=False):
|
||||||
source = f"{r['event_number_in_session']}_{r['event_type']}_{r['e_value']}"
|
source = f"{r['event_number_in_session']}_{r['event_type']}_{r['e_value']}"
|
||||||
if source not in nodes:
|
if source not in nodes:
|
||||||
nodes.append(source)
|
nodes.append(source)
|
||||||
nodes_values.append({"name": r['e_value'], "eventType": r['event_type'],
|
nodes_values.append({"name": r['e_value'], "eventType": r['event_type']})
|
||||||
"avgTimeFromPrevious": 0, "sessionsCount": 0})
|
|
||||||
if r['next_value']:
|
if r['next_value']:
|
||||||
target = f"{r['event_number_in_session'] + 1}_{r['next_type']}_{r['next_value']}"
|
target = f"{r['event_number_in_session'] + 1}_{r['next_type']}_{r['next_value']}"
|
||||||
if target not in nodes:
|
if target not in nodes:
|
||||||
nodes.append(target)
|
nodes.append(target)
|
||||||
nodes_values.append({"name": r['next_value'], "eventType": r['next_type'],
|
nodes_values.append({"name": r['next_value'], "eventType": r['next_type']})
|
||||||
"avgTimeFromPrevious": 0, "sessionsCount": 0})
|
|
||||||
|
|
||||||
sr_idx = nodes.index(source)
|
sr_idx = nodes.index(source)
|
||||||
tg_idx = nodes.index(target)
|
tg_idx = nodes.index(target)
|
||||||
if r["avg_time_from_previous"] is not None:
|
|
||||||
nodes_values[tg_idx]["avgTimeFromPrevious"] += r["avg_time_from_previous"] * r["sessions_count"]
|
link = {"eventType": r['event_type'], "sessionsCount": r["sessions_count"],"value": r["value"]}
|
||||||
nodes_values[tg_idx]["sessionsCount"] += r["sessions_count"]
|
|
||||||
link = {"eventType": r['event_type'], "sessionsCount": r["sessions_count"],
|
|
||||||
"value": r["value"], "avgTimeFromPrevious": r["avg_time_from_previous"]}
|
|
||||||
if not reverse_path:
|
if not reverse_path:
|
||||||
link["source"] = sr_idx
|
link["source"] = sr_idx
|
||||||
link["target"] = tg_idx
|
link["target"] = tg_idx
|
||||||
|
|
@ -55,12 +47,6 @@ def __transform_journey(rows, reverse_path=False):
|
||||||
link["source"] = tg_idx
|
link["source"] = tg_idx
|
||||||
link["target"] = sr_idx
|
link["target"] = sr_idx
|
||||||
links.append(link)
|
links.append(link)
|
||||||
for n in nodes_values:
|
|
||||||
if n["sessionsCount"] > 0:
|
|
||||||
n["avgTimeFromPrevious"] = n["avgTimeFromPrevious"] / n["sessionsCount"]
|
|
||||||
else:
|
|
||||||
n["avgTimeFromPrevious"] = None
|
|
||||||
n.pop("sessionsCount")
|
|
||||||
|
|
||||||
return {"nodes": nodes_values,
|
return {"nodes": nodes_values,
|
||||||
"links": sorted(links, key=lambda x: (x["source"], x["target"]), reverse=False)}
|
"links": sorted(links, key=lambda x: (x["source"], x["target"]), reverse=False)}
|
||||||
|
|
|
||||||
|
|
@ -27,7 +27,7 @@ JOURNEY_TYPES = {
|
||||||
# query: Q5, the result is correct,
|
# query: Q5, the result is correct,
|
||||||
# startPoints are computed before ranked_events to reduce the number of window functions over rows
|
# startPoints are computed before ranked_events to reduce the number of window functions over rows
|
||||||
# replaced time_to_target by time_from_previous
|
# replaced time_to_target by time_from_previous
|
||||||
# compute avg_time_from_previous at the same level as sessions_count
|
# compute avg_time_from_previous at the same level as sessions_count (this was removed in v1.22)
|
||||||
# sort by top 5 according to sessions_count at the CTE level
|
# sort by top 5 according to sessions_count at the CTE level
|
||||||
# final part project data without grouping
|
# final part project data without grouping
|
||||||
# if start-point is selected, the selected event is ranked n°1
|
# if start-point is selected, the selected event is ranked n°1
|
||||||
|
|
@ -35,15 +35,29 @@ def path_analysis(project_id: int, data: schemas.CardPathAnalysis):
|
||||||
sub_events = []
|
sub_events = []
|
||||||
start_points_conditions = []
|
start_points_conditions = []
|
||||||
step_0_conditions = []
|
step_0_conditions = []
|
||||||
|
step_1_post_conditions = ["event_number_in_session <= %(density)s"]
|
||||||
|
|
||||||
if len(data.metric_value) == 0:
|
if len(data.metric_value) == 0:
|
||||||
data.metric_value.append(schemas.ProductAnalyticsSelectedEventType.LOCATION)
|
data.metric_value.append(schemas.ProductAnalyticsSelectedEventType.LOCATION)
|
||||||
sub_events.append({"column": JOURNEY_TYPES[schemas.ProductAnalyticsSelectedEventType.LOCATION]["column"],
|
sub_events.append({"column": JOURNEY_TYPES[schemas.ProductAnalyticsSelectedEventType.LOCATION]["column"],
|
||||||
"eventType": schemas.ProductAnalyticsSelectedEventType.LOCATION.value})
|
"eventType": schemas.ProductAnalyticsSelectedEventType.LOCATION.value})
|
||||||
else:
|
else:
|
||||||
|
if len(data.start_point) > 0:
|
||||||
|
extra_metric_values = []
|
||||||
|
for s in data.start_point:
|
||||||
|
if s.type not in data.metric_value:
|
||||||
|
sub_events.append({"column": JOURNEY_TYPES[s.type]["column"],
|
||||||
|
"eventType": JOURNEY_TYPES[s.type]["eventType"]})
|
||||||
|
step_1_post_conditions.append(
|
||||||
|
f"(event_type!='{JOURNEY_TYPES[s.type]["eventType"]}' OR event_number_in_session = 1)")
|
||||||
|
extra_metric_values.append(s.type)
|
||||||
|
data.metric_value += extra_metric_values
|
||||||
|
|
||||||
for v in data.metric_value:
|
for v in data.metric_value:
|
||||||
if JOURNEY_TYPES.get(v):
|
if JOURNEY_TYPES.get(v):
|
||||||
sub_events.append({"column": JOURNEY_TYPES[v]["column"],
|
sub_events.append({"column": JOURNEY_TYPES[v]["column"],
|
||||||
"eventType": JOURNEY_TYPES[v]["eventType"]})
|
"eventType": JOURNEY_TYPES[v]["eventType"]})
|
||||||
|
|
||||||
if len(sub_events) == 1:
|
if len(sub_events) == 1:
|
||||||
main_column = sub_events[0]['column']
|
main_column = sub_events[0]['column']
|
||||||
else:
|
else:
|
||||||
|
|
@ -317,7 +331,6 @@ def path_analysis(project_id: int, data: schemas.CardPathAnalysis):
|
||||||
e_value,
|
e_value,
|
||||||
next_type,
|
next_type,
|
||||||
next_value,
|
next_value,
|
||||||
AVG(time_from_previous) AS avg_time_from_previous,
|
|
||||||
COUNT(1) AS sessions_count
|
COUNT(1) AS sessions_count
|
||||||
FROM ranked_events
|
FROM ranked_events
|
||||||
WHERE event_number_in_session = 1
|
WHERE event_number_in_session = 1
|
||||||
|
|
@ -330,8 +343,7 @@ def path_analysis(project_id: int, data: schemas.CardPathAnalysis):
|
||||||
e_value,
|
e_value,
|
||||||
next_type,
|
next_type,
|
||||||
next_value,
|
next_value,
|
||||||
sessions_count,
|
sessions_count
|
||||||
avg_time_from_previous
|
|
||||||
FROM n1"""]
|
FROM n1"""]
|
||||||
for i in range(2, data.density + 1):
|
for i in range(2, data.density + 1):
|
||||||
steps_query.append(f"""n{i} AS (SELECT *
|
steps_query.append(f"""n{i} AS (SELECT *
|
||||||
|
|
@ -340,7 +352,6 @@ def path_analysis(project_id: int, data: schemas.CardPathAnalysis):
|
||||||
re.e_value AS e_value,
|
re.e_value AS e_value,
|
||||||
re.next_type AS next_type,
|
re.next_type AS next_type,
|
||||||
re.next_value AS next_value,
|
re.next_value AS next_value,
|
||||||
AVG(re.time_from_previous) AS avg_time_from_previous,
|
|
||||||
COUNT(1) AS sessions_count
|
COUNT(1) AS sessions_count
|
||||||
FROM n{i - 1} INNER JOIN ranked_events AS re
|
FROM n{i - 1} INNER JOIN ranked_events AS re
|
||||||
ON (n{i - 1}.next_value = re.e_value AND n{i - 1}.next_type = re.event_type)
|
ON (n{i - 1}.next_value = re.e_value AND n{i - 1}.next_type = re.event_type)
|
||||||
|
|
@ -353,8 +364,7 @@ def path_analysis(project_id: int, data: schemas.CardPathAnalysis):
|
||||||
e_value,
|
e_value,
|
||||||
next_type,
|
next_type,
|
||||||
next_value,
|
next_value,
|
||||||
sessions_count,
|
sessions_count
|
||||||
avg_time_from_previous
|
|
||||||
FROM n{i}""")
|
FROM n{i}""")
|
||||||
|
|
||||||
with ch_client.ClickHouseClient(database="experimental") as ch:
|
with ch_client.ClickHouseClient(database="experimental") as ch:
|
||||||
|
|
@ -382,7 +392,7 @@ WITH {initial_sessions_cte}
|
||||||
FROM {main_events_table} {"INNER JOIN sub_sessions ON (sub_sessions.session_id = events.session_id)" if len(sessions_conditions) > 0 else ""}
|
FROM {main_events_table} {"INNER JOIN sub_sessions ON (sub_sessions.session_id = events.session_id)" if len(sessions_conditions) > 0 else ""}
|
||||||
WHERE {" AND ".join(ch_sub_query)}
|
WHERE {" AND ".join(ch_sub_query)}
|
||||||
) AS full_ranked_events
|
) AS full_ranked_events
|
||||||
WHERE event_number_in_session <= %(density)s)
|
WHERE {" AND ".join(step_1_post_conditions)})
|
||||||
SELECT *
|
SELECT *
|
||||||
FROM pre_ranked_events;"""
|
FROM pre_ranked_events;"""
|
||||||
logger.debug("---------Q1-----------")
|
logger.debug("---------Q1-----------")
|
||||||
|
|
@ -404,11 +414,7 @@ WITH pre_ranked_events AS (SELECT *
|
||||||
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS next_value,
|
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS next_value,
|
||||||
leadInFrame(toNullable(event_type))
|
leadInFrame(toNullable(event_type))
|
||||||
OVER (PARTITION BY session_id ORDER BY datetime {path_direction}
|
OVER (PARTITION BY session_id ORDER BY datetime {path_direction}
|
||||||
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS next_type,
|
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS next_type
|
||||||
abs(lagInFrame(toNullable(datetime))
|
|
||||||
OVER (PARTITION BY session_id ORDER BY datetime {path_direction}
|
|
||||||
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
|
|
||||||
- pre_ranked_events.datetime) AS time_from_previous
|
|
||||||
FROM start_points INNER JOIN pre_ranked_events USING (session_id))
|
FROM start_points INNER JOIN pre_ranked_events USING (session_id))
|
||||||
SELECT *
|
SELECT *
|
||||||
FROM ranked_events;"""
|
FROM ranked_events;"""
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,6 @@ from queue import Queue, Empty
|
||||||
|
|
||||||
import clickhouse_connect
|
import clickhouse_connect
|
||||||
from clickhouse_connect.driver.query import QueryContext
|
from clickhouse_connect.driver.query import QueryContext
|
||||||
from clickhouse_connect.driver.exceptions import DatabaseError
|
|
||||||
from decouple import config
|
from decouple import config
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
@ -32,9 +31,10 @@ if config("CH_COMPRESSION", cast=bool, default=True):
|
||||||
extra_args["compression"] = "lz4"
|
extra_args["compression"] = "lz4"
|
||||||
|
|
||||||
|
|
||||||
def transform_result(original_function):
|
def transform_result(self, original_function):
|
||||||
@wraps(original_function)
|
@wraps(original_function)
|
||||||
def wrapper(*args, **kwargs):
|
def wrapper(*args, **kwargs):
|
||||||
|
logger.debug(self.format(query=kwargs.get("query"), parameters=kwargs.get("parameters")))
|
||||||
result = original_function(*args, **kwargs)
|
result = original_function(*args, **kwargs)
|
||||||
if isinstance(result, clickhouse_connect.driver.query.QueryResult):
|
if isinstance(result, clickhouse_connect.driver.query.QueryResult):
|
||||||
column_names = result.column_names
|
column_names = result.column_names
|
||||||
|
|
@ -140,7 +140,7 @@ class ClickHouseClient:
|
||||||
else:
|
else:
|
||||||
self.__client = CH_pool.get_connection()
|
self.__client = CH_pool.get_connection()
|
||||||
|
|
||||||
self.__client.execute = transform_result(self.__client.query)
|
self.__client.execute = transform_result(self, self.__client.query)
|
||||||
self.__client.format = self.format
|
self.__client.format = self.format
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
|
|
|
||||||
|
|
@ -1209,10 +1209,10 @@ class CardPathAnalysis(__CardSchema):
|
||||||
if len(s.value) == 0:
|
if len(s.value) == 0:
|
||||||
continue
|
continue
|
||||||
start_point.append(s)
|
start_point.append(s)
|
||||||
self.metric_value.append(s.type)
|
# self.metric_value.append(s.type)
|
||||||
|
|
||||||
self.start_point = start_point
|
self.start_point = start_point
|
||||||
self.metric_value = remove_duplicate_values(self.metric_value)
|
# self.metric_value = remove_duplicate_values(self.metric_value)
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue