From 261595f075ab86406364523ef1bf60c5b6004e3e Mon Sep 17 00:00:00 2001 From: Taha Yassine Kraiem Date: Wed, 24 Aug 2022 18:06:38 +0100 Subject: [PATCH] feat(chalice): changed resources_by_party to use requests only instead of fetch&script resources feat(chalice): fixed clickhouse client helper timeout --- api/chalicelib/core/metrics.py | 38 +++++++++++++-------------- ee/api/chalicelib/core/metrics.py | 5 ++-- ee/api/chalicelib/core/metrics_new.py | 27 +++++++++++-------- ee/api/chalicelib/utils/ch_client.py | 7 ++++- 4 files changed, 44 insertions(+), 33 deletions(-) diff --git a/api/chalicelib/core/metrics.py b/api/chalicelib/core/metrics.py index 9a0d213cf..23eeb3410 100644 --- a/api/chalicelib/core/metrics.py +++ b/api/chalicelib/core/metrics.py @@ -2135,44 +2135,44 @@ def get_resources_by_party(project_id, startTimestamp=TimeUTC.now(delta_days=-1) pg_sub_query_subset = __get_constraints(project_id=project_id, time_constraint=True, chart=False, data=args) pg_sub_query_chart = __get_constraints(project_id=project_id, time_constraint=False, project=False, - chart=True, data=args, main_table="resources", time_column="timestamp", + chart=True, data=args, main_table="requests", time_column="timestamp", duration=False) - pg_sub_query_subset.append("resources.timestamp >= %(startTimestamp)s") - pg_sub_query_subset.append("resources.timestamp < %(endTimestamp)s") - pg_sub_query_subset.append("resources.success = FALSE") + pg_sub_query_subset.append("requests.timestamp >= %(startTimestamp)s") + pg_sub_query_subset.append("requests.timestamp < %(endTimestamp)s") + # pg_sub_query_subset.append("resources.type IN ('fetch', 'script')") + pg_sub_query_subset.append("requests.success = FALSE") with pg_client.PostgresClient() as cur: - pg_query = f"""WITH resources AS ( - SELECT resources.url_host, timestamp - FROM events.resources + pg_query = f"""WITH requests AS ( + SELECT requests.host, timestamp + FROM events_common.requests INNER JOIN public.sessions USING (session_id) WHERE {" AND ".join(pg_sub_query_subset)} ) SELECT generated_timestamp AS timestamp, - SUM(CASE WHEN first.url_host = sub_resources.url_host THEN 1 ELSE 0 END) AS first_party, - SUM(CASE WHEN first.url_host != sub_resources.url_host THEN 1 ELSE 0 END) AS third_party + SUM(CASE WHEN first.host = sub_requests.host THEN 1 ELSE 0 END) AS first_party, + SUM(CASE WHEN first.host != sub_requests.host THEN 1 ELSE 0 END) AS third_party FROM generate_series(%(startTimestamp)s, %(endTimestamp)s, %(step_size)s) AS generated_timestamp LEFT JOIN ( - SELECT resources.url_host, - COUNT(resources.session_id) AS count - FROM events.resources + SELECT requests.host, + COUNT(requests.session_id) AS count + FROM events_common.requests INNER JOIN public.sessions USING (session_id) WHERE sessions.project_id = '1' - AND resources.type IN ('fetch', 'script') AND sessions.start_ts > (EXTRACT(EPOCH FROM now() - INTERVAL '31 days') * 1000)::BIGINT AND sessions.start_ts < (EXTRACT(EPOCH FROM now()) * 1000)::BIGINT - AND resources.timestamp > (EXTRACT(EPOCH FROM now() - INTERVAL '31 days') * 1000)::BIGINT - AND resources.timestamp < (EXTRACT(EPOCH FROM now()) * 1000)::BIGINT + AND requests.timestamp > (EXTRACT(EPOCH FROM now() - INTERVAL '31 days') * 1000)::BIGINT + AND requests.timestamp < (EXTRACT(EPOCH FROM now()) * 1000)::BIGINT AND sessions.duration>0 - GROUP BY resources.url_host + GROUP BY requests.host ORDER BY count DESC LIMIT 1 ) AS first ON (TRUE) LEFT JOIN LATERAL ( - SELECT resources.url_host - FROM resources + SELECT requests.host + FROM requests WHERE {" AND ".join(pg_sub_query_chart)} - ) AS sub_resources ON (TRUE) + ) AS sub_requests ON (TRUE) GROUP BY generated_timestamp ORDER BY generated_timestamp;""" cur.execute(cur.mogrify(pg_query, {"step_size": step_size, diff --git a/ee/api/chalicelib/core/metrics.py b/ee/api/chalicelib/core/metrics.py index ac8afdfb7..92adacd62 100644 --- a/ee/api/chalicelib/core/metrics.py +++ b/ee/api/chalicelib/core/metrics.py @@ -2030,6 +2030,7 @@ def get_resources_by_party(project_id, startTimestamp=TimeUTC.now(delta_days=-1) step_size = __get_step_size(startTimestamp, endTimestamp, density) ch_sub_query = __get_basic_constraints(table_name="resources", round_start=True, data=args) ch_sub_query.append("resources.success = 0") + ch_sub_query.append("resources.type IN ('fetch','script')") sch_sub_query = ["rs.project_id =toUInt32(%(project_id)s)", "rs.type IN ('fetch','script')"] meta_condition = __get_meta_constraint(args) ch_sub_query += meta_condition @@ -2037,8 +2038,8 @@ def get_resources_by_party(project_id, startTimestamp=TimeUTC.now(delta_days=-1) with ch_client.ClickHouseClient() as ch: ch_query = f"""SELECT toUnixTimestamp(toStartOfInterval(sub_resources.datetime, INTERVAL %(step_size)s second)) * 1000 AS timestamp, - SUM(if(first.url_host = sub_resources.url_host, 1, 0)) AS first_party, - SUM(if(first.url_host = sub_resources.url_host, 0, 1)) AS third_party + SUM(first.url_host = sub_resources.url_host) AS first_party, + SUM(first.url_host != sub_resources.url_host) AS third_party FROM ( SELECT resources.datetime, resources.url_host diff --git a/ee/api/chalicelib/core/metrics_new.py b/ee/api/chalicelib/core/metrics_new.py index a4a2955ef..e6b12bf56 100644 --- a/ee/api/chalicelib/core/metrics_new.py +++ b/ee/api/chalicelib/core/metrics_new.py @@ -2041,31 +2041,31 @@ def get_resources_count_by_type(project_id, startTimestamp=TimeUTC.now(delta_day def get_resources_by_party(project_id, startTimestamp=TimeUTC.now(delta_days=-1), endTimestamp=TimeUTC.now(), density=7, **args): - raise Exception("not supported widget") step_size = __get_step_size(startTimestamp, endTimestamp, density) - ch_sub_query = __get_basic_constraints(table_name="resources", round_start=True, data=args) - ch_sub_query.append("resources.success = 0") - sch_sub_query = ["rs.project_id =toUInt16(%(project_id)s)", "rs.type IN ('fetch','script')"] + ch_sub_query = __get_basic_constraints(table_name="requests", round_start=True, data=args) + ch_sub_query.append("requests.event_type='REQUEST'") + ch_sub_query.append("requests.success = 0") + sch_sub_query = ["rs.project_id =toUInt16(%(project_id)s)", "rs.event_type='REQUEST'"] meta_condition = __get_meta_constraint(args) ch_sub_query += meta_condition # sch_sub_query += meta_condition with ch_client.ClickHouseClient() as ch: - ch_query = f"""SELECT toUnixTimestamp(toStartOfInterval(sub_resources.datetime, INTERVAL %(step_size)s second)) * 1000 AS timestamp, - SUM(if(first.url_host = sub_resources.url_host, 1, 0)) AS first_party, - SUM(if(first.url_host = sub_resources.url_host, 0, 1)) AS third_party + ch_query = f"""SELECT toUnixTimestamp(toStartOfInterval(sub_requests.datetime, INTERVAL %(step_size)s second)) * 1000 AS timestamp, + SUM(first.url_host = sub_requests.url_host) AS first_party, + SUM(first.url_host != sub_requests.url_host) AS third_party FROM ( - SELECT resources.datetime, resources.url_host - FROM resources {"INNER JOIN sessions_metadata USING(session_id)" if len(meta_condition) > 0 else ""} + SELECT requests.datetime, requests.url_host + FROM {sessions_helper.get_main_events_table(startTimestamp)} AS requests WHERE {" AND ".join(ch_sub_query)} - ) AS sub_resources + ) AS sub_requests CROSS JOIN ( SELECT rs.url_host, COUNT(rs.session_id) AS count - FROM resources AS rs + FROM {sessions_helper.get_main_events_table(startTimestamp)} AS rs WHERE {" AND ".join(sch_sub_query)} GROUP BY rs.url_host ORDER BY count DESC @@ -2073,6 +2073,11 @@ def get_resources_by_party(project_id, startTimestamp=TimeUTC.now(delta_days=-1) ) AS first GROUP BY timestamp ORDER BY timestamp;""" + print(ch.format(query=ch_query, + params={"step_size": step_size, + "project_id": project_id, + "startTimestamp": startTimestamp, + "endTimestamp": endTimestamp, **__get_constraint_values(args)})) rows = ch.execute(query=ch_query, params={"step_size": step_size, "project_id": project_id, diff --git a/ee/api/chalicelib/utils/ch_client.py b/ee/api/chalicelib/utils/ch_client.py index 75e1dc063..a2d15ab17 100644 --- a/ee/api/chalicelib/utils/ch_client.py +++ b/ee/api/chalicelib/utils/ch_client.py @@ -1,8 +1,13 @@ +import logging + import clickhouse_driver from decouple import config +logging.basicConfig(level=config("LOGLEVEL", default=logging.INFO)) +logging.getLogger('apscheduler').setLevel(config("LOGLEVEL", default=logging.INFO)) + settings = None -if config('pg_timeout', cast=int, default=-1) <= 0: +if config('pg_timeout', cast=int, default=-1) > 0: settings = {"max_execution_time": config('pg_timeout', cast=int)}