feat(chalice): changed resources_by_party to use requests only instead of fetch&script resources

feat(chalice): fixed clickhouse client helper timeout
This commit is contained in:
Taha Yassine Kraiem 2022-08-24 18:06:38 +01:00
parent 8c7c25e7cd
commit 261595f075
4 changed files with 44 additions and 33 deletions

View file

@ -2135,44 +2135,44 @@ def get_resources_by_party(project_id, startTimestamp=TimeUTC.now(delta_days=-1)
pg_sub_query_subset = __get_constraints(project_id=project_id, time_constraint=True,
chart=False, data=args)
pg_sub_query_chart = __get_constraints(project_id=project_id, time_constraint=False, project=False,
chart=True, data=args, main_table="resources", time_column="timestamp",
chart=True, data=args, main_table="requests", time_column="timestamp",
duration=False)
pg_sub_query_subset.append("resources.timestamp >= %(startTimestamp)s")
pg_sub_query_subset.append("resources.timestamp < %(endTimestamp)s")
pg_sub_query_subset.append("resources.success = FALSE")
pg_sub_query_subset.append("requests.timestamp >= %(startTimestamp)s")
pg_sub_query_subset.append("requests.timestamp < %(endTimestamp)s")
# pg_sub_query_subset.append("resources.type IN ('fetch', 'script')")
pg_sub_query_subset.append("requests.success = FALSE")
with pg_client.PostgresClient() as cur:
pg_query = f"""WITH resources AS (
SELECT resources.url_host, timestamp
FROM events.resources
pg_query = f"""WITH requests AS (
SELECT requests.host, timestamp
FROM events_common.requests
INNER JOIN public.sessions USING (session_id)
WHERE {" AND ".join(pg_sub_query_subset)}
)
SELECT generated_timestamp AS timestamp,
SUM(CASE WHEN first.url_host = sub_resources.url_host THEN 1 ELSE 0 END) AS first_party,
SUM(CASE WHEN first.url_host != sub_resources.url_host THEN 1 ELSE 0 END) AS third_party
SUM(CASE WHEN first.host = sub_requests.host THEN 1 ELSE 0 END) AS first_party,
SUM(CASE WHEN first.host != sub_requests.host THEN 1 ELSE 0 END) AS third_party
FROM generate_series(%(startTimestamp)s, %(endTimestamp)s, %(step_size)s) AS generated_timestamp
LEFT JOIN (
SELECT resources.url_host,
COUNT(resources.session_id) AS count
FROM events.resources
SELECT requests.host,
COUNT(requests.session_id) AS count
FROM events_common.requests
INNER JOIN public.sessions USING (session_id)
WHERE sessions.project_id = '1'
AND resources.type IN ('fetch', 'script')
AND sessions.start_ts > (EXTRACT(EPOCH FROM now() - INTERVAL '31 days') * 1000)::BIGINT
AND sessions.start_ts < (EXTRACT(EPOCH FROM now()) * 1000)::BIGINT
AND resources.timestamp > (EXTRACT(EPOCH FROM now() - INTERVAL '31 days') * 1000)::BIGINT
AND resources.timestamp < (EXTRACT(EPOCH FROM now()) * 1000)::BIGINT
AND requests.timestamp > (EXTRACT(EPOCH FROM now() - INTERVAL '31 days') * 1000)::BIGINT
AND requests.timestamp < (EXTRACT(EPOCH FROM now()) * 1000)::BIGINT
AND sessions.duration>0
GROUP BY resources.url_host
GROUP BY requests.host
ORDER BY count DESC
LIMIT 1
) AS first ON (TRUE)
LEFT JOIN LATERAL (
SELECT resources.url_host
FROM resources
SELECT requests.host
FROM requests
WHERE {" AND ".join(pg_sub_query_chart)}
) AS sub_resources ON (TRUE)
) AS sub_requests ON (TRUE)
GROUP BY generated_timestamp
ORDER BY generated_timestamp;"""
cur.execute(cur.mogrify(pg_query, {"step_size": step_size,

View file

@ -2030,6 +2030,7 @@ def get_resources_by_party(project_id, startTimestamp=TimeUTC.now(delta_days=-1)
step_size = __get_step_size(startTimestamp, endTimestamp, density)
ch_sub_query = __get_basic_constraints(table_name="resources", round_start=True, data=args)
ch_sub_query.append("resources.success = 0")
ch_sub_query.append("resources.type IN ('fetch','script')")
sch_sub_query = ["rs.project_id =toUInt32(%(project_id)s)", "rs.type IN ('fetch','script')"]
meta_condition = __get_meta_constraint(args)
ch_sub_query += meta_condition
@ -2037,8 +2038,8 @@ def get_resources_by_party(project_id, startTimestamp=TimeUTC.now(delta_days=-1)
with ch_client.ClickHouseClient() as ch:
ch_query = f"""SELECT toUnixTimestamp(toStartOfInterval(sub_resources.datetime, INTERVAL %(step_size)s second)) * 1000 AS timestamp,
SUM(if(first.url_host = sub_resources.url_host, 1, 0)) AS first_party,
SUM(if(first.url_host = sub_resources.url_host, 0, 1)) AS third_party
SUM(first.url_host = sub_resources.url_host) AS first_party,
SUM(first.url_host != sub_resources.url_host) AS third_party
FROM
(
SELECT resources.datetime, resources.url_host

View file

@ -2041,31 +2041,31 @@ def get_resources_count_by_type(project_id, startTimestamp=TimeUTC.now(delta_day
def get_resources_by_party(project_id, startTimestamp=TimeUTC.now(delta_days=-1),
endTimestamp=TimeUTC.now(), density=7, **args):
raise Exception("not supported widget")
step_size = __get_step_size(startTimestamp, endTimestamp, density)
ch_sub_query = __get_basic_constraints(table_name="resources", round_start=True, data=args)
ch_sub_query.append("resources.success = 0")
sch_sub_query = ["rs.project_id =toUInt16(%(project_id)s)", "rs.type IN ('fetch','script')"]
ch_sub_query = __get_basic_constraints(table_name="requests", round_start=True, data=args)
ch_sub_query.append("requests.event_type='REQUEST'")
ch_sub_query.append("requests.success = 0")
sch_sub_query = ["rs.project_id =toUInt16(%(project_id)s)", "rs.event_type='REQUEST'"]
meta_condition = __get_meta_constraint(args)
ch_sub_query += meta_condition
# sch_sub_query += meta_condition
with ch_client.ClickHouseClient() as ch:
ch_query = f"""SELECT toUnixTimestamp(toStartOfInterval(sub_resources.datetime, INTERVAL %(step_size)s second)) * 1000 AS timestamp,
SUM(if(first.url_host = sub_resources.url_host, 1, 0)) AS first_party,
SUM(if(first.url_host = sub_resources.url_host, 0, 1)) AS third_party
ch_query = f"""SELECT toUnixTimestamp(toStartOfInterval(sub_requests.datetime, INTERVAL %(step_size)s second)) * 1000 AS timestamp,
SUM(first.url_host = sub_requests.url_host) AS first_party,
SUM(first.url_host != sub_requests.url_host) AS third_party
FROM
(
SELECT resources.datetime, resources.url_host
FROM resources {"INNER JOIN sessions_metadata USING(session_id)" if len(meta_condition) > 0 else ""}
SELECT requests.datetime, requests.url_host
FROM {sessions_helper.get_main_events_table(startTimestamp)} AS requests
WHERE {" AND ".join(ch_sub_query)}
) AS sub_resources
) AS sub_requests
CROSS JOIN
(
SELECT
rs.url_host,
COUNT(rs.session_id) AS count
FROM resources AS rs
FROM {sessions_helper.get_main_events_table(startTimestamp)} AS rs
WHERE {" AND ".join(sch_sub_query)}
GROUP BY rs.url_host
ORDER BY count DESC
@ -2073,6 +2073,11 @@ def get_resources_by_party(project_id, startTimestamp=TimeUTC.now(delta_days=-1)
) AS first
GROUP BY timestamp
ORDER BY timestamp;"""
print(ch.format(query=ch_query,
params={"step_size": step_size,
"project_id": project_id,
"startTimestamp": startTimestamp,
"endTimestamp": endTimestamp, **__get_constraint_values(args)}))
rows = ch.execute(query=ch_query,
params={"step_size": step_size,
"project_id": project_id,

View file

@ -1,8 +1,13 @@
import logging
import clickhouse_driver
from decouple import config
logging.basicConfig(level=config("LOGLEVEL", default=logging.INFO))
logging.getLogger('apscheduler').setLevel(config("LOGLEVEL", default=logging.INFO))
settings = None
if config('pg_timeout', cast=int, default=-1) <= 0:
if config('pg_timeout', cast=int, default=-1) > 0:
settings = {"max_execution_time": config('pg_timeout', cast=int)}