Api lateral join search (#100)

* feat(api): less env-vars in chalice.yaml

* feat(api): lateral join for sessions search
* feat(api): fixed click-not-on

* feta(db): delta and indexes for lateral-join search
* feat(api): changed search to use lateral-join
* feat(api): optimized search for negative operator
This commit is contained in:
Kraiem Taha Yassine 2021-07-21 20:35:36 +02:00 committed by GitHub
parent 91203eec0c
commit 6952deeea8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 113 additions and 84 deletions

View file

@ -3,30 +3,29 @@ from chalicelib.core import events, sessions_metas, socket_ios, metadata, events
sessions_mobs, issues, projects, errors, resources, assist
SESSION_PROJECTION_COLS = """s.project_id,
s.session_id::text AS session_id,
s.user_uuid,
s.user_id,
s.user_agent,
s.user_os,
s.user_browser,
s.user_device,
s.user_device_type,
s.user_country,
s.start_ts,
s.duration,
s.events_count,
s.pages_count,
s.errors_count,
s.user_anonymous_id,
s.platform,
s.issue_score,
to_jsonb(s.issue_types) AS issue_types,
favorite_sessions.session_id NOTNULL AS favorite,
COALESCE((SELECT TRUE
FROM public.user_viewed_sessions AS fs
WHERE s.session_id = fs.session_id
AND fs.user_id = %(userId)s LIMIT 1), FALSE) AS viewed
"""
s.session_id::text AS session_id,
s.user_uuid,
s.user_id,
s.user_agent,
s.user_os,
s.user_browser,
s.user_device,
s.user_device_type,
s.user_country,
s.start_ts,
s.duration,
s.events_count,
s.pages_count,
s.errors_count,
s.user_anonymous_id,
s.platform,
s.issue_score,
to_jsonb(s.issue_types) AS issue_types,
favorite_sessions.session_id NOTNULL AS favorite,
COALESCE((SELECT TRUE
FROM public.user_viewed_sessions AS fs
WHERE s.session_id = fs.session_id
AND fs.user_id = %(userId)s LIMIT 1), FALSE) AS viewed """
def __group_metadata(session, project_metadata):
@ -120,7 +119,14 @@ new_line = "\n"
def __get_sql_operator(op):
op = op.lower()
return "=" if op == "is" or op == "on" else "!=" if op == "isnot" else "ILIKE" if op == "contains" else "NOT ILIKE" if op == "notcontains" else "="
return {
"is": "=",
"on": "=",
"isnot": "!=",
"noton": "!=",
"contains": "ILIKE",
"notcontains": "NOT ILIKE",
}.get(op, "=")
def __is_negation_operator(op):
@ -165,27 +171,30 @@ def search2_pg(data, project_id, user_id, favorite_only=False, errors_only=False
fav_only_join = "LEFT JOIN public.user_favorite_sessions AS fs ON fs.session_id = s.session_id"
extra_constraints.append(cur.mogrify("fs.user_id = %(userId)s", {"userId": user_id}))
events_query_part = ""
strict = True
if len(data.get("events", [])) > 0:
events_query_from = []
event_index = 0
for event in data["events"]:
# TODO: remove this when message_id is removed
seq_id = False
event_type = event["type"].upper()
if event.get("operator") is None:
event["operator"] = "is"
op = __get_sql_operator(event["operator"])
is_not = False
if __is_negation_operator(op) and event_index > 0:
if __is_negation_operator(op):
is_not = True
op = __reverse_sql_operator(op)
event_from = "%s INNER JOIN public.sessions AS ms USING (session_id)"
event_where = ["ms.project_id = %(projectId)s", "main.timestamp >= %(startDate)s",
"main.timestamp <= %(endDate)s", "ms.start_ts >= %(startDate)s",
"ms.start_ts <= %(endDate)s"]
if event_index == 0:
event_from = "%s INNER JOIN public.sessions AS ms USING (session_id)"
event_where = ["ms.project_id = %(projectId)s", "main.timestamp >= %(startDate)s",
"main.timestamp <= %(endDate)s", "ms.start_ts >= %(startDate)s",
"ms.start_ts <= %(endDate)s", "ms.duration IS NOT NULL"]
else:
event_from = "%s"
event_where = ["main.timestamp >= %(startDate)s", "main.timestamp <= %(endDate)s",
f"event_{event_index - 1}.timestamp <= main.timestamp",
"main.session_id=event_0.session_id"]
event_args = {"value": helper.string_to_sql_like_with_op(event['value'], op)}
if event_type not in list(events.SUPPORTED_TYPES.keys()) \
or event.get("value") in [None, "", "*"] \
@ -206,11 +215,9 @@ def search2_pg(data, project_id, user_id, favorite_only=False, errors_only=False
event_from = event_from % f"{events.event_type.LOCATION.table} AS main "
event_where.append(f"main.{events.event_type.LOCATION.column} {op} %(value)s")
elif event_type == events.event_type.CUSTOM.ui_type:
seq_id = True
event_from = event_from % f"{events.event_type.CUSTOM.table} AS main "
event_where.append(f"main.{events.event_type.CUSTOM.column} {op} %(value)s")
elif event_type == events.event_type.REQUEST.ui_type:
seq_id = True
event_from = event_from % f"{events.event_type.REQUEST.table} AS main "
event_where.append(f"main.{events.event_type.REQUEST.column} {op} %(value)s")
elif event_type == events.event_type.GRAPHQL.ui_type:
@ -234,12 +241,10 @@ def search2_pg(data, project_id, user_id, favorite_only=False, errors_only=False
# ----- IOS
elif event_type == events.event_type.CLICK_IOS.ui_type:
seq_id = True
event_from = event_from % f"{events.event_type.CLICK_IOS.table} AS main "
event_where.append(f"main.{events.event_type.CLICK_IOS.column} {op} %(value)s")
elif event_type == events.event_type.INPUT_IOS.ui_type:
seq_id = True
event_from = event_from % f"{events.event_type.INPUT_IOS.table} AS main "
event_where.append(f"main.{events.event_type.INPUT_IOS.column} {op} %(value)s")
@ -247,19 +252,15 @@ def search2_pg(data, project_id, user_id, favorite_only=False, errors_only=False
event_where.append("main.value ILIKE %(custom)s")
event_args["custom"] = helper.string_to_sql_like_with_op(event['custom'], "ILIKE")
elif event_type == events.event_type.VIEW_IOS.ui_type:
seq_id = True
event_from = event_from % f"{events.event_type.VIEW_IOS.table} AS main "
event_where.append(f"main.{events.event_type.VIEW_IOS.column} {op} %(value)s")
elif event_type == events.event_type.CUSTOM_IOS.ui_type:
seq_id = True
event_from = event_from % f"{events.event_type.CUSTOM_IOS.table} AS main "
event_where.append(f"main.{events.event_type.CUSTOM_IOS.column} {op} %(value)s")
elif event_type == events.event_type.REQUEST_IOS.ui_type:
seq_id = True
event_from = event_from % f"{events.event_type.REQUEST_IOS.table} AS main "
event_where.append(f"main.{events.event_type.REQUEST_IOS.column} {op} %(value)s")
elif event_type == events.event_type.ERROR_IOS.ui_type:
seq_id = True
event_from = event_from % f"{events.event_type.ERROR_IOS.table} AS main INNER JOIN public.crashes_ios AS main1 USING(crash_id)"
if event.get("value") not in [None, "*", ""]:
event_where.append(f"(main1.reason {op} %(value)s OR main1.name {op} %(value)s)")
@ -267,29 +268,50 @@ def search2_pg(data, project_id, user_id, favorite_only=False, errors_only=False
else:
continue
event_index += 1
if is_not:
event_from += f""" LEFT JOIN (SELECT session_id FROM {event_from} WHERE {" AND ".join(event_where)}) AS left_not USING (session_id)"""
event_where[-1] = "left_not.session_id ISNULL"
events_query_from.append(cur.mogrify(f"""\
if event_index == 0:
events_query_from.append(cur.mogrify(f"""\
(SELECT
session_id,
0 AS timestamp,
{event_index} AS funnel_step
FROM sessions
WHERE EXISTS(SELECT session_id
FROM {event_from}
WHERE {" AND ".join(event_where)}
AND sessions.session_id=ms.session_id) IS FALSE
AND project_id = %(projectId)s
AND start_ts >= %(startDate)s
AND start_ts <= %(endDate)s
AND duration IS NOT NULL
) AS event_{event_index} {"ON(TRUE)" if event_index > 0 else ""}\
""", {**generic_args, **event_args}).decode('UTF-8'))
else:
events_query_from.append(cur.mogrify(f"""\
(SELECT
main.session_id, {'seq_index' if seq_id else 'message_id %%%% 2147483647 AS seq_index'}, timestamp, {event_index} AS funnel_step
event_0.session_id,
event_{event_index - 1}.timestamp AS timestamp,
{event_index} AS funnel_step
WHERE EXISTS(SELECT session_id FROM {event_from} WHERE {" AND ".join(event_where)}) IS FALSE
) AS event_{event_index} {"ON(TRUE)" if event_index > 0 else ""}\
""", {**generic_args, **event_args}).decode('UTF-8'))
else:
events_query_from.append(cur.mogrify(f"""\
(SELECT main.session_id, MIN(timestamp) AS timestamp,{event_index} AS funnel_step
FROM {event_from}
WHERE {" AND ".join(event_where)}
)\
GROUP BY 1
) AS event_{event_index} {"ON(TRUE)" if event_index > 0 else ""}\
""", {**generic_args, **event_args}).decode('UTF-8'))
if len(events_query_from) > 0:
events_query_part = f"""\
SELECT
session_id, MIN(timestamp) AS first_event_ts, MAX(timestamp) AS last_event_ts
FROM
({(" UNION ALL ").join(events_query_from)}) AS f_query
GROUP BY 1
{"" if event_index < 2 else f"HAVING events.funnel(array_agg(funnel_step ORDER BY timestamp,seq_index ASC), {event_index})" if strict
else f"HAVING array_length(array_agg(DISTINCT funnel_step), 1) = {len(data['events'])}"}
{fav_only_join}
"""
event_index += 1
if event_index > 0:
events_query_part = f"""SELECT
event_0.session_id,
MIN(event_0.timestamp) AS first_event_ts,
MAX(event_{event_index - 1}.timestamp) AS last_event_ts
FROM {(" INNER JOIN LATERAL ").join(events_query_from)}
GROUP BY 1
{fav_only_join}"""
else:
data["events"] = []
@ -423,8 +445,7 @@ def search2_pg(data, project_id, user_id, favorite_only=False, errors_only=False
{" AND ".join(extra_constraints)}"""
if errors_only:
main_query = cur.mogrify(f"""\
SELECT DISTINCT er.error_id, ser.status, ser.parent_error_id, ser.payload,
main_query = cur.mogrify(f"""SELECT DISTINCT er.error_id, ser.status, ser.parent_error_id, ser.payload,
COALESCE((SELECT TRUE
FROM public.user_favorite_sessions AS fs
WHERE s.session_id = fs.session_id
@ -437,13 +458,12 @@ def search2_pg(data, project_id, user_id, favorite_only=False, errors_only=False
generic_args)
elif count_only:
main_query = cur.mogrify(f"""\
SELECT COUNT(DISTINCT s.session_id) AS count_sessions, COUNT(DISTINCT s.user_uuid) AS count_users
main_query = cur.mogrify(
f"""SELECT COUNT(DISTINCT s.session_id) AS count_sessions, COUNT(DISTINCT s.user_uuid) AS count_users
{query_part};""",
generic_args)
generic_args)
else:
main_query = cur.mogrify(f"""\
SELECT * FROM
main_query = cur.mogrify(f"""SELECT * FROM
(SELECT DISTINCT ON(s.session_id) {SESSION_PROJECTION_COLS}
{query_part}
ORDER BY s.session_id desc) AS filtred_sessions

View file

@ -1,7 +1,3 @@
BEGIN;
CREATE INDEX pages_first_contentful_paint_time_idx ON events.pages (first_contentful_paint_time) WHERE first_contentful_paint_time>0;
CREATE INDEX pages_dom_content_loaded_time_idx ON events.pages (dom_content_loaded_time) WHERE dom_content_loaded_time>0;
CREATE INDEX pages_first_paint_time_idx ON events.pages (first_paint_time) WHERE first_paint_time > 0;
CREATE INDEX pages_ttfb_idx ON events.pages (ttfb) WHERE ttfb > 0;
CREATE INDEX pages_time_to_interactive_idx ON events.pages (time_to_interactive) WHERE time_to_interactive > 0;
COMMIT;

View file

@ -1,4 +1,11 @@
BEGIN ;
BEGIN;
CREATE INDEX sessions_session_id_project_id_start_ts_durationNN_idx ON sessions (session_id, project_id, start_ts) WHERE duration IS NOT NULL;
CREATE INDEX clicks_label_session_id_timestamp_idx ON events.clicks (label,session_id,timestamp);
CREATE INDEX pages_base_path_session_id_timestamp_idx ON events.pages (base_path,session_id,timestamp);
CREATE INDEX ON unstarted_sessions(project_id);
CREATE INDEX ON assigned_sessions(session_id);
CREATE INDEX ON technical_info(session_id);
CREATE INDEX inputs_label_session_id_timestamp_idx ON events.inputs (label,session_id,timestamp);
CREATE INDEX clicks_url_idx ON events.clicks (url);
CREATE INDEX clicks_url_gin_idx ON events.clicks USING GIN (url gin_trgm_ops);

View file

@ -534,6 +534,8 @@ CREATE INDEX sessions_user_anonymous_id_gin_idx ON public.sessions USING GIN (us
CREATE INDEX sessions_user_country_gin_idx ON public.sessions (project_id, user_country);
CREATE INDEX ON sessions (project_id, user_country);
CREATE INDEX ON sessions (project_id, user_browser);
CREATE INDEX sessions_session_id_project_id_start_ts_durationNN_idx ON sessions (session_id, project_id, start_ts) WHERE duration IS NOT NULL;
ALTER TABLE public.sessions
ADD CONSTRAINT web_browser_constraint CHECK ( (sessions.platform = 'web' AND sessions.user_browser NOTNULL) OR
@ -574,6 +576,7 @@ create table assigned_sessions
created_at timestamp default timezone('utc'::text, now()) NOT NULL,
provider_data jsonb default '{}'::jsonb NOT NULL
);
CREATE INDEX ON assigned_sessions(session_id);
-- --- events_common.sql ---
@ -677,6 +680,7 @@ CREATE INDEX pages_path_idx ON events.pages (path);
CREATE INDEX pages_visually_complete_idx ON events.pages (visually_complete) WHERE visually_complete > 0;
CREATE INDEX pages_dom_building_time_idx ON events.pages (dom_building_time) WHERE dom_building_time > 0;
CREATE INDEX pages_load_time_idx ON events.pages (load_time) WHERE load_time > 0;
CREATE INDEX pages_base_path_session_id_timestamp_idx ON events.pages (base_path,session_id,timestamp);
CREATE TABLE events.clicks
@ -691,6 +695,7 @@ CREATE INDEX ON events.clicks (session_id);
CREATE INDEX ON events.clicks (label);
CREATE INDEX clicks_label_gin_idx ON events.clicks USING GIN (label gin_trgm_ops);
CREATE INDEX ON events.clicks (timestamp);
CREATE INDEX clicks_label_session_id_timestamp_idx ON events.clicks (label,session_id,timestamp);
CREATE INDEX clicks_url_idx ON events.clicks (url);
CREATE INDEX clicks_url_gin_idx ON events.clicks USING GIN (url gin_trgm_ops);
CREATE INDEX clicks_url_session_id_timestamp_selector_idx ON events.clicks (url, session_id, timestamp,selector);
@ -710,6 +715,7 @@ CREATE INDEX ON events.inputs (label, value);
CREATE INDEX inputs_label_gin_idx ON events.inputs USING GIN (label gin_trgm_ops);
CREATE INDEX inputs_label_idx ON events.inputs (label);
CREATE INDEX ON events.inputs (timestamp);
CREATE INDEX inputs_label_session_id_timestamp_idx ON events.inputs (label,session_id,timestamp);
CREATE TABLE events.errors
(

View file

@ -20,7 +20,6 @@ resources:
cpu: 1m
memory: 1Mi
env:
AWS_DEFAULT_REGION: us-east-1
pg_host: postgresql.db.svc.cluster.local
pg_port: 5432
pg_dbname: postgres
@ -28,19 +27,8 @@ env:
pg_password: asayerPostgres
ch_host: clickhouse.db.svc.cluster.local
ch_port: 9000
alert_ntf: http://127.0.0.1:8000/async/alerts/notifications/%s
email_signup: http://127.0.0.1:8000/async/email_signup/%s
email_funnel: http://127.0.0.1:8000/async/funnel/%s
email_plans: http://127.0.0.1:8000/async/plans/%s
email_basic: http://127.0.0.1:8000/async/basic/%s
assign_link: http://127.0.0.1:8000/async/email_assignment
captcha_server: ''
captcha_key: ''
sessions_bucket: mobs
sessions_region: us-east-1
put_S3_TTL: '20'
sourcemaps_bucket: sourcemaps
js_cache_bucket: sessions-assets
async_Token: ''
EMAIL_HOST: ''
EMAIL_PORT: '587'

View file

@ -1,4 +1,11 @@
BEGIN ;
BEGIN;
CREATE INDEX sessions_session_id_project_id_start_ts_durationNN_idx ON sessions (session_id, project_id, start_ts) WHERE duration IS NOT NULL;
CREATE INDEX clicks_label_session_id_timestamp_idx ON events.clicks (label,session_id,timestamp);
CREATE INDEX pages_base_path_session_id_timestamp_idx ON events.pages (base_path,session_id,timestamp);
CREATE INDEX ON unstarted_sessions(project_id);
CREATE INDEX ON assigned_sessions(session_id);
CREATE INDEX ON technical_info(session_id);
CREATE INDEX inputs_label_session_id_timestamp_idx ON events.inputs (label,session_id,timestamp);
ALTER TABLE events.clicks ADD COLUMN
url text DEFAULT '' NOT NULL;

View file

@ -517,6 +517,7 @@ CREATE INDEX ON sessions (project_id, user_browser);
CREATE INDEX sessions_start_ts_idx ON public.sessions (start_ts) WHERE duration > 0;
CREATE INDEX sessions_project_id_idx ON public.sessions (project_id) WHERE duration > 0;
CREATE INDEX sessions_session_id_project_id_start_ts_idx ON sessions (session_id, project_id, start_ts) WHERE duration > 0;
CREATE INDEX sessions_session_id_project_id_start_ts_durationNN_idx ON sessions (session_id, project_id, start_ts) WHERE duration IS NOT NULL;
ALTER TABLE public.sessions
ADD CONSTRAINT web_browser_constraint CHECK ( (sessions.platform = 'web' AND sessions.user_browser NOTNULL) OR
@ -557,6 +558,7 @@ create table assigned_sessions
created_at timestamp default timezone('utc'::text, now()) NOT NULL,
provider_data jsonb default '{}'::jsonb NOT NULL
);
CREATE INDEX ON assigned_sessions(session_id);
-- --- events_common.sql ---
@ -672,6 +674,7 @@ CREATE INDEX pages_timestamp_metgt0_idx ON events.pages (timestamp) WHERE respon
time_to_interactive > 0;
CREATE INDEX pages_session_id_speed_indexgt0nn_idx ON events.pages (session_id, speed_index) WHERE speed_index > 0 AND speed_index IS NOT NULL;
CREATE INDEX pages_session_id_timestamp_dom_building_timegt0nn_idx ON events.pages (session_id, timestamp, dom_building_time) WHERE dom_building_time > 0 AND dom_building_time IS NOT NULL;
CREATE INDEX pages_base_path_session_id_timestamp_idx ON events.pages (base_path,session_id,timestamp);
CREATE TABLE events.clicks
@ -688,6 +691,7 @@ CREATE INDEX ON events.clicks (session_id);
CREATE INDEX ON events.clicks (label);
CREATE INDEX clicks_label_gin_idx ON events.clicks USING GIN (label gin_trgm_ops);
CREATE INDEX ON events.clicks (timestamp);
CREATE INDEX clicks_label_session_id_timestamp_idx ON events.clicks (label,session_id,timestamp);
CREATE INDEX clicks_url_idx ON events.clicks (url);
CREATE INDEX clicks_url_gin_idx ON events.clicks USING GIN (url gin_trgm_ops);
CREATE INDEX clicks_url_session_id_timestamp_selector_idx ON events.clicks (url, session_id, timestamp,selector);
@ -707,6 +711,7 @@ CREATE INDEX ON events.inputs (label, value);
CREATE INDEX inputs_label_gin_idx ON events.inputs USING GIN (label gin_trgm_ops);
CREATE INDEX inputs_label_idx ON events.inputs (label);
CREATE INDEX ON events.inputs (timestamp);
CREATE INDEX inputs_label_session_id_timestamp_idx ON events.inputs (label,session_id,timestamp);
CREATE TABLE events.errors
(