refactor(chalice): autocomplete for event-names

refactor(chalice): autocomplete for properties-names
refactor(chalice): autocomplete for properties-values
This commit is contained in:
Taha Yassine Kraiem 2025-05-08 18:51:44 +02:00 committed by Kraiem Taha Yassine
parent 8a69316b82
commit a13f427816
9 changed files with 448 additions and 13 deletions

View file

@ -0,0 +1,57 @@
from typing import Optional
from chalicelib.utils import helper
from chalicelib.utils.ch_client import ClickHouseClient
def search_events(project_id: int, q: Optional[str] = None):
with ClickHouseClient() as ch_client:
full_args = {"project_id": project_id, "limit": 20}
constraints = ["project_id = %(project_id)s",
"_timestamp >= now()-INTERVAL 1 MONTH"]
if q:
constraints += ["value ILIKE %(q)s"]
full_args["q"] = helper.string_to_sql_like(q)
query = ch_client.format(
f"""SELECT value,data_count
FROM product_analytics.autocomplete_events_grouped
WHERE {" AND ".join(constraints)}
ORDER BY data_count DESC
LIMIT %(limit)s;""",
parameters=full_args)
rows = ch_client.execute(query)
return {"values": helper.list_to_camel_case(rows), "_src": 2}
def search_properties(project_id: int, property_name: Optional[str] = None, event_name: Optional[str] = None,
q: Optional[str] = None):
with ClickHouseClient() as ch_client:
select = "value"
full_args = {"project_id": project_id, "limit": 20,
"event_name": event_name, "property_name": property_name}
constraints = ["project_id = %(project_id)s",
"_timestamp >= now()-INTERVAL 1 MONTH"]
if event_name:
constraints += ["event_name = %(event_name)s"]
if property_name and q:
constraints += ["property_name = %(property_name)s"]
elif property_name:
select = "DISTINCT ON(property_name) property_name AS value"
constraints += ["property_name ILIKE %(property_name)s"]
full_args["property_name"] = helper.string_to_sql_like(property_name)
if q:
constraints += ["value ILIKE %(q)s"]
full_args["q"] = helper.string_to_sql_like(q)
query = ch_client.format(
f"""SELECT {select},data_count
FROM product_analytics.autocomplete_event_properties_grouped
WHERE {" AND ".join(constraints)}
ORDER BY data_count DESC
LIMIT %(limit)s;""",
parameters=full_args)
rows = ch_client.execute(query)
return {"values": helper.list_to_camel_case(rows), "_src": 2}

View file

@ -148,11 +148,11 @@ def search_events(project_id: int, data: schemas.EventsSearchPayloadSchema):
parameters=full_args)
rows = ch_client.execute(query)
if len(rows) == 0:
return {"total": 0, "rows": [], "src": 2}
return {"total": 0, "rows": [], "_src": 2}
total = rows[0]["total"]
for r in rows:
r.pop("total")
return {"total": total, "rows": rows, "src": 2}
return {"total": total, "rows": rows, "_src": 2}
def get_lexicon(project_id: int, page: schemas.PaginatedSchema):

View file

@ -73,7 +73,7 @@ def search_sessions(data: schemas.SessionsSearchPayloadSchema, project: schemas.
return {
'total': 0,
'sessions': [],
'src': 2
'_src': 2
}
if project.platform == "web":
full_args, query_part = sessions.search_query_parts_ch(data=data, error_status=error_status,
@ -216,7 +216,7 @@ def search_sessions(data: schemas.SessionsSearchPayloadSchema, project: schemas.
return {
'total': total,
'sessions': sessions_list,
'src': 2
'_src': 2
}

View file

@ -49,7 +49,7 @@ def search_sessions(data: schemas.SessionsSearchPayloadSchema, project: schemas.
return {
'total': 0,
'sessions': [],
'src': 1
'_src': 1
}
full_args, query_part = sessions_legacy.search_query_parts(data=data, error_status=error_status,
errors_only=errors_only,
@ -177,7 +177,7 @@ def search_sessions(data: schemas.SessionsSearchPayloadSchema, project: schemas.
return {
'total': total,
'sessions': helper.list_to_camel_case(sessions),
'src': 1
'_src': 1
}
@ -240,7 +240,7 @@ def search_by_metadata(tenant_id, user_id, m_key, m_value, project_id=None):
cur.execute("\nUNION\n".join(sub_queries))
rows = cur.fetchall()
for i in rows:
i["src"] = 1
i["_src"] = 1
results[str(i["project_id"])]["sessions"].append(helper.dict_to_camel_case(i))
return results
@ -248,7 +248,7 @@ def search_by_metadata(tenant_id, user_id, m_key, m_value, project_id=None):
def search_sessions_by_ids(project_id: int, session_ids: list, sort_by: str = 'session_id',
ascending: bool = False) -> dict:
if session_ids is None or len(session_ids) == 0:
return {"total": 0, "sessions": [], "src": 1}
return {"total": 0, "sessions": [], "_src": 1}
with pg_client.PostgresClient() as cur:
meta_keys = metadata.get(project_id=project_id)
params = {"project_id": project_id, "session_ids": tuple(session_ids)}
@ -267,4 +267,4 @@ def search_sessions_by_ids(project_id: int, session_ids: list, sort_by: str = 's
s["metadata"] = {}
for m in meta_keys:
s["metadata"][m["key"]] = s.pop(f'metadata_{m["index"]}')
return {"total": len(rows), "sessions": helper.list_to_camel_case(rows), "src": 1}
return {"total": len(rows), "sessions": helper.list_to_camel_case(rows), "_src": 1}

View file

@ -4,9 +4,10 @@ from fastapi import Body, Depends, Query
import schemas
from chalicelib.core import metadata
from chalicelib.core.product_analytics import events, properties
from chalicelib.core.product_analytics import events, properties, autocomplete
from or_dependencies import OR_context
from routers.base import get_routers
from typing import Optional
public_app, app, app_apikey = get_routers()
@ -53,3 +54,20 @@ def get_all_lexicon_events(projectId: int, filter_query: Annotated[schemas.Pagin
def get_all_lexicon_properties(projectId: int, filter_query: Annotated[schemas.PaginatedSchema, Query()],
context: schemas.CurrentContext = Depends(OR_context)):
return {"data": properties.get_lexicon(project_id=projectId, page=filter_query)}
@app.get('/{projectId}/events/autocomplete', tags=["autocomplete"])
def autocomplete_events(projectId: int, q: Optional[str] = None,
context: schemas.CurrentContext = Depends(OR_context)):
return {"data": autocomplete.search_events(project_id=projectId, q=None if not q or len(q) == 0 else q)}
@app.get('/{projectId}/properties/autocomplete', tags=["autocomplete"])
def autocomplete_properties(projectId: int, propertyName: str, eventName: Optional[str] = None,
q: Optional[str] = None, context: schemas.CurrentContext = Depends(OR_context)):
return {"data": autocomplete.search_properties(project_id=projectId,
event_name=None if not eventName \
or len(eventName) == 0 else eventName,
property_name=None if not propertyName \
or len(propertyName) == 0 else propertyName,
q=None if not q or len(q) == 0 else q)}

View file

@ -165,3 +165,92 @@ FROM product_analytics.events
WHERE randCanonical() < 0.5 -- This randomly skips inserts
AND value != ''
LIMIT 2 BY project_id,property_name;
-- Autocomplete
CREATE TABLE IF NOT EXISTS product_analytics.autocomplete_events
(
project_id UInt16,
value String COMMENT 'The $event_name',
_timestamp DateTime
) ENGINE = MergeTree()
ORDER BY (project_id, value, _timestamp)
TTL _timestamp + INTERVAL 1 MONTH;
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.autocomplete_events_mv
TO product_analytics.autocomplete_events AS
SELECT project_id,
`$event_name` AS value,
_timestamp
FROM product_analytics.events
WHERE _timestamp > now() - INTERVAL 1 MONTH;
CREATE TABLE IF NOT EXISTS product_analytics.autocomplete_events_grouped
(
project_id UInt16,
value String COMMENT 'The $event_name',
data_count UInt16 COMMENT 'The number of appearance during the past month',
_timestamp DateTime
) ENGINE = ReplacingMergeTree(_timestamp)
ORDER BY (project_id, value)
TTL _timestamp + INTERVAL 1 MONTH;
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.autocomplete_events_grouped_mv
REFRESH EVERY 30 MINUTE TO product_analytics.autocomplete_events_grouped AS
SELECT project_id,
value,
count(1) AS data_count,
max(_timestamp) AS _timestamp
FROM product_analytics.autocomplete_events
WHERE autocomplete_events._timestamp > now() - INTERVAL 1 MONTH
GROUP BY project_id, value;
CREATE TABLE IF NOT EXISTS product_analytics.autocomplete_event_properties
(
project_id UInt16,
event_name String COMMENT 'The $event_name',
property_name String,
value String COMMENT 'The property-value as a string',
_timestamp DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY (project_id, event_name, property_name, value, _timestamp)
TTL _timestamp + INTERVAL 1 MONTH;
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.autocomplete_event_properties_mv
TO product_analytics.autocomplete_event_properties AS
SELECT project_id,
`$event_name` AS event_name,
property_name,
JSONExtractString(toString(`$properties`), property_name) AS value,
_timestamp
FROM product_analytics.events
ARRAY JOIN JSONExtractKeys(toString(`$properties`)) as property_name
WHERE length(value) > 0 AND isNull(toFloat64OrNull(value))
AND _timestamp > now() - INTERVAL 1 MONTH;
CREATE TABLE IF NOT EXISTS product_analytics.autocomplete_event_properties_grouped
(
project_id UInt16,
event_name String COMMENT 'The $event_name',
property_name String,
value String COMMENT 'The property-value as a string',
data_count UInt16 COMMENT 'The number of appearance during the past month',
_timestamp DateTime DEFAULT now()
) ENGINE = ReplacingMergeTree(_timestamp)
ORDER BY (project_id, event_name, property_name, value)
TTL _timestamp + INTERVAL 1 MONTH;
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.autocomplete_event_properties_grouped_mv
REFRESH EVERY 30 MINUTE TO product_analytics.autocomplete_event_properties_grouped AS
SELECT project_id,
event_name,
property_name,
value,
count(1) AS data_count,
max(_timestamp) AS _timestamp
FROM product_analytics.autocomplete_event_properties
WHERE length(value) > 0
AND autocomplete_event_properties._timestamp > now() - INTERVAL 1 MONTH
GROUP BY project_id, event_name, property_name, value;

View file

@ -791,7 +791,8 @@ CREATE TABLE IF NOT EXISTS product_analytics.property_values_samples
ENGINE = ReplacingMergeTree(_timestamp)
ORDER BY (project_id, property_name, is_event_property);
-- Incremental materialized view to get random examples of property values using $properties & properties
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.property_values_sampler_mvREFRESHEVERY30HOURTOproduct_analytics.property_values_samples AS
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.property_values_sampler_mv
REFRESH EVERY 30 HOUR TO product_analytics.property_values_samples AS
SELECT project_id,
property_name,
TRUE AS is_event_property,
@ -812,3 +813,92 @@ FROM product_analytics.events
WHERE randCanonical() < 0.5 -- This randomly skips inserts
AND value != ''
LIMIT 2 BY project_id,property_name;
-- Autocomplete
CREATE TABLE IF NOT EXISTS product_analytics.autocomplete_events
(
project_id UInt16,
value String COMMENT 'The $event_name',
_timestamp DateTime
) ENGINE = MergeTree()
ORDER BY (project_id, value, _timestamp)
TTL _timestamp + INTERVAL 1 MONTH;
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.autocomplete_events_mv
TO product_analytics.autocomplete_events AS
SELECT project_id,
`$event_name` AS value,
_timestamp
FROM product_analytics.events
WHERE _timestamp > now() - INTERVAL 1 MONTH;
CREATE TABLE IF NOT EXISTS product_analytics.autocomplete_events_grouped
(
project_id UInt16,
value String COMMENT 'The $event_name',
data_count UInt16 COMMENT 'The number of appearance during the past month',
_timestamp DateTime
) ENGINE = ReplacingMergeTree(_timestamp)
ORDER BY (project_id, value)
TTL _timestamp + INTERVAL 1 MONTH;
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.autocomplete_events_grouped_mv
REFRESH EVERY 30 MINUTE TO product_analytics.autocomplete_events_grouped AS
SELECT project_id,
value,
count(1) AS data_count,
max(_timestamp) AS _timestamp
FROM product_analytics.autocomplete_events
WHERE autocomplete_events._timestamp > now() - INTERVAL 1 MONTH
GROUP BY project_id, value;
CREATE TABLE IF NOT EXISTS product_analytics.autocomplete_event_properties
(
project_id UInt16,
event_name String COMMENT 'The $event_name',
property_name String,
value String COMMENT 'The property-value as a string',
_timestamp DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY (project_id, event_name, property_name, value, _timestamp)
TTL _timestamp + INTERVAL 1 MONTH;
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.autocomplete_event_properties_mv
TO product_analytics.autocomplete_event_properties AS
SELECT project_id,
`$event_name` AS event_name,
property_name,
JSONExtractString(toString(`$properties`), property_name) AS value,
_timestamp
FROM product_analytics.events
ARRAY JOIN JSONExtractKeys(toString(`$properties`)) as property_name
WHERE length(value) > 0 AND isNull(toFloat64OrNull(value))
AND _timestamp > now() - INTERVAL 1 MONTH;
CREATE TABLE IF NOT EXISTS product_analytics.autocomplete_event_properties_grouped
(
project_id UInt16,
event_name String COMMENT 'The $event_name',
property_name String,
value String COMMENT 'The property-value as a string',
data_count UInt16 COMMENT 'The number of appearance during the past month',
_timestamp DateTime DEFAULT now()
) ENGINE = ReplacingMergeTree(_timestamp)
ORDER BY (project_id, event_name, property_name, value)
TTL _timestamp + INTERVAL 1 MONTH;
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.autocomplete_event_properties_grouped_mv
REFRESH EVERY 30 MINUTE TO product_analytics.autocomplete_event_properties_grouped AS
SELECT project_id,
event_name,
property_name,
value,
count(1) AS data_count,
max(_timestamp) AS _timestamp
FROM product_analytics.autocomplete_event_properties
WHERE length(value) > 0
AND autocomplete_event_properties._timestamp > now() - INTERVAL 1 MONTH
GROUP BY project_id, event_name, property_name, value;

View file

@ -155,7 +155,8 @@ CREATE TABLE IF NOT EXISTS product_analytics.property_values_samples
ENGINE = ReplacingMergeTree(_timestamp)
ORDER BY (project_id, property_name, is_event_property);
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.property_values_sampler_mvREFRESHEVERY30HOURTOproduct_analytics.property_values_samples AS
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.property_values_sampler_mv
REFRESH EVERY 30 HOUR TO product_analytics.property_values_samples AS
SELECT project_id,
property_name,
TRUE AS is_event_property,
@ -175,3 +176,93 @@ FROM product_analytics.events
WHERE randCanonical() < 0.5 -- This randomly skips inserts
AND value != ''
LIMIT 2 BY project_id,property_name;
-- Autocomplete
CREATE TABLE IF NOT EXISTS product_analytics.autocomplete_events
(
project_id UInt16,
value String COMMENT 'The $event_name',
_timestamp DateTime
) ENGINE = MergeTree()
ORDER BY (project_id, value, _timestamp)
TTL _timestamp + INTERVAL 1 MONTH;
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.autocomplete_events_mv
TO product_analytics.autocomplete_events AS
SELECT project_id,
`$event_name` AS value,
_timestamp
FROM product_analytics.events
WHERE _timestamp > now() - INTERVAL 1 MONTH;
CREATE TABLE IF NOT EXISTS product_analytics.autocomplete_events_grouped
(
project_id UInt16,
value String COMMENT 'The $event_name',
data_count UInt16 COMMENT 'The number of appearance during the past month',
_timestamp DateTime
) ENGINE = ReplacingMergeTree(_timestamp)
ORDER BY (project_id, value)
TTL _timestamp + INTERVAL 1 MONTH;
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.autocomplete_events_grouped_mv
REFRESH EVERY 30 MINUTE TO product_analytics.autocomplete_events_grouped AS
SELECT project_id,
value,
count(1) AS data_count,
max(_timestamp) AS _timestamp
FROM product_analytics.autocomplete_events
WHERE autocomplete_events._timestamp > now() - INTERVAL 1 MONTH
GROUP BY project_id, value;
CREATE TABLE IF NOT EXISTS product_analytics.autocomplete_event_properties
(
project_id UInt16,
event_name String COMMENT 'The $event_name',
property_name String,
value String COMMENT 'The property-value as a string',
_timestamp DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY (project_id, event_name, property_name, value, _timestamp)
TTL _timestamp + INTERVAL 1 MONTH;
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.autocomplete_event_properties_mv
TO product_analytics.autocomplete_event_properties AS
SELECT project_id,
`$event_name` AS event_name,
property_name,
JSONExtractString(toString(`$properties`), property_name) AS value,
_timestamp
FROM product_analytics.events
ARRAY JOIN JSONExtractKeys(toString(`$properties`)) as property_name
WHERE length(value) > 0 AND isNull(toFloat64OrNull(value))
AND _timestamp > now() - INTERVAL 1 MONTH;
CREATE TABLE IF NOT EXISTS product_analytics.autocomplete_event_properties_grouped
(
project_id UInt16,
event_name String COMMENT 'The $event_name',
property_name String,
value String COMMENT 'The property-value as a string',
data_count UInt16 COMMENT 'The number of appearance during the past month',
_timestamp DateTime DEFAULT now()
) ENGINE = ReplacingMergeTree(_timestamp)
ORDER BY (project_id, event_name, property_name, value)
TTL _timestamp + INTERVAL 1 MONTH;
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.autocomplete_event_properties_grouped_mv
REFRESH EVERY 30 MINUTE TO product_analytics.autocomplete_event_properties_grouped AS
SELECT project_id,
event_name,
property_name,
value,
count(1) AS data_count,
max(_timestamp) AS _timestamp
FROM product_analytics.autocomplete_event_properties
WHERE length(value) > 0
AND autocomplete_event_properties._timestamp > now() - INTERVAL 1 MONTH
GROUP BY project_id, event_name, property_name, value;

View file

@ -687,7 +687,8 @@ CREATE TABLE IF NOT EXISTS product_analytics.property_values_samples
ENGINE = ReplacingMergeTree(_timestamp)
ORDER BY (project_id, property_name, is_event_property);
-- Incremental materialized view to get random examples of property values using $properties & properties
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.property_values_sampler_mvREFRESHEVERY30HOURTOproduct_analytics.property_values_samples AS
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.property_values_sampler_mv
REFRESH EVERY 30 HOUR TO product_analytics.property_values_samples AS
SELECT project_id,
property_name,
TRUE AS is_event_property,
@ -708,3 +709,92 @@ FROM product_analytics.events
WHERE randCanonical() < 0.5 -- This randomly skips inserts
AND value != ''
LIMIT 2 BY project_id,property_name;
-- Autocomplete
CREATE TABLE IF NOT EXISTS product_analytics.autocomplete_events
(
project_id UInt16,
value String COMMENT 'The $event_name',
_timestamp DateTime
) ENGINE = MergeTree()
ORDER BY (project_id, value, _timestamp)
TTL _timestamp + INTERVAL 1 MONTH;
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.autocomplete_events_mv
TO product_analytics.autocomplete_events AS
SELECT project_id,
`$event_name` AS value,
_timestamp
FROM product_analytics.events
WHERE _timestamp > now() - INTERVAL 1 MONTH;
CREATE TABLE IF NOT EXISTS product_analytics.autocomplete_events_grouped
(
project_id UInt16,
value String COMMENT 'The $event_name',
data_count UInt16 COMMENT 'The number of appearance during the past month',
_timestamp DateTime
) ENGINE = ReplacingMergeTree(_timestamp)
ORDER BY (project_id, value)
TTL _timestamp + INTERVAL 1 MONTH;
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.autocomplete_events_grouped_mv
REFRESH EVERY 30 MINUTE TO product_analytics.autocomplete_events_grouped AS
SELECT project_id,
value,
count(1) AS data_count,
max(_timestamp) AS _timestamp
FROM product_analytics.autocomplete_events
WHERE autocomplete_events._timestamp > now() - INTERVAL 1 MONTH
GROUP BY project_id, value;
CREATE TABLE IF NOT EXISTS product_analytics.autocomplete_event_properties
(
project_id UInt16,
event_name String COMMENT 'The $event_name',
property_name String,
value String COMMENT 'The property-value as a string',
_timestamp DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY (project_id, event_name, property_name, value, _timestamp)
TTL _timestamp + INTERVAL 1 MONTH;
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.autocomplete_event_properties_mv
TO product_analytics.autocomplete_event_properties AS
SELECT project_id,
`$event_name` AS event_name,
property_name,
JSONExtractString(toString(`$properties`), property_name) AS value,
_timestamp
FROM product_analytics.events
ARRAY JOIN JSONExtractKeys(toString(`$properties`)) as property_name
WHERE length(value) > 0 AND isNull(toFloat64OrNull(value))
AND _timestamp > now() - INTERVAL 1 MONTH;
CREATE TABLE IF NOT EXISTS product_analytics.autocomplete_event_properties_grouped
(
project_id UInt16,
event_name String COMMENT 'The $event_name',
property_name String,
value String COMMENT 'The property-value as a string',
data_count UInt16 COMMENT 'The number of appearance during the past month',
_timestamp DateTime DEFAULT now()
) ENGINE = ReplacingMergeTree(_timestamp)
ORDER BY (project_id, event_name, property_name, value)
TTL _timestamp + INTERVAL 1 MONTH;
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.autocomplete_event_properties_grouped_mv
REFRESH EVERY 30 MINUTE TO product_analytics.autocomplete_event_properties_grouped AS
SELECT project_id,
event_name,
property_name,
value,
count(1) AS data_count,
max(_timestamp) AS _timestamp
FROM product_analytics.autocomplete_event_properties
WHERE length(value) > 0
AND autocomplete_event_properties._timestamp > now() - INTERVAL 1 MONTH
GROUP BY project_id, event_name, property_name, value;