From bb17f672fe12b336bcf1375cc752d152f8cd1e52 Mon Sep 17 00:00:00 2001 From: Taha Yassine Kraiem Date: Wed, 26 Mar 2025 13:06:34 +0100 Subject: [PATCH] feat(DB): use incremental&refreshable materialized views to fill extra tables --- .../clickhouse/create/init_schema.sql | 151 +++++++++++++----- 1 file changed, 113 insertions(+), 38 deletions(-) diff --git a/scripts/schema/db/init_dbs/clickhouse/create/init_schema.sql b/scripts/schema/db/init_dbs/clickhouse/create/init_schema.sql index 62bdbae69..311f51c7d 100644 --- a/scripts/schema/db/init_dbs/clickhouse/create/init_schema.sql +++ b/scripts/schema/db/init_dbs/clickhouse/create/init_schema.sql @@ -143,6 +143,17 @@ CREATE TABLE IF NOT EXISTS experimental.sessions TTL datetime + INTERVAL 1 MONTH SETTINGS index_granularity = 512; +CREATE TABLE IF NOT EXISTS experimental.user_viewed_sessions +( + project_id UInt16, + user_id UInt32, + session_id UInt64, + _timestamp DateTime DEFAULT now() +) ENGINE = ReplacingMergeTree(_timestamp) + PARTITION BY toYYYYMM(_timestamp) + ORDER BY (project_id, user_id, session_id) + TTL _timestamp + INTERVAL 3 MONTH; + CREATE TABLE IF NOT EXISTS experimental.issues ( project_id UInt16, @@ -518,8 +529,9 @@ CREATE TABLE IF NOT EXISTS product_analytics.all_events ORDER BY (project_id, auto_captured, event_name); -- ----------------- This is experimental, if it doesn't work, we need to do it in db worker ------------- --- Incremental materialized view to fill event_properties using $properties -CREATE MATERIALIZED VIEW product_analytics.events_extractor TO product_analytics.all_events AS +-- Incremental materialized view to fill all_events using $properties +CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.all_events_extractor_mv + TO product_analytics.all_events AS SELECT DISTINCT ON (project_id,auto_captured,event_name) project_id, `$auto_captured` AS auto_captured, `$event_name` AS event_name, @@ -545,8 +557,7 @@ CREATE TABLE IF NOT EXISTS product_analytics.event_properties project_id UInt16, event_name String, property_name String, --- TODO: find a fix for this --- value_type String, + value_type String, _timestamp DateTime DEFAULT now() ) ENGINE = ReplacingMergeTree(_timestamp) @@ -554,18 +565,22 @@ CREATE TABLE IF NOT EXISTS product_analytics.event_properties -- ----------------- This is experimental, if it doesn't work, we need to do it in db worker ------------- -- Incremental materialized view to fill event_properties using $properties -CREATE MATERIALIZED VIEW product_analytics.event_properties_extractor TO product_analytics.event_properties AS +CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.event_properties_extractor_mv + TO product_analytics.event_properties AS SELECT project_id, - `$event_name` AS event_name, - property_name + `$event_name` AS event_name, + property_name, + JSONType(JSONExtractRaw(toString(`$properties`), property_name)) AS value_type FROM product_analytics.events ARRAY JOIN JSONExtractKeys(toString(`$properties`)) as property_name; -- Incremental materialized view to fill event_properties using properties -CREATE MATERIALIZED VIEW product_analytics.event_cproperties_extractor TO product_analytics.event_properties AS +CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.event_cproperties_extractor + TO product_analytics.event_properties AS SELECT project_id, - `$event_name` AS event_name, - property_name + `$event_name` AS event_name, + property_name, + JSONType(JSONExtractRaw(toString(`properties`), property_name)) AS value_type FROM product_analytics.events ARRAY JOIN JSONExtractKeys(toString(`properties`)) as property_name; -- -------- END --------- @@ -590,32 +605,92 @@ CREATE TABLE IF NOT EXISTS product_analytics.all_properties ORDER BY (project_id, property_name, is_event_property); -CREATE TABLE IF NOT EXISTS experimental.user_viewed_sessions -( - project_id UInt16, - user_id UInt32, - session_id UInt64, - _timestamp DateTime DEFAULT now() -) ENGINE = ReplacingMergeTree(_timestamp) - PARTITION BY toYYYYMM(_timestamp) - ORDER BY (project_id, user_id, session_id) - TTL _timestamp + INTERVAL 3 MONTH; - -CREATE TABLE product_analytics.properties_example_values -( - id UInt64, - value String, - random_value Float64 -) - ENGINE = ReplacingMergeTree() - ORDER BY random_value; - --- Incremental materialized view to get random examples of property values -CREATE MATERIALIZED VIEW product_analytics.properties_values_random_sampler TO product_analytics.properties_example_values AS -SELECT id, - value, - rand() AS random_value +-- ----------------- This is experimental, if it doesn't work, we need to do it in db worker ------------- +-- Incremental materialized view to fill all_properties using $properties +CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.all_properties_extractor_mv + TO product_analytics.all_properties AS +SELECT project_id, + property_name, + TRUE AS is_event_property, + display_name, + description, + status, + data_count, + query_count FROM product_analytics.events -WHERE rand() < 0.5 -- This randomly skips ~50% of inserts -ORDER BY random_value -LIMIT 10; + ARRAY JOIN JSONExtractKeys(toString(`$properties`)) as property_name + LEFT JOIN (SELECT project_id, + property_name, + display_name, + description, + status, + data_count, + query_count + FROM product_analytics.all_properties + WHERE (all_properties.display_name != '' + OR all_properties.description != '') + AND is_event_property) AS old_data + ON (events.project_id = old_data.project_id AND property_name = old_data.property_name); + +-- Incremental materialized view to fill all_properties using properties +CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.all_cproperties_extractor_mv + TO product_analytics.all_properties AS +SELECT project_id, + property_name, + TRUE AS is_event_property, + display_name, + description, + status, + data_count, + query_count +FROM product_analytics.events + ARRAY JOIN JSONExtractKeys(toString(`properties`)) as property_name + LEFT JOIN (SELECT project_id, + property_name, + display_name, + description, + status, + data_count, + query_count + FROM product_analytics.all_properties + WHERE (all_properties.display_name != '' + OR all_properties.description != '') + AND is_event_property) AS old_data + ON (events.project_id = old_data.project_id AND property_name = old_data.property_name); +-- -------- END --------- + +-- Some random examples of property-values, limited by 2 per property +-- Experimental: This table is filled by a refreshable materialized view +CREATE TABLE IF NOT EXISTS product_analytics.property_values_samples +( + project_id UInt16, + property_name String, + is_event_property BOOL, + value String, + + _timestamp DateTime DEFAULT now() +) + ENGINE = ReplacingMergeTree(_timestamp) + ORDER BY (project_id, property_name, is_event_property); +-- Incremental materialized view to get random examples of property values using $properties & properties +CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.property_values_sampler_mvREFRESHEVERY30HOURTOproduct_analytics.property_values_samples AS +SELECT project_id, + property_name, + TRUE AS is_event_property, + JSONExtractString(toString(`$properties`), property_name) AS value +FROM product_analytics.events + ARRAY JOIN JSONExtractKeys(toString(`$properties`)) as property_name +WHERE randCanonical() < 0.5 -- This randomly skips inserts + AND value != '' +LIMIT 2 BY project_id,property_name +UNION ALL +-- using union because each table should be the target of 1 single refreshable MV +SELECT project_id, + property_name, + TRUE AS is_event_property, + JSONExtractString(toString(`properties`), property_name) AS value +FROM product_analytics.events + ARRAY JOIN JSONExtractKeys(toString(`properties`)) as property_name +WHERE randCanonical() < 0.5 -- This randomly skips inserts + AND value != '' +LIMIT 2 BY project_id,property_name;