feat(DB): use incremental&refreshable materialized views to fill extra tables

This commit is contained in:
Taha Yassine Kraiem 2025-03-26 13:06:34 +01:00 committed by Kraiem Taha Yassine
parent d087736df0
commit bb17f672fe

View file

@ -143,6 +143,17 @@ CREATE TABLE IF NOT EXISTS experimental.sessions
TTL datetime + INTERVAL 1 MONTH
SETTINGS index_granularity = 512;
CREATE TABLE IF NOT EXISTS experimental.user_viewed_sessions
(
project_id UInt16,
user_id UInt32,
session_id UInt64,
_timestamp DateTime DEFAULT now()
) ENGINE = ReplacingMergeTree(_timestamp)
PARTITION BY toYYYYMM(_timestamp)
ORDER BY (project_id, user_id, session_id)
TTL _timestamp + INTERVAL 3 MONTH;
CREATE TABLE IF NOT EXISTS experimental.issues
(
project_id UInt16,
@ -518,8 +529,9 @@ CREATE TABLE IF NOT EXISTS product_analytics.all_events
ORDER BY (project_id, auto_captured, event_name);
-- ----------------- This is experimental, if it doesn't work, we need to do it in db worker -------------
-- Incremental materialized view to fill event_properties using $properties
CREATE MATERIALIZED VIEW product_analytics.events_extractor TO product_analytics.all_events AS
-- Incremental materialized view to fill all_events using $properties
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.all_events_extractor_mv
TO product_analytics.all_events AS
SELECT DISTINCT ON (project_id,auto_captured,event_name) project_id,
`$auto_captured` AS auto_captured,
`$event_name` AS event_name,
@ -545,8 +557,7 @@ CREATE TABLE IF NOT EXISTS product_analytics.event_properties
project_id UInt16,
event_name String,
property_name String,
-- TODO: find a fix for this
-- value_type String,
value_type String,
_timestamp DateTime DEFAULT now()
) ENGINE = ReplacingMergeTree(_timestamp)
@ -554,18 +565,22 @@ CREATE TABLE IF NOT EXISTS product_analytics.event_properties
-- ----------------- This is experimental, if it doesn't work, we need to do it in db worker -------------
-- Incremental materialized view to fill event_properties using $properties
CREATE MATERIALIZED VIEW product_analytics.event_properties_extractor TO product_analytics.event_properties AS
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.event_properties_extractor_mv
TO product_analytics.event_properties AS
SELECT project_id,
`$event_name` AS event_name,
property_name
`$event_name` AS event_name,
property_name,
JSONType(JSONExtractRaw(toString(`$properties`), property_name)) AS value_type
FROM product_analytics.events
ARRAY JOIN JSONExtractKeys(toString(`$properties`)) as property_name;
-- Incremental materialized view to fill event_properties using properties
CREATE MATERIALIZED VIEW product_analytics.event_cproperties_extractor TO product_analytics.event_properties AS
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.event_cproperties_extractor
TO product_analytics.event_properties AS
SELECT project_id,
`$event_name` AS event_name,
property_name
`$event_name` AS event_name,
property_name,
JSONType(JSONExtractRaw(toString(`properties`), property_name)) AS value_type
FROM product_analytics.events
ARRAY JOIN JSONExtractKeys(toString(`properties`)) as property_name;
-- -------- END ---------
@ -590,32 +605,92 @@ CREATE TABLE IF NOT EXISTS product_analytics.all_properties
ORDER BY (project_id, property_name, is_event_property);
CREATE TABLE IF NOT EXISTS experimental.user_viewed_sessions
(
project_id UInt16,
user_id UInt32,
session_id UInt64,
_timestamp DateTime DEFAULT now()
) ENGINE = ReplacingMergeTree(_timestamp)
PARTITION BY toYYYYMM(_timestamp)
ORDER BY (project_id, user_id, session_id)
TTL _timestamp + INTERVAL 3 MONTH;
CREATE TABLE product_analytics.properties_example_values
(
id UInt64,
value String,
random_value Float64
)
ENGINE = ReplacingMergeTree()
ORDER BY random_value;
-- Incremental materialized view to get random examples of property values
CREATE MATERIALIZED VIEW product_analytics.properties_values_random_sampler TO product_analytics.properties_example_values AS
SELECT id,
value,
rand() AS random_value
-- ----------------- This is experimental, if it doesn't work, we need to do it in db worker -------------
-- Incremental materialized view to fill all_properties using $properties
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.all_properties_extractor_mv
TO product_analytics.all_properties AS
SELECT project_id,
property_name,
TRUE AS is_event_property,
display_name,
description,
status,
data_count,
query_count
FROM product_analytics.events
WHERE rand() < 0.5 -- This randomly skips ~50% of inserts
ORDER BY random_value
LIMIT 10;
ARRAY JOIN JSONExtractKeys(toString(`$properties`)) as property_name
LEFT JOIN (SELECT project_id,
property_name,
display_name,
description,
status,
data_count,
query_count
FROM product_analytics.all_properties
WHERE (all_properties.display_name != ''
OR all_properties.description != '')
AND is_event_property) AS old_data
ON (events.project_id = old_data.project_id AND property_name = old_data.property_name);
-- Incremental materialized view to fill all_properties using properties
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.all_cproperties_extractor_mv
TO product_analytics.all_properties AS
SELECT project_id,
property_name,
TRUE AS is_event_property,
display_name,
description,
status,
data_count,
query_count
FROM product_analytics.events
ARRAY JOIN JSONExtractKeys(toString(`properties`)) as property_name
LEFT JOIN (SELECT project_id,
property_name,
display_name,
description,
status,
data_count,
query_count
FROM product_analytics.all_properties
WHERE (all_properties.display_name != ''
OR all_properties.description != '')
AND is_event_property) AS old_data
ON (events.project_id = old_data.project_id AND property_name = old_data.property_name);
-- -------- END ---------
-- Some random examples of property-values, limited by 2 per property
-- Experimental: This table is filled by a refreshable materialized view
CREATE TABLE IF NOT EXISTS product_analytics.property_values_samples
(
project_id UInt16,
property_name String,
is_event_property BOOL,
value String,
_timestamp DateTime DEFAULT now()
)
ENGINE = ReplacingMergeTree(_timestamp)
ORDER BY (project_id, property_name, is_event_property);
-- Incremental materialized view to get random examples of property values using $properties & properties
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.property_values_sampler_mvREFRESHEVERY30HOURTOproduct_analytics.property_values_samples AS
SELECT project_id,
property_name,
TRUE AS is_event_property,
JSONExtractString(toString(`$properties`), property_name) AS value
FROM product_analytics.events
ARRAY JOIN JSONExtractKeys(toString(`$properties`)) as property_name
WHERE randCanonical() < 0.5 -- This randomly skips inserts
AND value != ''
LIMIT 2 BY project_id,property_name
UNION ALL
-- using union because each table should be the target of 1 single refreshable MV
SELECT project_id,
property_name,
TRUE AS is_event_property,
JSONExtractString(toString(`properties`), property_name) AS value
FROM product_analytics.events
ARRAY JOIN JSONExtractKeys(toString(`properties`)) as property_name
WHERE randCanonical() < 0.5 -- This randomly skips inserts
AND value != ''
LIMIT 2 BY project_id,property_name;