feat(DB): use incremental&refreshable materialized views to fill extra tables

This commit is contained in:
Taha Yassine Kraiem 2025-03-26 13:53:15 +01:00 committed by Kraiem Taha Yassine
parent 65072f607f
commit 8fbaf25799
4 changed files with 472 additions and 30 deletions

View file

@ -1,17 +1,6 @@
CREATE OR REPLACE FUNCTION openreplay_version AS() -> 'v1.23.0-ee';
-- The full list of event-properties (used to tell which property belongs to which event)
CREATE TABLE IF NOT EXISTS product_analytics.event_properties
(
project_id UInt16,
event_name String,
property_name String,
_timestamp DateTime DEFAULT now()
) ENGINE = ReplacingMergeTree(_timestamp)
ORDER BY (project_id, event_name, property_name);
DROP TABLE IF EXISTS product_analytics.all_events;
CREATE TABLE IF NOT EXISTS product_analytics.all_events
(
@ -27,3 +16,153 @@ CREATE TABLE IF NOT EXISTS product_analytics.all_events
_timestamp DateTime DEFAULT now()
) ENGINE = ReplacingMergeTree(_timestamp)
ORDER BY (project_id, auto_captured, event_name);
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.all_events_extractor_mv
TO product_analytics.all_events AS
SELECT DISTINCT ON (project_id,auto_captured,event_name) project_id,
`$auto_captured` AS auto_captured,
`$event_name` AS event_name,
display_name,
description
FROM product_analytics.events
LEFT JOIN (SELECT project_id,
auto_captured,
event_name,
display_name,
description
FROM product_analytics.all_events
WHERE all_events.display_name != ''
OR all_events.description != '') AS old_data
ON (events.project_id = old_data.project_id AND events.`$auto_captured` = old_data.auto_captured AND
events.`$event_name` = old_data.event_name);
CREATE TABLE IF NOT EXISTS product_analytics.event_properties
(
project_id UInt16,
event_name String,
property_name String,
value_type String,
_timestamp DateTime DEFAULT now()
) ENGINE = ReplacingMergeTree(_timestamp)
ORDER BY (project_id, event_name, property_name, value_type);
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.event_properties_extractor_mv
TO product_analytics.event_properties AS
SELECT project_id,
`$event_name` AS event_name,
property_name,
JSONType(JSONExtractRaw(toString(`$properties`), property_name)) AS value_type
FROM product_analytics.events
ARRAY JOIN JSONExtractKeys(toString(`$properties`)) as property_name;
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.event_cproperties_extractor
TO product_analytics.event_properties AS
SELECT project_id,
`$event_name` AS event_name,
property_name,
JSONType(JSONExtractRaw(toString(`properties`), property_name)) AS value_type
FROM product_analytics.events
ARRAY JOIN JSONExtractKeys(toString(`properties`)) as property_name;
DROP TABLE IF EXISTS product_analytics.all_properties;
CREATE TABLE IF NOT EXISTS product_analytics.all_properties
(
project_id UInt16,
property_name String,
is_event_property BOOL,
display_name String DEFAULT '',
description String DEFAULT '',
status String DEFAULT 'visible' COMMENT 'visible/hidden/dropped',
data_count UInt32 DEFAULT 1,
query_count UInt32 DEFAULT 0,
created_at DateTime64,
_timestamp DateTime DEFAULT now()
) ENGINE = ReplacingMergeTree(_timestamp)
ORDER BY (project_id, property_name, is_event_property);
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.all_properties_extractor_mv
TO product_analytics.all_properties AS
SELECT project_id,
property_name,
TRUE AS is_event_property,
display_name,
description,
status,
data_count,
query_count
FROM product_analytics.events
ARRAY JOIN JSONExtractKeys(toString(`$properties`)) as property_name
LEFT JOIN (SELECT project_id,
property_name,
display_name,
description,
status,
data_count,
query_count
FROM product_analytics.all_properties
WHERE (all_properties.display_name != ''
OR all_properties.description != '')
AND is_event_property) AS old_data
ON (events.project_id = old_data.project_id AND property_name = old_data.property_name);
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.all_cproperties_extractor_mv
TO product_analytics.all_properties AS
SELECT project_id,
property_name,
TRUE AS is_event_property,
display_name,
description,
status,
data_count,
query_count
FROM product_analytics.events
ARRAY JOIN JSONExtractKeys(toString(`properties`)) as property_name
LEFT JOIN (SELECT project_id,
property_name,
display_name,
description,
status,
data_count,
query_count
FROM product_analytics.all_properties
WHERE (all_properties.display_name != ''
OR all_properties.description != '')
AND is_event_property) AS old_data
ON (events.project_id = old_data.project_id AND property_name = old_data.property_name);
CREATE TABLE IF NOT EXISTS product_analytics.property_values_samples
(
project_id UInt16,
property_name String,
is_event_property BOOL,
value String,
_timestamp DateTime DEFAULT now()
)
ENGINE = ReplacingMergeTree(_timestamp)
ORDER BY (project_id, property_name, is_event_property);
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.property_values_sampler_mv
REFRESH EVERY 30 HOUR TO product_analytics.property_values_samples AS
SELECT project_id,
property_name,
TRUE AS is_event_property,
JSONExtractString(toString(`$properties`), property_name) AS value
FROM product_analytics.events
ARRAY JOIN JSONExtractKeys(toString(`$properties`)) as property_name
WHERE randCanonical() < 0.5 -- This randomly skips inserts
AND value != ''
LIMIT 2 BY project_id,property_name
UNION ALL
SELECT project_id,
property_name,
TRUE AS is_event_property,
JSONExtractString(toString(`properties`), property_name) AS value
FROM product_analytics.events
ARRAY JOIN JSONExtractKeys(toString(`properties`)) as property_name
WHERE randCanonical() < 0.5 -- This randomly skips inserts
AND value != ''
LIMIT 2 BY project_id,property_name;

View file

@ -88,7 +88,7 @@ CREATE TABLE IF NOT EXISTS experimental.events
) ENGINE = ReplacingMergeTree(_timestamp)
PARTITION BY toYYYYMM(datetime)
ORDER BY (project_id, datetime, event_type, session_id, message_id)
TTL datetime + INTERVAL 3 MONTH;
TTL datetime + INTERVAL 1 MONTH;
@ -140,7 +140,7 @@ CREATE TABLE IF NOT EXISTS experimental.sessions
) ENGINE = ReplacingMergeTree(_timestamp)
PARTITION BY toYYYYMMDD(datetime)
ORDER BY (project_id, datetime, session_id)
TTL datetime + INTERVAL 3 MONTH
TTL datetime + INTERVAL 1 MONTH
SETTINGS index_granularity = 512;
CREATE TABLE IF NOT EXISTS experimental.user_favorite_sessions
@ -189,7 +189,7 @@ CREATE TABLE IF NOT EXISTS experimental.issues
) ENGINE = ReplacingMergeTree(_timestamp)
PARTITION BY toYYYYMM(_timestamp)
ORDER BY (project_id, issue_id, type)
TTL _timestamp + INTERVAL 3 MONTH;
TTL _timestamp + INTERVAL 1 MONTH;
@ -330,7 +330,7 @@ CREATE TABLE IF NOT EXISTS experimental.ios_events
) ENGINE = ReplacingMergeTree(_timestamp)
PARTITION BY toYYYYMM(datetime)
ORDER BY (project_id, datetime, event_type, session_id, message_id)
TTL datetime + INTERVAL 3 MONTH;
TTL datetime + INTERVAL 1 MONTH;
SET allow_experimental_json_type = 1;
@ -639,6 +639,7 @@ CREATE TABLE IF NOT EXISTS product_analytics.group_properties
-- The full list of events
-- Experimental: This table is filled by an incremental materialized view
CREATE TABLE IF NOT EXISTS product_analytics.all_events
(
project_id UInt16,
@ -654,19 +655,66 @@ CREATE TABLE IF NOT EXISTS product_analytics.all_events
) ENGINE = ReplacingMergeTree(_timestamp)
ORDER BY (project_id, auto_captured, event_name);
-- ----------------- This is experimental, if it doesn't work, we need to do it in db worker -------------
-- Incremental materialized view to fill all_events using $properties
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.all_events_extractor_mv
TO product_analytics.all_events AS
SELECT DISTINCT ON (project_id,auto_captured,event_name) project_id,
`$auto_captured` AS auto_captured,
`$event_name` AS event_name,
display_name,
description
FROM product_analytics.events
LEFT JOIN (SELECT project_id,
auto_captured,
event_name,
display_name,
description
FROM product_analytics.all_events
WHERE all_events.display_name != ''
OR all_events.description != '') AS old_data
ON (events.project_id = old_data.project_id AND events.`$auto_captured` = old_data.auto_captured AND
events.`$event_name` = old_data.event_name);
-- -------- END ---------
-- The full list of event-properties (used to tell which property belongs to which event)
-- Experimental: This table is filled by an incremental materialized view
CREATE TABLE IF NOT EXISTS product_analytics.event_properties
(
project_id UInt16,
event_name String,
property_name String,
value_type String,
_timestamp DateTime DEFAULT now()
) ENGINE = ReplacingMergeTree(_timestamp)
ORDER BY (project_id, event_name, property_name);
ORDER BY (project_id, event_name, property_name, value_type);
-- ----------------- This is experimental, if it doesn't work, we need to do it in db worker -------------
-- Incremental materialized view to fill event_properties using $properties
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.event_properties_extractor_mv
TO product_analytics.event_properties AS
SELECT project_id,
`$event_name` AS event_name,
property_name,
JSONType(JSONExtractRaw(toString(`$properties`), property_name)) AS value_type
FROM product_analytics.events
ARRAY JOIN JSONExtractKeys(toString(`$properties`)) as property_name;
-- Incremental materialized view to fill event_properties using properties
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.event_cproperties_extractor
TO product_analytics.event_properties AS
SELECT project_id,
`$event_name` AS event_name,
property_name,
JSONType(JSONExtractRaw(toString(`properties`), property_name)) AS value_type
FROM product_analytics.events
ARRAY JOIN JSONExtractKeys(toString(`properties`)) as property_name;
-- -------- END ---------
-- The full list of properties (events and users)
-- Experimental: This table is filled by an incremental materialized view
CREATE TABLE IF NOT EXISTS product_analytics.all_properties
(
project_id UInt16,
@ -682,3 +730,95 @@ CREATE TABLE IF NOT EXISTS product_analytics.all_properties
_timestamp DateTime DEFAULT now()
) ENGINE = ReplacingMergeTree(_timestamp)
ORDER BY (project_id, property_name, is_event_property);
-- ----------------- This is experimental, if it doesn't work, we need to do it in db worker -------------
-- Incremental materialized view to fill all_properties using $properties
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.all_properties_extractor_mv
TO product_analytics.all_properties AS
SELECT project_id,
property_name,
TRUE AS is_event_property,
display_name,
description,
status,
data_count,
query_count
FROM product_analytics.events
ARRAY JOIN JSONExtractKeys(toString(`$properties`)) as property_name
LEFT JOIN (SELECT project_id,
property_name,
display_name,
description,
status,
data_count,
query_count
FROM product_analytics.all_properties
WHERE (all_properties.display_name != ''
OR all_properties.description != '')
AND is_event_property) AS old_data
ON (events.project_id = old_data.project_id AND property_name = old_data.property_name);
-- Incremental materialized view to fill all_properties using properties
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.all_cproperties_extractor_mv
TO product_analytics.all_properties AS
SELECT project_id,
property_name,
TRUE AS is_event_property,
display_name,
description,
status,
data_count,
query_count
FROM product_analytics.events
ARRAY JOIN JSONExtractKeys(toString(`properties`)) as property_name
LEFT JOIN (SELECT project_id,
property_name,
display_name,
description,
status,
data_count,
query_count
FROM product_analytics.all_properties
WHERE (all_properties.display_name != ''
OR all_properties.description != '')
AND is_event_property) AS old_data
ON (events.project_id = old_data.project_id AND property_name = old_data.property_name);
-- -------- END ---------
-- Some random examples of property-values, limited by 2 per property
-- Experimental: This table is filled by a refreshable materialized view
CREATE TABLE IF NOT EXISTS product_analytics.property_values_samples
(
project_id UInt16,
property_name String,
is_event_property BOOL,
value String,
_timestamp DateTime DEFAULT now()
)
ENGINE = ReplacingMergeTree(_timestamp)
ORDER BY (project_id, property_name, is_event_property);
-- Incremental materialized view to get random examples of property values using $properties & properties
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.property_values_sampler_mv
REFRESH EVERY 30 HOUR TO product_analytics.property_values_samples AS
SELECT project_id,
property_name,
TRUE AS is_event_property,
JSONExtractString(toString(`$properties`), property_name) AS value
FROM product_analytics.events
ARRAY JOIN JSONExtractKeys(toString(`$properties`)) as property_name
WHERE randCanonical() < 0.5 -- This randomly skips inserts
AND value != ''
LIMIT 2 BY project_id,property_name
UNION ALL
-- using union because each table should be the target of 1 single refreshable MV
SELECT project_id,
property_name,
TRUE AS is_event_property,
JSONExtractString(toString(`properties`), property_name) AS value
FROM product_analytics.events
ARRAY JOIN JSONExtractKeys(toString(`properties`)) as property_name
WHERE randCanonical() < 0.5 -- This randomly skips inserts
AND value != ''
LIMIT 2 BY project_id,property_name;

View file

@ -12,17 +12,6 @@ CREATE TABLE IF NOT EXISTS experimental.user_viewed_sessions
ORDER BY (project_id, user_id, session_id)
TTL _timestamp + INTERVAL 3 MONTH;
-- The full list of event-properties (used to tell which property belongs to which event)
CREATE TABLE IF NOT EXISTS product_analytics.event_properties
(
project_id UInt16,
event_name String,
property_name String,
_timestamp DateTime DEFAULT now()
) ENGINE = ReplacingMergeTree(_timestamp)
ORDER BY (project_id, event_name, property_name);
DROP TABLE IF EXISTS product_analytics.all_events;
CREATE TABLE IF NOT EXISTS product_analytics.all_events
(
@ -38,3 +27,153 @@ CREATE TABLE IF NOT EXISTS product_analytics.all_events
_timestamp DateTime DEFAULT now()
) ENGINE = ReplacingMergeTree(_timestamp)
ORDER BY (project_id, auto_captured, event_name);
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.all_events_extractor_mv
TO product_analytics.all_events AS
SELECT DISTINCT ON (project_id,auto_captured,event_name) project_id,
`$auto_captured` AS auto_captured,
`$event_name` AS event_name,
display_name,
description
FROM product_analytics.events
LEFT JOIN (SELECT project_id,
auto_captured,
event_name,
display_name,
description
FROM product_analytics.all_events
WHERE all_events.display_name != ''
OR all_events.description != '') AS old_data
ON (events.project_id = old_data.project_id AND events.`$auto_captured` = old_data.auto_captured AND
events.`$event_name` = old_data.event_name);
CREATE TABLE IF NOT EXISTS product_analytics.event_properties
(
project_id UInt16,
event_name String,
property_name String,
value_type String,
_timestamp DateTime DEFAULT now()
) ENGINE = ReplacingMergeTree(_timestamp)
ORDER BY (project_id, event_name, property_name, value_type);
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.event_properties_extractor_mv
TO product_analytics.event_properties AS
SELECT project_id,
`$event_name` AS event_name,
property_name,
JSONType(JSONExtractRaw(toString(`$properties`), property_name)) AS value_type
FROM product_analytics.events
ARRAY JOIN JSONExtractKeys(toString(`$properties`)) as property_name;
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.event_cproperties_extractor
TO product_analytics.event_properties AS
SELECT project_id,
`$event_name` AS event_name,
property_name,
JSONType(JSONExtractRaw(toString(`properties`), property_name)) AS value_type
FROM product_analytics.events
ARRAY JOIN JSONExtractKeys(toString(`properties`)) as property_name;
DROP TABLE IF EXISTS product_analytics.all_properties;
CREATE TABLE IF NOT EXISTS product_analytics.all_properties
(
project_id UInt16,
property_name String,
is_event_property BOOL,
display_name String DEFAULT '',
description String DEFAULT '',
status String DEFAULT 'visible' COMMENT 'visible/hidden/dropped',
data_count UInt32 DEFAULT 1,
query_count UInt32 DEFAULT 0,
created_at DateTime64,
_timestamp DateTime DEFAULT now()
) ENGINE = ReplacingMergeTree(_timestamp)
ORDER BY (project_id, property_name, is_event_property);
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.all_properties_extractor_mv
TO product_analytics.all_properties AS
SELECT project_id,
property_name,
TRUE AS is_event_property,
display_name,
description,
status,
data_count,
query_count
FROM product_analytics.events
ARRAY JOIN JSONExtractKeys(toString(`$properties`)) as property_name
LEFT JOIN (SELECT project_id,
property_name,
display_name,
description,
status,
data_count,
query_count
FROM product_analytics.all_properties
WHERE (all_properties.display_name != ''
OR all_properties.description != '')
AND is_event_property) AS old_data
ON (events.project_id = old_data.project_id AND property_name = old_data.property_name);
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.all_cproperties_extractor_mv
TO product_analytics.all_properties AS
SELECT project_id,
property_name,
TRUE AS is_event_property,
display_name,
description,
status,
data_count,
query_count
FROM product_analytics.events
ARRAY JOIN JSONExtractKeys(toString(`properties`)) as property_name
LEFT JOIN (SELECT project_id,
property_name,
display_name,
description,
status,
data_count,
query_count
FROM product_analytics.all_properties
WHERE (all_properties.display_name != ''
OR all_properties.description != '')
AND is_event_property) AS old_data
ON (events.project_id = old_data.project_id AND property_name = old_data.property_name);
CREATE TABLE IF NOT EXISTS product_analytics.property_values_samples
(
project_id UInt16,
property_name String,
is_event_property BOOL,
value String,
_timestamp DateTime DEFAULT now()
)
ENGINE = ReplacingMergeTree(_timestamp)
ORDER BY (project_id, property_name, is_event_property);
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.property_values_sampler_mv
REFRESH EVERY 30 HOUR TO product_analytics.property_values_samples AS
SELECT project_id,
property_name,
TRUE AS is_event_property,
JSONExtractString(toString(`$properties`), property_name) AS value
FROM product_analytics.events
ARRAY JOIN JSONExtractKeys(toString(`$properties`)) as property_name
WHERE randCanonical() < 0.5 -- This randomly skips inserts
AND value != ''
LIMIT 2 BY project_id,property_name
UNION ALL
SELECT project_id,
property_name,
TRUE AS is_event_property,
JSONExtractString(toString(`properties`), property_name) AS value
FROM product_analytics.events
ARRAY JOIN JSONExtractKeys(toString(`properties`)) as property_name
WHERE randCanonical() < 0.5 -- This randomly skips inserts
AND value != ''
LIMIT 2 BY project_id,property_name;

View file

@ -143,6 +143,18 @@ CREATE TABLE IF NOT EXISTS experimental.sessions
TTL datetime + INTERVAL 1 MONTH
SETTINGS index_granularity = 512;
CREATE TABLE IF NOT EXISTS experimental.user_favorite_sessions
(
project_id UInt16,
user_id UInt32,
session_id UInt64,
_timestamp DateTime DEFAULT now(),
sign Int8
) ENGINE = CollapsingMergeTree(sign)
PARTITION BY toYYYYMM(_timestamp)
ORDER BY (project_id, user_id, session_id)
TTL _timestamp + INTERVAL 3 MONTH;
CREATE TABLE IF NOT EXISTS experimental.user_viewed_sessions
(
project_id UInt16,
@ -154,6 +166,17 @@ CREATE TABLE IF NOT EXISTS experimental.user_viewed_sessions
ORDER BY (project_id, user_id, session_id)
TTL _timestamp + INTERVAL 3 MONTH;
CREATE TABLE IF NOT EXISTS experimental.user_viewed_errors
(
project_id UInt16,
user_id UInt32,
error_id String,
_timestamp DateTime DEFAULT now()
) ENGINE = ReplacingMergeTree(_timestamp)
PARTITION BY toYYYYMM(_timestamp)
ORDER BY (project_id, user_id, error_id)
TTL _timestamp + INTERVAL 3 MONTH;
CREATE TABLE IF NOT EXISTS experimental.issues
(
project_id UInt16,
@ -673,7 +696,8 @@ CREATE TABLE IF NOT EXISTS product_analytics.property_values_samples
ENGINE = ReplacingMergeTree(_timestamp)
ORDER BY (project_id, property_name, is_event_property);
-- Incremental materialized view to get random examples of property values using $properties & properties
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.property_values_sampler_mvREFRESHEVERY30HOURTOproduct_analytics.property_values_samples AS
CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.property_values_sampler_mv
REFRESH EVERY 30 HOUR TO product_analytics.property_values_samples AS
SELECT project_id,
property_name,
TRUE AS is_event_property,