Created
February 5, 2024 08:07
-
-
Save ampaze/3a22d3c2adbc1d0bbeff409966ccd132 to your computer and use it in GitHub Desktop.
How to import Matomo tracking data into Plausible Analytics
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Use at your own risk, I am not responsible if you lose data or break your system! | |
-- This is based on self hosted Plausible v2.0.0 and self hosted Matomo 5.0.1 | |
-- EXPORT from Matomo MySQL | |
-- Best to start screen before running the export | |
-- mysql -u <user> -p <db> < plausible-matomo-import-events.sql > plausible-matomo-import-events.tsv | |
-- IMPORT into clickhouse | |
-- Again best to use screen | |
-- cat plausible-matomo-import-events.tsv | docker exec -i plausible-analytics-plausible_events_db-1 clickhouse-client -q "INSERT INTO plausible_events_db.events_v2 FORMAT TabSeparatedWithNames" | |
SELECT | |
va.server_time 'timestamp', | |
CASE a.`type` | |
WHEN 1 THEN 'pageview' | |
WHEN 2 THEN 'Outbound Link: Click' | |
WHEN 3 THEN 'File Download' | |
ELSE '' | |
END 'name', | |
CASE va.idsite | |
-- Match Matomo site_id with Plausible idsite | |
WHEN 1 THEN 1 | |
-- WHEN 2 THEN x | |
END 'site_id', | |
-- Try to generate a user_id. | |
-- Matomo Visitor id is potentially valid much longer than a Plausible user_id, which updates every 24 hours | |
CONV( | |
CONCAT( | |
SUBSTR(HEX(v.idvisitor), 1, 12), | |
/* Matomo/Piwik started in 2007, so there shouldn't be any dates older than that */ | |
LPAD(HEX(CAST(DATE_FORMAT(v.visit_first_action_time, "%Y%j") as UNSIGNED) - 2007000), 4, 0) | |
), 16,10 | |
) AS user_id, | |
v.idvisit as session_id, | |
COALESCE(SUBSTRING_INDEX( | |
CASE a.`type` | |
WHEN 1 THEN COALESCE(a.name, a_entry.name) | |
ELSE a_ref.name | |
END, '/', 1 | |
), '') 'hostname', | |
COALESCE(SUBSTRING( | |
CASE a.`type` WHEN 1 THEN COALESCE(a.name, a_entry.name) | |
ELSE a_ref.name | |
END, | |
LENGTH(SUBSTRING_INDEX(CASE a.`type` | |
WHEN 1 THEN COALESCE(a.name, a_entry.name) | |
ELSE a_ref.name | |
END, '/', 1 | |
)) + 1 | |
), '') 'pathname', | |
-- Only the first pageview has referer information | |
COALESCE(IF(va.`pageview_position` = 1, | |
TRIM(TRAILING '/' FROM | |
CASE WHEN v.referer_type IN (2,3,7) THEN | |
CASE | |
WHEN v.referer_url LIKE 'http://www.%' THEN SUBSTRING(v.referer_url, 12) | |
WHEN v.referer_url LIKE 'https://www.%' THEN SUBSTRING(v.referer_url, 13) | |
WHEN v.referer_url LIKE 'http://%' THEN SUBSTRING(v.referer_url, 8) | |
WHEN v.referer_url LIKE 'https://%' THEN SUBSTRING(v.referer_url, 9) | |
ELSE NULL | |
END | |
ELSE NULL | |
END) | |
, NULL) | |
, '') referrer, | |
COALESCE(IF(va.`pageview_position` = 1, v.referer_name, ''), '') referrer_source, | |
COALESCE(UPPER(location_country), '') country_code, | |
-- https://github.com/matomo-org/device-detector/blob/6.1.5/Parser/Device/AbstractDeviceParser.php#L39 | |
-- https://github.com/plausible/analytics/blob/master/lib/plausible/ingestion/event.ex#L352 | |
COALESCE(ELT(config_device_type + 1, 'Desktop', 'Mobile', 'Tablet', | |
/* 3 feature phone*/ 'Mobile', | |
/* 4 console */ 'Desktop', | |
/* 5 tv */ 'Desktop', | |
/* 6 car browser */ 'Tablet', | |
/* 8 camera */ 'Mobile', | |
/* 9 portable media player' */ 'Mobile', | |
/* 10 phablet */ 'Mobile', | |
/* 12 wearable */ 'Mobile' | |
), '') screen_size, | |
-- https://github.com/plausible/analytics/blob/master/priv/ua_inspector/short_codes.oss.yml | |
COALESCE(JSON_UNQUOTE(JSON_EXTRACT('{"AND": "Android", "WIN": "Windows", "IOS": "iOS", "MAC": "Mac", "LIN": "GNU/Linux", "UBT": "Ubuntu", "IPA": "iPadOS", "COS": "Chrome OS"}', CONCAT('$."', config_os, '"'))), '') operating_system, | |
COALESCE(CASE | |
WHEN v.config_browser_name = 'UNK' THEN NULL | |
WHEN LENGTH(v.config_browser_name) = 2 THEN | |
JSON_UNQUOTE(JSON_EXTRACT('{"FF": "Firefox", "CH": "Chrome", "MF": "Safari", "CM": "Chrome", | |
"SF": "Safari", "PS": "Microsoft Edge", "SB": "Samsung Browser", "FM": "Firefox", "EC": "Ecosia", | |
"CV": "Mobile App", "CI": "Chrome", "OP": "Opera", "EW": "Edge Webview", "MU": "MIUI Browser", "DD": "DuckDuckGo Privacy Browser", | |
"BR": "Brave", "OM": "Opera", "F1": "Firefox", "IE": "Internet Explorer" | |
}', CONCAT('$."', v.config_browser_name, '"'))) | |
ELSE v.config_browser_name | |
END, '') browser, | |
COALESCE(campaign_medium, '') utm_medium, | |
COALESCE(campaign_source, '') utm_source, | |
COALESCE(campaign_name, '') utm_campaign, | |
CASE a.`type` | |
WHEN 1 THEN '[]' -- pageview | |
WHEN 2 THEN "['url']" -- outlink | |
WHEN 3 THEN "['url']" -- download | |
ELSE '[]' | |
END 'meta.key', | |
CASE a.`type` | |
WHEN 1 THEN '[]' -- pageview | |
WHEN 2 THEN CONCAT("['", a.name, "']") -- outlink | |
WHEN 3 THEN CONCAT("['", a.name, "']") -- download | |
ELSE '[]' | |
END 'meta.value', | |
COALESCE(IF(config_browser_version IN ("", "UNK"), NULL, config_browser_version), '') browser_version, | |
COALESCE(IF(config_os_version IN ("", "UNK"), NULL, config_os_version), '') operating_system_version, | |
'' subdivision1_code, | |
'' subdivision2_code, | |
0 city_geoname_id, | |
COALESCE(campaign_content, '') utm_content, | |
COALESCE(campaign_keyword, '') utm_term, | |
'Matomo' transferred_from | |
FROM `matomo_log_link_visit_action` va | |
JOIN `matomo_log_visit` v ON va.`idvisit` = v.idvisit | |
JOIN `matomo_log_action` a ON va.`idaction_url` = a.`idaction` AND a.`type` IN (1,2,3) | |
LEFT JOIN `matomo_log_action` a_ref ON va.`idaction_url_ref` = a_ref.`idaction` -- AND a.`type` IN (2,3) | |
JOIN `matomo_log_action` a_entry ON v.`visit_entry_idaction_url` = a_entry.`idaction` -- AND a.`type` IN (2,3) | |
WHERE | |
v.config_os != 'BOT' | |
-- Everything before the tracking started with Plausible for this site id | |
AND (va.idsite = 1 AND va.server_time < '2023-09-07 10:39:14') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Use at your own risk, I am not responsible if you lose data or break your system! | |
-- This is based on a self hosted Plausible v2.0.0 | |
-- After importing the events, the next step is to the generate session data | |
-- run the following SQL in the clickhouse client | |
-- docker exec -it plausible-analytics-plausible_events_db-1 clickhouse-client | |
-- if you run into memory issues, try SET max_bytes_before_external_group_by = 1000000000; | |
INSERT INTO plausible_events_db.sessions_v2 | |
SELECT ev.session_id, 1 sign, ev.site_id, any(ev.user_id) `user_id`, | |
any(ev.hostname) `hostname`, | |
MAX(ev.`timestamp`) `timestamp`, MIN(ev.`timestamp`) `start`, | |
COUNT() == 1 `is_bounce`, | |
argMin(ev.pathname, ev.`timestamp`) entry_page, | |
argMax(ev.pathname, ev.`timestamp`) exit_page, | |
COUNT(IF(ev.name = 'pageview', 1, 0)) pageviews, | |
COUNT() `events`, | |
timestamp_diff('s', MIN(ev.`timestamp`), MAX(ev.`timestamp`)) `duration`, | |
IFNULL(any(IF(empty(ev.referrer), NULL, ev.referrer)), '') `referrer`, | |
IFNULL(any(IF(empty(ev.referrer_source), NULL, ev.referrer_source)), '') `referrer_source`, | |
IFNULL(any(IF(empty(ev.country_code), NULL, ev.country_code)), '') `country_code`, | |
IFNULL(any(IF(empty(ev.screen_size), NULL, ev.screen_size)), '') `screen_size`, | |
IFNULL(any(IF(empty(ev.operating_system), NULL, ev.operating_system)), '') `operating_system`, | |
IFNULL(any(IF(empty(ev.browser), NULL, ev.browser)), '') `browser`, | |
IFNULL(any(IF(empty(ev.utm_medium), NULL, ev.utm_medium)), '') `utm_medium`, | |
IFNULL(any(IF(empty(ev.utm_source), NULL, ev.utm_source)), '') `utm_source`, | |
IFNULL(any(IF(empty(ev.utm_campaign), NULL, ev.utm_campaign)), '') `utm_campaign`, | |
IFNULL(any(IF(empty(ev.browser_version), NULL, ev.browser_version)), '') `browser_version`, | |
IFNULL(any(IF(empty(ev.operating_system_version), NULL, ev.operating_system_version)), '') `operating_system_version`, | |
any(ev.subdivision1_code) `subdivision1_code`, | |
any(ev.subdivision2_code) `subdivision2_code`, | |
any(ev.city_geoname_id) `city_geoname_id`, | |
IFNULL(any(IF(empty(ev.utm_content), NULL, ev.utm_content)), '') `utm_content`, | |
any(ev.utm_term) `utm_term`, | |
any(ev.transferred_from) `transferred_from`, | |
argMin(ev.`meta.key`, ev.timestamp) `entry_meta.key`, | |
argMin(ev.`meta.value`, ev.timestamp) `entry_meta.value` | |
FROM plausible_events_db.events_v2 ev | |
WHERE ev.transferred_from = 'Matomo' | |
GROUP BY ev.site_id, ev.session_id |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Use the following SQL to query the timestamp for the first event for each site_id | |
-- Then use these timestamps to update the columns `stats_start_date` and `native_stats_start_at` in plausible_db.sites | |
-- docker exec -it plausible-analytics-plausible_events_db-1 clickhouse-client | |
SELECT ev.site_id, min(`timestamp`) | |
FROM plausible_events_db.events_v2 ev | |
GROUP BY ev.site_id |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment