Skip to content

Instantly share code, notes, and snippets.

@ampaze
Created February 5, 2024 08:07
Show Gist options
  • Save ampaze/3a22d3c2adbc1d0bbeff409966ccd132 to your computer and use it in GitHub Desktop.
Save ampaze/3a22d3c2adbc1d0bbeff409966ccd132 to your computer and use it in GitHub Desktop.
How to import Matomo tracking data into Plausible Analytics
-- Use at your own risk, I am not responsible if you lose data or break your system!
-- This is based on self hosted Plausible v2.0.0 and self hosted Matomo 5.0.1
-- EXPORT from Matomo MySQL
-- Best to start screen before running the export
-- mysql -u <user> -p <db> < plausible-matomo-import-events.sql > plausible-matomo-import-events.tsv
-- IMPORT into clickhouse
-- Again best to use screen
-- cat plausible-matomo-import-events.tsv | docker exec -i plausible-analytics-plausible_events_db-1 clickhouse-client -q "INSERT INTO plausible_events_db.events_v2 FORMAT TabSeparatedWithNames"
SELECT
va.server_time 'timestamp',
CASE a.`type`
WHEN 1 THEN 'pageview'
WHEN 2 THEN 'Outbound Link: Click'
WHEN 3 THEN 'File Download'
ELSE ''
END 'name',
CASE va.idsite
-- Match Matomo site_id with Plausible idsite
WHEN 1 THEN 1
-- WHEN 2 THEN x
END 'site_id',
-- Try to generate a user_id.
-- Matomo Visitor id is potentially valid much longer than a Plausible user_id, which updates every 24 hours
CONV(
CONCAT(
SUBSTR(HEX(v.idvisitor), 1, 12),
/* Matomo/Piwik started in 2007, so there shouldn't be any dates older than that */
LPAD(HEX(CAST(DATE_FORMAT(v.visit_first_action_time, "%Y%j") as UNSIGNED) - 2007000), 4, 0)
), 16,10
) AS user_id,
v.idvisit as session_id,
COALESCE(SUBSTRING_INDEX(
CASE a.`type`
WHEN 1 THEN COALESCE(a.name, a_entry.name)
ELSE a_ref.name
END, '/', 1
), '') 'hostname',
COALESCE(SUBSTRING(
CASE a.`type` WHEN 1 THEN COALESCE(a.name, a_entry.name)
ELSE a_ref.name
END,
LENGTH(SUBSTRING_INDEX(CASE a.`type`
WHEN 1 THEN COALESCE(a.name, a_entry.name)
ELSE a_ref.name
END, '/', 1
)) + 1
), '') 'pathname',
-- Only the first pageview has referer information
COALESCE(IF(va.`pageview_position` = 1,
TRIM(TRAILING '/' FROM
CASE WHEN v.referer_type IN (2,3,7) THEN
CASE
WHEN v.referer_url LIKE 'http://www.%' THEN SUBSTRING(v.referer_url, 12)
WHEN v.referer_url LIKE 'https://www.%' THEN SUBSTRING(v.referer_url, 13)
WHEN v.referer_url LIKE 'http://%' THEN SUBSTRING(v.referer_url, 8)
WHEN v.referer_url LIKE 'https://%' THEN SUBSTRING(v.referer_url, 9)
ELSE NULL
END
ELSE NULL
END)
, NULL)
, '') referrer,
COALESCE(IF(va.`pageview_position` = 1, v.referer_name, ''), '') referrer_source,
COALESCE(UPPER(location_country), '') country_code,
-- https://github.com/matomo-org/device-detector/blob/6.1.5/Parser/Device/AbstractDeviceParser.php#L39
-- https://github.com/plausible/analytics/blob/master/lib/plausible/ingestion/event.ex#L352
COALESCE(ELT(config_device_type + 1, 'Desktop', 'Mobile', 'Tablet',
/* 3 feature phone*/ 'Mobile',
/* 4 console */ 'Desktop',
/* 5 tv */ 'Desktop',
/* 6 car browser */ 'Tablet',
/* 8 camera */ 'Mobile',
/* 9 portable media player' */ 'Mobile',
/* 10 phablet */ 'Mobile',
/* 12 wearable */ 'Mobile'
), '') screen_size,
-- https://github.com/plausible/analytics/blob/master/priv/ua_inspector/short_codes.oss.yml
COALESCE(JSON_UNQUOTE(JSON_EXTRACT('{"AND": "Android", "WIN": "Windows", "IOS": "iOS", "MAC": "Mac", "LIN": "GNU/Linux", "UBT": "Ubuntu", "IPA": "iPadOS", "COS": "Chrome OS"}', CONCAT('$."', config_os, '"'))), '') operating_system,
COALESCE(CASE
WHEN v.config_browser_name = 'UNK' THEN NULL
WHEN LENGTH(v.config_browser_name) = 2 THEN
JSON_UNQUOTE(JSON_EXTRACT('{"FF": "Firefox", "CH": "Chrome", "MF": "Safari", "CM": "Chrome",
"SF": "Safari", "PS": "Microsoft Edge", "SB": "Samsung Browser", "FM": "Firefox", "EC": "Ecosia",
"CV": "Mobile App", "CI": "Chrome", "OP": "Opera", "EW": "Edge Webview", "MU": "MIUI Browser", "DD": "DuckDuckGo Privacy Browser",
"BR": "Brave", "OM": "Opera", "F1": "Firefox", "IE": "Internet Explorer"
}', CONCAT('$."', v.config_browser_name, '"')))
ELSE v.config_browser_name
END, '') browser,
COALESCE(campaign_medium, '') utm_medium,
COALESCE(campaign_source, '') utm_source,
COALESCE(campaign_name, '') utm_campaign,
CASE a.`type`
WHEN 1 THEN '[]' -- pageview
WHEN 2 THEN "['url']" -- outlink
WHEN 3 THEN "['url']" -- download
ELSE '[]'
END 'meta.key',
CASE a.`type`
WHEN 1 THEN '[]' -- pageview
WHEN 2 THEN CONCAT("['", a.name, "']") -- outlink
WHEN 3 THEN CONCAT("['", a.name, "']") -- download
ELSE '[]'
END 'meta.value',
COALESCE(IF(config_browser_version IN ("", "UNK"), NULL, config_browser_version), '') browser_version,
COALESCE(IF(config_os_version IN ("", "UNK"), NULL, config_os_version), '') operating_system_version,
'' subdivision1_code,
'' subdivision2_code,
0 city_geoname_id,
COALESCE(campaign_content, '') utm_content,
COALESCE(campaign_keyword, '') utm_term,
'Matomo' transferred_from
FROM `matomo_log_link_visit_action` va
JOIN `matomo_log_visit` v ON va.`idvisit` = v.idvisit
JOIN `matomo_log_action` a ON va.`idaction_url` = a.`idaction` AND a.`type` IN (1,2,3)
LEFT JOIN `matomo_log_action` a_ref ON va.`idaction_url_ref` = a_ref.`idaction` -- AND a.`type` IN (2,3)
JOIN `matomo_log_action` a_entry ON v.`visit_entry_idaction_url` = a_entry.`idaction` -- AND a.`type` IN (2,3)
WHERE
v.config_os != 'BOT'
-- Everything before the tracking started with Plausible for this site id
AND (va.idsite = 1 AND va.server_time < '2023-09-07 10:39:14')
-- Use at your own risk, I am not responsible if you lose data or break your system!
-- This is based on a self hosted Plausible v2.0.0
-- After importing the events, the next step is to the generate session data
-- run the following SQL in the clickhouse client
-- docker exec -it plausible-analytics-plausible_events_db-1 clickhouse-client
-- if you run into memory issues, try SET max_bytes_before_external_group_by = 1000000000;
INSERT INTO plausible_events_db.sessions_v2
SELECT ev.session_id, 1 sign, ev.site_id, any(ev.user_id) `user_id`,
any(ev.hostname) `hostname`,
MAX(ev.`timestamp`) `timestamp`, MIN(ev.`timestamp`) `start`,
COUNT() == 1 `is_bounce`,
argMin(ev.pathname, ev.`timestamp`) entry_page,
argMax(ev.pathname, ev.`timestamp`) exit_page,
COUNT(IF(ev.name = 'pageview', 1, 0)) pageviews,
COUNT() `events`,
timestamp_diff('s', MIN(ev.`timestamp`), MAX(ev.`timestamp`)) `duration`,
IFNULL(any(IF(empty(ev.referrer), NULL, ev.referrer)), '') `referrer`,
IFNULL(any(IF(empty(ev.referrer_source), NULL, ev.referrer_source)), '') `referrer_source`,
IFNULL(any(IF(empty(ev.country_code), NULL, ev.country_code)), '') `country_code`,
IFNULL(any(IF(empty(ev.screen_size), NULL, ev.screen_size)), '') `screen_size`,
IFNULL(any(IF(empty(ev.operating_system), NULL, ev.operating_system)), '') `operating_system`,
IFNULL(any(IF(empty(ev.browser), NULL, ev.browser)), '') `browser`,
IFNULL(any(IF(empty(ev.utm_medium), NULL, ev.utm_medium)), '') `utm_medium`,
IFNULL(any(IF(empty(ev.utm_source), NULL, ev.utm_source)), '') `utm_source`,
IFNULL(any(IF(empty(ev.utm_campaign), NULL, ev.utm_campaign)), '') `utm_campaign`,
IFNULL(any(IF(empty(ev.browser_version), NULL, ev.browser_version)), '') `browser_version`,
IFNULL(any(IF(empty(ev.operating_system_version), NULL, ev.operating_system_version)), '') `operating_system_version`,
any(ev.subdivision1_code) `subdivision1_code`,
any(ev.subdivision2_code) `subdivision2_code`,
any(ev.city_geoname_id) `city_geoname_id`,
IFNULL(any(IF(empty(ev.utm_content), NULL, ev.utm_content)), '') `utm_content`,
any(ev.utm_term) `utm_term`,
any(ev.transferred_from) `transferred_from`,
argMin(ev.`meta.key`, ev.timestamp) `entry_meta.key`,
argMin(ev.`meta.value`, ev.timestamp) `entry_meta.value`
FROM plausible_events_db.events_v2 ev
WHERE ev.transferred_from = 'Matomo'
GROUP BY ev.site_id, ev.session_id
-- Use the following SQL to query the timestamp for the first event for each site_id
-- Then use these timestamps to update the columns `stats_start_date` and `native_stats_start_at` in plausible_db.sites
-- docker exec -it plausible-analytics-plausible_events_db-1 clickhouse-client
SELECT ev.site_id, min(`timestamp`)
FROM plausible_events_db.events_v2 ev
GROUP BY ev.site_id
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment