Skip to content

Instantly share code, notes, and snippets.

@adamsilverstein
Created November 27, 2023 19:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save adamsilverstein/0c52afe02e61bc37b04dac54842407cf to your computer and use it in GitHub Desktop.
Save adamsilverstein/0c52afe02e61bc37b04dac54842407cf to your computer and use it in GitHub Desktop.
%%bigquery bytype
WITH
embed_data AS (
SELECT
url,
JSON_EXTRACT(payload, '$._cms.wordpress.has_embed_block') AS has_embed_block,
CAST(JSON_EXTRACT(payload, '$._cms.wordpress.embed_block_count.total') AS FLOAT64)
AS embed_block_count_total,
JSON_EXTRACT(payload, '$._cms.wordpress.embed_block_count.total_by_type') AS embeds,
FROM `httparchive.pages.2023_10_01_desktop`
WHERE JSON_EXTRACT(payload, '$._cms.wordpress.has_embed_block') = 'true'
),
keys AS (
SELECT
*
FROM embed_data
CROSS JOIN UNNEST(REGEXP_EXTRACT_ALL(embeds, r'"(.*?"\s*:\d*)')) AS key_value
),
site_block_counts AS (
SELECT
url,
REGEXP_EXTRACT(key_value, r'(.*?)"\s*:\d?') AS key,
CAST(REGEXP_EXTRACT(key_value, r'.*?"\s*:(\d?)') AS INTEGER) AS value
FROM keys
),
total_sites AS (
SELECT COUNT(DISTINCT URL) AS total_sites_with_embeds
FROM embed_data
WHERE has_embed_block = 'true'
),
site_rollup AS (
SELECT
key,
AVG(value) AS average_count,
COUNT(DISTINCT url) AS site_count,
(SELECT total_sites_with_embeds FROM total_sites) AS total
FROM site_block_counts
GROUP BY key
ORDER BY site_count DESC
)
SELECT
key AS embed_type,
site_count,
site_count / total * 100 AS percent_of_sites,
average_count AS average_embeds_per_page
FROM site_rollup
@adamsilverstein
Copy link
Author

Results

index embed_type site_count percent_of_sites average_embeds_per_page
0 youtube 32166 71.75% 1.778337374867875
1 twitter 4396 9.81% 1.2570518653321183
2 vimeo 2460 5.49% 1.4341463414634126
3 embed-handler 1325 2.96% 1.5524528301886802
4 videopress 1096 2.44% 1.5885036496350347
5 spotify 1084 2.42% 1.4289667896678968
6 instagram 478 1.07% 1.6297071129707115
7 embed 267 0.60% 1.6254681647940068
8 tiktok 267 0.60% 1.4157303370786511
9 amazon 221 0.49% 1.9592760180995477
10 soundcloud 204 0.46% 1.6666666666666659
11 pinterest 175 0.39% 1.0400000000000005
12 gestor-del-servicio 167 0.37% 1.4910179640718564
13 埋め込みハンドラー 120 0.27% 1.6333333333333333
14 flickr 110 0.25% 1.890909090909091
15 facebook 93 0.21% 1.268817204301075
16 prise-en-charge-des-contenus-embarques 79 0.18% 1.4177215189873422
17 issuu 55 0.12% 1.1818181818181817
18 incorporar-manipulador 53 0.12% 1.6981132075471699
19 handler-einbetten 52 0.12% 1.634615384615385
20 handler-delloggetto-incorporato 49 0.11% 1.8571428571428572
21 rumble-com 48 0.11% 2.0625
22 genially 46 0.10% 1.4565217391304353
23 slideshare 45 0.10% 1.5777777777777784
24 insluiten-handler 42 0.09% 1.6666666666666667

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment