Skip to content

Instantly share code, notes, and snippets.

@eternal-flame-AD
Last active November 6, 2024 18:02
Show Gist options
  • Save eternal-flame-AD/7c9448fcc8627247240aa3e61e61ae30 to your computer and use it in GitHub Desktop.
Save eternal-flame-AD/7c9448fcc8627247240aa3e61e61ae30 to your computer and use it in GitHub Desktop.
Misskey SPAM statistics on database
BEGIN TRANSACTION READ WRITE;
-- write helper functions to yume schema
CREATE schema IF NOT EXISTS yume;
-- is default ID
CREATE OR REPLACE FUNCTION yume.is_default_name(text) RETURNS boolean AS $$
SELECT $1 ~ '^[0-9a-z]{10}$' $$ LANGUAGE SQL IMMUTABLE LEAKPROOF;
-- base36 to int
CREATE OR REPLACE FUNCTION yume.base36_to_int(text) RETURNS bigint AS $$
DECLARE result bigint := 0;
c char;
cd int;
len int;
BEGIN
len := length($1);
FOR i IN 1..len LOOP
c := substring($1 FROM i FOR 1);
cd := ascii(c);
IF cd >= ASCII('0') AND cd <= ASCII('9')
THEN result := result * 36 + cd - ASCII('0');
ELSE
IF cd >= ASCII('a') AND cd <= ASCII('z')
THEN result := result * 36 + cd - ASCII('a') + 10;
ELSE RAISE EXCEPTION 'Invalid character in base36 string: %', c;
END IF;
END IF;
END LOOP;
RETURN result;
END;
$$ LANGUAGE plpgsql IMMUTABLE LEAKPROOF;
-- AIDX index to millis from 2000-01-01
CREATE OR REPLACE FUNCTION yume.aidx_millis(text) RETURNS bigint AS $$
SELECT yume.base36_to_int(SUBSTRING($1 FROM 1 FOR 8));
$$ LANGUAGE SQL IMMUTABLE LEAKPROOF COST 0.5;
-- AIDX index to timestamp
CREATE OR REPLACE FUNCTION yume.aidx_time(text) RETURNS timestamp AS $$
SELECT make_timestamptz(2000, 1, 1, 0, 0, 0, 'UTC') +
yume.aidx_millis($1) / 1000 * interval '1 second' +
yume.aidx_millis($1) % 1000 * interval '1 millisecond';
$$ LANGUAGE SQL IMMUTABLE LEAKPROOF COST 1;
-- hostname last component
CREATE OR REPLACE FUNCTION yume.host_last_component(text) RETURNS text AS $$
SELECT substring((select regexp_matches($1, '\.[a-z]{2,}$'))[1] FROM 2) LIMIT 1;
$$ LANGUAGE SQL IMMUTABLE LEAKPROOF;
-- TLDS with more than 1 component
CREATE TABLE IF NOT EXISTS yume.tld_overrides (
tld text PRIMARY KEY NOT NULL
);
INSERT INTO yume.tld_overrides(tld) VALUES ('.ne.jp') ON CONFLICT DO NOTHING;
INSERT INTO yume.tld_overrides(tld) VALUES ('.co.jp') ON CONFLICT DO NOTHING;
INSERT INTO yume.tld_overrides(tld) VALUES ('.or.jp') ON CONFLICT DO NOTHING;
-- convert hostname to TLD
CREATE OR REPLACE FUNCTION yume.get_tld(text) RETURNS text AS $$
WITH overridden AS (
SELECT tld FROM yume.tld_overrides WHERE substring($1 FROM length($1) - length(tld) + 1) = tld
)
SELECT
CASE
WHEN EXISTS(SELECT * FROM overridden) THEN (SELECT tld FROM overridden)
ELSE '.' || yume.host_last_component($1)
END
END;
$$ LANGUAGE SQL STABLE LEAKPROOF;
COMMIT;
BEGIN TRANSACTION READ ONLY;
-- actually query, to save to csv wrap the query without the last semicolon in '\copy ( ... ) TO '/tmp/spam.csv' WITH (FORMAT CSV, HEADER);
WITH typed AS (
SELECT
*,
(SELECT
CASE
WHEN
cw IS NULL AND
TEXT ~ '^(@[a-zA-Z0-9_]+@[a-zA-Z0-9_.]+\s*)+$' AND
array_length(note."attachedFileTypes", 1) = 1
THEN 'qr'
WHEN
cw IS NULL AND
TEXT ~ 'サイトに核を飛ばすことです!' AND
json_array_length(note."mentionedRemoteUsers"::json) >= 5 AND
array_length(note."attachedFileTypes", 1) IS NULL
THEN 'lawyer'
ELSE NULL
END
) as ty
FROM note
)
SELECT
count(*),
ty,
"user".host as src_host,
yume.get_tld("user".host) as src_tld,
date_trunc('hour', yume.aidx_time(typed."id")) AS utc_hour,
json_array_length(typed."mentionedRemoteUsers"::json) AS mentioned
FROM typed
INNER JOIN "user" ON "user".id = typed."userId"
WHERE
ty IS NOT NULL AND
yume.is_default_name("user".username) AND
"user".name IS NULL
GROUP BY utc_hour, ty, mentioned, "user".host
ORDER BY count DESC;
ROLLBACK;
library(tidyverse)
library(cowplot)
library(colorspace)
spam <- read_csv("spam.csv") %>%
mutate(time = as_datetime(utc_hour)) %>%
group_by(src_host, ty, time) %>%
summarize(total_mention = sum(count * (mentioned + 1))) %>%
group_by(src_host) %>%
mutate(src_host_count = sum(total_mention)) %>%
ungroup() %>%
mutate(src_prop = src_host_count / sum(total_mention)) %>%
mutate(src_prop_disp = map_chr(src_prop * 100, ~sprintf("%.2f%%", .x))) %>%
mutate(src_host = str_glue("{src_host} ({src_host_count}, {src_prop_disp})")) %>%
mutate(src_host = fct_reorder(src_host, -src_host_count))
spam %>%
ggplot(aes(x=time, y=total_mention, fill=src_host)) +
geom_col(position="dodge") +
scale_x_datetime(date_labels="%Y-%m-%d %H:%M", date_breaks="1 hour") +
scale_y_log10() +
scale_fill_discrete_qualitative(palette = "Set3") +
facet_wrap(~ty, ncol = 1, scale = "free_x") +
theme_cowplot(font_size = 18) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
guides(fill = guide_legend(ncol = 1)) +
labs(
title = "Spam I know on mi.yumechi.jp",
x = "Hour",
y = "n_post * (n_mention + 1)",
fill = "host"
)
ggsave("spam.png", width=18, height=14, dpi=300, bg="white")
count ty src_host src_tld utc_hour mentioned
57 lawyer urweibo.com .com 2024-10-08 13:00:00 14
50 lawyer urweibo.com .com 2024-10-08 13:00:00 13
48 qr mk.hoshisaki-h.com .com 2024-11-04 05:00:00 1
46 qr mk.hoshisaki-h.com .com 2024-11-04 04:00:00 1
39 qr mk.hoshisaki-h.com .com 2024-11-04 04:00:00 3
35 lawyer urweibo.com .com 2024-10-08 13:00:00 15
35 qr mk.hoshisaki-h.com .com 2024-11-04 05:00:00 3
32 qr mk.hoshisaki-h.com .com 2024-11-04 05:00:00 2
31 qr mk.hoshisaki-h.com .com 2024-11-04 04:00:00 4
29 lawyer misskey.favskey.online .online 2024-10-08 13:00:00 15
27 qr mk.hoshisaki-h.com .com 2024-11-04 04:00:00 2
26 lawyer misskey.favskey.online .online 2024-10-08 13:00:00 14
24 qr mk.hoshisaki-h.com .com 2024-11-04 05:00:00 5
24 lawyer urweibo.com .com 2024-10-08 13:00:00 12
22 qr catcore.life .life 2024-11-04 04:00:00 2
19 qr catcore.life .life 2024-11-04 04:00:00 1
19 lawyer n7.social .social 2024-10-08 13:00:00 11
18 qr mk.hoshisaki-h.com .com 2024-11-04 04:00:00 5
18 lawyer n7.social .social 2024-10-08 13:00:00 12
18 qr catcore.life .life 2024-11-04 04:00:00 3
17 lawyer ai.wiki .wiki 2024-10-08 13:00:00 10
16 qr mk.hoshisaki-h.com .com 2024-11-04 05:00:00 4
14 lawyer ai.wiki .wiki 2024-10-08 13:00:00 9
12 lawyer misskey.favskey.online .online 2024-10-08 13:00:00 13
12 lawyer comingto.org .org 2024-10-08 13:00:00 13
12 qr catcore.life .life 2024-11-04 04:00:00 4
10 lawyer a.sukazyo.cc .cc 2024-10-08 13:00:00 14
10 lawyer catcore.life .life 2024-10-08 13:00:00 14
10 qr catcore.life .life 2024-11-04 04:00:00 5
9 qr mk.hoshisaki-h.com .com 2024-11-04 03:00:00 5
9 lawyer n7.social .social 2024-10-08 13:00:00 8
8 lawyer n7.social .social 2024-10-08 13:00:00 10
8 qr mk.hoshisaki-h.com .com 2024-11-04 03:00:00 4
8 lawyer mk.hoshisaki-h.com .com 2024-10-08 13:00:00 14
7 lawyer n7.social .social 2024-10-08 13:00:00 13
7 qr catcore.life .life 2024-11-04 03:00:00 5
7 lawyer comingto.org .org 2024-10-08 13:00:00 15
6 lawyer catcore.life .life 2024-10-08 13:00:00 12
6 lawyer comingto.org .org 2024-10-08 13:00:00 14
6 lawyer a.sukazyo.cc .cc 2024-10-08 13:00:00 12
6 lawyer misskey.favskey.online .online 2024-10-08 13:00:00 12
6 lawyer urweibo.com .com 2024-10-08 14:00:00 12
6 lawyer a.sukazyo.cc .cc 2024-10-08 13:00:00 15
5 lawyer ai.wiki .wiki 2024-10-08 13:00:00 8
5 lawyer n7.social .social 2024-10-08 13:00:00 14
5 lawyer catcore.life .life 2024-10-08 13:00:00 15
4 lawyer n7.social .social 2024-10-08 14:00:00 9
4 lawyer n7.social .social 2024-10-08 13:00:00 9
4 lawyer urweibo.com .com 2024-10-08 13:00:00 11
4 lawyer a.sukazyo.cc .cc 2024-10-08 13:00:00 13
4 lawyer catcore.life .life 2024-10-08 13:00:00 13
4 lawyer mk.hoshisaki-h.com .com 2024-10-08 13:00:00 15
4 lawyer ai.wiki .wiki 2024-10-08 14:00:00 9
4 lawyer mkacg.com .com 2024-10-08 13:00:00 5
4 lawyer n7.social .social 2024-10-08 14:00:00 11
4 lawyer urweibo.com .com 2024-10-08 14:00:00 11
4 qr catcore.life .life 2024-11-04 03:00:00 4
3 lawyer mkacg.com .com 2024-10-08 13:00:00 7
3 lawyer a.sukazyo.cc .cc 2024-10-08 13:00:00 11
3 qr mk.hoshisaki-h.com .com 2024-11-04 04:00:00 0
2 lawyer ai.wiki .wiki 2024-10-08 13:00:00 7
2 qr catcore.life .life 2024-11-04 04:00:00 0
2 lawyer ai.wiki .wiki 2024-10-08 14:00:00 10
2 lawyer mk.hoshisaki-h.com .com 2024-10-08 13:00:00 12
2 lawyer mk.hoshisaki-h.com .com 2024-10-08 13:00:00 13
2 lawyer a.sukazyo.cc .cc 2024-10-08 13:00:00 10
2 lawyer n7.social .social 2024-10-08 13:00:00 7
2 qr mk.hoshisaki-h.com .com 2024-11-04 05:00:00 0
1 qr tickler.cc .cc 2024-11-05 06:00:00 1
1 lawyer msk.ilnk.info .info 2024-10-08 13:00:00 12
1 lawyer comingto.org .org 2024-10-08 13:00:00 12
1 lawyer misskey.favskey.online .online 2024-10-08 13:00:00 11
1 qr misskey.gg .gg 2024-11-04 04:00:00 1
1 lawyer comingto.org .org 2024-10-08 13:00:00 11
1 lawyer catcore.life .life 2024-10-08 13:00:00 11
1 lawyer mkacg.com .com 2024-10-08 13:00:00 10
1 lawyer comingto.org .org 2024-10-08 13:00:00 10
1 lawyer catcore.life .life 2024-10-08 13:00:00 10
1 lawyer mstdn.beer .beer 2024-10-08 13:00:00 8
1 lawyer a.sukazyo.cc .cc 2024-10-08 13:00:00 7
1 lawyer msk.ilnk.info .info 2024-10-08 13:00:00 15
1 lawyer n7.social .social 2024-10-08 13:00:00 15
1 lawyer zmi.im .im 2024-10-08 13:00:00 14
1 lawyer n7.social .social 2024-10-08 13:00:00 16
1 lawyer n7.social .social 2024-10-08 13:00:00 19
1 lawyer n7.social .social 2024-10-08 13:00:00 17
1 lawyer n7.social .social 2024-10-08 14:00:00 7
1 qr misskey.t1nyb0x.cloud .cloud 2024-11-04 05:00:00 4
1 lawyer n7.social .social 2024-10-08 13:00:00 6
1 lawyer n7.social .social 2024-10-08 14:00:00 10
1 lawyer mkacg.com .com 2024-10-08 13:00:00 6
1 qr misskey.shunrin.com .com 2024-11-04 11:00:00 0
1 lawyer a.sukazyo.cc .cc 2024-10-08 14:00:00 12
1 lawyer mk.hoshisaki-h.com .com 2024-10-08 14:00:00 12
1 lawyer n7.social .social 2024-10-08 14:00:00 12
1 qr catcore.life .life 2024-11-04 03:00:00 3
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment