Last active
November 9, 2022 22:17
-
-
Save sebastian-nagel/beb244bf1f7092a06a1479335a5e268b to your computer and use it in GitHub Desktop.
Simple spam detection of Common Search host-level page rank list: detect blocks of hosts with similar rank and host names which ev. form link farms
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import fileinput | |
import sys | |
import tldextract | |
from _collections import defaultdict | |
from math import log | |
RANK_DIVERGENCE_THR = 0.02 | |
HOST_LENGTH_DIVERGENCE_THR = 0.15 | |
LEN_NORM = .2 | |
SPAM_SKIP_THR = .2 | |
MIN_PAGERANK = 0.0 # for testing or to select only higher ranking hosts | |
FAST_DECISION = True | |
counters = defaultdict(int) | |
spam_domains = defaultdict(int) | |
def mark_spam(buffer): | |
for (host, rank, len_host, hasNum, suffix, domain) in buffer: | |
score = 0.0 | |
if suffix == '': | |
# ip address or failed to parse | |
print('-\t{}\t{}\t{}\t{}'.format(score, rank, host, | |
convert_rank(rank))) | |
counters['bad host'] += 1 | |
continue | |
for (h, r, l, n, s, d) in buffer: | |
if host is h or s == '': | |
continue | |
sc = RANK_DIVERGENCE_THR - abs(1.0 - abs(r/rank)) | |
if hasNum and n: | |
# both host names contain numbers | |
sc *= 2.0 | |
if domain == d: | |
# same domain | |
sc *= 2.0 | |
elif suffix == s: | |
# from same tld | |
sc *= 1.2 | |
# decrease score if lengths differ | |
sc /= (1.0 + LEN_NORM * abs(len_host-l)) | |
score += sc | |
if FAST_DECISION and score > (2.0*SPAM_SKIP_THR): | |
break | |
mark = '+' | |
if score > SPAM_SKIP_THR or score > (rank/2.0): | |
mark = '-' | |
counters['spam'] += 1 | |
spam_domains[domain] += 1 | |
else: | |
counters['good'] += 1 | |
print('{}\t{}\t{}\t{}\t{}'.format(mark, score, rank, host, | |
convert_rank(rank))) | |
def convert_rank(rank): | |
"""convert page rank to value between 0.0 and 10.0 | |
0.15 => 0.0""" | |
rank = log(.85+rank) | |
if rank > 10.0: | |
rank = 10.0 | |
elif rank < 0.0: | |
rank = 0.0 | |
return '%.4e' % rank | |
buffer = list() | |
buffer_str_len = 0 | |
for line in fileinput.input(): | |
counters['total'] += 1 | |
fields = line.split('\t') | |
if len(fields) != 2: | |
sys.stderr.write('bad line: ' + line) | |
counters['bad line'] += 1 | |
continue | |
(host, rank) = fields | |
rank = float(rank) | |
if rank < MIN_PAGERANK: | |
break | |
len_host = len(host) | |
if len_host == 0: | |
sys.stderr.write('empty host: ' + line) | |
counters['bad line'] += 1 | |
continue | |
host_contains_number = any(char.isdigit() for char in host) | |
parsed_host = tldextract.extract(host) | |
domain = '.'.join([parsed_host.domain, parsed_host.suffix]) | |
if len(buffer) > 0: | |
avg_buffer_host_len = buffer_str_len / len(buffer) | |
len_dif = abs(1.0 - (len_host / avg_buffer_host_len)) | |
if ((buffer[0][1] / rank) > (1.0 + RANK_DIVERGENCE_THR)) or \ | |
(len_dif > HOST_LENGTH_DIVERGENCE_THR and | |
domain != buffer[-1][5]): | |
# host or rank are different: start next block | |
mark_spam(buffer) | |
buffer = list() | |
buffer_str_len = 0 | |
buffer.append([host, rank, len_host, host_contains_number, | |
parsed_host.suffix, domain]) | |
buffer_str_len += len(host) | |
mark_spam(buffer) | |
sys.stderr.write('statistics:\n') | |
for name, count in counters.items(): | |
percent = 100.0 * count / counters['total'] | |
sys.stderr.write('%8d %6.2f%%\t%s\n' % (count, percent, name)) | |
sys.stderr.write('spam domains total: %d\n' % len(spam_domains)) | |
sys.stderr.write('top spam domains:\n') | |
domains_shown = 0 | |
for domain in sorted(spam_domains, key=spam_domains.get, reverse=True): | |
domains_shown += 1 | |
if domains_shown <= 20: | |
sys.stderr.write('%6d\t%s\n' % (spam_domains[domain], domain)) | |
print('D', spam_domains[domain], domain) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# download data | |
wget https://dumps.commonsearch.org/webgraph/201606/host-level/pagerank/pagerank.txt.gz | |
# sort by decreasing rank and reverse host name, | |
# use tab as separator | |
zcat pagerank.txt.gz \ | |
| perl -lne 's/\s+([0-9.]+)$//; print scalar reverse, "\t", $1' \ | |
| LC_ALL=C sort --compress-program=gzip -t$'\t' -k2,2nr -k1,1 \ | |
| perl -lne 's/\t([0-9.]+)$//; print scalar reverse, "\t", $1' \ | |
| gzip \ | |
> pagerank_sort.txt.gz | |
# try to mark spam | |
zcat pagerank_sort.txt.gz \ | |
| LC_ALL=en_US.utf8 python3 cs_despam_host_pagerank.py \ | |
| gzip \ | |
> pagerank_spam_marked.txt.gz | |
# write seed file for Nutch | |
zcat pagerank_spam_marked.txt.gz \ | |
| perl -lne '($ok, $sp, $pr, $host, $sc) = split /\s+/; print "http://", $host, "/\tnutch.score=", $sc if $ok eq "+"' \ | |
| bzip2 \ | |
> cs_host_pagerank_inject.txt.bz2 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment