Skip to content

Instantly share code, notes, and snippets.

@sebastian-nagel
Last active November 9, 2022 22:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sebastian-nagel/beb244bf1f7092a06a1479335a5e268b to your computer and use it in GitHub Desktop.
Save sebastian-nagel/beb244bf1f7092a06a1479335a5e268b to your computer and use it in GitHub Desktop.
Simple spam detection of Common Search host-level page rank list: detect blocks of hosts with similar rank and host names which ev. form link farms
import fileinput
import sys
import tldextract
from _collections import defaultdict
from math import log
RANK_DIVERGENCE_THR = 0.02
HOST_LENGTH_DIVERGENCE_THR = 0.15
LEN_NORM = .2
SPAM_SKIP_THR = .2
MIN_PAGERANK = 0.0 # for testing or to select only higher ranking hosts
FAST_DECISION = True
counters = defaultdict(int)
spam_domains = defaultdict(int)
def mark_spam(buffer):
for (host, rank, len_host, hasNum, suffix, domain) in buffer:
score = 0.0
if suffix == '':
# ip address or failed to parse
print('-\t{}\t{}\t{}\t{}'.format(score, rank, host,
convert_rank(rank)))
counters['bad host'] += 1
continue
for (h, r, l, n, s, d) in buffer:
if host is h or s == '':
continue
sc = RANK_DIVERGENCE_THR - abs(1.0 - abs(r/rank))
if hasNum and n:
# both host names contain numbers
sc *= 2.0
if domain == d:
# same domain
sc *= 2.0
elif suffix == s:
# from same tld
sc *= 1.2
# decrease score if lengths differ
sc /= (1.0 + LEN_NORM * abs(len_host-l))
score += sc
if FAST_DECISION and score > (2.0*SPAM_SKIP_THR):
break
mark = '+'
if score > SPAM_SKIP_THR or score > (rank/2.0):
mark = '-'
counters['spam'] += 1
spam_domains[domain] += 1
else:
counters['good'] += 1
print('{}\t{}\t{}\t{}\t{}'.format(mark, score, rank, host,
convert_rank(rank)))
def convert_rank(rank):
"""convert page rank to value between 0.0 and 10.0
0.15 => 0.0"""
rank = log(.85+rank)
if rank > 10.0:
rank = 10.0
elif rank < 0.0:
rank = 0.0
return '%.4e' % rank
buffer = list()
buffer_str_len = 0
for line in fileinput.input():
counters['total'] += 1
fields = line.split('\t')
if len(fields) != 2:
sys.stderr.write('bad line: ' + line)
counters['bad line'] += 1
continue
(host, rank) = fields
rank = float(rank)
if rank < MIN_PAGERANK:
break
len_host = len(host)
if len_host == 0:
sys.stderr.write('empty host: ' + line)
counters['bad line'] += 1
continue
host_contains_number = any(char.isdigit() for char in host)
parsed_host = tldextract.extract(host)
domain = '.'.join([parsed_host.domain, parsed_host.suffix])
if len(buffer) > 0:
avg_buffer_host_len = buffer_str_len / len(buffer)
len_dif = abs(1.0 - (len_host / avg_buffer_host_len))
if ((buffer[0][1] / rank) > (1.0 + RANK_DIVERGENCE_THR)) or \
(len_dif > HOST_LENGTH_DIVERGENCE_THR and
domain != buffer[-1][5]):
# host or rank are different: start next block
mark_spam(buffer)
buffer = list()
buffer_str_len = 0
buffer.append([host, rank, len_host, host_contains_number,
parsed_host.suffix, domain])
buffer_str_len += len(host)
mark_spam(buffer)
sys.stderr.write('statistics:\n')
for name, count in counters.items():
percent = 100.0 * count / counters['total']
sys.stderr.write('%8d %6.2f%%\t%s\n' % (count, percent, name))
sys.stderr.write('spam domains total: %d\n' % len(spam_domains))
sys.stderr.write('top spam domains:\n')
domains_shown = 0
for domain in sorted(spam_domains, key=spam_domains.get, reverse=True):
domains_shown += 1
if domains_shown <= 20:
sys.stderr.write('%6d\t%s\n' % (spam_domains[domain], domain))
print('D', spam_domains[domain], domain)
#!/bin/bash
# download data
wget https://dumps.commonsearch.org/webgraph/201606/host-level/pagerank/pagerank.txt.gz
# sort by decreasing rank and reverse host name,
# use tab as separator
zcat pagerank.txt.gz \
| perl -lne 's/\s+([0-9.]+)$//; print scalar reverse, "\t", $1' \
| LC_ALL=C sort --compress-program=gzip -t$'\t' -k2,2nr -k1,1 \
| perl -lne 's/\t([0-9.]+)$//; print scalar reverse, "\t", $1' \
| gzip \
> pagerank_sort.txt.gz
# try to mark spam
zcat pagerank_sort.txt.gz \
| LC_ALL=en_US.utf8 python3 cs_despam_host_pagerank.py \
| gzip \
> pagerank_spam_marked.txt.gz
# write seed file for Nutch
zcat pagerank_spam_marked.txt.gz \
| perl -lne '($ok, $sp, $pr, $host, $sc) = split /\s+/; print "http://", $host, "/\tnutch.score=", $sc if $ok eq "+"' \
| bzip2 \
> cs_host_pagerank_inject.txt.bz2
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment