Create a gist now

Instantly share code, notes, and snippets.

Embed
What would you like to do?
扫描discuz里的垃圾帖
<?php
/*
* 扫描论坛垃圾帖放入回收站
* 使用levenshtein算法判断字符串相似度
* 相似度超过0.85的主题,数量超过2条,则会被放入回收站
* 每隔5分钟取最新的100个主题检查
* 在/tmp/bbs_spam.log记录所有被删除的主题
*
* @author Perry
*/
function check(){
$host = 'localhost';
$dbname = 'discuz';
$dbuser = 'root';
$dbpw = '';
$tablepre = 'pre_';
$limit = 100;
$threshold = 0.85;
$minimum = 3;
$dbh = new PDO("mysql:host=$host;dbname=$dbname", $dbuser, $dbpw, array(PDO::MYSQL_ATTR_INIT_COMMAND => 'SET NAMES \'UTF8\''));
$stmt = $dbh->query("select tid, subject, author from {$tablepre}forum_thread where displayorder != -1 order by tid desc limit $limit");
$new_threads = $stmt->fetchAll();
if(empty($new_threads)){
return;
}
$similar_threads = array();
$iterate_threads = $new_threads;
$hide_threads = array();
foreach($iterate_threads as $needle_thread){
$group = array();
foreach($new_threads as $thread){
$similarity = levenshtein_similarity($needle_thread['subject'], $thread['subject']);
if($similarity > $threshold){
$thread['similarity'] = $similarity;
$group['tid-'.$thread['tid']] = $thread;
}
}
if(count($group) >= $minimum){
$hide_threads = array_merge($hide_threads, $group);
}
}
if(!empty($hide_threads)){
$date = date('Y-m-d H:i:s');
$log = PHP_EOL . '-------------------------'.$date.'-------------------------' . PHP_EOL . 'moving these threads to trash...' . PHP_EOL;
$tids = array();
foreach($hide_threads as $thread){
$tids[] = $thread['tid'];
$log .= sprintf("author %s, similarity %g, %d: %s".PHP_EOL, $thread['author'], round($thread['similarity'], 2), $thread['tid'], $thread['subject']);
}
file_put_contents('/tmp/bbs_spam.log', $log, FILE_APPEND);
echo sprintf('%s, moving %d threads to trash...'.PHP_EOL, $date, count($tids));
$dbh->query(sprintf('update %sforum_thread set displayorder = -1 where tid in (%s) limit %d', $tablepre, implode(',', $tids), $limit));
}
}
function levenshtein_similarity($str1, $str2){
$length1 = mb_strlen( $str1, 'UTF-8');
$length2 = mb_strlen( $str2, 'UTF-8');
if( $length1 < $length2) return levenshtein_similarity($str2, $str1);
if( $length1 == 0 ) return 0;
if( $str1 === $str2) return 1;
$prevRow = range( 0, $length2);
$currentRow = array();
for ( $i = 0; $i < $length1; $i++ ) {
$currentRow=array();
$currentRow[0] = $i + 1;
$c1 = mb_substr( $str1, $i, 1, 'UTF-8') ;
for ( $j = 0; $j < $length2; $j++ ) {
$c2 = mb_substr( $str2, $j, 1, 'UTF-8' );
$insertions = $prevRow[$j+1] + 1;
$deletions = $currentRow[$j] + 1;
$substitutions = $prevRow[$j] + (($c1 != $c2)?1:0);
$currentRow[] = min($insertions, $deletions, $substitutions);
}
$prevRow = $currentRow;
}
return 1-$prevRow[$length2]/$length1;
}
while(true){
check();
sleep(300);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment