Created
January 9, 2013 13:53
-
-
Save perrywky/4493263 to your computer and use it in GitHub Desktop.
扫描discuz里的垃圾帖
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/* | |
* 扫描论坛垃圾帖放入回收站 | |
* 使用levenshtein算法判断字符串相似度 | |
* 相似度超过0.85的主题,数量超过2条,则会被放入回收站 | |
* 每隔5分钟取最新的100个主题检查 | |
* 在/tmp/bbs_spam.log记录所有被删除的主题 | |
* | |
* @author Perry | |
*/ | |
function check(){ | |
$host = 'localhost'; | |
$dbname = 'discuz'; | |
$dbuser = 'root'; | |
$dbpw = ''; | |
$tablepre = 'pre_'; | |
$limit = 100; | |
$threshold = 0.85; | |
$minimum = 3; | |
$dbh = new PDO("mysql:host=$host;dbname=$dbname", $dbuser, $dbpw, array(PDO::MYSQL_ATTR_INIT_COMMAND => 'SET NAMES \'UTF8\'')); | |
$stmt = $dbh->query("select tid, subject, author from {$tablepre}forum_thread where displayorder != -1 order by tid desc limit $limit"); | |
$new_threads = $stmt->fetchAll(); | |
if(empty($new_threads)){ | |
return; | |
} | |
$similar_threads = array(); | |
$iterate_threads = $new_threads; | |
$hide_threads = array(); | |
foreach($iterate_threads as $needle_thread){ | |
$group = array(); | |
foreach($new_threads as $thread){ | |
$similarity = levenshtein_similarity($needle_thread['subject'], $thread['subject']); | |
if($similarity > $threshold){ | |
$thread['similarity'] = $similarity; | |
$group['tid-'.$thread['tid']] = $thread; | |
} | |
} | |
if(count($group) >= $minimum){ | |
$hide_threads = array_merge($hide_threads, $group); | |
} | |
} | |
if(!empty($hide_threads)){ | |
$date = date('Y-m-d H:i:s'); | |
$log = PHP_EOL . '-------------------------'.$date.'-------------------------' . PHP_EOL . 'moving these threads to trash...' . PHP_EOL; | |
$tids = array(); | |
foreach($hide_threads as $thread){ | |
$tids[] = $thread['tid']; | |
$log .= sprintf("author %s, similarity %g, %d: %s".PHP_EOL, $thread['author'], round($thread['similarity'], 2), $thread['tid'], $thread['subject']); | |
} | |
file_put_contents('/tmp/bbs_spam.log', $log, FILE_APPEND); | |
echo sprintf('%s, moving %d threads to trash...'.PHP_EOL, $date, count($tids)); | |
$dbh->query(sprintf('update %sforum_thread set displayorder = -1 where tid in (%s) limit %d', $tablepre, implode(',', $tids), $limit)); | |
} | |
} | |
function levenshtein_similarity($str1, $str2){ | |
$length1 = mb_strlen( $str1, 'UTF-8'); | |
$length2 = mb_strlen( $str2, 'UTF-8'); | |
if( $length1 < $length2) return levenshtein_similarity($str2, $str1); | |
if( $length1 == 0 ) return 0; | |
if( $str1 === $str2) return 1; | |
$prevRow = range( 0, $length2); | |
$currentRow = array(); | |
for ( $i = 0; $i < $length1; $i++ ) { | |
$currentRow=array(); | |
$currentRow[0] = $i + 1; | |
$c1 = mb_substr( $str1, $i, 1, 'UTF-8') ; | |
for ( $j = 0; $j < $length2; $j++ ) { | |
$c2 = mb_substr( $str2, $j, 1, 'UTF-8' ); | |
$insertions = $prevRow[$j+1] + 1; | |
$deletions = $currentRow[$j] + 1; | |
$substitutions = $prevRow[$j] + (($c1 != $c2)?1:0); | |
$currentRow[] = min($insertions, $deletions, $substitutions); | |
} | |
$prevRow = $currentRow; | |
} | |
return 1-$prevRow[$length2]/$length1; | |
} | |
while(true){ | |
check(); | |
sleep(300); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment