Last active
August 29, 2015 14:04
-
-
Save ozero/f68437e8ea14f34c8e9f to your computer and use it in GitHub Desktop.
strangeworld [at]mojie ( http://mojie.s1.xrea.com/ ) archive parsing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
//@文字絵小屋の過去ログを構造化してみる試み | |
main(); | |
function main(){ | |
$data = array(); | |
$db = new SQLite3('mojiedb.sqlite'); | |
$db->query('DROP TABLE mojie'); | |
$db->query('CREATE TABLE mojie ( | |
id integer PRIMARY KEY, | |
post_by varchar(255), | |
post_date varchar(255), | |
r_by varchar(255), | |
r_to integer, | |
body text | |
)'); | |
// "src/m=g&e=200109.dat&dl=1.cgi.html"; | |
$pathes = glob("./src/*.html"); | |
foreach($pathes as $path){ | |
$tmp = parse($path); | |
store($db,$tmp); | |
} | |
/* | |
spam消し | |
delete from mojie where post_by like '%yahoo.co.jp%'; | |
delete from mojie where post_by like '%docomo.ne.jp%'; | |
delete from mojie where post_by like '%yahoo.co.uk%'; | |
delete from mojie where post_by like '%hotmail.co.jp%'; | |
delete from mojie where post_by like '%bsmpvzei.com%'; | |
delete from mojie where post_by like '%xe69.org%'; | |
delete from mojie where post_by like '%セフレ%'; | |
delete from mojie where post_by like '%グリー%'; | |
delete from mojie where post_by like '%モバゲー%'; | |
*/ | |
$fhout = fopen("ext.txt", "w"); | |
fwrite($fhout,print_r($data,true)); | |
fclose($fhout); | |
$fhout = fopen("ext.json", "w"); | |
fwrite($fhout,json_encode($data,true)); | |
fclose($fhout); | |
return; | |
} | |
function store($db, $data){ | |
$stm0 = 'INSERT INTO mojie(id, post_by, post_date, r_by, r_to, body) | |
VALUES (:id, :by, :date, :r_by, :r_to, :body)'; | |
$db->query('begin'); | |
foreach($data as $v0){ | |
$stmt = $db->prepare($stm0); | |
$stmt->bindValue(':id', $v0['id'], SQLITE3_INTEGER); | |
$stmt->bindValue(':by', $v0['by'], SQLITE3_TEXT); | |
$stmt->bindValue(':date', $v0['date'], SQLITE3_TEXT); | |
$stmt->bindValue(':r_by', $v0['r_by'], SQLITE3_TEXT); | |
$stmt->bindValue(':r_to', $v0['r_to'], SQLITE3_INTEGER); | |
$stmt->bindValue(':body', $v0['body'], SQLITE3_TEXT); | |
$stmt->execute(); | |
} | |
$db->query('commit'); | |
return; | |
} | |
function parse($path){ | |
$handle = @fopen($path, "r"); | |
if ($handle) { | |
while (($buffer = fgets($handle, 40960)) !== false) { | |
$src0[]=rtrim(mb_convert_encoding($buffer,"utf-8","sjis-win")); | |
} | |
if (!feof($handle)) { | |
echo "Error: unexpected fgets() fail\n"; | |
} | |
fclose($handle); | |
} | |
/* | |
sample: | |
<!-- 101 --> | |
<FONT size="+1" color="#fffffe"><TT><B>>蠍一号</B></TT></FONT> | |
投稿者:<TT><B> </B></TT> | |
<FONT size="-1">投稿日:2001/09/22(土)00時47分15秒</FONT> | |
<BLOCKQUOTE> | |
<PRE> | |
<FONT color="#d1d1d1">> > そうだったのか(゚Д゚) | |
> > しかしそうめんにわさびってのはどうか? | |
> (;゚Д゚)えっ | |
> そうめんにわさびって入れないもんなのか。 | |
> 俺はラーメンにラー油と七味唐辛子を入れて食べるシトなのでわからん・・・。</FONT> | |
結構メジャーだな。わさびはそばの印象が強かったんだが少々了見が狭かったな | |
<A href="http://www.google.com/search?q=%82%BB%82%A4%82%DF%82%F1+%96%F2%96%A1+%82%ED%82%B3%82%D1&btnG=Google+%8C%9F%8D%F5&hl=ja&lr=lang_ja" target="link">http://www.google.com/search?q=%82%BB%82%A4%82%DF%82%F1+%96%F2%96%A1+%82%ED%82%B3%82%D1&btnG=Google+%8C%9F%8D%F5&hl=ja&lr=lang_ja</A> | |
でもここを見ると漏れと同じくわさびで驚く香具師がいたりするな | |
<A href="http://matsuri.site.ne.jp/standard/std58.htm" target="link">http://matsuri.site.ne.jp/standard/std58.htm</A> | |
<A href="#100">参考:2001/09/21(金)00時44分18秒</A> | |
</PRE> | |
</BLOCKQUOTE> | |
<HR> | |
<!-- --> | |
*/ | |
$data=array(); | |
$state = array('','');//current,next | |
$post_id = 0; | |
$post_user=""; | |
$post_date=""; | |
$reply_user=""; | |
$reply_id=""; | |
foreach((array)$src0 as $k0=>$v0){ | |
//行スキャンステート | |
if(preg_match('#<A.name="(\d+)"></A>#',$v0,$matches) > 0){ | |
$state=array('header','body'); | |
$post_id = $matches[1]; | |
//print_r($matches); | |
print "header ({$post_id}) at line {$k0}\n"; | |
continue; | |
} | |
if(($state[0]=="header")&&(preg_match('#<PRE>#',$v0,$matches) > 0)){ | |
$state=array('body','footer');//print_r($matches); | |
print "body start ({$post_id}) at line {$k0}\n"; | |
continue; | |
} | |
if(($state[0]=="body")&&(preg_match('#^</PRE>$#',$v0,$matches) > 0)){ | |
$state=array('footer','header');//print_r($matches); | |
print "body end ({$post_id}) at line {$k0}\n\n"; | |
continue; | |
} | |
//行処理 | |
if($state[0]=="header"){ | |
//<FONT size="+1" color="#fffffe"><TT><B>>komaru</B></TT></FONT> | |
preg_match( | |
'#<FONT.size="\+1".color="\#fffffe"><TT><B>(.*?)</B></TT></FONT>#',$v0,$matches); | |
if(isset($matches[1])){ | |
$reply_user = $matches[1]; | |
//print " - reply user name is '{$reply_user}' ({$post_id}) at line {$k0}\n"; | |
} | |
//投稿者:<TT><B> </B></TT> | |
preg_match('#投稿者:<TT><B>(.*?)</B></TT>#',$v0,$matches); | |
if(isset($matches[1])){ | |
$post_user = $matches[1]; | |
//print " - post_user name is '{$post_user}' ({$post_id}) at line {$k0}\n"; | |
} | |
//<FONT size="-1">投稿日:2001/09/22(土)00時47分15秒</FONT> | |
preg_match('#<FONT.size="-1">投稿日:(.*?)</FONT>#',$v0,$matches); | |
if(isset($matches[1])){ | |
$post_date = $matches[1]; | |
$post_date = preg_replace("#\(.*?\)#"," ",$post_date); | |
$post_date = str_replace("時",":",$post_date); | |
$post_date = str_replace("分",":",$post_date); | |
$post_date = str_replace("秒","",$post_date); | |
//print " - post_date is '{$post_date}' ({$post_id}) at line {$k0}\n"; | |
} | |
continue; | |
} | |
if($state[0]=="body"){ | |
//spam避け URL3つ以上書いてる奴アウト | |
if(preg_match_all("#http#",$v0,$matches)>2){ | |
continue; | |
} | |
//spam避け 半角文字列しかないやつアウト | |
if(preg_match( "/[一-龠]+|[ぁ-ん]+|[ァ-ヴー]+|[a-zA-Z0-9]+/u",$v0) == 0){ | |
continue; | |
} | |
// | |
$regex='/<A.href="#(\d+)">参考:.*?<\/A>/'; | |
preg_match($regex,$v0,$matches); | |
if(isset($matches[1])){ | |
$reply_id = $matches[1]; | |
}else{ | |
$reply_id = ""; | |
} | |
// | |
$data[$post_id] = array( | |
'id'=>$post_id, | |
'by'=>$post_user, | |
'date'=>$post_date, | |
'r_by'=>$reply_user, | |
'r_to'=>$reply_id, | |
'body'=>"\n".$v0, | |
); | |
continue; | |
} | |
} | |
return $data; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Source data & parsed data (SQLite) is here. https://copy.com/khw7dDHkNLCm