Skip to content

Instantly share code, notes, and snippets.

@ozero
Last active August 29, 2015 14:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ozero/f68437e8ea14f34c8e9f to your computer and use it in GitHub Desktop.
Save ozero/f68437e8ea14f34c8e9f to your computer and use it in GitHub Desktop.
strangeworld [at]mojie ( http://mojie.s1.xrea.com/ ) archive parsing
<?php
//@文字絵小屋の過去ログを構造化してみる試み
main();
function main(){
$data = array();
$db = new SQLite3('mojiedb.sqlite');
$db->query('DROP TABLE mojie');
$db->query('CREATE TABLE mojie (
id integer PRIMARY KEY,
post_by varchar(255),
post_date varchar(255),
r_by varchar(255),
r_to integer,
body text
)');
// "src/m=g&e=200109.dat&dl=1.cgi.html";
$pathes = glob("./src/*.html");
foreach($pathes as $path){
$tmp = parse($path);
store($db,$tmp);
}
/*
spam消し
delete from mojie where post_by like '%yahoo.co.jp%';
delete from mojie where post_by like '%docomo.ne.jp%';
delete from mojie where post_by like '%yahoo.co.uk%';
delete from mojie where post_by like '%hotmail.co.jp%';
delete from mojie where post_by like '%bsmpvzei.com%';
delete from mojie where post_by like '%xe69.org%';
delete from mojie where post_by like '%セフレ%';
delete from mojie where post_by like '%グリー%';
delete from mojie where post_by like '%モバゲー%';
*/
$fhout = fopen("ext.txt", "w");
fwrite($fhout,print_r($data,true));
fclose($fhout);
$fhout = fopen("ext.json", "w");
fwrite($fhout,json_encode($data,true));
fclose($fhout);
return;
}
function store($db, $data){
$stm0 = 'INSERT INTO mojie(id, post_by, post_date, r_by, r_to, body)
VALUES (:id, :by, :date, :r_by, :r_to, :body)';
$db->query('begin');
foreach($data as $v0){
$stmt = $db->prepare($stm0);
$stmt->bindValue(':id', $v0['id'], SQLITE3_INTEGER);
$stmt->bindValue(':by', $v0['by'], SQLITE3_TEXT);
$stmt->bindValue(':date', $v0['date'], SQLITE3_TEXT);
$stmt->bindValue(':r_by', $v0['r_by'], SQLITE3_TEXT);
$stmt->bindValue(':r_to', $v0['r_to'], SQLITE3_INTEGER);
$stmt->bindValue(':body', $v0['body'], SQLITE3_TEXT);
$stmt->execute();
}
$db->query('commit');
return;
}
function parse($path){
$handle = @fopen($path, "r");
if ($handle) {
while (($buffer = fgets($handle, 40960)) !== false) {
$src0[]=rtrim(mb_convert_encoding($buffer,"utf-8","sjis-win"));
}
if (!feof($handle)) {
echo "Error: unexpected fgets() fail\n";
}
fclose($handle);
}
/*
sample:
<!-- 101 -->
<FONT size="+1" color="#fffffe"><TT><B>>蠍一号</B></TT></FONT>
 投稿者:<TT><B> </B></TT>
 <FONT size="-1">投稿日:2001/09/22(土)00時47分15秒</FONT>
<BLOCKQUOTE>
<PRE>
<FONT color="#d1d1d1">&gt; &gt; そうだったのか(゚Д゚)
&gt; &gt; しかしそうめんにわさびってのはどうか?
&gt; (;゚Д゚)えっ
&gt;     そうめんにわさびって入れないもんなのか。
&gt;     俺はラーメンにラー油と七味唐辛子を入れて食べるシトなのでわからん・・・。</FONT>
結構メジャーだな。わさびはそばの印象が強かったんだが少々了見が狭かったな
<A href="http://www.google.com/search?q=%82%BB%82%A4%82%DF%82%F1+%96%F2%96%A1+%82%ED%82%B3%82%D1&amp;btnG=Google+%8C%9F%8D%F5&amp;hl=ja&amp;lr=lang_ja" target="link">http://www.google.com/search?q=%82%BB%82%A4%82%DF%82%F1+%96%F2%96%A1+%82%ED%82%B3%82%D1&amp;btnG=Google+%8C%9F%8D%F5&amp;hl=ja&amp;lr=lang_ja</A>
でもここを見ると漏れと同じくわさびで驚く香具師がいたりするな
<A href="http://matsuri.site.ne.jp/standard/std58.htm" target="link">http://matsuri.site.ne.jp/standard/std58.htm</A>
<A href="#100">参考:2001/09/21(金)00時44分18秒</A>
</PRE>
</BLOCKQUOTE>
<HR>
<!-- -->
*/
$data=array();
$state = array('','');//current,next
$post_id = 0;
$post_user="";
$post_date="";
$reply_user="";
$reply_id="";
foreach((array)$src0 as $k0=>$v0){
//行スキャンステート
if(preg_match('#<A.name="(\d+)"></A>#',$v0,$matches) > 0){
$state=array('header','body');
$post_id = $matches[1];
//print_r($matches);
print "header ({$post_id}) at line {$k0}\n";
continue;
}
if(($state[0]=="header")&&(preg_match('#<PRE>#',$v0,$matches) > 0)){
$state=array('body','footer');//print_r($matches);
print "body start ({$post_id}) at line {$k0}\n";
continue;
}
if(($state[0]=="body")&&(preg_match('#^</PRE>$#',$v0,$matches) > 0)){
$state=array('footer','header');//print_r($matches);
print "body end ({$post_id}) at line {$k0}\n\n";
continue;
}
//行処理
if($state[0]=="header"){
//<FONT size="+1" color="#fffffe"><TT><B>>komaru</B></TT></FONT>
preg_match(
'#<FONT.size="\+1".color="\#fffffe"><TT><B>(.*?)</B></TT></FONT>#',$v0,$matches);
if(isset($matches[1])){
$reply_user = $matches[1];
//print " - reply user name is '{$reply_user}' ({$post_id}) at line {$k0}\n";
}
//投稿者:<TT><B> </B></TT>
preg_match('#投稿者:<TT><B>(.*?)</B></TT>#',$v0,$matches);
if(isset($matches[1])){
$post_user = $matches[1];
//print " - post_user name is '{$post_user}' ({$post_id}) at line {$k0}\n";
}
//<FONT size="-1">投稿日:2001/09/22(土)00時47分15秒</FONT>
preg_match('#<FONT.size="-1">投稿日:(.*?)</FONT>#',$v0,$matches);
if(isset($matches[1])){
$post_date = $matches[1];
$post_date = preg_replace("#\(.*?\)#"," ",$post_date);
$post_date = str_replace("時",":",$post_date);
$post_date = str_replace("分",":",$post_date);
$post_date = str_replace("秒","",$post_date);
//print " - post_date is '{$post_date}' ({$post_id}) at line {$k0}\n";
}
continue;
}
if($state[0]=="body"){
//spam避け URL3つ以上書いてる奴アウト
if(preg_match_all("#http#",$v0,$matches)>2){
continue;
}
//spam避け 半角文字列しかないやつアウト
if(preg_match( "/[一-龠]+|[ぁ-ん]+|[ァ-ヴー]+|[a-zA-Z0-9]+/u",$v0) == 0){
continue;
}
//
$regex='/<A.href="#(\d+)">参考:.*?<\/A>/';
preg_match($regex,$v0,$matches);
if(isset($matches[1])){
$reply_id = $matches[1];
}else{
$reply_id = "";
}
//
$data[$post_id] = array(
'id'=>$post_id,
'by'=>$post_user,
'date'=>$post_date,
'r_by'=>$reply_user,
'r_to'=>$reply_id,
'body'=>"\n".$v0,
);
continue;
}
}
return $data;
}
@ozero
Copy link
Author

ozero commented Jul 25, 2014

Source data & parsed data (SQLite) is here. https://copy.com/khw7dDHkNLCm

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment