Last active
December 13, 2016 07:27
-
-
Save tacktaddy/3776eb4d237dc11bbfd2dea50bb2cc57 to your computer and use it in GitHub Desktop.
iタウンページから企業情報を収集するPHPスクリプト(大分県大分市版)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// NOTICEは出力しない | |
error_reporting(E_ALL & ~E_NOTICE); | |
// 改行文字 | |
// ブラウザに出力したい場合は true->falseに変更 | |
// テキストファイルで出力する場合はtrueでOK | |
$eol = (true) ? PHP_EOL : '<br>' ; | |
// 出力内容 | |
$print_header_list = array( | |
"No", | |
"市", | |
"町", | |
"会社名", | |
"住所", | |
"ホームページURL", | |
"ジャンル1", | |
"ジャンル2", | |
"ジャンル3", | |
"電話番号", | |
"FAXフラグ", | |
"メールアドレス", | |
"タウンページURL", | |
); | |
// ヘッダ出力 | |
echo implode(',', $print_header_list) . $eol; | |
// 重複をスキップするためにタウンページURLを蓄積する配列を用意 | |
$printed_list = array(); | |
// 対象市ID | |
$target_city = "oita/44201"; //大分県大分市 | |
$target_city_label = "大分市"; | |
// タウンページの一覧URL (1ページ50件表示で50音順) | |
$base_url = "http://itp.ne.jp/{$target_city}/%s/genre_dir/?num=50&nad=1&st=4&ngr=1&sr=1"; | |
// 町リスト | |
$townid_list = array( | |
'44201459' => "都町", | |
'44201406' => "府内町", | |
'44201287' => "中央町", | |
'44201203' => "大字下郡", | |
'44201510' => "萩原", | |
'44201495' => "大字永興", | |
'44201108' => "金池町", | |
'44201410' => "大字古国府", | |
'44201085' => "大字奥田", | |
'44201329' => "大字中戸次", | |
'44201570' => "三佐", | |
'44201587' => "大字佐賀関", | |
'44201310' => "豊海", | |
'44201130' => "大字上宗方", | |
'44201514' => "牧", | |
'44201438' => "大字光吉", | |
'44201368' => "大字羽屋", | |
'44201282' => "大字玉沢", | |
'44201104' => "大字片島", | |
'44201075' => "大手町", | |
'44201478' => "大字森", | |
'44201221' => "城崎町", | |
'44201333' => "長浜町", | |
'44201481' => "大字森町", | |
'44201053' => "大字荏隈", | |
'44201467' => "大字三芳", | |
'44201077' => "大道町", | |
'44201442' => "大字皆春", | |
'44201270' => "大字田尻", | |
'44201357' => "大字畑中", | |
'44201336' => "荷揚町", | |
'44201295' => "大字津守", | |
'44201460' => "大字宮崎", | |
'44201359' => "大字羽田", | |
'44201380' => "東春日町", | |
'44201316' => "中島西", | |
'44201236' => "末広町", | |
'44201490' => "大字横尾", | |
'44201431' => "大字三佐", | |
'44201037' => "大字猪野", | |
'44201568' => "賀来北", | |
'44201342' => "西新地", | |
'44201025' => "生石", | |
'44201425' => "大字松岡", | |
'44201240' => "大字勢家", | |
'44201517' => "明野北", | |
'44201109' => "金池南", | |
'44201087' => "大字鴛野", | |
'44201038' => "今津留", | |
'44201469' => "向原西", | |
'44201566' => "公園通り西", | |
'44201553' => "下郡北", | |
'44201520' => "明野東", | |
'44201500' => "大字寒田", | |
'44201165' => "大字小池原", | |
'44201110' => "要町", | |
'44201032' => "大字市", | |
'44201532' => "青崎", | |
'44201328' => "大字中判田", | |
'44201556' => "下郡南", | |
'44201414' => "弁天", | |
'44201320' => "中鶴崎", | |
'44201427' => "松原町", | |
'44201344' => "大字西ノ洲", | |
'44201262' => "高松", | |
'44201297' => "大字鶴崎", | |
'44201512' => "原新町", | |
'44201421' => "舞鶴町", | |
'44201363' => "花津留", | |
'44201149' => "大字木上", | |
'44201554' => "下郡中央", | |
'44201254' => "高砂町", | |
'44201599' => "坂ノ市中央", | |
'44201569' => "賀来南", | |
'44201234' => "大字城原", | |
'44201073' => "大津町", | |
'44201288' => "千代町", | |
'44201226' => "新川町", | |
'44201412' => "大字豊饒", | |
'44201506' => "新貝", | |
'44201379' => "東大道", | |
'44201070' => "大州浜", | |
'44201507' => "新栄町", | |
'44201164' => "顕徳町", | |
'44201436' => "三川新町", | |
'44201315' => "中島中央", | |
'44201264' => "高松東", | |
'44201428' => "大字政所", | |
'44201592' => "大字野津原", | |
'44201321' => "中津留", | |
'44201513' => "日吉町", | |
'44201143' => "北鶴崎", | |
'44201575' => "西大道", | |
'44201468' => "向原沖", | |
'44201383' => "東津留", | |
'44201508' => "高城新町", | |
'44201536' => "大在中央", | |
'44201509' => "高城西町", | |
'44201044' => "岩田町", | |
'44201371' => "原川", | |
'44201286' => "大字旦野原", | |
'44201385' => "東浜", | |
'44201601' => "坂ノ市南", | |
'44201128' => "大字上戸次", | |
'44201535' => "大在北", | |
'44201100' => "大字賀来", | |
'44201245' => "碩田町", | |
'44201322' => "仲西町", | |
'44201491' => "大字横瀬", | |
'44201451' => "南鶴崎", | |
'44201343' => "西鶴崎", | |
'44201484' => "大字八幡", | |
'44201095' => "大字小野鶴", | |
'44201212' => "大字下判田", | |
'44201267' => "大字竹中", | |
'44201448' => "南春日町", | |
'44201452' => "南津留", | |
'44201362' => "花高松", | |
'44201058' => "王子中町", | |
'44201278' => "大字種具", | |
'44201106' => "大字葛木", | |
'44201537' => "大在浜", | |
'44201056' => "王子北町", | |
'44201549' => "横田", | |
'44201435' => "三川下", | |
'44201423' => "大字曲", | |
'44201059' => "王子西町", | |
'44201246' => "大字千歳", | |
'44201555' => "下郡東", | |
'44201258' => "高城本町", | |
'44201172' => "寿町", | |
'44201166' => "古ケ鶴", | |
'44201092' => "乙津港町", | |
'44201573' => "富士見が丘東", | |
'44201485' => "山津町", | |
'44201470' => "向原東", | |
'44201283' => "田室町", | |
'44201228' => "新町", | |
'44201139' => "大字神崎", | |
'44201501' => "大字大在", | |
'44201473' => "大字廻栖野", | |
'44201313' => "中春日町", | |
'44201186' => "大字佐野", | |
'44201434' => "三川上", | |
'44201372' => "日岡", | |
'44201339' => "西春日町", | |
'44201299' => "寺崎町", | |
'44201180' => "大字迫", | |
'44201548' => "政所", | |
'44201391' => "大字日吉原", | |
'44201382' => "東鶴崎", | |
'44201293' => "大字常行", | |
'44201060' => "王子南町", | |
'44201024' => "大字家島", | |
'44201324' => "大字中ノ洲", | |
'44201317' => "中島東", | |
'44201099' => "大字海原", | |
'44201340' => "錦町", | |
'44201285' => "大字駄原", | |
'44201572' => "富士見が丘西", | |
'44201518' => "明野高尾", | |
'44201163' => "大字毛井", | |
'44201608' => "賀来西", | |
'44201239' => "住吉町", | |
'44201211' => "大字下徳丸", | |
'44201595' => "大字本神崎", | |
'44201476' => "元町", | |
'44201353' => "大字野田", | |
'44201241' => "勢家町", | |
'44201112' => "大字金谷迫", | |
'44201213' => "大字下戸次", | |
'44201091' => "乙津町", | |
'44201026' => "大字生石", | |
'44201244' => "大字関園", | |
'44201062' => "王子町", | |
'44201567' => "大分流通業務団地", | |
'44201280' => "大字田原", | |
'44201564' => "法勝台", | |
'44201215' => "大字下宗方", | |
'44201027' => "生石港町", | |
'44201562' => "徳島", | |
'44201539' => "汐見", | |
'44201524' => "緑が丘", | |
'44201596' => "久原中央", | |
'44201585' => "大字志生木", | |
'44201223' => "新春日町", | |
'44201550' => "横塚", | |
'44201487' => "大字屋山", | |
'44201424' => "大字牧", | |
'44201259' => "高城南町", | |
'44201505' => "城東町", | |
'44201488' => "豊町", | |
'44201050' => "上野町", | |
'44201542' => "須賀", | |
'44201350' => "大字丹生", | |
'44201252' => "高崎", | |
'44201167' => "大字国分", | |
'44201551' => "六坊北町", | |
'44201521' => "明野南", | |
'44201443' => "大字南", | |
'44201312' => "大字中尾", | |
'44201048' => "上野丘西", | |
'44201002' => "大字丹川", | |
'44201457' => "大字宮河内", | |
'44201294' => "大字角子原", | |
'44201033' => "大字一木", | |
'44201291' => "大字辻", | |
'44201182' => "大字里", | |
'44201561' => "小中島", | |
'44201529' => "高江西", | |
'44201047' => "上野丘", | |
'44201296' => "大字津留", | |
'44201036' => "大字一の洲", | |
'44201609' => "松が丘", | |
'44201545' => "角子南", | |
'44201544' => "角子原", | |
'44201515' => "牧上町", | |
'44201534' => "王ノ瀬", | |
'44201365' => "浜の市", | |
'44201600' => "坂ノ市西", | |
'44201598' => "小佐井", | |
'44201079' => "大字岡川", | |
'44201417' => "大字細", | |
'44201346' => "西浜", | |
'44201303' => "東野台", | |
'44201589' => "大字竹矢", | |
'44201565' => "公園通り", | |
'44201541' => "庄境", | |
'44201458' => "宮河内ハイランド", | |
'44201155' => "大字久土", | |
'44201057' => "王子新町", | |
'44201197' => "大字志村", | |
'44201576' => "大字一尺屋", | |
'44201519' => "明野西", | |
'44201546' => "花江川", | |
'44201543' => "竹下", | |
'44201540' => "志村", | |
'44201237' => "大字杉原", | |
'44201141' => "大字木田", | |
'44201030' => "泉町", | |
'44201597' => "久原南", | |
'44201577' => "大字今市", | |
'44201504' => "にじが丘", | |
'44201375' => "大字東上野", | |
'44201090' => "大字乙津", | |
'44201586' => "大字白木", | |
'44201522' => "田尻グリーンハイツ", | |
'44201192' => "敷戸西町", | |
'44201607' => "王子山の手町", | |
'44201525' => "曙台", | |
'44201093' => "大字小中島", | |
'44201578' => "大字入蔵", | |
'44201559' => "星和台", | |
'44201558' => "敷戸台", | |
'44201302' => "大字東院", | |
'44201261' => "大字高瀬", | |
'44201593' => "大字馬場", | |
'44201499' => "王子港町", | |
'44201389' => "ひばりケ丘", | |
'44201358' => "大字端登", | |
'44201154' => "大字口戸", | |
'44201523' => "けやき台", | |
'44201502' => "大字青崎", | |
'44201190' => "敷戸北町", | |
'44201580' => "大字大平", | |
'44201528' => "高江中央", | |
'44201463' => "大字宮苑", | |
'44201126' => "大字上判田", | |
'44201560' => "芳河原台", | |
'44201179' => "桜ケ丘", | |
'44201594' => "大字福宗", | |
'44201582' => "大字木佐上", | |
'44201530' => "高江南", | |
'44201429' => "大字丸亀", | |
'44201156' => "大字久原", | |
'44201138' => "大字河原内", | |
'44201065' => "大字大分", | |
'44201035' => "大字市尾", | |
'44201602' => "里", | |
'44201538' => "久原北", | |
'44201531' => "高尾台", | |
'44201298' => "大字鶴瀬", | |
'44201290' => "大字月形", | |
'44201193' => "敷戸東町", | |
'44201049' => "上野丘東", | |
'44201503' => "青葉台", | |
'44201579' => "大字太田", | |
'44201533' => "恵比寿町", | |
'44201394' => "大字広内", | |
'44201253' => "大字高崎", | |
'44201606' => "国分新町", | |
'44201603' => "小野鶴南", | |
'44201591' => "大字荷尾杵", | |
'44201590' => "大字辻原", | |
'44201552' => "六坊南町", | |
'44201547' => "浜中", | |
'44201527' => "高江北", | |
'44201355' => "大字萩尾", | |
'44201194' => "敷戸南町", | |
'44201191' => "敷戸新町", | |
'44201094' => "大字鬼崎", | |
'44201074' => "大字大津留", | |
'44201046' => "大字上野", | |
'44201001' => "青葉町", | |
'44201604' => "京が丘南", | |
'44201584' => "大字下原", | |
'44201571' => "希望が丘", | |
'44201563' => "かたしま台", | |
'44201557' => "望みが丘", | |
'44201492' => "大字横田", | |
'44201373' => "大字東明野", | |
'44201084' => "大字奥", | |
'44201526' => "藤の台", | |
'44201456' => "大字宮尾", | |
'44201588' => "大字高原", | |
'44201583' => "大字沢田", | |
'44201581' => "大字上詰", | |
'44201574' => "はなの森", | |
'44201511' => "萩原緑町", | |
'44201494' => "大字吉野原", | |
'44201398' => "大字福良", | |
'44201605' => "季の坂", | |
'44201516' => "牧緑町", | |
'44201393' => "大字平横瀬", | |
'44201364' => "大字浜", | |
'44201266' => "大字竹下", | |
'44201195' => "大字志津留", | |
'44201178' => "大字坂ノ市", | |
); | |
// 取得処理開始 | |
$no = 1; | |
// 町リストをループ | |
foreach ($townid_list as $townid => $townname) { | |
// 該当町のトップページを取得 | |
$town_url = sprintf($base_url, $townid); | |
// 該当町のトップページからジャンル1のリストを取得 | |
$genre1_list = genre_split(fgc_utf8($town_url), $target_city, $townid); | |
// ジャンル1リストをループしてジャンル2を取得 | |
foreach($genre1_list as $genre1_row){ | |
// ジャンル2リスト取得 | |
$genre2_list = genre_split(fgc_utf8($genre1_row['url']), $target_city, $townid); | |
// ジャンル2リストをループしてジャンル2を取得 | |
foreach ($genre2_list as $genre2_row) { | |
// ジャンル3リスト取得 | |
$genre3_list = genre_split(fgc_utf8($genre2_row['url']), $target_city, $townid); | |
// ジャンル3の検索一覧URLから会社情報を取得する処理を開始 | |
foreach ($genre3_list as $genre3_row) { | |
$url_tmp = explode('/?', $genre3_row['url']); | |
$page_cnt = 1; | |
while (true) { | |
$search_result_url = $url_tmp[0] . "/pg/{$page_cnt}/?" . $url_tmp[1]; | |
$search_result_src = fgc_utf8($search_result_url); | |
// 検索結果ページに会社情報が有る場合 | |
if (preg_match('{normalResultsBox}', $search_result_src)) { | |
// キーワード:normalResultsBoxでページ内の会社情報を全取得 | |
// 1ページ最大50件 | |
preg_match_all('{<div class="normalResultsBox">(.*?)</article>}us', $search_result_src, $resultbox_list); | |
// 全取得した会社情報をループ処理 | |
foreach ($resultbox_list[1] as $resultbox) { | |
// 通常パターン(blackText) | |
if (strpos($resultbox, "blackText") !== false) { | |
// 社名,タウンページURL | |
preg_match('{<a class="blackText" href="(.*?)" target="_blank">(.*?)</a>}us', $resultbox, $m_name); | |
$company_name = trim($m_name[2]); | |
$townpage_link = "http://itp.ne.jp" . trim($m_name[1]); | |
// 未出力の場合のみ後続処理を実行 | |
if(!array_key_exists($townpage_link, $printed_list)){ | |
// ホームページURL | |
preg_match('{URL</span>(.*?)</ br>}us', $resultbox, $m_hp); | |
$hp = trim($m_hp[1]); | |
preg_match('{EMAIL</span>(.*?)</ br>}us', $resultbox, $m_email); | |
$email = trim($m_email[1]); | |
// 住所 | |
$address = get_address($resultbox); | |
// 電話番号 | |
$tel = get_tel($resultbox); | |
} | |
// 特別パターン(blueText) | |
}elseif (strpos($resultbox, "blueText") !== false) { | |
// 社名,タウンページURL | |
preg_match('{<a class="blueText" href="(.*?)" target="_blank">(.*?)</a>}us', $resultbox, $m_name); | |
$company_name = trim($m_name[2]); | |
preg_match('{%2F(.*?)%2F}us', $m_name[1], $m_link); | |
$townpage_link = "http://nttbj.itp.ne.jp/" . trim($m_link[1]) . "/index.html"; | |
// 未出力の場合のみ後続処理を実行 | |
if(!array_key_exists($townpage_link, $printed_list)){ | |
// ホームページURL | |
// 特別パターンの場合は個別ページから取得 | |
$nttbj_src = fgc_utf8($townpage_link); | |
preg_match('{<th colspan="1">URL</th>(.*?)<td colspan="3">(.*?)<A(.*?)>(.*?)</A>}us', $nttbj_src, $m_hp); | |
$hp = trim($m_hp[4]); | |
preg_match('{<th colspan="1">E-mail</th>(.*?)<td colspan="3">(.*?)<A(.*?)>(.*?)</A>}us', $nttbj_src, $m_email); | |
$email = trim($m_email[4]); | |
// 住所 | |
$address = get_address($resultbox); | |
// 電話番号 | |
$tel = get_tel($resultbox); | |
} | |
// その他 | |
}else{ | |
$company_name = "例外パターン"; | |
} | |
// 未出力の場合のみ出力処理を実行 | |
if(!array_key_exists($townpage_link, $printed_list)){ | |
// 出力 | |
printf('"%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s"' . $eol, | |
$no, | |
$target_city_label, | |
$townname, | |
$company_name, | |
$address, | |
$hp, | |
$genre1_row['text'], | |
$genre2_row['text'], | |
$genre3_row['text'], | |
$tel['no'], | |
$tel['fax_flg'], | |
$email, | |
$townpage_link | |
); | |
// No.をカウントアップ | |
$no++; | |
// 重複を避けるために配列に格納 | |
$printed_list[$townpage_link] = 1; | |
} | |
} | |
// ページをカウントアップ | |
$page_cnt++; | |
// 検索結果ページに会社情報が無い場合 | |
}else{ | |
break; | |
} | |
} | |
} | |
} | |
} | |
} | |
//=========================== | |
// 関数 | |
//=========================== | |
// file_get_contentsでコンテンツを取得して、結果をSJIS→UTF-8に変換する | |
function fgc_utf8($url){ | |
// 最大3回までリトライ | |
for ($i=0; $i < 3; $i++) { | |
$contents = file_get_contents($url); | |
if ($contents === false) { | |
sleep(30); | |
}else{ | |
break; | |
} | |
} | |
return mb_convert_encoding($contents, 'UTF-8', 'SJIS'); | |
} | |
// HTMLソースの中身からジャンルを抽出する | |
function genre_split($htmlsrc, $target_city, $townid){ | |
$ret = array(); | |
preg_match('{業種を選ぶ(.*?)</div>}us', $htmlsrc, $match_list); | |
$genre_str = $match_list[0]; | |
$str = '<a href="javascript:void\(0\)" class="evfd" onclick="ITPPcListingSearchGenreNarrowingLink\(\'2\', \'/' . $target_city . '/' . $townid . '/genre_dir/'; | |
preg_match_all('{' . $str . '(.*?)</a>}us', $genre_str, $match_item); | |
$end = count($match_item[1]); | |
for ($i = 0; $i < $end; $i++) { | |
$genre_info = explode('/?num=50&nad=1&st=4&ngr=1&sr=1\');return false;">', $match_item[1][$i]); | |
$ret[$i]['slug'] = $genre_info[0]; | |
$ret[$i]['url'] = sprintf("http://itp.ne.jp/%s/%s/genre_dir/%s/?num=50&nad=1&st=4&ngr=1&sr=1", $target_city, $townid, $genre_info[0]); | |
$ret[$i]['text'] = $genre_info[1]; | |
} | |
return $ret; | |
} | |
// 住所抜き出し | |
function get_address($str){ | |
preg_match('{住所</span> (.*?) <}us', $str, $m_address); | |
return trim($m_address[1]); | |
} | |
// 電話番号抜き出し | |
function get_tel($str){ | |
$tel = array(); | |
preg_match('{TEL</span>(.*?)<b>(.*?)</b>}us', $str, $m_tel); | |
// 電話番号 | |
$tel['no'] = trim($m_tel[2]); | |
// FAX専フラグ | |
$tel['fax_flg'] = (strpos($m_tel[1], "F専") !== false) ? 'FAX' : '' ; | |
return $tel; | |
} | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Scraping script from https://itp.ne.jp/
Learn more -> https://takahashi-it.com/php/itownpage-scraping-script/