Skip to content

Instantly share code, notes, and snippets.

@tacktaddy
Last active December 13, 2016 07:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tacktaddy/3776eb4d237dc11bbfd2dea50bb2cc57 to your computer and use it in GitHub Desktop.
Save tacktaddy/3776eb4d237dc11bbfd2dea50bb2cc57 to your computer and use it in GitHub Desktop.
iタウンページから企業情報を収集するPHPスクリプト(大分県大分市版)
<?php
// NOTICEは出力しない
error_reporting(E_ALL & ~E_NOTICE);
// 改行文字
// ブラウザに出力したい場合は true->falseに変更
// テキストファイルで出力する場合はtrueでOK
$eol = (true) ? PHP_EOL : '<br>' ;
// 出力内容
$print_header_list = array(
"No",
"市",
"町",
"会社名",
"住所",
"ホームページURL",
"ジャンル1",
"ジャンル2",
"ジャンル3",
"電話番号",
"FAXフラグ",
"メールアドレス",
"タウンページURL",
);
// ヘッダ出力
echo implode(',', $print_header_list) . $eol;
// 重複をスキップするためにタウンページURLを蓄積する配列を用意
$printed_list = array();
// 対象市ID
$target_city = "oita/44201"; //大分県大分市
$target_city_label = "大分市";
// タウンページの一覧URL (1ページ50件表示で50音順)
$base_url = "http://itp.ne.jp/{$target_city}/%s/genre_dir/?num=50&nad=1&st=4&ngr=1&sr=1";
// 町リスト
$townid_list = array(
'44201459' => "都町",
'44201406' => "府内町",
'44201287' => "中央町",
'44201203' => "大字下郡",
'44201510' => "萩原",
'44201495' => "大字永興",
'44201108' => "金池町",
'44201410' => "大字古国府",
'44201085' => "大字奥田",
'44201329' => "大字中戸次",
'44201570' => "三佐",
'44201587' => "大字佐賀関",
'44201310' => "豊海",
'44201130' => "大字上宗方",
'44201514' => "牧",
'44201438' => "大字光吉",
'44201368' => "大字羽屋",
'44201282' => "大字玉沢",
'44201104' => "大字片島",
'44201075' => "大手町",
'44201478' => "大字森",
'44201221' => "城崎町",
'44201333' => "長浜町",
'44201481' => "大字森町",
'44201053' => "大字荏隈",
'44201467' => "大字三芳",
'44201077' => "大道町",
'44201442' => "大字皆春",
'44201270' => "大字田尻",
'44201357' => "大字畑中",
'44201336' => "荷揚町",
'44201295' => "大字津守",
'44201460' => "大字宮崎",
'44201359' => "大字羽田",
'44201380' => "東春日町",
'44201316' => "中島西",
'44201236' => "末広町",
'44201490' => "大字横尾",
'44201431' => "大字三佐",
'44201037' => "大字猪野",
'44201568' => "賀来北",
'44201342' => "西新地",
'44201025' => "生石",
'44201425' => "大字松岡",
'44201240' => "大字勢家",
'44201517' => "明野北",
'44201109' => "金池南",
'44201087' => "大字鴛野",
'44201038' => "今津留",
'44201469' => "向原西",
'44201566' => "公園通り西",
'44201553' => "下郡北",
'44201520' => "明野東",
'44201500' => "大字寒田",
'44201165' => "大字小池原",
'44201110' => "要町",
'44201032' => "大字市",
'44201532' => "青崎",
'44201328' => "大字中判田",
'44201556' => "下郡南",
'44201414' => "弁天",
'44201320' => "中鶴崎",
'44201427' => "松原町",
'44201344' => "大字西ノ洲",
'44201262' => "高松",
'44201297' => "大字鶴崎",
'44201512' => "原新町",
'44201421' => "舞鶴町",
'44201363' => "花津留",
'44201149' => "大字木上",
'44201554' => "下郡中央",
'44201254' => "高砂町",
'44201599' => "坂ノ市中央",
'44201569' => "賀来南",
'44201234' => "大字城原",
'44201073' => "大津町",
'44201288' => "千代町",
'44201226' => "新川町",
'44201412' => "大字豊饒",
'44201506' => "新貝",
'44201379' => "東大道",
'44201070' => "大州浜",
'44201507' => "新栄町",
'44201164' => "顕徳町",
'44201436' => "三川新町",
'44201315' => "中島中央",
'44201264' => "高松東",
'44201428' => "大字政所",
'44201592' => "大字野津原",
'44201321' => "中津留",
'44201513' => "日吉町",
'44201143' => "北鶴崎",
'44201575' => "西大道",
'44201468' => "向原沖",
'44201383' => "東津留",
'44201508' => "高城新町",
'44201536' => "大在中央",
'44201509' => "高城西町",
'44201044' => "岩田町",
'44201371' => "原川",
'44201286' => "大字旦野原",
'44201385' => "東浜",
'44201601' => "坂ノ市南",
'44201128' => "大字上戸次",
'44201535' => "大在北",
'44201100' => "大字賀来",
'44201245' => "碩田町",
'44201322' => "仲西町",
'44201491' => "大字横瀬",
'44201451' => "南鶴崎",
'44201343' => "西鶴崎",
'44201484' => "大字八幡",
'44201095' => "大字小野鶴",
'44201212' => "大字下判田",
'44201267' => "大字竹中",
'44201448' => "南春日町",
'44201452' => "南津留",
'44201362' => "花高松",
'44201058' => "王子中町",
'44201278' => "大字種具",
'44201106' => "大字葛木",
'44201537' => "大在浜",
'44201056' => "王子北町",
'44201549' => "横田",
'44201435' => "三川下",
'44201423' => "大字曲",
'44201059' => "王子西町",
'44201246' => "大字千歳",
'44201555' => "下郡東",
'44201258' => "高城本町",
'44201172' => "寿町",
'44201166' => "古ケ鶴",
'44201092' => "乙津港町",
'44201573' => "富士見が丘東",
'44201485' => "山津町",
'44201470' => "向原東",
'44201283' => "田室町",
'44201228' => "新町",
'44201139' => "大字神崎",
'44201501' => "大字大在",
'44201473' => "大字廻栖野",
'44201313' => "中春日町",
'44201186' => "大字佐野",
'44201434' => "三川上",
'44201372' => "日岡",
'44201339' => "西春日町",
'44201299' => "寺崎町",
'44201180' => "大字迫",
'44201548' => "政所",
'44201391' => "大字日吉原",
'44201382' => "東鶴崎",
'44201293' => "大字常行",
'44201060' => "王子南町",
'44201024' => "大字家島",
'44201324' => "大字中ノ洲",
'44201317' => "中島東",
'44201099' => "大字海原",
'44201340' => "錦町",
'44201285' => "大字駄原",
'44201572' => "富士見が丘西",
'44201518' => "明野高尾",
'44201163' => "大字毛井",
'44201608' => "賀来西",
'44201239' => "住吉町",
'44201211' => "大字下徳丸",
'44201595' => "大字本神崎",
'44201476' => "元町",
'44201353' => "大字野田",
'44201241' => "勢家町",
'44201112' => "大字金谷迫",
'44201213' => "大字下戸次",
'44201091' => "乙津町",
'44201026' => "大字生石",
'44201244' => "大字関園",
'44201062' => "王子町",
'44201567' => "大分流通業務団地",
'44201280' => "大字田原",
'44201564' => "法勝台",
'44201215' => "大字下宗方",
'44201027' => "生石港町",
'44201562' => "徳島",
'44201539' => "汐見",
'44201524' => "緑が丘",
'44201596' => "久原中央",
'44201585' => "大字志生木",
'44201223' => "新春日町",
'44201550' => "横塚",
'44201487' => "大字屋山",
'44201424' => "大字牧",
'44201259' => "高城南町",
'44201505' => "城東町",
'44201488' => "豊町",
'44201050' => "上野町",
'44201542' => "須賀",
'44201350' => "大字丹生",
'44201252' => "高崎",
'44201167' => "大字国分",
'44201551' => "六坊北町",
'44201521' => "明野南",
'44201443' => "大字南",
'44201312' => "大字中尾",
'44201048' => "上野丘西",
'44201002' => "大字丹川",
'44201457' => "大字宮河内",
'44201294' => "大字角子原",
'44201033' => "大字一木",
'44201291' => "大字辻",
'44201182' => "大字里",
'44201561' => "小中島",
'44201529' => "高江西",
'44201047' => "上野丘",
'44201296' => "大字津留",
'44201036' => "大字一の洲",
'44201609' => "松が丘",
'44201545' => "角子南",
'44201544' => "角子原",
'44201515' => "牧上町",
'44201534' => "王ノ瀬",
'44201365' => "浜の市",
'44201600' => "坂ノ市西",
'44201598' => "小佐井",
'44201079' => "大字岡川",
'44201417' => "大字細",
'44201346' => "西浜",
'44201303' => "東野台",
'44201589' => "大字竹矢",
'44201565' => "公園通り",
'44201541' => "庄境",
'44201458' => "宮河内ハイランド",
'44201155' => "大字久土",
'44201057' => "王子新町",
'44201197' => "大字志村",
'44201576' => "大字一尺屋",
'44201519' => "明野西",
'44201546' => "花江川",
'44201543' => "竹下",
'44201540' => "志村",
'44201237' => "大字杉原",
'44201141' => "大字木田",
'44201030' => "泉町",
'44201597' => "久原南",
'44201577' => "大字今市",
'44201504' => "にじが丘",
'44201375' => "大字東上野",
'44201090' => "大字乙津",
'44201586' => "大字白木",
'44201522' => "田尻グリーンハイツ",
'44201192' => "敷戸西町",
'44201607' => "王子山の手町",
'44201525' => "曙台",
'44201093' => "大字小中島",
'44201578' => "大字入蔵",
'44201559' => "星和台",
'44201558' => "敷戸台",
'44201302' => "大字東院",
'44201261' => "大字高瀬",
'44201593' => "大字馬場",
'44201499' => "王子港町",
'44201389' => "ひばりケ丘",
'44201358' => "大字端登",
'44201154' => "大字口戸",
'44201523' => "けやき台",
'44201502' => "大字青崎",
'44201190' => "敷戸北町",
'44201580' => "大字大平",
'44201528' => "高江中央",
'44201463' => "大字宮苑",
'44201126' => "大字上判田",
'44201560' => "芳河原台",
'44201179' => "桜ケ丘",
'44201594' => "大字福宗",
'44201582' => "大字木佐上",
'44201530' => "高江南",
'44201429' => "大字丸亀",
'44201156' => "大字久原",
'44201138' => "大字河原内",
'44201065' => "大字大分",
'44201035' => "大字市尾",
'44201602' => "里",
'44201538' => "久原北",
'44201531' => "高尾台",
'44201298' => "大字鶴瀬",
'44201290' => "大字月形",
'44201193' => "敷戸東町",
'44201049' => "上野丘東",
'44201503' => "青葉台",
'44201579' => "大字太田",
'44201533' => "恵比寿町",
'44201394' => "大字広内",
'44201253' => "大字高崎",
'44201606' => "国分新町",
'44201603' => "小野鶴南",
'44201591' => "大字荷尾杵",
'44201590' => "大字辻原",
'44201552' => "六坊南町",
'44201547' => "浜中",
'44201527' => "高江北",
'44201355' => "大字萩尾",
'44201194' => "敷戸南町",
'44201191' => "敷戸新町",
'44201094' => "大字鬼崎",
'44201074' => "大字大津留",
'44201046' => "大字上野",
'44201001' => "青葉町",
'44201604' => "京が丘南",
'44201584' => "大字下原",
'44201571' => "希望が丘",
'44201563' => "かたしま台",
'44201557' => "望みが丘",
'44201492' => "大字横田",
'44201373' => "大字東明野",
'44201084' => "大字奥",
'44201526' => "藤の台",
'44201456' => "大字宮尾",
'44201588' => "大字高原",
'44201583' => "大字沢田",
'44201581' => "大字上詰",
'44201574' => "はなの森",
'44201511' => "萩原緑町",
'44201494' => "大字吉野原",
'44201398' => "大字福良",
'44201605' => "季の坂",
'44201516' => "牧緑町",
'44201393' => "大字平横瀬",
'44201364' => "大字浜",
'44201266' => "大字竹下",
'44201195' => "大字志津留",
'44201178' => "大字坂ノ市",
);
// 取得処理開始
$no = 1;
// 町リストをループ
foreach ($townid_list as $townid => $townname) {
// 該当町のトップページを取得
$town_url = sprintf($base_url, $townid);
// 該当町のトップページからジャンル1のリストを取得
$genre1_list = genre_split(fgc_utf8($town_url), $target_city, $townid);
// ジャンル1リストをループしてジャンル2を取得
foreach($genre1_list as $genre1_row){
// ジャンル2リスト取得
$genre2_list = genre_split(fgc_utf8($genre1_row['url']), $target_city, $townid);
// ジャンル2リストをループしてジャンル2を取得
foreach ($genre2_list as $genre2_row) {
// ジャンル3リスト取得
$genre3_list = genre_split(fgc_utf8($genre2_row['url']), $target_city, $townid);
// ジャンル3の検索一覧URLから会社情報を取得する処理を開始
foreach ($genre3_list as $genre3_row) {
$url_tmp = explode('/?', $genre3_row['url']);
$page_cnt = 1;
while (true) {
$search_result_url = $url_tmp[0] . "/pg/{$page_cnt}/?" . $url_tmp[1];
$search_result_src = fgc_utf8($search_result_url);
// 検索結果ページに会社情報が有る場合
if (preg_match('{normalResultsBox}', $search_result_src)) {
// キーワード:normalResultsBoxでページ内の会社情報を全取得
// 1ページ最大50件
preg_match_all('{<div class="normalResultsBox">(.*?)</article>}us', $search_result_src, $resultbox_list);
// 全取得した会社情報をループ処理
foreach ($resultbox_list[1] as $resultbox) {
// 通常パターン(blackText)
if (strpos($resultbox, "blackText") !== false) {
// 社名,タウンページURL
preg_match('{<a class="blackText" href="(.*?)" target="_blank">(.*?)</a>}us', $resultbox, $m_name);
$company_name = trim($m_name[2]);
$townpage_link = "http://itp.ne.jp" . trim($m_name[1]);
// 未出力の場合のみ後続処理を実行
if(!array_key_exists($townpage_link, $printed_list)){
// ホームページURL
preg_match('{URL</span>(.*?)</ br>}us', $resultbox, $m_hp);
$hp = trim($m_hp[1]);
// EMAIL
preg_match('{EMAIL</span>(.*?)</ br>}us', $resultbox, $m_email);
$email = trim($m_email[1]);
// 住所
$address = get_address($resultbox);
// 電話番号
$tel = get_tel($resultbox);
}
// 特別パターン(blueText)
}elseif (strpos($resultbox, "blueText") !== false) {
// 社名,タウンページURL
preg_match('{<a class="blueText" href="(.*?)" target="_blank">(.*?)</a>}us', $resultbox, $m_name);
$company_name = trim($m_name[2]);
preg_match('{%2F(.*?)%2F}us', $m_name[1], $m_link);
$townpage_link = "http://nttbj.itp.ne.jp/" . trim($m_link[1]) . "/index.html";
// 未出力の場合のみ後続処理を実行
if(!array_key_exists($townpage_link, $printed_list)){
// ホームページURL
// 特別パターンの場合は個別ページから取得
$nttbj_src = fgc_utf8($townpage_link);
preg_match('{<th colspan="1">URL</th>(.*?)<td colspan="3">(.*?)<A(.*?)>(.*?)</A>}us', $nttbj_src, $m_hp);
$hp = trim($m_hp[4]);
// EMAIL
preg_match('{<th colspan="1">E-mail</th>(.*?)<td colspan="3">(.*?)<A(.*?)>(.*?)</A>}us', $nttbj_src, $m_email);
$email = trim($m_email[4]);
// 住所
$address = get_address($resultbox);
// 電話番号
$tel = get_tel($resultbox);
}
// その他
}else{
$company_name = "例外パターン";
}
// 未出力の場合のみ出力処理を実行
if(!array_key_exists($townpage_link, $printed_list)){
// 出力
printf('"%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s"' . $eol,
$no,
$target_city_label,
$townname,
$company_name,
$address,
$hp,
$genre1_row['text'],
$genre2_row['text'],
$genre3_row['text'],
$tel['no'],
$tel['fax_flg'],
$email,
$townpage_link
);
// No.をカウントアップ
$no++;
// 重複を避けるために配列に格納
$printed_list[$townpage_link] = 1;
}
}
// ページをカウントアップ
$page_cnt++;
// 検索結果ページに会社情報が無い場合
}else{
break;
}
}
}
}
}
}
//===========================
// 関数
//===========================
// file_get_contentsでコンテンツを取得して、結果をSJIS→UTF-8に変換する
function fgc_utf8($url){
// 最大3回までリトライ
for ($i=0; $i < 3; $i++) {
$contents = file_get_contents($url);
if ($contents === false) {
sleep(30);
}else{
break;
}
}
return mb_convert_encoding($contents, 'UTF-8', 'SJIS');
}
// HTMLソースの中身からジャンルを抽出する
function genre_split($htmlsrc, $target_city, $townid){
$ret = array();
preg_match('{業種を選ぶ(.*?)</div>}us', $htmlsrc, $match_list);
$genre_str = $match_list[0];
$str = '<a href="javascript:void\(0\)" class="evfd" onclick="ITPPcListingSearchGenreNarrowingLink\(\'2\', \'/' . $target_city . '/' . $townid . '/genre_dir/';
preg_match_all('{' . $str . '(.*?)</a>}us', $genre_str, $match_item);
$end = count($match_item[1]);
for ($i = 0; $i < $end; $i++) {
$genre_info = explode('/?num=50&amp;nad=1&amp;st=4&amp;ngr=1&amp;sr=1\');return false;">', $match_item[1][$i]);
$ret[$i]['slug'] = $genre_info[0];
$ret[$i]['url'] = sprintf("http://itp.ne.jp/%s/%s/genre_dir/%s/?num=50&nad=1&st=4&ngr=1&sr=1", $target_city, $townid, $genre_info[0]);
$ret[$i]['text'] = $genre_info[1];
}
return $ret;
}
// 住所抜き出し
function get_address($str){
preg_match('{住所</span> (.*?) <}us', $str, $m_address);
return trim($m_address[1]);
}
// 電話番号抜き出し
function get_tel($str){
$tel = array();
preg_match('{TEL</span>(.*?)<b>(.*?)</b>}us', $str, $m_tel);
// 電話番号
$tel['no'] = trim($m_tel[2]);
// FAX専フラグ
$tel['fax_flg'] = (strpos($m_tel[1], "F専") !== false) ? 'FAX' : '' ;
return $tel;
}
?>
@tacktaddy
Copy link
Author

tacktaddy commented Dec 13, 2016

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment