Skip to content

Instantly share code, notes, and snippets.

@guandaxia
Last active April 12, 2020 13:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save guandaxia/c9dcbee31d3d865402cf52df5b9c41f2 to your computer and use it in GitHub Desktop.
Save guandaxia/c9dcbee31d3d865402cf52df5b9c41f2 to your computer and use it in GitHub Desktop.
58同城租房爬虫——基于phpspider|-|{"files":{"58.php":{"env":"plain"}},"tag":"Uncategorized"}
<?php
ini_set("memory_limit", "1024M");
require dirname(__FILE__).'/../core/init.php';
/* Do NOT delete this comment */
/* 不要删除这段注释 */
//58同城
$configs = array(
'name' => '58租房',
'log_show' => false,
// 'log_type' => 'info, warn, error',
'tasknum' => 5,
'interval' => 1000,
'export' => array(
'type' => 'sql',
'file' => PATH_DATA.'/58.sql',
'table' => 'zufang',
),
'max_try' => 5,
// 'proxy' => 'http://115.202.167.238:808',
//'save_running_state' => true,
'domains' => array(
'tj.58.com',
),
'scan_urls' => array(
'http://tj.58.com/chuzu/0/'
),
'list_url_regexes' => array(
"http://tj.58.com/chuzu/0/pn\d+"
),
'content_url_regexes' => array(
"http://tj.58.com/zufang/\d{14}x.shtml(\?\w+)*",
),
'fields' => array(
[
//标题
'name' => "title",
'selector' => "//div[4]/div[1]/h1",
'required' => true,
],
[
//租金
'name' => "money",
'selector' => "//div[4]/div[2]/div[2]/div[1]/div[1]/div/span[1]/b",
'required' => true,
],
[
//租赁方式
'name' => "lease_mode",
'selector' => "//div[4]/div[2]/div[2]/div[1]/div[1]/ul/li[1]/span[2]",
'required' => true,
],
[
//房屋类型
'name' => "roome_type",
'selector' => "div[4]/div[2]/div[2]/div[1]/div[1]/ul/li[2]/span[1]",
'required' => true,
],
[
//朝向楼层
'name' => "floor",
'selector' => "div[4]/div[2]/div[2]/div[1]/div[1]/ul/li[3]/span[2]",
'required' => true,
],
[
//所在小区
'name' => "house_estate",
'selector' => "div[4]/div[2]/div[2]/div[1]/div[1]/ul/li[4]/span[2]/a",
'required' => true,
],
[
//地址
'name' => "address",
'selector' => "div[4]/div[2]/div[2]/div[1]/div[1]/ul/li[6]/span[2]",
'required' => true,
]
),
);
$spider = new phpspider($configs);
$spider->on_extract_field = function($fieldname, $data, $page) {
if($fieldname == "floor"){
$data = str_replace("&nbsp;", "", $data);
}
return $data;
};
$spider->start();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment