Last active
August 29, 2015 14:10
-
-
Save Youngv/5d133ceba72d699c65fe to your computer and use it in GitHub Desktop.
抓取点评数据的脚本。
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#encoding: utf-8 | |
require 'uri' | |
require 'json' | |
require 'base64' | |
PAGE_URL = "http://www.dianping.com/shop/" | |
AK = "2ozNcQ4RfFDlSQozwNBwKnIz" | |
PI = 3.14159265358979324 * 3000.0 / 180.0 | |
def crawl_page(i,j) | |
s = ('A'..'Z').to_a + ('0'..'9').to_a | |
hashids = Hashids.new("www.ttch.com",10,s.join()) | |
(i..j).each do |n| | |
url = PAGE_URL + n.to_s | |
begin | |
response = RestClient.get(url, :user_agent => 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.63 Safari/537.36') | |
rescue => e | |
$logger.error "抓取失败: #{url}" | |
sleep 3 | |
next | |
end | |
$logger.info n | |
poi = /poi:\s"([A-Z]+)"/.match(response) | |
$logger.info n,poi[1] | |
page = Nokogiri::HTML(response) | |
if !page.css("#Tab_Category a strong").text.include?("美食") | |
$logger.error "非美食: #{url}" | |
sleep 1 | |
next | |
end | |
name = page.css(".shop-title").text.gsub(/\s/,"") | |
city = page.css("#G_loc").text.gsub(/\s/,"").delete "站" | |
region = page.css("span[itemprop='locality region']").text.gsub(/\s/,"") | |
street = page.css("span[itemprop='street-address']").text.gsub(/\s/,"") | |
address = city + region + street | |
phone = page.css("span[itemprop='tel']") | |
k = phone.length | |
phones = [] | |
if k == 1 | |
phones << phone.text.gsub(/\s/,"") | |
else | |
(1..k).each do |x| | |
phones << phone[x-1].text.gsub(/\s/,"") | |
end | |
end | |
$logger.info n,poi[1] | |
gg_lon, gg_lat, dist = get_coordinates(address) | |
hash = hashids.encrypt(Time.now.to_i,n) | |
rest={ | |
name:name, | |
phone:phones.join(","), | |
city:city, | |
address:address, | |
lon:gg_lon, | |
lat:gg_lat, | |
url_id:n, | |
poi:hash | |
} | |
restaurant = Restaurant.new(rest) | |
if restaurant.save | |
$logger.info "抓取成功: 第 #{restaurant.id} 条数据: #{url}" | |
$logger.info "餐馆名称:#{name}, 地址:#{address}, 电话:#{phones.join(",")}, 高德坐标:#{[gg_lon,gg_lat]}, 误差:#{dist}, POI: #{hash}" | |
else | |
$logger.error "保存失败错误信息: #{restaurant.errors.full_messages}" | |
next | |
end | |
sleep 1 | |
end | |
end | |
private | |
def get_coordinates(address) | |
bd_lon, bd_lat = convert_address_to_baidu(address) | |
gg_lon, gg_lat = bd_decrypt bd_lon, bd_lat | |
bd_lon_f,bd_lat_f = convert_gaode_to_baidu gg_lon, gg_lat | |
a = [bd_lon, bd_lat] | |
b = [bd_lon_f, bd_lat_f] | |
dist = distance a,b | |
return gg_lon, gg_lat, dist | |
end | |
def distance a,b | |
rad_per_deg = Math::PI/180 # PI / 180 | |
rkm = 6371 # Earth radius in kilometers | |
rm = rkm * 1000 # Radius in meters | |
dlon_rad = (b[1]-a[1]) * rad_per_deg # Delta, converted to rad | |
dlat_rad = (b[0]-a[0]) * rad_per_deg | |
lat1_rad, lon1_rad = a.map! {|i| i * rad_per_deg } | |
lat2_rad, lon2_rad = b.map! {|i| i * rad_per_deg } | |
a = Math.sin(dlat_rad/2)**2 + Math.cos(lat1_rad) * Math.cos(lat2_rad) * Math.sin(dlon_rad/2)**2 | |
c = 2 * Math.asin(Math.sqrt(a)) | |
dist = rm * c # Delta in meters | |
return dist | |
end | |
def bd_decrypt bd_lon,bd_lat | |
x = bd_lon - 0.0065 | |
y = bd_lat - 0.006 | |
z = Math.sqrt(x * x + y * y) - 0.00002 * Math.sin(y * PI) | |
theta = Math.atan2(y, x) - 0.000003 * Math.cos(x * PI) | |
gg_lon = z * Math.cos(theta) | |
gg_lat = z * Math.sin(theta) | |
return gg_lon,gg_lat | |
end | |
def convert_gaode_to_baidu(gg_lon,gg_lat) | |
baidu_map_api_url = "http://api.map.baidu.com/geoconv/v1/?coords=#{gg_lon},#{gg_lat}&from=3&to=5&ak=2ozNcQ4RfFDlSQozwNBwKnIz" | |
response = RestClient.get(URI.escape(baidu_map_api_url)) | |
response = JSON.parse(response) | |
bd_lon_f = response["result"][0]["x"] | |
bd_lat_f = response["result"][0]["y"] | |
return bd_lon_f,bd_lat_f | |
end | |
def convert_address_to_baidu(address) | |
baidu_map_api_url = "http://api.map.baidu.com/geocoder/v2/?address=#{address}&output=json&ak=#{AK}" | |
response = RestClient.get(URI.escape(baidu_map_api_url)) | |
response = JSON.parse(response) | |
bd_lon = response["result"]["location"]["lng"] | |
bd_lat = response["result"]["location"]["lat"] | |
return bd_lon,bd_lat | |
end | |
def decode(poi) | |
digi=16 | |
add= 10 | |
plus=7 | |
cha=36 | |
i = -1 | |
h = 0 | |
b = "" | |
j = poi.length | |
g = poi[-1].ord | |
poi = poi[0..j-2] | |
j -= 1 | |
for n in (0..j-1) do | |
puts n | |
d = poi[n].to_i(cha) - add | |
if d >= add | |
d = d - plus | |
end | |
b += d.to_s(cha) | |
if d > h | |
i = n | |
h = d | |
end | |
end | |
a = b[0..i-1].to_i(digi) | |
f = b[i..-1].to_i(digi) | |
l = (a + f - g.to_i) / 2 | |
k = (f - l) / 100000.0 | |
l /= 100000.0 | |
return {lat:k ,lng:l} | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment