Skip to content

Instantly share code, notes, and snippets.

@Youngv
Last active August 29, 2015 14:10
Show Gist options
  • Save Youngv/5d133ceba72d699c65fe to your computer and use it in GitHub Desktop.
Save Youngv/5d133ceba72d699c65fe to your computer and use it in GitHub Desktop.
抓取点评数据的脚本。
#encoding: utf-8
require 'uri'
require 'json'
require 'base64'
PAGE_URL = "http://www.dianping.com/shop/"
AK = "2ozNcQ4RfFDlSQozwNBwKnIz"
PI = 3.14159265358979324 * 3000.0 / 180.0
def crawl_page(i,j)
s = ('A'..'Z').to_a + ('0'..'9').to_a
hashids = Hashids.new("www.ttch.com",10,s.join())
(i..j).each do |n|
url = PAGE_URL + n.to_s
begin
response = RestClient.get(url, :user_agent => 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.63 Safari/537.36')
rescue => e
$logger.error "抓取失败: #{url}"
sleep 3
next
end
$logger.info n
poi = /poi:\s"([A-Z]+)"/.match(response)
$logger.info n,poi[1]
page = Nokogiri::HTML(response)
if !page.css("#Tab_Category a strong").text.include?("美食")
$logger.error "非美食: #{url}"
sleep 1
next
end
name = page.css(".shop-title").text.gsub(/\s/,"")
city = page.css("#G_loc").text.gsub(/\s/,"").delete "站"
region = page.css("span[itemprop='locality region']").text.gsub(/\s/,"")
street = page.css("span[itemprop='street-address']").text.gsub(/\s/,"")
address = city + region + street
phone = page.css("span[itemprop='tel']")
k = phone.length
phones = []
if k == 1
phones << phone.text.gsub(/\s/,"")
else
(1..k).each do |x|
phones << phone[x-1].text.gsub(/\s/,"")
end
end
$logger.info n,poi[1]
gg_lon, gg_lat, dist = get_coordinates(address)
hash = hashids.encrypt(Time.now.to_i,n)
rest={
name:name,
phone:phones.join(","),
city:city,
address:address,
lon:gg_lon,
lat:gg_lat,
url_id:n,
poi:hash
}
restaurant = Restaurant.new(rest)
if restaurant.save
$logger.info "抓取成功: 第 #{restaurant.id} 条数据: #{url}"
$logger.info "餐馆名称:#{name}, 地址:#{address}, 电话:#{phones.join(",")}, 高德坐标:#{[gg_lon,gg_lat]}, 误差:#{dist}, POI: #{hash}"
else
$logger.error "保存失败错误信息: #{restaurant.errors.full_messages}"
next
end
sleep 1
end
end
private
def get_coordinates(address)
bd_lon, bd_lat = convert_address_to_baidu(address)
gg_lon, gg_lat = bd_decrypt bd_lon, bd_lat
bd_lon_f,bd_lat_f = convert_gaode_to_baidu gg_lon, gg_lat
a = [bd_lon, bd_lat]
b = [bd_lon_f, bd_lat_f]
dist = distance a,b
return gg_lon, gg_lat, dist
end
def distance a,b
rad_per_deg = Math::PI/180 # PI / 180
rkm = 6371 # Earth radius in kilometers
rm = rkm * 1000 # Radius in meters
dlon_rad = (b[1]-a[1]) * rad_per_deg # Delta, converted to rad
dlat_rad = (b[0]-a[0]) * rad_per_deg
lat1_rad, lon1_rad = a.map! {|i| i * rad_per_deg }
lat2_rad, lon2_rad = b.map! {|i| i * rad_per_deg }
a = Math.sin(dlat_rad/2)**2 + Math.cos(lat1_rad) * Math.cos(lat2_rad) * Math.sin(dlon_rad/2)**2
c = 2 * Math.asin(Math.sqrt(a))
dist = rm * c # Delta in meters
return dist
end
def bd_decrypt bd_lon,bd_lat
x = bd_lon - 0.0065
y = bd_lat - 0.006
z = Math.sqrt(x * x + y * y) - 0.00002 * Math.sin(y * PI)
theta = Math.atan2(y, x) - 0.000003 * Math.cos(x * PI)
gg_lon = z * Math.cos(theta)
gg_lat = z * Math.sin(theta)
return gg_lon,gg_lat
end
def convert_gaode_to_baidu(gg_lon,gg_lat)
baidu_map_api_url = "http://api.map.baidu.com/geoconv/v1/?coords=#{gg_lon},#{gg_lat}&from=3&to=5&ak=2ozNcQ4RfFDlSQozwNBwKnIz"
response = RestClient.get(URI.escape(baidu_map_api_url))
response = JSON.parse(response)
bd_lon_f = response["result"][0]["x"]
bd_lat_f = response["result"][0]["y"]
return bd_lon_f,bd_lat_f
end
def convert_address_to_baidu(address)
baidu_map_api_url = "http://api.map.baidu.com/geocoder/v2/?address=#{address}&output=json&ak=#{AK}"
response = RestClient.get(URI.escape(baidu_map_api_url))
response = JSON.parse(response)
bd_lon = response["result"]["location"]["lng"]
bd_lat = response["result"]["location"]["lat"]
return bd_lon,bd_lat
end
def decode(poi)
digi=16
add= 10
plus=7
cha=36
i = -1
h = 0
b = ""
j = poi.length
g = poi[-1].ord
poi = poi[0..j-2]
j -= 1
for n in (0..j-1) do
puts n
d = poi[n].to_i(cha) - add
if d >= add
d = d - plus
end
b += d.to_s(cha)
if d > h
i = n
h = d
end
end
a = b[0..i-1].to_i(digi)
f = b[i..-1].to_i(digi)
l = (a + f - g.to_i) / 2
k = (f - l) / 100000.0
l /= 100000.0
return {lat:k ,lng:l}
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment