Skip to content

Instantly share code, notes, and snippets.

@parano
Created February 29, 2012 08:41
Show Gist options
  • Save parano/a5f4015ba7091e83f775 to your computer and use it in GitHub Desktop.
Save parano/a5f4015ba7091e83f775 to your computer and use it in GitHub Desktop.
scrape jingdian
#!/usr/bin/ruby -w
require 'rexml/document'
include REXML
xmlfile = File.new("tour_treepath_area.xml")
xmldoc = Document.new(xmlfile)
root = xmldoc.root
#puts root.attributes["ver"]
#xmldoc.elements.each("configuration/property/class") {
# |e| puts e.attributes["key"]
#}
xmldoc.elements.each("configuration/property/class") { |e|
e.elements.each("prov"){ |a|
puts a.attributes["key"]
}
}
#!/usr/bin/ruby -w
# coding: utf-8
require 'rexml/document'
include REXML
if $0 == __FILE__
f = File.open("all_data.txt","r")
doc = Document.new File.new "tour_treepath_area.xml"
f.each do |line|
a = line.split(",")
a.each { |a| a.gsub!(/[\n\s]/,'') }
#puts "#{a[0]} #{a[1]} #{a[2]}"
if doc.elements["*/*/*/prov[@key='#{a[0]}']"] == nil
puts "#{a[0]} doesn't exist!"
elsif doc.elements["*/*/*/prov[@key='#{a[0]}']/area[@key='#{a[1]}']"] == nil
e = doc.elements["*/*/*/prov[@key='#{a[0]}']"]
e.add_element "area",{"key" => "#{a[1]}", "desc" => "#{a[2]}"}
#puts "add #{a[1]} ,#{a[2]}"
else
e = doc.elements["*/*/*/prov[@key='#{a[0]}']/area[@key='#{a[1]}']"]
s = e.attributes["desc"]
s += "|#{a[2]}"
e.attributes["desc"] = s
#puts "add #{a[2]} to #{a[1]}"
end
end
f.close
f = File.open("results.xml",'w')
f.puts doc.to_s
end
#!/usr/bin/ruby -w
# coding: utf-8
require 'rexml/document'
include REXML
if $0 == __FILE__
doc = Document.new File.new "treepath.xml"
#doc = Document.new File.new "tour_treepath_area.xml"
doc.elements.each("configuration/property/class") do |e_class|
place = e_class.attributes["key"]
e_class.elements.each("prov") do |e_prov|
prov = e_prov.attributes["key"]
e_prov.elements.each("area") do |e_area|
area = e_area.attributes["key"]
desc = e_area.attributes["desc"].split('|') * '$'
puts "#{desc}|A|地区##{place}##{prov}##{area}"
end
end
end
end
#!/usr/bin/ruby -w
# coding: utf-8
require 'rexml/document'
include REXML
if $0 == __FILE__
f = File.open("new_data.txt","r")
a = []
f.each do |line|
b = a.clone
a = line.split(",")
a.each { |a| a.gsub!(/[\n\s]/,'') }
if a[0] == b[0] and a[1] == b[1]
print "|#{a[2]}"
else
print "\n#{a[0]},#{a[1]},#{a[2]}"
end
#puts "#{a[0]} #{a[1]} #{a[2]}"
end
f.close
end
#!/usr/bin/ruby -w
# coding: utf-8
require 'rexml/document'
include REXML
if $0 == __FILE__
f = File.open("improved_data.txt","r")
doc = Document.new File.new "tour_treepath_area.xml"
f.each do |line|
a = line.split(",")
a.each { |a| a.gsub!(/[\n\s]/,'') }
#puts "#{a[0]} #{a[1]} #{a[2]}"
if doc.elements["*/*/*/prov[@key='#{a[0]}']"] == nil
puts "#{a[0]} doesn't exist!"
elsif doc.elements["*/*/*/prov[@key='#{a[0]}']/area[@key='#{a[1]}']"] == nil
e = doc.elements["*/*/*/prov[@key='#{a[0]}']"]
e.add_element "area",{"key" => "#{a[1]}", "desc" => "#{a[2]}"}
#puts "add #{a[1]} ,#{a[2]}"
else
e = doc.elements["*/*/*/prov[@key='#{a[0]}']/area[@key='#{a[1]}']"]
s = e.attributes["desc"]
s += "|#{a[2]}"
e.attributes["desc"] = s.split('|').uniq * '|'
#puts "add #{a[2]} to #{a[1]}"
end
end
f.close
f = File.open("results.xml",'w')
f.puts doc.to_s
end
#!/usr/bin/ruby
# coding: utf-8
require 'uri'
#require 'iconv'
require 'open-uri'
require 'thread'
def scrape(from,to,n)
file = File.open("data#{n}.txt","w+")
num = 1
for num in from..to
begin
page = open(@url + num.to_s)
text = page.read; nil
text = text.to_s.scan(@regexp)
if( @regexp_title =~ text.to_s )
print "#{n}:#{num.to_s}:#{$1},#{$2},"
file << "#{$1},#{$2},"
end
if( @regexp_jingdian =~ text.to_s )
print "#{$1}\n"
file << "#{$1}\n"
end
rescue Timeout::Error
puts "#{num} Time out"
end
end
file.close
end
if $0 == __FILE__
@url = 'http://jingdian.tuniu.com/fengjing/'
@regexp = /\<h1\>.*span\>/
@regexp_jingdian = /\<h1\>(.*)\<\/h1\>/
@regexp_jingdian1 = /title=\".*\"\>(.*)\<\/a\>\//
@regexp_jingdian2 = /href=.*"\>(.*)\<\/a\>"/
@regexp_title = /href=.*"\>(.*)\<\/a\>\/<a.*"\>(.*)\<\/a\>/
# from to 10376..38144
threads = []
1.upto(40) do |n|
threads << Thread.new {
# do task
if n != 40
tem_from = 10376 + (n-1) * 694
tem_to = tem_from + 693
scrape(tem_from, tem_to, n)
else
tem_from = 10376 + (n-1) * 694
tem_to = 38144
scrape(tem_from, tem_to, n)
end
}
end
threads.each { |t| t.join }
puts "shit"
end
#!/usr/bin/ruby
# coding: utf-8
require 'uri'
require 'iconv'
require 'open-uri'
if $0 == __FILE__
url = 'http://jingdian.tuniu.com/fengjing/'
num = 1
regexp = /\<h1\>.*span\>/
regexp_jingdian = /\<h1\>(.*)\<\/h1\>/
regexp_jingdian1 = /title=\".*\"\>(.*)\<\/a\>\//
regexp_jingdian2 = /href=.*"\>(.*)\<\/a\>"/
regexp_title = /href=.*"\>(.*)\<\/a\>\/<a.*"\>(.*)\<\/a\>/
file = File.open("data.txt","w+")
for num in 1..38144
page = open(url + num.to_s)
text = page.read; nil
text = text.to_s.scan(regexp)
if( regexp_title =~ text.to_s )
print "#{$1},#{$2},"
file << "#{$1},#{$2},"
end
if( regexp_jingdian =~ text.to_s )
print "#{$1}\n"
file << "#{$1}\n"
end
end
file.close
end
@parano
Copy link
Author

parano commented Feb 29, 2012

在终端输入awk -F, '{ print $1 $2 }' all_data.txt | sort | uniq
查看不同的prov共有多少
用 dom-like-parsing.rb 查看原有xml中的prov的数量,查看是否已经全部覆盖
然后发现prov都是40个,完全一样
那么只需要根据prov来加入area以及在对应area的description中加入景点即可

@parano
Copy link
Author

parano commented Mar 1, 2012

使用from_data_to_treepath.rb 脚本把all_data.txt中的数据全部插入到已有的treepath中,得到一个汇总的xml文件
再通过from_treepath_keywords.rb 脚本把汇总的xml整理为keywords形式,重定向到一个keywords文件

@parano
Copy link
Author

parano commented Mar 3, 2012

发现上面的方法执行后存在景点名重复,并且很多景点最后都少了一个字
找到bug源自于

  • 误用了String类的chop函数,当字符串结尾不是\n\r什么的话就会删掉最后一个字符
  • 没有对area的desc的内容做判断是否重复后再进行插入

@parano
Copy link
Author

parano commented Mar 3, 2012

为了去除重复的地点名称,在每次插入desc时,加上一个include?判断是否有重复。这样以后程序每插入一条记录都要在大的treepath中查询,并对一大串desc做split include 这样的操作,程序肯定会非常非常慢。为了改进程序效率,这里先对all_data做了一点简单的处理:
sort -k1 all_data.txt >> new_data.txt
然后运行脚本 ./improve_data.rb >> improved_data.txt
得到一个改进后的数据文件,形如下面的文件 :

上海,南汇,上海南汇老港镇滨海农家乐|上海工商外国语学院|上海滨海农家乐|上海野生动物园|东海影视乐园|中荷玫瑰园|南汇区体育场|南汇区体育馆|南汇博物馆|卢潮港|古钟园|周浦公园|射击俱乐部|新场古镇|桃源民俗村|海港卡丁车运动场|滨海森林公园|芦潮港|鲜花港
上海,卢湾,上海公安博物馆|上海市跳水池|上海第二医科大学(上海交通大学医学部)|中共一大会址|南园公园|卢浦大桥|卢湾区工人体育场|周公馆|复兴公园|复兴坊|孙中山故居|延安中路大型公共绿地|新天地|梅兰坊|淮海公园|淮海坊|淮海路购物街|渔阳里弄|绍兴公园|韬奋纪念馆
上海,嘉定,上海大学悉尼工商学院|上海音乐学院|南翔双塔|南翔火车站|古漪园|古猗园|叶池|吴兴寺|嘉定体育中心|嘉定儿童公园|嘉定博物馆|四海壶具博物馆|国际赛车场|孔庙|安亭公园|安亭火车站|梦幻乐园|汇龙潭|汇龙潭公园|法华塔|浏河岛风景区|环球乐园|秋霞圃|衡山路|陆俨少艺术院|陶庵留碧碑
上海,奉贤,奉贤区体育中心|奉贤博物馆

最后运行改进版本的improved_from_data_to_treepath.rb即可快速得到新生成的xml,同样再次运行原来的from_treepath_to_keywords.rb获得keywords文件

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment