Created
February 29, 2012 08:41
-
-
Save parano/a5f4015ba7091e83f775 to your computer and use it in GitHub Desktop.
scrape jingdian
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby -w | |
require 'rexml/document' | |
include REXML | |
xmlfile = File.new("tour_treepath_area.xml") | |
xmldoc = Document.new(xmlfile) | |
root = xmldoc.root | |
#puts root.attributes["ver"] | |
#xmldoc.elements.each("configuration/property/class") { | |
# |e| puts e.attributes["key"] | |
#} | |
xmldoc.elements.each("configuration/property/class") { |e| | |
e.elements.each("prov"){ |a| | |
puts a.attributes["key"] | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby -w | |
# coding: utf-8 | |
require 'rexml/document' | |
include REXML | |
if $0 == __FILE__ | |
f = File.open("all_data.txt","r") | |
doc = Document.new File.new "tour_treepath_area.xml" | |
f.each do |line| | |
a = line.split(",") | |
a.each { |a| a.gsub!(/[\n\s]/,'') } | |
#puts "#{a[0]} #{a[1]} #{a[2]}" | |
if doc.elements["*/*/*/prov[@key='#{a[0]}']"] == nil | |
puts "#{a[0]} doesn't exist!" | |
elsif doc.elements["*/*/*/prov[@key='#{a[0]}']/area[@key='#{a[1]}']"] == nil | |
e = doc.elements["*/*/*/prov[@key='#{a[0]}']"] | |
e.add_element "area",{"key" => "#{a[1]}", "desc" => "#{a[2]}"} | |
#puts "add #{a[1]} ,#{a[2]}" | |
else | |
e = doc.elements["*/*/*/prov[@key='#{a[0]}']/area[@key='#{a[1]}']"] | |
s = e.attributes["desc"] | |
s += "|#{a[2]}" | |
e.attributes["desc"] = s | |
#puts "add #{a[2]} to #{a[1]}" | |
end | |
end | |
f.close | |
f = File.open("results.xml",'w') | |
f.puts doc.to_s | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby -w | |
# coding: utf-8 | |
require 'rexml/document' | |
include REXML | |
if $0 == __FILE__ | |
doc = Document.new File.new "treepath.xml" | |
#doc = Document.new File.new "tour_treepath_area.xml" | |
doc.elements.each("configuration/property/class") do |e_class| | |
place = e_class.attributes["key"] | |
e_class.elements.each("prov") do |e_prov| | |
prov = e_prov.attributes["key"] | |
e_prov.elements.each("area") do |e_area| | |
area = e_area.attributes["key"] | |
desc = e_area.attributes["desc"].split('|') * '$' | |
puts "#{desc}|A|地区##{place}##{prov}##{area}" | |
end | |
end | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby -w | |
# coding: utf-8 | |
require 'rexml/document' | |
include REXML | |
if $0 == __FILE__ | |
f = File.open("new_data.txt","r") | |
a = [] | |
f.each do |line| | |
b = a.clone | |
a = line.split(",") | |
a.each { |a| a.gsub!(/[\n\s]/,'') } | |
if a[0] == b[0] and a[1] == b[1] | |
print "|#{a[2]}" | |
else | |
print "\n#{a[0]},#{a[1]},#{a[2]}" | |
end | |
#puts "#{a[0]} #{a[1]} #{a[2]}" | |
end | |
f.close | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby -w | |
# coding: utf-8 | |
require 'rexml/document' | |
include REXML | |
if $0 == __FILE__ | |
f = File.open("improved_data.txt","r") | |
doc = Document.new File.new "tour_treepath_area.xml" | |
f.each do |line| | |
a = line.split(",") | |
a.each { |a| a.gsub!(/[\n\s]/,'') } | |
#puts "#{a[0]} #{a[1]} #{a[2]}" | |
if doc.elements["*/*/*/prov[@key='#{a[0]}']"] == nil | |
puts "#{a[0]} doesn't exist!" | |
elsif doc.elements["*/*/*/prov[@key='#{a[0]}']/area[@key='#{a[1]}']"] == nil | |
e = doc.elements["*/*/*/prov[@key='#{a[0]}']"] | |
e.add_element "area",{"key" => "#{a[1]}", "desc" => "#{a[2]}"} | |
#puts "add #{a[1]} ,#{a[2]}" | |
else | |
e = doc.elements["*/*/*/prov[@key='#{a[0]}']/area[@key='#{a[1]}']"] | |
s = e.attributes["desc"] | |
s += "|#{a[2]}" | |
e.attributes["desc"] = s.split('|').uniq * '|' | |
#puts "add #{a[2]} to #{a[1]}" | |
end | |
end | |
f.close | |
f = File.open("results.xml",'w') | |
f.puts doc.to_s | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby | |
# coding: utf-8 | |
require 'uri' | |
#require 'iconv' | |
require 'open-uri' | |
require 'thread' | |
def scrape(from,to,n) | |
file = File.open("data#{n}.txt","w+") | |
num = 1 | |
for num in from..to | |
begin | |
page = open(@url + num.to_s) | |
text = page.read; nil | |
text = text.to_s.scan(@regexp) | |
if( @regexp_title =~ text.to_s ) | |
print "#{n}:#{num.to_s}:#{$1},#{$2}," | |
file << "#{$1},#{$2}," | |
end | |
if( @regexp_jingdian =~ text.to_s ) | |
print "#{$1}\n" | |
file << "#{$1}\n" | |
end | |
rescue Timeout::Error | |
puts "#{num} Time out" | |
end | |
end | |
file.close | |
end | |
if $0 == __FILE__ | |
@url = 'http://jingdian.tuniu.com/fengjing/' | |
@regexp = /\<h1\>.*span\>/ | |
@regexp_jingdian = /\<h1\>(.*)\<\/h1\>/ | |
@regexp_jingdian1 = /title=\".*\"\>(.*)\<\/a\>\// | |
@regexp_jingdian2 = /href=.*"\>(.*)\<\/a\>"/ | |
@regexp_title = /href=.*"\>(.*)\<\/a\>\/<a.*"\>(.*)\<\/a\>/ | |
# from to 10376..38144 | |
threads = [] | |
1.upto(40) do |n| | |
threads << Thread.new { | |
# do task | |
if n != 40 | |
tem_from = 10376 + (n-1) * 694 | |
tem_to = tem_from + 693 | |
scrape(tem_from, tem_to, n) | |
else | |
tem_from = 10376 + (n-1) * 694 | |
tem_to = 38144 | |
scrape(tem_from, tem_to, n) | |
end | |
} | |
end | |
threads.each { |t| t.join } | |
puts "shit" | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby | |
# coding: utf-8 | |
require 'uri' | |
require 'iconv' | |
require 'open-uri' | |
if $0 == __FILE__ | |
url = 'http://jingdian.tuniu.com/fengjing/' | |
num = 1 | |
regexp = /\<h1\>.*span\>/ | |
regexp_jingdian = /\<h1\>(.*)\<\/h1\>/ | |
regexp_jingdian1 = /title=\".*\"\>(.*)\<\/a\>\// | |
regexp_jingdian2 = /href=.*"\>(.*)\<\/a\>"/ | |
regexp_title = /href=.*"\>(.*)\<\/a\>\/<a.*"\>(.*)\<\/a\>/ | |
file = File.open("data.txt","w+") | |
for num in 1..38144 | |
page = open(url + num.to_s) | |
text = page.read; nil | |
text = text.to_s.scan(regexp) | |
if( regexp_title =~ text.to_s ) | |
print "#{$1},#{$2}," | |
file << "#{$1},#{$2}," | |
end | |
if( regexp_jingdian =~ text.to_s ) | |
print "#{$1}\n" | |
file << "#{$1}\n" | |
end | |
end | |
file.close | |
end |
发现上面的方法执行后存在景点名重复,并且很多景点最后都少了一个字
找到bug源自于
- 误用了String类的chop函数,当字符串结尾不是\n\r什么的话就会删掉最后一个字符
- 没有对area的desc的内容做判断是否重复后再进行插入
为了去除重复的地点名称,在每次插入desc时,加上一个include?判断是否有重复。这样以后程序每插入一条记录都要在大的treepath中查询,并对一大串desc做split include 这样的操作,程序肯定会非常非常慢。为了改进程序效率,这里先对all_data做了一点简单的处理:
sort -k1 all_data.txt >> new_data.txt
然后运行脚本 ./improve_data.rb >> improved_data.txt
得到一个改进后的数据文件,形如下面的文件 :
上海,南汇,上海南汇老港镇滨海农家乐|上海工商外国语学院|上海滨海农家乐|上海野生动物园|东海影视乐园|中荷玫瑰园|南汇区体育场|南汇区体育馆|南汇博物馆|卢潮港|古钟园|周浦公园|射击俱乐部|新场古镇|桃源民俗村|海港卡丁车运动场|滨海森林公园|芦潮港|鲜花港
上海,卢湾,上海公安博物馆|上海市跳水池|上海第二医科大学(上海交通大学医学部)|中共一大会址|南园公园|卢浦大桥|卢湾区工人体育场|周公馆|复兴公园|复兴坊|孙中山故居|延安中路大型公共绿地|新天地|梅兰坊|淮海公园|淮海坊|淮海路购物街|渔阳里弄|绍兴公园|韬奋纪念馆
上海,嘉定,上海大学悉尼工商学院|上海音乐学院|南翔双塔|南翔火车站|古漪园|古猗园|叶池|吴兴寺|嘉定体育中心|嘉定儿童公园|嘉定博物馆|四海壶具博物馆|国际赛车场|孔庙|安亭公园|安亭火车站|梦幻乐园|汇龙潭|汇龙潭公园|法华塔|浏河岛风景区|环球乐园|秋霞圃|衡山路|陆俨少艺术院|陶庵留碧碑
上海,奉贤,奉贤区体育中心|奉贤博物馆
最后运行改进版本的improved_from_data_to_treepath.rb即可快速得到新生成的xml,同样再次运行原来的from_treepath_to_keywords.rb获得keywords文件
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
使用from_data_to_treepath.rb 脚本把all_data.txt中的数据全部插入到已有的treepath中,得到一个汇总的xml文件
再通过from_treepath_keywords.rb 脚本把汇总的xml整理为keywords形式,重定向到一个keywords文件