parano/dom-like-parsing.rb

## dom-like-parsing.rb
#!/usr/bin/ruby -w

require 'rexml/document'
include REXML

xmlfile = File.new("tour_treepath_area.xml")
xmldoc = Document.new(xmlfile)

root = xmldoc.root
#puts root.attributes["ver"]

#xmldoc.elements.each("configuration/property/class") {
# |e| puts e.attributes["key"]
#}

xmldoc.elements.each("configuration/property/class") { |e|
  e.elements.each("prov"){ |a|
    puts a.attributes["key"]
  }
}

## from_data_to_treepath.rb
#!/usr/bin/ruby -w
# coding: utf-8

require 'rexml/document'
include REXML

if $0 == __FILE__
  f = File.open("all_data.txt","r")
  doc = Document.new File.new "tour_treepath_area.xml"

  f.each do |line|
    a = line.split(",")
    a.each { |a| a.gsub!(/[\n\s]/,'') }

    #puts "#{a[0]}  #{a[1]} #{a[2]}"
    if doc.elements["*/*/*/prov[@key='#{a[0]}']"] == nil
      puts "#{a[0]} doesn't exist!"
    elsif doc.elements["*/*/*/prov[@key='#{a[0]}']/area[@key='#{a[1]}']"] == nil
      e = doc.elements["*/*/*/prov[@key='#{a[0]}']"]
      e.add_element "area",{"key" => "#{a[1]}", "desc" => "#{a[2]}"}
      #puts "add #{a[1]} ,#{a[2]}"
    else
      e = doc.elements["*/*/*/prov[@key='#{a[0]}']/area[@key='#{a[1]}']"]
      s = e.attributes["desc"]
      s += "|#{a[2]}"
      e.attributes["desc"] = s
      #puts "add #{a[2]} to #{a[1]}"
    end

  end

  f.close
  f = File.open("results.xml",'w')
  f.puts doc.to_s
end

## from_treepath_keywords.rb
#!/usr/bin/ruby -w
# coding: utf-8

require 'rexml/document'
include REXML

if $0 == __FILE__
  doc = Document.new File.new "treepath.xml"
  #doc = Document.new File.new "tour_treepath_area.xml"

  doc.elements.each("configuration/property/class") do |e_class|
    place = e_class.attributes["key"]
    e_class.elements.each("prov") do |e_prov|
        prov = e_prov.attributes["key"]
        e_prov.elements.each("area") do |e_area|
            area = e_area.attributes["key"]
            desc = e_area.attributes["desc"].split('|') * '$'
            puts "#{desc}|A|地区##{place}##{prov}##{area}"
        end
    end
  end
end

## improve_data.rb
#!/usr/bin/ruby -w
# coding: utf-8

require 'rexml/document'
include REXML

if $0 == __FILE__
  f = File.open("new_data.txt","r")
  a = []

  f.each do |line|
    b = a.clone
    a = line.split(",")
    a.each { |a| a.gsub!(/[\n\s]/,'') }
    if a[0] == b[0] and a[1] == b[1]
        print "|#{a[2]}"
    else
        print "\n#{a[0]},#{a[1]},#{a[2]}"
    end

    #puts "#{a[0]}  #{a[1]} #{a[2]}"
  end

  f.close
end

## improved_from_data_to_treepath.rb
#!/usr/bin/ruby -w
# coding: utf-8

require 'rexml/document'
include REXML

if $0 == __FILE__
    f = File.open("improved_data.txt","r")
    doc = Document.new File.new "tour_treepath_area.xml"

    f.each do |line|
        a = line.split(",")
        a.each { |a| a.gsub!(/[\n\s]/,'') }

        #puts "#{a[0]}  #{a[1]} #{a[2]}"
        if doc.elements["*/*/*/prov[@key='#{a[0]}']"] == nil
            puts "#{a[0]} doesn't exist!"
        elsif doc.elements["*/*/*/prov[@key='#{a[0]}']/area[@key='#{a[1]}']"] == nil
            e = doc.elements["*/*/*/prov[@key='#{a[0]}']"]
            e.add_element "area",{"key" => "#{a[1]}", "desc" => "#{a[2]}"}
            #puts "add #{a[1]} ,#{a[2]}"
        else
            e = doc.elements["*/*/*/prov[@key='#{a[0]}']/area[@key='#{a[1]}']"]
            s = e.attributes["desc"]
            s += "|#{a[2]}"
            e.attributes["desc"] = s.split('|').uniq * '|'
            #puts "add #{a[2]} to #{a[1]}"
        end

    end

    f.close
    f = File.open("results.xml",'w')
    f.puts doc.to_s
end

## multithread.rb
#!/usr/bin/ruby
# coding: utf-8
require 'uri'
#require 'iconv'
require 'open-uri'
require 'thread'


def scrape(from,to,n)
  file = File.open("data#{n}.txt","w+")
  num = 1

  for num in from..to
    begin
      page = open(@url + num.to_s)
      text = page.read; nil
      text = text.to_s.scan(@regexp)

      if( @regexp_title =~ text.to_s )
        print "#{n}:#{num.to_s}:#{$1},#{$2},"
        file << "#{$1},#{$2},"
      end

      if( @regexp_jingdian =~ text.to_s )
        print "#{$1}\n"
        file << "#{$1}\n"
      end

    rescue Timeout::Error
      puts "#{num} Time out"
    end
  end

  file.close
end

if $0 == __FILE__
  @url = 'http://jingdian.tuniu.com/fengjing/'
  @regexp = /\<h1\>.*span\>/
  @regexp_jingdian = /\<h1\>(.*)\<\/h1\>/
  @regexp_jingdian1 = /title=\".*\"\>(.*)\<\/a\>\//
  @regexp_jingdian2 = /href=.*"\>(.*)\<\/a\>"/
  @regexp_title = /href=.*"\>(.*)\<\/a\>\/<a.*"\>(.*)\<\/a\>/


  # from to 10376..38144
  threads = []
  1.upto(40) do |n|
    threads << Thread.new {
      # do task
      if n != 40
        tem_from = 10376 + (n-1) * 694
        tem_to = tem_from + 693
        scrape(tem_from, tem_to, n)
      else
        tem_from = 10376 + (n-1) * 694
        tem_to = 38144
        scrape(tem_from, tem_to, n)
      end
    }
  end

  threads.each { |t| t.join }
  puts "shit"
end

## singlethread.rb
#!/usr/bin/ruby
# coding: utf-8
require 'uri'
require 'iconv'
require 'open-uri'

if $0 == __FILE__
  url = 'http://jingdian.tuniu.com/fengjing/'
  num = 1
  regexp = /\<h1\>.*span\>/
  regexp_jingdian = /\<h1\>(.*)\<\/h1\>/
  regexp_jingdian1 = /title=\".*\"\>(.*)\<\/a\>\//
  regexp_jingdian2 = /href=.*"\>(.*)\<\/a\>"/
  regexp_title = /href=.*"\>(.*)\<\/a\>\/<a.*"\>(.*)\<\/a\>/

  file = File.open("data.txt","w+")

  for num in 1..38144
    page = open(url + num.to_s)
    text = page.read; nil
    text = text.to_s.scan(regexp)

    if( regexp_title =~ text.to_s )
      print "#{$1},#{$2},"
      file << "#{$1},#{$2},"
    end

    if( regexp_jingdian =~ text.to_s )
      print "#{$1}\n"
      file << "#{$1}\n"
    end
  end

  file.close
end
	#!/usr/bin/ruby -w

	require 'rexml/document'
	include REXML

	xmlfile = File.new("tour_treepath_area.xml")
	xmldoc = Document.new(xmlfile)

	root = xmldoc.root
	#puts root.attributes["ver"]

	#xmldoc.elements.each("configuration/property/class") {
	# \|e\| puts e.attributes["key"]
	#}

	xmldoc.elements.each("configuration/property/class") { \|e\|
	e.elements.each("prov"){ \|a\|
	puts a.attributes["key"]
	}
	}
	#!/usr/bin/ruby -w
	# coding: utf-8

	require 'rexml/document'
	include REXML

	if $0 == __FILE__
	f = File.open("all_data.txt","r")
	doc = Document.new File.new "tour_treepath_area.xml"

	f.each do \|line\|
	a = line.split(",")
	a.each { \|a\| a.gsub!(/[\n\s]/,'') }

	#puts "#{a[0]} #{a[1]} #{a[2]}"
	if doc.elements["//*/prov[@key='#{a[0]}']"] == nil
	puts "#{a[0]} doesn't exist!"
	elsif doc.elements["//*/prov[@key='#{a[0]}']/area[@key='#{a[1]}']"] == nil
	e = doc.elements["//*/prov[@key='#{a[0]}']"]
	e.add_element "area",{"key" => "#{a[1]}", "desc" => "#{a[2]}"}
	#puts "add #{a[1]} ,#{a[2]}"
	else
	e = doc.elements["//*/prov[@key='#{a[0]}']/area[@key='#{a[1]}']"]
	s = e.attributes["desc"]
	s += "\|#{a[2]}"
	e.attributes["desc"] = s
	#puts "add #{a[2]} to #{a[1]}"
	end

	end

	f.close
	f = File.open("results.xml",'w')
	f.puts doc.to_s
	end
	#!/usr/bin/ruby
	# coding: utf-8
	require 'uri'
	#require 'iconv'
	require 'open-uri'
	require 'thread'


	def scrape(from,to,n)
	file = File.open("data#{n}.txt","w+")
	num = 1

	for num in from..to
	begin
	page = open(@url + num.to_s)
	text = page.read; nil
	text = text.to_s.scan(@regexp)

	if( @regexp_title =~ text.to_s )
	print "#{n}:#{num.to_s}:#{$1},#{$2},"
	file << "#{$1},#{$2},"
	end

	if( @regexp_jingdian =~ text.to_s )
	print "#{$1}\n"
	file << "#{$1}\n"
	end

	rescue Timeout::Error
	puts "#{num} Time out"
	end
	end

	file.close
	end

	if $0 == __FILE__
	@url = 'http://jingdian.tuniu.com/fengjing/'
	@regexp = /\<h1\>.*span\>/
	@regexp_jingdian = /\<h1\>(.*)\<\/h1\>/
	@regexp_jingdian1 = /title=\".\"\>(.)\<\/a\>\//
	@regexp_jingdian2 = /href=."\>(.)\<\/a\>"/
	@regexp_title = /href=."\>(.)\<\/a\>\/<a."\>(.)\<\/a\>/


	# from to 10376..38144
	threads = []
	1.upto(40) do \|n\|
	threads << Thread.new {
	# do task
	if n != 40
	tem_from = 10376 + (n-1) * 694
	tem_to = tem_from + 693
	scrape(tem_from, tem_to, n)
	else
	tem_from = 10376 + (n-1) * 694
	tem_to = 38144
	scrape(tem_from, tem_to, n)
	end
	}
	end

	threads.each { \|t\| t.join }
	puts "shit"
	end