pppoe/EventParser.rb

## douban.rb
require 'open-uri'
require 'rubygems'
require 'nokogiri'
require 'eventParser'

rootURL = "http://www.douban.com/location/hangzhou/"
html = open(rootURL)
doc  = Nokogiri::HTML(html)

eventLinks = []

doc.css('a').each { |link|
    linkRef = link['href']
    if linkRef =~ /\/event\//
        eventLinks.push(linkRef)
    end
}

eventLinks.uniq!

eventLinks.each { |link|

    puts "Parsing...#{link}"

    detail = EventParser.parse(link)

    detail.each { |key, value|
        if value.class == Array
            puts "#{key} \t\t #{value.join(', ')}"
        else
            puts "#{key} \t\t #{value}"
        end
    }

    #linkDoc = Nokogiri::HTML(open(link))

    #detail['Title'] = linkDoc.xpath('//h1').text

    #timeInfo = linkDoc.xpath('//div[@id=\'info\']/span[@class=\'pl\']')

    sleep(1) # prevent continual request
}

## EventParser.rb
require 'open-uri'
require 'rubygems'
require 'nokogiri'
require 'photoParser'

#rootURL = "http://www.douban.com/event/13967079/"

class EventParser
    def self.parse(rootURL)
        html = open(rootURL)
        linkDoc  = Nokogiri::HTML(html)

        detail = {}

        detail['Title'] = linkDoc.xpath('//h1').text

        timeInfoKeys = linkDoc.xpath('//div[@id=\'info\']/span[@class=\'pl\']/text()').to_a.map { |elem| elem.text }

        timeInfoContents = linkDoc.xpath('//div[@id=\'info\']/text()').to_a.map { |elem| elem.text }

        (0..timeInfoKeys.length-1).each { |i| detail[timeInfoKeys[i]] = timeInfoContents[i] }

        eventInfoKeys = linkDoc.xpath('//div[@id=\'info\']/div[@class=\'obmo\']/span[@class=\'pl\']/text()').to_a.map { |elem| elem.text }
        eventInfoContents = linkDoc.xpath('//div[@id=\'info\']/div[@class=\'obmo\']//text()').to_a.map { |elem| elem.text }

        eventInfoKeys.each { |elem| elem.strip! }

        non_break_space = "#{0xc2.chr}#{0xa0.chr}"
        eventInfoContents.delete_if {|elem|
            elem.strip!
            elem =~ /^( |\\n)?$/ || elem == non_break_space
        }

        prevKey = nil
        prevValues = []

        key = eventInfoKeys.shift
        value = eventInfoContents.shift

        while (key && value)
            if (key != value)
                prevValues = []
                while (key != value)
                    prevValues.push(value)
                    value = eventInfoContents.shift
                end
            end

            detail[prevKey] = prevValues if prevKey

            prevKey = key
            key = eventInfoKeys.shift
            value = eventInfoContents.shift
        end

        introKey = linkDoc.xpath('//div[@class=\'related_info\']/h2[1]').text
        photoKey = linkDoc.xpath('//div[@class=\'related_info\']/h2[2]').text.gsub(/ .*/, '')

        introValue = linkDoc.xpath('//div[@class=\'related_info\']/div[@id=\'edesc_f\']//text()')
        #photoURL = linkDoc.xpath('//div[@class=\'related_info\']/div/div[@class=\'pic\']//a')
        photoURL = linkDoc.xpath('//div[@class=\'related_info\']//div[@class=\'pic\']//a[1]').attribute('href')

        #photoValues = PhotoParser.parse(albumURL)

        puts photoURL

        #detail.each { |key, value|
            #if value.class == Array
                #puts "#{key} \t\t #{value.join(', ')}"
            #else
                #puts "#{key} \t\t #{value}"
            #end
        #}

        detail
    end
end
	require 'open-uri'
	require 'rubygems'
	require 'nokogiri'
	require 'eventParser'

	rootURL = "http://www.douban.com/location/hangzhou/"
	html = open(rootURL)
	doc = Nokogiri::HTML(html)

	eventLinks = []

	doc.css('a').each { \|link\|
	linkRef = link['href']
	if linkRef =~ /\/event\//
	eventLinks.push(linkRef)
	end
	}

	eventLinks.uniq!

	eventLinks.each { \|link\|

	puts "Parsing...#{link}"

	detail = EventParser.parse(link)

	detail.each { \|key, value\|
	if value.class == Array
	puts "#{key} \t\t #{value.join(', ')}"
	else
	puts "#{key} \t\t #{value}"
	end
	}

	#linkDoc = Nokogiri::HTML(open(link))

	#detail['Title'] = linkDoc.xpath('//h1').text

	#timeInfo = linkDoc.xpath('//div[@id=\'info\']/span[@class=\'pl\']')

	sleep(1) # prevent continual request
	}