Skip to content

Instantly share code, notes, and snippets.

@pppoe
Created June 2, 2011 09:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pppoe/1004156 to your computer and use it in GitHub Desktop.
Save pppoe/1004156 to your computer and use it in GitHub Desktop.
get information of an event from douban, use for learning only
require 'open-uri'
require 'rubygems'
require 'nokogiri'
require 'eventParser'
rootURL = "http://www.douban.com/location/hangzhou/"
html = open(rootURL)
doc = Nokogiri::HTML(html)
eventLinks = []
doc.css('a').each { |link|
linkRef = link['href']
if linkRef =~ /\/event\//
eventLinks.push(linkRef)
end
}
eventLinks.uniq!
eventLinks.each { |link|
puts "Parsing...#{link}"
detail = EventParser.parse(link)
detail.each { |key, value|
if value.class == Array
puts "#{key} \t\t #{value.join(', ')}"
else
puts "#{key} \t\t #{value}"
end
}
#linkDoc = Nokogiri::HTML(open(link))
#detail['Title'] = linkDoc.xpath('//h1').text
#timeInfo = linkDoc.xpath('//div[@id=\'info\']/span[@class=\'pl\']')
sleep(1) # prevent continual request
}
require 'open-uri'
require 'rubygems'
require 'nokogiri'
require 'photoParser'
#rootURL = "http://www.douban.com/event/13967079/"
class EventParser
def self.parse(rootURL)
html = open(rootURL)
linkDoc = Nokogiri::HTML(html)
detail = {}
detail['Title'] = linkDoc.xpath('//h1').text
timeInfoKeys = linkDoc.xpath('//div[@id=\'info\']/span[@class=\'pl\']/text()').to_a.map { |elem| elem.text }
timeInfoContents = linkDoc.xpath('//div[@id=\'info\']/text()').to_a.map { |elem| elem.text }
(0..timeInfoKeys.length-1).each { |i| detail[timeInfoKeys[i]] = timeInfoContents[i] }
eventInfoKeys = linkDoc.xpath('//div[@id=\'info\']/div[@class=\'obmo\']/span[@class=\'pl\']/text()').to_a.map { |elem| elem.text }
eventInfoContents = linkDoc.xpath('//div[@id=\'info\']/div[@class=\'obmo\']//text()').to_a.map { |elem| elem.text }
eventInfoKeys.each { |elem| elem.strip! }
non_break_space = "#{0xc2.chr}#{0xa0.chr}"
eventInfoContents.delete_if {|elem|
elem.strip!
elem =~ /^( |\\n)?$/ || elem == non_break_space
}
prevKey = nil
prevValues = []
key = eventInfoKeys.shift
value = eventInfoContents.shift
while (key && value)
if (key != value)
prevValues = []
while (key != value)
prevValues.push(value)
value = eventInfoContents.shift
end
end
detail[prevKey] = prevValues if prevKey
prevKey = key
key = eventInfoKeys.shift
value = eventInfoContents.shift
end
introKey = linkDoc.xpath('//div[@class=\'related_info\']/h2[1]').text
photoKey = linkDoc.xpath('//div[@class=\'related_info\']/h2[2]').text.gsub(/ .*/, '')
introValue = linkDoc.xpath('//div[@class=\'related_info\']/div[@id=\'edesc_f\']//text()')
#photoURL = linkDoc.xpath('//div[@class=\'related_info\']/div/div[@class=\'pic\']//a')
photoURL = linkDoc.xpath('//div[@class=\'related_info\']//div[@class=\'pic\']//a[1]').attribute('href')
#photoValues = PhotoParser.parse(albumURL)
puts photoURL
#detail.each { |key, value|
#if value.class == Array
#puts "#{key} \t\t #{value.join(', ')}"
#else
#puts "#{key} \t\t #{value}"
#end
#}
detail
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment