Skip to content

Instantly share code, notes, and snippets.

@shuma
Created March 5, 2012 21:02
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save shuma/1981074 to your computer and use it in GitHub Desktop.
Crawling 43612 XML files
require 'rubygems'
require 'nokogiri'
require 'open-uri'
require 'rexml/document'
require 'csv'
include REXML
@urls = Array.new
@ID = Array.new
@titleSv = Array.new
@titleEn = Array.new
@identifier = Array.new
@typeOfLevel = Array.new
@typeOfResponsibleBody = Array.new
@courseTyp = Array.new
@credits = Array.new
@degree = Array.new
@preAcademic = Array.new
@subjectCodeVhs = Array.new
@descriptionSv = Array.new
@visibleToSweApplicants = Array.new
@lastedited = Array.new
@expires = Array.new
# Hämtar alla XML-länkar
htmldoc = Nokogiri::HTML(open('http://testnavet.skolverket.se/SusaNavExport/EmilExporter?GetEvent&EMILVersion=1.1&NotExpired&EEFormOfStudy=normal&EIAcademicType=UoH&SelectEI'))
# Hämtar alla länkar för xml-filerna och sparar dom i arrayn urls
htmldoc.xpath('//a/@href').each do |links|
@urls << links.content
end
@urls.each do |url|
# Loop throw the XML files and grab element nodes
xmldoc = REXML::Document.new(open(url).read)
# Root element
root = xmldoc.root
# Hämtar info-id
@ID << root.attributes["id"]
# TitleSv
xmldoc.elements.each("/educationInfo/titles/title[1] | /ns:educationInfo/ns:titles/ns:title[1]"){
|e| m = e.text
m = m.to_s
next if m.empty?
@titleSv << m
}
# TitleEn
xmldoc.elements.each("/educationInfo/titles/title[2] | /ns:educationInfo/ns:titles/ns:title[2]"){
|e| m = e.text
m = m.to_s
next if m.empty?
@titleSv << m
}
# Identifier
xmldoc.elements.each("/educationInfo/identifier | /ns:educationInfo/ns:identifier"){
|e| m = e.text
m = m.to_s
next if m.empty?
@identifier << m
}
# typeOfLevel
xmldoc.elements.each("/educationInfo/educationLevelDetails/typeOfLevel | /ns:educationInfo/ns:educationLevelDetails/ns:typeOfLevel"){
|e| m = e.text
m = m.to_s
next if m.empty?
@typeOfLevel << m
}
# typeOfResponsibleBody
xmldoc.elements.each("/educationInfo/educationLevelDetails/typeOfResponsibleBody | /ns:educationInfo/ns:educationLevelDetails/ns:typeOfResponsibleBody"){
|e| m = e.text
m = m.to_s
next if m.empty?
@typeOfResponsibleBody << m
}
# courseTyp
xmldoc.elements.each("/educationInfo/educationLevelDetails/academic/course/type | /ns:educationInfo/ns:educationLevelDetails/ns:academic/ns:courseOfferingPackage/ns:type"){
|e| m = e.text
m = m.to_s
next if m.empty?
@courseTyp << m
}
# credits
xmldoc.elements.each("/educationInfo/credits/exact | /ns:educationInfo/ns:credits/ns:exact"){
|e| m = e.text
m = m.to_s
next if m.empty?
@credits << m
}
# degree
xmldoc.elements.each("/educationInfo/degrees/degree | /ns:educationInfo/ns:degrees/ns:degree"){
|e| m = e.text
m = m.to_s
next if m.empty?
@degree << m
}
# @preAcademic
xmldoc.elements.each("/educationInfo/prerequisites/academic | /ns:educationInfo/ns:prerequisites/ns:academic"){
|e| m = e.text
m = m.to_s
next if m.empty?
@preAcademic << m
}
# @subjectCodeVhs
xmldoc.elements.each("/educationInfo/subjects/subject/code[1] | /ns:educationInfo/ns:subjects/ns:subject/ns:code"){
|e| m = e.text
m = m.to_s
next if m.empty?
@subjectCodeVhs << m
}
# DescriptionSv
xmldoc.elements.each("/educationInfo/extensionInfo/nya:textualDescription/nya:textualDescriptionPhrase | /ns:educationInfo/ns:extensionInfo/gu:guInfoExtensions/gu:guSubject/gu:descriptions/gu:description | //*[name()='ct:text']"){
|e| m = e.text
m = m.to_s
m.gsub!(/<.+?>/, "")
next if m.empty?
@descriptionSv << m
}
# Hämtar dokuments utgångs-datum
@expires << root.attributes["expires"]
# Hämtar dokuments lastedited
@lastedited << root.attributes["lastEdited"]
# Lagrar dom i uni.CSV
CSV.open("eduction_normal.csv", "wb") do |row|
(0..@ID.length - 1).each do |index|
row << [@ID[index], @titleSv[index], @titleEn[index], @identifier[index], @typeOfLevel[index], @typeOfResponsibleBody[index], @courseTyp[index], @credits[index], @degree[index], @preAcademic[index], @subjectCodeVhs[index], @descriptionSv[index], @lastedited[index], @expires[index]]
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment