Skip to content

Instantly share code, notes, and snippets.

@theduke
Created August 2, 2015 20:49
Show Gist options
  • Save theduke/b2e239e4fc143225a80c to your computer and use it in GitHub Desktop.
Save theduke/b2e239e4fc143225a80c to your computer and use it in GitHub Desktop.
Meduni Wien Campus Crawler
package crawler
import (
"io/ioutil"
"log"
//"net"
"net/http"
//"net/url"
"crypto/tls"
"encoding/json"
"errors"
"math/big"
"net/url"
"os"
"regexp"
"strconv"
"strings"
"time"
"github.com/moovweb/gokogiri"
"github.com/moovweb/gokogiri/html"
"github.com/moovweb/gokogiri/xml"
"theduke.at/docduke/server"
)
type Result struct {
Url string
RedirectedUrl string
Success bool
TimeStarted time.Time
TimeFinished time.Time
Result *server.ImportItem
Error error
}
type Crawler struct {
Client *http.Client
workerCount int
handledUrls map[string]bool
totalWorkerTime *big.Int
totalWorkerTries int64
crawlQeueCount int
urlQueue chan string
Urls map[string]bool
resultQueue chan Result
ShouldQueueUrl func(string) string
Handle func(string, *html.HtmlDocument, []byte, *Crawler) (*server.ImportItem, error)
OnFinish func(*Crawler)
ResultCounter map[string]int
UrlFile string
DataFile string
}
func NewCrawler(workerCount int) *Crawler {
c := Crawler{
workerCount: workerCount,
}
c.totalWorkerTime = big.NewInt(0)
c.handledUrls = make(map[string]bool)
c.ResultCounter = make(map[string]int)
c.Urls = make(map[string]bool)
c.urlQueue = make(chan string, 9999999)
c.resultQueue = make(chan Result, workerCount*10)
c.Client = &http.Client{
Transport: &http.Transport{
/*
DisableKeepAlives: true,
Dial: (&net.Dialer{
//Timeout: 30 * time.Second,
KeepAlive: 0,
}).Dial,
*/
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
},
}
c.Client = &http.Client{}
return &c
}
func (c *Crawler) Finish() {
if c.OnFinish != nil {
c.OnFinish(c)
}
log.Println("All done!")
log.Println("Results:")
for key := range c.ResultCounter {
log.Printf("%v: %v\n", key, c.ResultCounter[key])
}
}
func (c *Crawler) Crawl(urls []string) {
// Queue initial urls.
for _, url := range urls {
c.urlQueue <- url
}
lastReport := time.Now()
var lastTotalWorkerTries int64 = 0
var urlFile *os.File
if c.UrlFile != "" {
f, err := os.Create(c.UrlFile)
if err == nil {
urlFile = f
defer urlFile.Close()
} else {
panic("Could not create url file at " + c.UrlFile)
}
}
var dataFile *os.File
if c.DataFile != "" {
f, err := os.Create(c.DataFile)
if err == nil {
dataFile = f
defer dataFile.Close()
} else {
panic("Could not create data file at " + c.DataFile)
}
}
for {
select {
case url := <-c.urlQueue:
_, alreadyHandled := c.handledUrls[url]
_, alreadyQueued := c.Urls[url]
if !(alreadyHandled || alreadyQueued) {
c.Urls[url] = true
}
case result := <-c.resultQueue:
if result.Success {
// Write item to file if one was found.
if result.Result != nil {
// Note: if key does not exist yet, val is 0.
val := c.ResultCounter[result.Result.Type]
c.ResultCounter[result.Result.Type] = val + 1
dataFile.Write(result.Result.JSON)
dataFile.WriteString("\n")
}
if result.RedirectedUrl != "" {
// Request was redirected.
// Prevent re-crawling the same page with redirected URL.
c.handledUrls[result.RedirectedUrl] = true
}
} else {
log.Printf("Error at %v: %v\n", result.Url, result.Error)
}
c.crawlQeueCount -= 1
// Add time stats.
c.totalWorkerTries += 1
milliseconds := big.NewInt(int64(result.TimeFinished.Sub(result.TimeStarted).Nanoseconds() / 1000000))
c.totalWorkerTime.Add(c.totalWorkerTime, milliseconds)
default:
}
if len(c.Urls) < 1 && c.crawlQeueCount < 1 {
// Done!
break
}
// Start new crawl if appropriate.
if c.crawlQeueCount < c.workerCount && len(c.Urls) > 0 {
url := ""
for key := range c.Urls {
url = key
break
}
// Delete from queue.
delete(c.Urls, url)
// Only process url if it was not already handled.
if _, ok := c.handledUrls[url]; !ok {
c.crawlQeueCount += 1
c.handledUrls[url] = true
go func(url string, crawler *Crawler) {
crawlUrl(url, crawler)
}(url, c)
if urlFile != nil {
urlFile.WriteString(url + "\n")
}
}
}
if c.totalWorkerTries > 0 && time.Since(lastReport).Seconds() >= 5 {
queuedUrls := len(c.Urls)
percent := 0.0
if c.totalWorkerTries+int64(queuedUrls) > 0 {
percent = float64(c.totalWorkerTries) / float64((c.totalWorkerTries + int64(queuedUrls))) * 100
}
secs := big.NewInt(0).Div(c.totalWorkerTime, big.NewInt(c.totalWorkerTries))
var requestsPerSec float64 = 0
newRequests := c.totalWorkerTries - lastTotalWorkerTries
requestsPerSec = float64(newRequests) / time.Now().Sub(lastReport).Seconds()
log.Printf("Crawled %v of %v (%.2f %%) / %v active | %.0f req/sec | %v ms/req \n",
c.totalWorkerTries,
int64(queuedUrls)+c.totalWorkerTries,
percent,
c.crawlQeueCount,
requestsPerSec,
secs)
lastReport = time.Now()
lastTotalWorkerTries = c.totalWorkerTries
}
}
c.Finish()
}
func (c *Crawler) SanitizeUrl(url string) string {
return url
}
func (c *Crawler) FetchUrl(url string) (string, []byte, *html.HtmlDocument, error) {
response, err := c.Client.Get(url)
if err != nil {
return "", nil, nil, errors.New(err.Error())
}
defer response.Body.Close()
if response.StatusCode != http.StatusOK {
return "", nil, nil, errors.New("Request failed: " + strconv.Itoa(response.StatusCode))
}
page, err := ioutil.ReadAll(response.Body)
if err != nil {
return "", nil, nil, err
}
doc, err := gokogiri.ParseHtml(page)
if err != nil {
return "", nil, nil, err
}
return response.Request.URL.String(), page, doc, nil
}
func (c *Crawler) FetchUrlWithResult(url string) (Result, []byte, *html.HtmlDocument) {
result := Result{
Url: url,
TimeStarted: time.Now(),
}
finalUrl, body, doc, err := c.FetchUrl(url)
result.TimeFinished = time.Now()
if err != nil {
result.Success = false
result.Error = err
} else {
result.Success = true
if finalUrl != url {
result.RedirectedUrl = finalUrl
}
}
return result, body, doc
}
func crawlUrl(url string, crawler *Crawler) {
result, body, doc := crawler.FetchUrlWithResult(url)
if result.Error == nil {
if result.RedirectedUrl != "" {
url = result.RedirectedUrl
}
// Extract urls.
urls, _ := doc.Search("//a/@href")
for _, urlNode := range urls {
url := urlNode.String()
cleanUrl := crawler.ShouldQueueUrl(url)
if cleanUrl != "" {
crawler.urlQueue <- cleanUrl
}
}
// Handle.
res, err := crawler.Handle(url, doc, body, crawler)
if err != nil {
result.Error = err
result.Success = false
} else {
result.Result = res
}
doc.Free()
}
result.TimeFinished = time.Now()
crawler.resultQueue <- result
}
func XpathGetFirst(doc xml.Node, xpath string) xml.Node {
res, _ := doc.Search(xpath)
if len(res) < 1 {
return nil
} else {
return res[0]
}
}
func ExtractUrlParam(theUrl string, param string) string {
u, err := url.Parse(theUrl)
if err != nil {
return ""
}
return u.Query().Get(param)
}
func ExtractUrlParamInt(url string, param string) int {
v := ExtractUrlParam(strings.ToLower(url), param)
i, err := strconv.Atoi(v)
if err != nil {
return 0
} else {
return i
}
}
func HandleLocation(url string, doc *html.HtmlDocument, crawler *Crawler) *server.ImportLocation {
location := server.ImportLocation{}
location.Type = "location"
location.ImportID = ExtractUrlParam(strings.ToLower(url), "praumnr")
res, _ := doc.Search("//table[contains(@class, \"MaskSpacing\")]//tr")
if len(res) < 1 {
log.Printf("Error at %v: Could not find data table\n", url)
return nil
}
for _, row := range res {
tds, _ := row.Search("./td")
if len(tds) != 2 {
continue
}
title := strings.TrimSpace(tds[0].Content())
contentNode := tds[1]
content := strings.TrimSpace(contentNode.Content())
address := ""
switch title {
case "Gebäude":
location.Builing = content
case "Stockwerk":
location.Floor = content
case "Raumnummer":
location.Room = content
case "Straße/Hausnummer":
address = content
case "PLZ/Ort":
address += " " + content
location.Address = address
case "Verwendung":
location.Usage = content
case "Raumtyp":
location.RoomType = content
case "Nutzungstyp":
location.Usage = content
case "Zwischengeschoß":
case "Fläche [m2]":
case "Höhe [cm]":
case "Sitzplätze":
case "Ausrichtung":
case "Boden":
case "Reinigung":
case "Verwaltung":
case "Zusatzbezeichnung":
// TODO: implement
default:
if title != "" {
log.Printf("Error at %v: Unknown data row '%v'\n", url, title)
}
}
}
return &location
}
func HandleLecture(url string, doc *html.HtmlDocument, crawler *Crawler) *server.ImportLecture {
lecture := server.ImportLecture{}
lecture.Type = "lecture"
lecture.ImportID = ExtractUrlParam(strings.ToLower(url), "pstpspnr")
lecture.ImportUrl = url
lecture.LecturerIDs = make([]string, 0)
res, _ := doc.Search("//form/div[contains(@class, \"MaskBackground\")]//tr")
if len(res) < 1 {
log.Printf("Error at %v: Could not find data table\n", url)
return nil
}
datesUrl := ""
lecturersUrl := ""
for _, row := range res {
tds, _ := row.Search("./td")
if len(tds) != 2 {
continue
}
title := strings.TrimSpace(tds[0].Content())
contentNode := tds[1]
content := strings.TrimSpace(contentNode.Content())
contentHtml := contentNode.InnerHtml()
if strings.HasPrefix(title, "Lehr- und Lernmethode") {
title = "Lehr- und Lernmethode"
} else if strings.HasPrefix(title, "Inhaltliche Voraussetzungen") {
title = "Inhaltliche Voraussetzungen"
} else if strings.HasPrefix(title, "Anzahl der Prüfungstermine") {
title = "Anzahl der Prüfungstermine"
} else if strings.HasPrefix(title, "Ziel") {
title = "Ziel"
} else if strings.HasPrefix(title, "Beurteilungsschema") {
title = "Beurteilungsschema"
} else if strings.HasPrefix(title, "Stellung im Studienplan") {
title = "Stellung im Studienplan"
}
switch title {
case "Titel":
lecture.Title = content
case "Nummer":
lecture.Number = content
case "Art":
lecture.Kind = content
case "Semesterstunden":
hours, err := strconv.ParseFloat(content, 32)
if err == nil {
lecture.Semesterhours = float32(hours)
}
case "Angeboten im Semester":
parts := strings.Split(content, " ")
if len(parts) == 2 {
if parts[0] == "Wintersemester" {
lecture.Semester = "winter"
} else if parts[0] == "Sommersemester" {
lecture.Semester = "summer"
} else {
log.Printf("Error at %v: Unknown semester %v\n", url, parts[0])
}
year, err := strconv.Atoi(parts[1])
if err == nil {
lecture.Year = year
}
}
case "Vortragende/r (Mitwirkende/r)":
links, _ := contentNode.Search(".//a/@href")
if len(links) > 0 {
// Check if last link is to seperate page.
lastLink := links[len(links)-1].String()
if strings.HasPrefix(lastLink, "wbLvMgmt.wbLvPersListe") {
// Details will be fetched later.
lecturersUrl = lastLink
} else {
// No detail page, just add all linked persons.
for _, linkNode := range links {
link := linkNode.String()
id := ExtractUrlParam(link, "pPersonenId")
if id != "" {
lecture.LecturerIDs = append(lecture.LecturerIDs, id)
}
}
}
}
case "Organisation":
link := XpathGetFirst(contentNode, ".//a/@href")
if link != nil {
lecture.OrganisationID = ExtractUrlParam(link.String(), "corg")
}
case "Stellung im Studienplan":
re := regexp.MustCompile("(\\d+,?\\d?) ECTS")
match := re.FindStringSubmatch(contentHtml)
if match != nil {
ects, err := strconv.ParseFloat(match[1], 32)
if err == nil {
lecture.Ects = float32(ects)
}
}
rows, _ := contentNode.Search("//tr[contains(@class, \"coTableR\")]")
if rows != nil {
for _, row := range rows {
lecDegree := server.LectureDegree{}
tds, _ := row.Search("./td")
if len(tds) != 9 {
log.Printf("Error at %v: Could not parse lecture degrees\n", url)
continue
}
txt := strings.TrimSpace(tds[0].Content())
parts := strings.Split(txt, " ")
if len(parts) < 1 {
log.Printf("Error at %v: Could not parse lecture degrees\n", url)
continue
}
degree, _ := strconv.Atoi(parts[0])
lecDegree.Degree = degree
lecDegree.Module = strings.TrimSpace(tds[2].Content())
lecDegree.Kind = strings.TrimSpace(tds[3].Content())
rawSemester := strings.TrimSpace(tds[4].Content())
rawSemester = strings.Replace(rawSemester, ".", "", 1)
semester, _ := strconv.Atoi(rawSemester)
lecDegree.Semester = semester
if lecDegree.Degree > 0 && lecDegree.Semester > 0 {
lecture.Degrees = append(lecture.Degrees, lecDegree)
}
}
}
case "Inhalt":
lecture.Topics = content
case "Inhaltliche Voraussetzungen":
lecture.KnowledgeRequirements = contentHtml
case "Ziel":
lecture.Outcomes = content
case "Unterrichts-/Lehrsprachen":
lecture.Language = contentHtml
case "Lehr- und Lernmethode":
lecture.TeachingMethods = contentHtml
case "Abhaltungstermine":
datesLink := XpathGetFirst(contentNode, ".//a/@href")
if datesLink != nil {
datesUrl = datesLink.String()
}
case "Teilnahmekriterien & Anmeldung":
lecture.Registration = contentHtml
case "Voraussetzungen laut Studienplan":
lecture.UniversityRequirements = contentHtml
case "Beurteilungsschema":
lecture.Grading = contentHtml
case "Prüfungstermine & Anmeldung":
// TODO: implement.
case "Anzahl der Prüfungstermine":
// Ignore.
case "Empfohlene Fachliteratur":
lecture.ReadingList = contentHtml
case "Online Informationen":
lecture.OnlineInfo = contentHtml
case "Anmerkung":
lecture.Comments = contentHtml
case "Bezeichnung":
// TODO: implement.
default:
if title != "" {
log.Printf("Error at %v: Unknown data row '%v'\n", url, title)
}
}
}
return &lecture
// Get data for lecture dates.
if datesUrl == "" {
log.Printf("Error at %v: Could not extract dates URL!\n", url)
return &lecture
}
if lecturersUrl != "" {
result, _, lecDoc := crawler.FetchUrlWithResult("https://campus.meduniwien.ac.at/med.campus/" + lecturersUrl)
crawler.resultQueue <- result
if result.Error != nil {
log.Printf("Error at %v: Could not fetch lecturer detail page %v: %v\n", url, lecturersUrl, result.Error)
} else {
links, _ := lecDoc.Search("//form/table//a/@href")
for _, linkNode := range links {
link := linkNode.String()
if !strings.HasPrefix(link, "visitenkarte.show_vcard") {
continue
}
id := ExtractUrlParam(link, "pPersonenId")
if id != "" {
lecture.LecturerIDs = append(lecture.LecturerIDs, id)
}
}
}
lecDoc.Free()
}
result, _, datesDoc := crawler.FetchUrlWithResult("https://campus.meduniwien.ac.at/med.campus/" + datesUrl)
crawler.resultQueue <- result
if result.Error != nil {
log.Printf("Error at %v: Could not fetch dates page!\n", url)
return &lecture
}
dateLinks, _ := datesDoc.Search("//table[contains(@id, \"tabLvTermine\")]//a/@href")
for _, linkNode := range dateLinks {
link := linkNode.String()
if !strings.HasPrefix(link, "!wbTermin.wbEdit") {
continue
}
linkUrl := "https://campus.meduniwien.ac.at/med.campus/" + link
result, _, dateDoc := crawler.FetchUrlWithResult(linkUrl)
crawler.resultQueue <- result
if result.Error != nil {
log.Printf("Error at %v: Could not fetch date page\n", link)
continue
}
date := server.ImportLectureDate{}
date.ImportUrl = linkUrl
dateString := ""
rows, _ := dateDoc.Search("//form[contains(@name, \"terminEditMask\")]//tr")
for _, row := range rows {
tds, _ := row.Search("./td")
if len(tds) != 2 {
continue
}
title := strings.TrimSpace(tds[0].Content())
contentNode := tds[1]
content := strings.TrimSpace(contentNode.Content())
switch title {
case "Lehrveranstaltung":
// Ignore.
case "Organisation":
// Ignore
case "Gruppe":
date.Group = content
case "Ereignis":
date.EventType = content
case "Termintyp":
date.DateType = content
case "Ressource":
link := XpathGetFirst(contentNode, ".//a/@href")
if link != nil {
date.LocationID = ExtractUrlParam(link.String(), "raumKey")
// Send link to crawler so locations are crawled.
crawler.urlQueue <- "https://campus.meduniwien.ac.at/med.campus/" + link.String()
}
case "Datum":
dateString = content
case "Uhrzeit":
re := regexp.MustCompile(".*?(\\d\\d\\:\\d\\d).*?(\\d\\d\\:\\d\\d).*")
match := re.FindStringSubmatch(content)
if match != nil {
// Start date.
start := dateString + " " + match[1]
if startDate, err := time.Parse("02.01.2006 15:04", start); err == nil {
date.Start = startDate
} else {
log.Printf("Error at %v: Could not parse start date %v: %v\n", link, start, err)
}
// End date.
end := dateString + " " + match[2]
if endDate, err := time.Parse("02.01.2006 15:04", end); err == nil {
date.End = endDate
} else {
log.Printf("Error at %v: Could not parse end date %v\n", end)
}
} else {
log.Printf("Error at %v: Coul not extract start/end times\n", link)
}
case "Vortragende/r":
urlNode := XpathGetFirst(contentNode, ".//a/@href")
if urlNode != nil {
date.LecturerID = ExtractUrlParam(urlNode.String(), "pPersonenId")
}
case "Lerneinheit":
date.Unit = content
case "Anmerkung":
date.Comments = content
default:
if title != "" {
log.Printf("Error at %v: Unknown data row: %v\n", link, title)
}
}
}
lecture.Dates = append(lecture.Dates, date)
}
return &lecture
}
func HandlePerson(url string, doc *html.HtmlDocument, crawler *Crawler) *server.ImportPerson {
person := server.ImportPerson{}
person.Type = "person"
person.ImportUrl = url
person.ImportID = ExtractUrlParam(url, "pPersonenId")
person.AdditionalOrganisationIDs = make([]string, 0)
res, _ := doc.Search("//div[contains(@id, \"pageContent\")]/table[3]//table//table")
if len(res) < 1 {
log.Printf("Error at %v: Could not find data table\n", url)
return nil
}
table := res[0]
rows, _ := table.Search(".//tr")
for _, row := range rows {
th := XpathGetFirst(row, "./th")
td := XpathGetFirst(row, "./td")
if th == nil || td == nil {
continue
}
name := strings.TrimSpace(th.Content())
content := strings.TrimSpace(td.Content())
switch name {
case "Mobil":
person.PhoneMobile = content
case "Fax":
person.Fax = content
case "Telefon  MedUni Wien":
person.PhoneUniversity = content
case "Telefon extern":
person.PhoneExternal = content
case "Sprechstunde":
if content != "-" {
person.Sprechstunde = content
}
case "Zusatzinfo":
person.Info = content
case "Postadresse":
url := XpathGetFirst(td, ".//a/@href")
if url != nil {
rawId := ExtractUrlParam(url.String(), "corg")
if rawId != "" {
person.OrganisationID = rawId
}
}
case "Herr":
person.Gender = "m"
person.LastName = XpathGetFirst(td, "./span/span").InnerHtml()
person.FirstName = XpathGetFirst(td, "./span/span").NextSibling().String()
case "Frau":
person.Gender = "f"
person.LastName = XpathGetFirst(td, "./span/span").InnerHtml()
person.FirstName = XpathGetFirst(td, "./span/span").NextSibling().String()
case "E-Mail":
url := XpathGetFirst(td, ".//a/@href")
if url != nil {
email := strings.TrimSpace(strings.Replace(url.String(), "mailto:", "new", 1))
email = strings.Replace(email, "%40", "@", 1)
person.Email = email
}
case "Homepage":
url := XpathGetFirst(td, ".//a/@href")
if url != nil {
person.Website = strings.TrimSpace(url.String())
}
case "weitere Org.":
orgLinks, _ := td.Search(".//a/@href")
for _, orgLink := range orgLinks {
id := ExtractUrlParam(orgLink.String(), "corg")
if id != "" {
person.AdditionalOrganisationIDs = append(person.AdditionalOrganisationIDs, id)
}
}
case "Dienstort":
// TODO: implement.
default:
if name != "" {
log.Printf("Error at %v: unknown data row %v\n", url, name)
}
}
}
return &person
}
func HandleOrg(url string, doc *html.HtmlDocument, crawler *Crawler) *server.ImportOrganisation {
org := server.ImportOrganisation{}
org.Type = "organisation"
org.ImportUrl = url
org.ChildIDs = make([]string, 0)
parts := strings.Split(url, "=")
org.ImportID = parts[len(parts)-1]
org.Name = XpathGetFirst(doc, "//span[contains(@class, \"wborg_name\")]/text()").String()
numNode := XpathGetFirst(doc, "//span[contains(@class, \"wborg_kennung\")]/text()")
if numNode != nil {
num, _ := strconv.Atoi(numNode.String())
org.Number = num
}
res, _ := doc.Search("//div[contains(@id, \"wbOrg_site_listing\")]//a")
for _, node := range res {
linkUrl := node.Attr("href")
name := node.InnerHtml()
if name == "Homepage" {
org.Website = linkUrl
} else if strings.HasPrefix(linkUrl, "wborg.display") {
// Child organization.
if rawId := ExtractUrlParam(strings.ToLower(linkUrl), "porgnr"); rawId != "" {
org.ChildIDs = append(org.ChildIDs, rawId)
}
} else if strings.HasPrefix(linkUrl, "wborg.adressen?") {
// Fetch address data.
result, _, contactDoc := crawler.FetchUrlWithResult("https://campus.meduniwien.ac.at/med.campus/" + linkUrl)
crawler.resultQueue <- result
if result.Error == nil {
res, _ := contactDoc.Search("//fieldset[contains(@class, \"MaskSCT\")]//tr")
for _, node := range res {
tds, _ := node.Search("./td")
if len(tds) < 2 {
continue
}
td1, td2 := tds[0], tds[1]
name := td1.Content()
linkRes, _ := td2.Search(".//a/@href")
switch name {
case "Adresse":
org.Address = td2.Content()
case "Telefon":
org.Phone = td2.Content()
case "Fax":
org.Fax = td2.Content()
case "E-Mailadresse":
org.Email = strings.Replace(linkRes[0].String(), "mailto:", "", 1)
case "WWW-Homepage":
org.Website = linkRes[0].String()
case "Info":
org.ContactInfo = td2.Content()
case "Sekretariat":
org.Secretary = td2.Content()
case "Bezeichnung":
// TODO: Implement.
default:
log.Printf("Unhandled data row at %v: %v\n", linkUrl, name)
}
}
}
} else if strings.HasPrefix(linkUrl, "orgdesc.display?corg=") {
result, _, descriptionDoc := crawler.FetchUrlWithResult("https://campus.meduniwien.ac.at/med.campus/" + linkUrl)
crawler.resultQueue <- result
if result.Error == nil {
res, _ := descriptionDoc.Search("//td[contains(@class, \"detailStd\")]")
if len(res) == 2 {
org.ShortDescription = res[0].InnerHtml()
org.LongDescription = res[1].InnerHtml()
}
} else {
log.Printf("Error at %v: Coul not fetch organisation description page: %v\n", url, result.Error)
}
}
}
return &org
}
func ShouldQueueUrl(url string) string {
ignorePrefixes := []string{
"javascript:",
"mailto:",
"#",
"http",
// organisation detail pages.
"orgdesc.display?corg=",
"wborg.adressen?",
"wblv.wbShowLvDetail",
"wbAbs.showMaskAbsBetreuer",
"wborg.cbMonoListElement",
"wbAbs.cbOverviewTheses",
"lv.listEqualLectures",
"wbTermin_List.cbLvTerminTable",
"wblv.cbReloadLvDetail",
"visitenkarte.vcardExport",
"WBANMELDUNG.durchfuehren",
"wbregisterexam.lv_termine",
"!wbTermin.wbEdit",
"wbAbs.showThesis",
"sa.gruppen_einteilung",
"sachgebiete.showOrgList",
"sachgebiete.showList",
"LV.listEqualLecturesForNode",
"wbLv.wbShowStellungInStp",
"studienplaene.lv_stellung",
"LV_TX.wbLvInfoTypen",
}
for _, val := range ignorePrefixes {
if strings.HasPrefix(url, val) {
return ""
}
}
if !strings.HasPrefix(url, "http") {
url = "https://campus.meduniwien.ac.at/med.campus/" + url
return url
}
return ""
}
func Handle(url string, doc *html.HtmlDocument, body []byte, crawler *Crawler) (*server.ImportItem, error) {
url = strings.Replace(url, "https://campus.meduniwien.ac.at/med.campus/", "", 1)
var item server.ImportItem
var js []byte
lowerUrl := strings.ToLower(url)
if strings.HasPrefix(lowerUrl, "wborg.display") {
org := HandleOrg(url, doc, crawler)
if org != nil {
item = org.ImportItem
js, _ = json.Marshal(org)
}
} else if strings.HasPrefix(lowerUrl, "visitenkarte.show_vcard") {
if !strings.Contains(string(body), "Diese Visitenkarte kann nicht mehr eingesehen werden.") {
person := HandlePerson(url, doc, crawler)
if person != nil {
item = person.ImportItem
js, _ = json.Marshal(person)
}
}
} else if strings.HasPrefix(lowerUrl, "wblv.wbshowlvdetail") {
lecture := HandleLecture(url, doc, crawler)
if lecture != nil {
item = lecture.ImportItem
js, _ = json.Marshal(lecture)
}
} else if strings.HasPrefix(lowerUrl, "wbraum.editraum") {
location := HandleLocation(url, doc, crawler)
if location != nil {
item = location.ImportItem
js, _ = json.Marshal(location)
}
}
if js != nil {
item.JSON = js
return &item, nil
} else {
return nil, nil
}
}
func OnFinish(crawler *Crawler) {
}
func Crawl(workerCount int) {
c := NewCrawler(workerCount)
c.UrlFile = "urls.txt"
c.DataFile = "data.json"
c.ShouldQueueUrl = ShouldQueueUrl
c.Handle = Handle
c.OnFinish = OnFinish
urls := []string{
"https://campus.meduniwien.ac.at/med.campus/wborg.display?porgnr=1",
"https://campus.meduniwien.ac.at/med.campus/wbsuche.lvsuchesimple",
}
c.Crawl(urls)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment