alxarch/.gitignore

## .gitignore
node_modules/

## README.md

      
    Raw
  

              README.md
            
          
    Usage:
% coffee crawl.coffee 2014 | mysql -u root -D eudoxus -p

  
## bookinfo.coffee
###
PhantomJS script to collect book info from service.eudoxus.gr
###

usage = (code) ->
	console.log """
	Usage: phantomjs bookinfo.coffee <book_id>
	"""
	phantom.exit code

id = require("system").args[1]
usage(1) unless id

pg = require("webpage").create()
pg.onCallback = (data) ->
	console.log JSON.stringify data
	phantom.exit()

pg.open "https://service.eudoxus.gr/search/", (status) ->
	phantom.exit(1) unless status is "success"

	pg.evaluate (id) ->
		$ = (sel, el = document.body) -> el.querySelector sel
		$$ = (sel, el = document.body) -> [].slice.apply el.querySelectorAll sel
		waitForElement = (sel, ms, callback) ->
			fn = ->
				result = $ sel
				if result?
					clearInterval interval
					callback result

			interval = setInterval fn, ms

		click = (el) ->
			event = document.createEvent "MouseEvents"
			event.initEvent "click", yes, yes
			el.dispatchEvent event, yes
			event

		parseBookDetailsPopup = (popup) ->
			covers = $$(".search-popup-left .gwt-Image", popup).map (img) -> img.getAttribute "src"
			pdfs = $$(".search-popup-left .search-popup-link", popup).map (link) -> "#{link.href}"
			fields = {}

			$$(".search-popup-details-table td:nth-child(1)").forEach (td) ->
				fields[td.textContent] = td.nextElementSibling?.textContent

			["description", "authors", "title", "subtitle"].forEach (key) ->
				fields[key] = $(".search-popup-#{key}", popup)?.textContent

			code:         id
			description:  fields.description
			title:        fields.title
			subtitle:     fields.subtitle
			authors:      fields.authors.replace "Συγγραφείς: ", ""
			edition:      fields["Αριθμός Έκδοσης"]?.replace " εκδ.", ""
			year:         fields["Έτος Έκδοσης"]
			keywords:     fields["Λέξεις κλειδιά"]
			topics:       fields["Θεματικές Ενότητες"]
			isbn:         fields["ISBN"]
			publisher:    fields["Εκδόσεις"]
			distributor:  fields["Διαθέτης (Εκδότης)"]
			type:         fields["Τύπος"]
			covertype:    fields["Δέσιμο"]
			pages:        fields["Αριθμός Σελίδων"]
			dimensions:   fields["Διαστάσεις"]
			url:          $(".search-popup-details-table td .gwt-Anchor", popup)?.href
			cover:        covers[0]
			backcover:    covers[1]
			toc:          pdfs[0]
			sample:       pdfs[1]

		waitForElement ".search-resultsPanel .search-hyperlink", 50, (link) ->
			click link
			waitForElement ".search-popup", 10, (popup) ->
				data = parseBookDetailsPopup popup
				window.callPhantom data

		window.location.hash = "a/id:#{id}/0"
	, id

## crawl.coffee
#!/usr/bin/env coffee
###
# Crawls service.eudoxus.gr to get all book selections for a year
# also ffetches book info
# outputs mysql statements that be piped into mysql directly or to a file
# Example:
# coffee crawler.coffee 2014 | mysql -u username -p somedbname
###

year = parseInt process.argv[2], 10
process.exit(1) unless year

fs = require "fs"
path = require "path"
async = require "async"
{exec} = require "child_process"
{format} = require "util"
simplecrawler = require "simplecrawler"
cheerio = require "cheerio"
phantomjs = require "phantomjs"

output = process.stdout
output.write """
CREATE TABLE IF NOT EXISTS `selections` (
	`id` int(11) NOT NULL AUTO_INCREMENT,
	`course_name` varchar(255) COLLATE utf8_unicode_ci NOT NULL,
	`course_code` varchar(100) COLLATE utf8_unicode_ci NOT NULL,
	`book_code` varchar(100) COLLATE utf8_unicode_ci NOT NULL,
	`dept_code` varchar(100) COLLATE utf8_unicode_ci NOT NULL,
	`dept_name` varchar(255) COLLATE utf8_unicode_ci NOT NULL,
	`book_desc` text COLLATE utf8_unicode_ci NOT NULL,
	`position` int(11) NOT NULL,
	`year` int(11) NOT NULL,
	`course_season` varchar(50) COLLATE utf8_unicode_ci NOT NULL,
	`course_semester` int(11) NOT NULL,
	PRIMARY KEY (`id`),
	KEY `book_code` (`book_code`),
	KEY `dept_code` (`dept_code`),
	KEY `position` (`position`),
	KEY `year` (`year`),
	KEY `course_code` (`course_code`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci AUTO_INCREMENT=1;

CREATE TABLE IF NOT	EXISTS `books` (
	`id` int(11) NOT NULL AUTO_INCREMENT,
	`code` varchar(45) DEFAULT NULL,
	`isbn` varchar(45) DEFAULT NULL,
	`title` varchar(255) DEFAULT NULL,
	`subtitle` varchar(255) DEFAULT NULL,
	`authors` varchar(255) DEFAULT NULL,
	`description` text,
	`publisher` varchar(255) DEFAULT NULL,
	`distributor` varchar(255) DEFAULT NULL,
	`url` varchar(255) DEFAULT NULL,
	`sample` varchar(255) DEFAULT NULL,
	`toc` varchar(255) DEFAULT NULL,
	`cover` varchar(255) DEFAULT NULL,
	`backcover` varchar(255) DEFAULT NULL,
	`dimensions` varchar(45) DEFAULT NULL,
	`topics` text,
	`keywords` text,
	`type` varchar(45) DEFAULT NULL,
	`pages` varchar(45) DEFAULT NULL,
	`edition` varchar(45) DEFAULT NULL,
	`year` varchar(45) DEFAULT NULL,
	`covertype` varchar(45) DEFAULT NULL,
	PRIMARY KEY (`id`),
	KEY `book_code_idx` (`code`),
	KEY `book_isbn_idx` (`isbn`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci AUTO_INCREMENT=1;


DELETE FROM selections WHERE year = #{year};

"""

insert = (table, obj) ->
	quote = (value) -> "\"#{value.replace /"/g, '\\"'}\""
	keys = Object.keys obj
	values = (quote("#{obj[key]}") for key in keys)
	"""
	INSERT INTO #{table} (#{keys.join ','}) VALUES (#{values.join ','});

	"""
rx =
	url:  new RegExp "^/public/departments/courses/(\\d+)/#{year}$"
	course: /^Μάθημα \[([^\]]+)\]: (.*)/
	book: /^Βιβλίο \[([^\]]+)\]: (.*)Λεπτομέρειες$/
	semester: /^Εξάμηνο (\d+) - (Χειμερινό|Εαρινό|Ετήσιο)$/

crawler = new simplecrawler "service.eudoxus.gr"
crawler.initialPath = "/public/departments"
crawler.initialProtocol = "https"
crawler.maxConcurrency = 4
crawler.addFetchCondition (url) -> rx.url.test url.path

book_queue = do ->
	script = path.join __dirname, "bookinfo.coffee"
	processed = {}
	worker = (book_id, done) ->
		if processed[book_id]?
			done()
		else
			options =
				timeout: 30000
			exec "#{phantomjs.path} #{script} #{book_id}", options, (err, stdout) ->
				if err?
					book_queue.push book_id
				else
					book = null
					try
						book = JSON.parse stdout
					if book?
						output.write "DELETE FROM books WHERE code = '#{book_id}';\n"
						output.write insert "books", book
				done()
	async.queue worker, 8

crawler.on "fetchcomplete", (item, html, response) ->
	dept_code = item.path.replace rx.url, "$1"
	$ = cheerio.load html,
		decodeEntities: yes
	dept_name = "#{$("#header > h2").first().text()} | #{$("#header > h2").last().text()}"
	$("ol > li > ul > li").each ->
		$li = $ @
		$ol = $li.closest "ol"
		course_text = $ol.prevAll("h2").first().text()
		semester_text = $ol.prevAll("h3").first().text()
		book_text = $li.text().replace("\n", " ")
		position = $li.parent().parent().prevAll().length + 1

		book =
			year:             year
			dept_code:        dept_code
			course_code:      course_text.replace(rx.course, "$1")
			book_code:        book_text.replace(rx.book, "$1")
			course_name:      course_text.replace(rx.course, "$2")
			dept_name:        dept_name
			course_semester:  semester_text.replace(rx.semester, "$1")
			course_season:    semester_text.replace(rx.semester, "$2")
			book_desc:        book_text.replace(rx.book, "$2")
			position:         position

		book_queue.push book.book_code
		output.write insert "selections", book

crawler.on "complete", ->
	book_queue.drain = -> process.exit()

crawler.start()

## package.json
{
  "name": "eudoxus",
  "version": "0.0.0",
  "description": "",
  "main": "bookinfo.js",
  "dependencies": {
    "async": "^0.9.0",
    "cheerio": "^0.17.0",
    "phantomjs": "^1.9.7-15",
    "simplecrawler": "^0.3.9"
  },
  "devDependencies": {},
  "scripts": {
    "test": "echo \"Error: no test specified\" && exit 1"
  },
  "repository": {
    "type": "git",
    "url": "git@gist.github.com:/ce38f1cea4694ed92463.git"
  },
  "author": "",
  "license": "ISC"
}
	###
	PhantomJS script to collect book info from service.eudoxus.gr
	###

	usage = (code) ->
	console.log """
	Usage: phantomjs bookinfo.coffee <book_id>
	"""
	phantom.exit code

	id = require("system").args[1]
	usage(1) unless id

	pg = require("webpage").create()
	pg.onCallback = (data) ->
	console.log JSON.stringify data
	phantom.exit()

	pg.open "https://service.eudoxus.gr/search/", (status) ->
	phantom.exit(1) unless status is "success"

	pg.evaluate (id) ->
	$ = (sel, el = document.body) -> el.querySelector sel
	$$ = (sel, el = document.body) -> [].slice.apply el.querySelectorAll sel
	waitForElement = (sel, ms, callback) ->
	fn = ->
	result = $ sel
	if result?
	clearInterval interval
	callback result

	interval = setInterval fn, ms

	click = (el) ->
	event = document.createEvent "MouseEvents"
	event.initEvent "click", yes, yes
	el.dispatchEvent event, yes
	event

	parseBookDetailsPopup = (popup) ->
	covers = $$(".search-popup-left .gwt-Image", popup).map (img) -> img.getAttribute "src"
	pdfs = $$(".search-popup-left .search-popup-link", popup).map (link) -> "#{link.href}"
	fields = {}

	$$(".search-popup-details-table td:nth-child(1)").forEach (td) ->
	fields[td.textContent] = td.nextElementSibling?.textContent

	["description", "authors", "title", "subtitle"].forEach (key) ->
	fields[key] = $(".search-popup-#{key}", popup)?.textContent

	code: id
	description: fields.description
	title: fields.title
	subtitle: fields.subtitle
	authors: fields.authors.replace "Συγγραφείς: ", ""
	edition: fields["Αριθμός Έκδοσης"]?.replace " εκδ.", ""
	year: fields["Έτος Έκδοσης"]
	keywords: fields["Λέξεις κλειδιά"]
	topics: fields["Θεματικές Ενότητες"]
	isbn: fields["ISBN"]
	publisher: fields["Εκδόσεις"]
	distributor: fields["Διαθέτης (Εκδότης)"]
	type: fields["Τύπος"]
	covertype: fields["Δέσιμο"]
	pages: fields["Αριθμός Σελίδων"]
	dimensions: fields["Διαστάσεις"]
	url: $(".search-popup-details-table td .gwt-Anchor", popup)?.href
	cover: covers[0]
	backcover: covers[1]
	toc: pdfs[0]
	sample: pdfs[1]

	waitForElement ".search-resultsPanel .search-hyperlink", 50, (link) ->
	click link
	waitForElement ".search-popup", 10, (popup) ->
	data = parseBookDetailsPopup popup
	window.callPhantom data

	window.location.hash = "a/id:#{id}/0"
	, id
	#!/usr/bin/env coffee
	###
	# Crawls service.eudoxus.gr to get all book selections for a year
	# also ffetches book info
	# outputs mysql statements that be piped into mysql directly or to a file
	# Example:
	# coffee crawler.coffee 2014 \| mysql -u username -p somedbname
	###

	year = parseInt process.argv[2], 10
	process.exit(1) unless year

	fs = require "fs"
	path = require "path"
	async = require "async"
	{exec} = require "child_process"
	{format} = require "util"
	simplecrawler = require "simplecrawler"
	cheerio = require "cheerio"
	phantomjs = require "phantomjs"

	output = process.stdout
	output.write """
	CREATE TABLE IF NOT EXISTS `selections` (
	`id` int(11) NOT NULL AUTO_INCREMENT,
	`course_name` varchar(255) COLLATE utf8_unicode_ci NOT NULL,
	`course_code` varchar(100) COLLATE utf8_unicode_ci NOT NULL,
	`book_code` varchar(100) COLLATE utf8_unicode_ci NOT NULL,
	`dept_code` varchar(100) COLLATE utf8_unicode_ci NOT NULL,
	`dept_name` varchar(255) COLLATE utf8_unicode_ci NOT NULL,
	`book_desc` text COLLATE utf8_unicode_ci NOT NULL,
	`position` int(11) NOT NULL,
	`year` int(11) NOT NULL,
	`course_season` varchar(50) COLLATE utf8_unicode_ci NOT NULL,
	`course_semester` int(11) NOT NULL,
	PRIMARY KEY (`id`),
	KEY `book_code` (`book_code`),
	KEY `dept_code` (`dept_code`),
	KEY `position` (`position`),
	KEY `year` (`year`),
	KEY `course_code` (`course_code`)
	) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci AUTO_INCREMENT=1;

	CREATE TABLE IF NOT EXISTS `books` (
	`id` int(11) NOT NULL AUTO_INCREMENT,
	`code` varchar(45) DEFAULT NULL,
	`isbn` varchar(45) DEFAULT NULL,
	`title` varchar(255) DEFAULT NULL,
	`subtitle` varchar(255) DEFAULT NULL,
	`authors` varchar(255) DEFAULT NULL,
	`description` text,
	`publisher` varchar(255) DEFAULT NULL,
	`distributor` varchar(255) DEFAULT NULL,
	`url` varchar(255) DEFAULT NULL,
	`sample` varchar(255) DEFAULT NULL,
	`toc` varchar(255) DEFAULT NULL,
	`cover` varchar(255) DEFAULT NULL,
	`backcover` varchar(255) DEFAULT NULL,
	`dimensions` varchar(45) DEFAULT NULL,
	`topics` text,
	`keywords` text,
	`type` varchar(45) DEFAULT NULL,
	`pages` varchar(45) DEFAULT NULL,
	`edition` varchar(45) DEFAULT NULL,
	`year` varchar(45) DEFAULT NULL,
	`covertype` varchar(45) DEFAULT NULL,
	PRIMARY KEY (`id`),
	KEY `book_code_idx` (`code`),
	KEY `book_isbn_idx` (`isbn`)
	) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci AUTO_INCREMENT=1;


	DELETE FROM selections WHERE year = #{year};

	"""

	insert = (table, obj) ->
	quote = (value) -> "\"#{value.replace /"/g, '\\"'}\""
	keys = Object.keys obj
	values = (quote("#{obj[key]}") for key in keys)
	"""
	INSERT INTO #{table} (#{keys.join ','}) VALUES (#{values.join ','});

	"""
	rx =
	url: new RegExp "^/public/departments/courses/(\\d+)/#{year}$"
	course: /^Μάθημα \[([^\]]+)\]: (.*)/
	book: /^Βιβλίο \[([^\]]+)\]: (.*)Λεπτομέρειες$/
	semester: /^Εξάμηνο (\d+) - (Χειμερινό\|Εαρινό\|Ετήσιο)$/

	crawler = new simplecrawler "service.eudoxus.gr"
	crawler.initialPath = "/public/departments"
	crawler.initialProtocol = "https"
	crawler.maxConcurrency = 4
	crawler.addFetchCondition (url) -> rx.url.test url.path

	book_queue = do ->
	script = path.join __dirname, "bookinfo.coffee"
	processed = {}
	worker = (book_id, done) ->
	if processed[book_id]?
	done()
	else
	options =
	timeout: 30000
	exec "#{phantomjs.path} #{script} #{book_id}", options, (err, stdout) ->
	if err?
	book_queue.push book_id
	else
	book = null
	try
	book = JSON.parse stdout
	if book?
	output.write "DELETE FROM books WHERE code = '#{book_id}';\n"
	output.write insert "books", book
	done()
	async.queue worker, 8

	crawler.on "fetchcomplete", (item, html, response) ->
	dept_code = item.path.replace rx.url, "$1"
	$ = cheerio.load html,
	decodeEntities: yes
	dept_name = "#{$("#header > h2").first().text()} \| #{$("#header > h2").last().text()}"
	$("ol > li > ul > li").each ->
	$li = $ @
	$ol = $li.closest "ol"
	course_text = $ol.prevAll("h2").first().text()
	semester_text = $ol.prevAll("h3").first().text()
	book_text = $li.text().replace("\n", " ")
	position = $li.parent().parent().prevAll().length + 1

	book =
	year: year
	dept_code: dept_code
	course_code: course_text.replace(rx.course, "$1")
	book_code: book_text.replace(rx.book, "$1")
	course_name: course_text.replace(rx.course, "$2")
	dept_name: dept_name
	course_semester: semester_text.replace(rx.semester, "$1")
	course_season: semester_text.replace(rx.semester, "$2")
	book_desc: book_text.replace(rx.book, "$2")
	position: position

	book_queue.push book.book_code
	output.write insert "selections", book

	crawler.on "complete", ->
	book_queue.drain = -> process.exit()

	crawler.start()
	{
	"name": "eudoxus",
	"version": "0.0.0",
	"description": "",
	"main": "bookinfo.js",
	"dependencies": {
	"async": "^0.9.0",
	"cheerio": "^0.17.0",
	"phantomjs": "^1.9.7-15",
	"simplecrawler": "^0.3.9"
	},
	"devDependencies": {},
	"scripts": {
	"test": "echo \"Error: no test specified\" && exit 1"
	},
	"repository": {
	"type": "git",
	"url": "git@gist.github.com:/ce38f1cea4694ed92463.git"
	},
	"author": "",
	"license": "ISC"
	}