sjparsons/structure_algorithms.txt

## structure_algorithms.txt
== Database Structure

	Table: crawler_<site>_urls
		url (str) primary
		accessed (date)
		content-type (str)
		content-length (int)
		status (str)
		etag (if set, the crawler can use the etag in it's requests
			to reduce unnecessary checks)
		response (str, only used if content-type is HTML or CSS or XML or RSS)
		validation_type (html|xml|css|rss)
		valid = NULL

	Table: crawler_queue
		id (int) primary
		url (str, not null)
		pattern (str)
		force (bool)

	Table: crawler_<site>_links
		from: url (str)
		to: url (str)
		type: (a|css|img|video|audio|object|script)

	Table: crawler_sites
		id (int)
		domain (str)
		start_path (str)


== Algorithms


shelf_life = 1 day

seed()
	for each row in urls
		if accessed < NOW - shelf_life
			add url to queue (pattern blank, force = false)

crawl()

	for each row in the queue
		if force is true
			crawlUrl(queue_id)
		else if not UrlCrawled(url)
			crawlUrl(queue_id)
		else if UrlCrawled(url) and NOW - UrlAccessed(url) > shelf_life
			crawlUrl(queue_id)
		remove queue_id from queue

crawlUrl(queue_id)

	item = get from queue where id = queue_id

	http request for item.url
		returns
			status (http status response)
			headers
			body

	if the url has been crawled before and the etag is the same,
		then update the accessed time to now() and return.

	create an object with the following attributes
		url = item.url
		accessed = now()
		content-type = headers['Content-Type']
		content-length = headers // or could be based on the actual returned size
		etag = headers
		status = status (HTTP status)
		response = body (if content is html, xml, css, or rss)
		validation_type = (one of html, xml, css, rss OR Null)
		valid = NULL

	if validation_type not null
		valid = parse(response, validation_type)

	if validation_type == html
		old_links = get all rows in links where url == current url
		parsed_links = parse all links in response

		for each difference(old_links,parsed_links) // deleted links
			remove link from links table.

		for each difference(parsed_links,old_links) // new links
			add links to links tables
			if link is not in urls
				if item.pattern = null
					add to queue
				else if item.pattern not null && link matches item.pattern
					add to queue, using same pattern and force values.

	add/update row in table: urls


	* implementation note: would be nice to chain all the SQL transactions together so that if for some reason processing was stopped after the first and before the last was executed, none would be run.
	== Database Structure

	Table: crawler_<site>_urls
	url (str) primary
	accessed (date)
	content-type (str)
	content-length (int)
	status (str)
	etag (if set, the crawler can use the etag in it's requests
	to reduce unnecessary checks)
	response (str, only used if content-type is HTML or CSS or XML or RSS)
	validation_type (html\|xml\|css\|rss)
	valid = NULL

	Table: crawler_queue
	id (int) primary
	url (str, not null)
	pattern (str)
	force (bool)

	Table: crawler_<site>_links
	from: url (str)
	to: url (str)
	type: (a\|css\|img\|video\|audio\|object\|script)

	Table: crawler_sites
	id (int)
	domain (str)
	start_path (str)


	== Algorithms


	shelf_life = 1 day

	seed()
	for each row in urls
	if accessed < NOW - shelf_life
	add url to queue (pattern blank, force = false)

	crawl()

	for each row in the queue
	if force is true
	crawlUrl(queue_id)
	else if not UrlCrawled(url)
	crawlUrl(queue_id)
	else if UrlCrawled(url) and NOW - UrlAccessed(url) > shelf_life
	crawlUrl(queue_id)
	remove queue_id from queue

	crawlUrl(queue_id)

	item = get from queue where id = queue_id

	http request for item.url
	returns
	status (http status response)
	headers
	body

	if the url has been crawled before and the etag is the same,
	then update the accessed time to now() and return.

	create an object with the following attributes
	url = item.url
	accessed = now()
	content-type = headers['Content-Type']
	content-length = headers // or could be based on the actual returned size
	etag = headers
	status = status (HTTP status)
	response = body (if content is html, xml, css, or rss)
	validation_type = (one of html, xml, css, rss OR Null)
	valid = NULL

	if validation_type not null
	valid = parse(response, validation_type)

	if validation_type == html
	old_links = get all rows in links where url == current url
	parsed_links = parse all links in response

	for each difference(old_links,parsed_links) // deleted links
	remove link from links table.

	for each difference(parsed_links,old_links) // new links
	add links to links tables
	if link is not in urls
	if item.pattern = null
	add to queue
	else if item.pattern not null && link matches item.pattern
	add to queue, using same pattern and force values.

	add/update row in table: urls


	* implementation note: would be nice to chain all the SQL transactions together so that if for some reason processing was stopped after the first and before the last was executed, none would be run.