public
Created

cheerio > jsdom for now

  • Download Gist
listobjects.coffee
CoffeeScript
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
nodeio = require 'node.io'
fs = require 'fs'
start = require './ids/american-wing.json'
 
_arrify = (str) -> str.split /\r\n/
_remove_nums = (arr) -> str.replace(/\([0-9,]+\)|:/, '').trim() for str in arr
_remove_null = (arr) -> arr.filter (e) -> e.length
_flatten = (arr) -> if arr?.length is 1 then arr[0] else arr
_process = (str) -> _flatten _remove_null _remove_nums _arrify str
_trim = (arr) -> str.trim() for str in arr
 
class ParseObjects extends nodeio.JobClass
queue: start
 
init: ->
fs.readdir './ids/', (err, files) =>
@exit err if err?
for file in files
@queue.push require "./ids/#{file}" if file isnt 'american-wing.json'
 
input: (start,num,callback) ->
return false if start > @queue.length
return @queue[start...@length] if start+num-1 > @queue.length
@queue[start...start+num]
 
run: (id) ->
base = 'http://www.metmuseum.org/Collections/search-the-collections/'
delete object
object = {}
 
@getHtml base+id, (err, $) =>
@retry() if err?
 
object['id'] = +id
object['gallery-id'] = +$('.gallery-id a').text().match(/[0-9]+/g)?[0] or null
object['image'] = _flatten $('a[name="art-object-fullscreen"] > img')?.attr('src')?.match /(^http.*)/g
object['related-artworks'] = (+($(a).attr('href').match(/[0-9]+/g)[0]) for a in $('.related-content-container .object-info a'))
 
# add any definition lists as properties
object[_process $($('dt')[i]).text()] = _process $(v).text() for v,i in $('dd')
 
# add description and provenance
$('.promo-accordion > li').each (i, e) ->
category = _process $(e).find('.category').text()
content = $(e).find('.accordion-inner > p').text().trim()
switch category
when 'Description' then object[category] = content
when 'Provenance' then object[category] = _trim _remove_null content.split(';')
 
@emit id: id, object: object
 
 
output: (rows) ->
for row in rows
fs.writeFileSync "objects/#{row.id}.json", JSON.stringify row.object, null, 2
 
@job = new ParseObjects jsdom: true, max: 10

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.