Skip to content

Instantly share code, notes, and snippets.

@emarschner
Created May 12, 2014 16:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save emarschner/31dbcea42cbab61469b1 to your computer and use it in GitHub Desktop.
Save emarschner/31dbcea42cbab61469b1 to your computer and use it in GitHub Desktop.
Crawl USPTO weekly gazette for names of non-signing inventors
resolve = require('url').resolve
express = require 'express'
request = require 'request'
cheerio = require 'cheerio'
rootUrl = 'http://www.uspto.gov/news/og/'
goto = (url, callback) ->
request url, (err, res, html) ->
if not err?
console.log url
callback cheerio.load html
else
console.log err
fetch =
root: ->
goto rootUrl, ($) ->
for link in $ '#article .section:nth-of-type(2) li a'
do (link) ->
fetch.year resolve rootUrl, $(link).attr 'href'
year: (url) ->
goto url, ($) ->
validate = (week) ->
$week = $ week
if $week.text().match(/Week\s*#\d+/) then $week
for week in $ '#article li'
do (week) ->
$week = validate week
if $week?
fetch.week resolve url, $week.find('a').attr 'href'
week: (url) ->
goto url, ($) ->
$body = $ 'body'
patterns = [
/non.?signing\s+inventor,\s+([^.]+)\./,
/The\s+inventor\s+whose\s+signature\s+is\s+missing\s+\(([^)]+)\)/
]
for pattern in patterns
do (pattern) ->
global = new RegExp(pattern.toString().replace(/^\/|\/$/g, ''), 'g')
matches = $body.text().match global
if matches?
console.log (match.replace pattern, '$1' for match in matches)
port = 8081
app = express()
app.get '/', fetch.root
app.listen port
console.log 'listening on port: ' + port
exports = module.exports = app
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment