Skip to content

Instantly share code, notes, and snippets.

@joepie91
Last active August 29, 2015 14:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save joepie91/53ca2c078260e3b2e10c to your computer and use it in GitHub Desktop.
Save joepie91/53ca2c078260e3b2e10c to your computer and use it in GitHub Desktop.
Newer AnonNews stuff
S = require "string"
Promise = require "bluebird"
request = require "request"
libxml = require "libxmljs"
longest = require "longest"
cld = Promise.promisifyAll(require "cld")
urlLib = require "url"
module.exports =
createTeaser: (input, length) ->
if input.length > length
re = new RegExp("^((.|[\r\n]){0,#{length}})(\\W|$)", "")
return re.exec(input)[1] + "..."
else
return input
justText: (input) ->
return S(input.replace(/<br\s?\/?>/, " ").replace("</p>", " ")).stripTags().s
getLanguage: (body, tld, encoding) ->
return new Promise (resolve, reject) ->
options =
isHTML: yes
tldHint: tld
httpHint: encoding
cld.detectAsync body, options
.then (result) ->
if result?
resolve result.languages[0].code
else
reject()
.catch (error) ->
reject error
getMetadata: (url) ->
selectors =
title: [
"//meta[@property='twitter:title']/@content",
"//meta[@property='og:title']/@content",
"//meta[@name='title']/@content",
"//*[@itemprop='headline']/text()",
"//*[@class='news_title']/text()"
"//h1[contains(@class, 'title')]/text()",
"//h2[contains(@class, 'title')]/text()",
"//title/text()"
],
image: [
"//meta[@property='twitter:image']/@content",
"//meta[@property='og:image']/@content",
"//img[@itemprop='image']/@src",
"//img[contains(@class, 'size-full')]/@src",
"(//div[contains(@class, 'embed-content') or contains(@class, 'article') or @class='body' or contains(@class, 'post_content') or contains(@class, 'entry') or @id='body' or @id='entry']//img)[1]/@src"
],
description: [
"//meta[@property='twitter:description']/@content",
"//meta[@property='og:description']/@content",
"//*[@itemprop='description']/text()",
"//*[@itemprop='articleBody']/text()",
"//*[@class='news_lead']/text()"
"//*[@class='news_body']/text()"
"//div[contains(@class, 'embed-content') or contains(@class, 'article') or @class='body' or contains(@class, 'post_content') or contains(@class, 'entry') or contains(@class, 'mainnewstxt') or contains(@class, 'story-body') or contains(@class, 'bodytext') or @id='body' or @id='entry']/text()",
"//div[contains(@class, 'embed-content') or contains(@class, 'article') or @class='body' or contains(@class, 'post_content') or contains(@class, 'entry') or contains(@class, 'mainnewstxt') or contains(@class, 'story-body') or contains(@class, 'bodytext') or @id='body' or @id='entry']//p[not(contains(@class, 'wptl'))]/text()",
"//div[@class='entry']//p/text()"
],
sitename: [
"//meta[@property='og:site_name']/@content",
"//img[contains(@class, 'logo') or contains(@src, 'logo')]/@alt",
"//img[contains(@class, 'logo') or contains(@src, 'logo')]/@title",
"//h1//img/@alt"
]
title_separators = /(\s[|:\/·-]\s|&raquo;|\s&gt;\s|\s&middot;\s)/
userAgent = "AnonNews v3 Metadata Extractor (ignore; Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36)"
return new Promise (resolve, reject) ->
request url, {headers: {"user-agent": userAgent}}, (err, resp, body) ->
if err?
reject(err)
else
doc = libxml.parseHtmlString(body)
metadata = {}
for field of selectors
for selector in selectors[field]
result = doc.find(selector)
if result? and result.length > 0
found = false
for node in result
if node.text?
value = node.text().trim()
else if node.value?
value = node.value().trim()
else
value = node.toString().trim()
if (field == "description" and value.length > 20) or (field != "description" and value.length > 5)
found = true
value = value.replace("\n", " ")
if field == "title" and value.search title_separators > -1
value = longest value.split(title_separators)
if field == "image"
value = urlLib.resolve url, value
metadata[field] = value
break
if found == true
break
parsedUrl = urlLib.parse url
module.exports.getLanguage body, parsedUrl.host.split(".").pop(), resp.headers["content-encoding"]
.then (result) ->
metadata.language = result
.finally ->
resolve metadata
NotFound: (message = "The requested page could not be found.") ->
err = new Error
err.status = 404
err.message = message
return err
InputError: (message = "One or more required input fields were missing.") ->
err = new Error
err.status = 400
err.message = message
return err
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment