Last active
August 29, 2015 14:04
-
-
Save joepie91/53ca2c078260e3b2e10c to your computer and use it in GitHub Desktop.
Newer AnonNews stuff
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
S = require "string" | |
Promise = require "bluebird" | |
request = require "request" | |
libxml = require "libxmljs" | |
longest = require "longest" | |
cld = Promise.promisifyAll(require "cld") | |
urlLib = require "url" | |
module.exports = | |
createTeaser: (input, length) -> | |
if input.length > length | |
re = new RegExp("^((.|[\r\n]){0,#{length}})(\\W|$)", "") | |
return re.exec(input)[1] + "..." | |
else | |
return input | |
justText: (input) -> | |
return S(input.replace(/<br\s?\/?>/, " ").replace("</p>", " ")).stripTags().s | |
getLanguage: (body, tld, encoding) -> | |
return new Promise (resolve, reject) -> | |
options = | |
isHTML: yes | |
tldHint: tld | |
httpHint: encoding | |
cld.detectAsync body, options | |
.then (result) -> | |
if result? | |
resolve result.languages[0].code | |
else | |
reject() | |
.catch (error) -> | |
reject error | |
getMetadata: (url) -> | |
selectors = | |
title: [ | |
"//meta[@property='twitter:title']/@content", | |
"//meta[@property='og:title']/@content", | |
"//meta[@name='title']/@content", | |
"//*[@itemprop='headline']/text()", | |
"//*[@class='news_title']/text()" | |
"//h1[contains(@class, 'title')]/text()", | |
"//h2[contains(@class, 'title')]/text()", | |
"//title/text()" | |
], | |
image: [ | |
"//meta[@property='twitter:image']/@content", | |
"//meta[@property='og:image']/@content", | |
"//img[@itemprop='image']/@src", | |
"//img[contains(@class, 'size-full')]/@src", | |
"(//div[contains(@class, 'embed-content') or contains(@class, 'article') or @class='body' or contains(@class, 'post_content') or contains(@class, 'entry') or @id='body' or @id='entry']//img)[1]/@src" | |
], | |
description: [ | |
"//meta[@property='twitter:description']/@content", | |
"//meta[@property='og:description']/@content", | |
"//*[@itemprop='description']/text()", | |
"//*[@itemprop='articleBody']/text()", | |
"//*[@class='news_lead']/text()" | |
"//*[@class='news_body']/text()" | |
"//div[contains(@class, 'embed-content') or contains(@class, 'article') or @class='body' or contains(@class, 'post_content') or contains(@class, 'entry') or contains(@class, 'mainnewstxt') or contains(@class, 'story-body') or contains(@class, 'bodytext') or @id='body' or @id='entry']/text()", | |
"//div[contains(@class, 'embed-content') or contains(@class, 'article') or @class='body' or contains(@class, 'post_content') or contains(@class, 'entry') or contains(@class, 'mainnewstxt') or contains(@class, 'story-body') or contains(@class, 'bodytext') or @id='body' or @id='entry']//p[not(contains(@class, 'wptl'))]/text()", | |
"//div[@class='entry']//p/text()" | |
], | |
sitename: [ | |
"//meta[@property='og:site_name']/@content", | |
"//img[contains(@class, 'logo') or contains(@src, 'logo')]/@alt", | |
"//img[contains(@class, 'logo') or contains(@src, 'logo')]/@title", | |
"//h1//img/@alt" | |
] | |
title_separators = /(\s[|:\/·-]\s|»|\s>\s|\s·\s)/ | |
userAgent = "AnonNews v3 Metadata Extractor (ignore; Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36)" | |
return new Promise (resolve, reject) -> | |
request url, {headers: {"user-agent": userAgent}}, (err, resp, body) -> | |
if err? | |
reject(err) | |
else | |
doc = libxml.parseHtmlString(body) | |
metadata = {} | |
for field of selectors | |
for selector in selectors[field] | |
result = doc.find(selector) | |
if result? and result.length > 0 | |
found = false | |
for node in result | |
if node.text? | |
value = node.text().trim() | |
else if node.value? | |
value = node.value().trim() | |
else | |
value = node.toString().trim() | |
if (field == "description" and value.length > 20) or (field != "description" and value.length > 5) | |
found = true | |
value = value.replace("\n", " ") | |
if field == "title" and value.search title_separators > -1 | |
value = longest value.split(title_separators) | |
if field == "image" | |
value = urlLib.resolve url, value | |
metadata[field] = value | |
break | |
if found == true | |
break | |
parsedUrl = urlLib.parse url | |
module.exports.getLanguage body, parsedUrl.host.split(".").pop(), resp.headers["content-encoding"] | |
.then (result) -> | |
metadata.language = result | |
.finally -> | |
resolve metadata | |
NotFound: (message = "The requested page could not be found.") -> | |
err = new Error | |
err.status = 404 | |
err.message = message | |
return err | |
InputError: (message = "One or more required input fields were missing.") -> | |
err = new Error | |
err.status = 400 | |
err.message = message | |
return err |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment