Last active
December 18, 2015 08:29
-
-
Save kmwallio/5754308 to your computer and use it in GitHub Desktop.
Crappy Automated summary using Coffee-Script. Requires natural and Sylvester
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env coffee | |
natural = require 'natural' | |
sylvester = require 'sylvester' | |
fs = require 'fs' | |
if process.argv[2]? and process.argv[3]? | |
# Read in the given file | |
fs.readFile process.argv[2], (err, data) -> | |
# Tokenize into sentences | |
tokenizer = new natural.RegexpTokenizer({pattern: /[\.!\?\r\n#]+/}) | |
file = strip_markdown data.toString() | |
sentences = tokenizer.tokenize file | |
sentences = (sentence.trim() for sentence in sentences) | |
# Take the sentences and generate the tf-idf vectors | |
transition = []; | |
TfIdf = natural.TfIdf | |
tfidf = new TfIdf() | |
tfidf.addDocument(sentence) for sentence in sentences | |
for sentence in sentences | |
row = []; | |
tfidf.tfidfs sentence, (i, measure) -> | |
if sentence is sentences[i] | |
row.push(1) | |
else | |
row.push(measure + 1) | |
transition.push(transition_modified(row)) | |
# Find the page rank off the given | |
# Transition matrix... | |
trans_mat = $M(transition) | |
rank_vec = make_vec(sentences.length) | |
for i in [1..250] | |
rank_vec = rank_vec.x(trans_mat) | |
# Create an array of objects with sentences and scores | |
scored = [] | |
for i in [1..sentences.length] | |
scored.push({score: rank_vec.e(1, i), sentence: sentences[i-1], idx: i}) | |
scored.sort compare_scores | |
scored = scored.splice(0, process.argv[3]) | |
scored.sort compare_idx | |
summary = [] | |
summary.push(good.sentence) for good in scored | |
console.log(summary.join(". ") + ".") | |
else | |
console.log "Usage:" | |
console.log "\tsummary [file] [limit]" | |
compare_idx = (a, b) -> | |
a.idx - b.idx | |
compare_scores = (a, b) -> | |
b.score - a.score | |
make_vec = (len) -> | |
new_vec = []; | |
new_vec.push 1 | |
new_vec.push(0) for i in [1...len] | |
$M([new_vec]) | |
transition_modified = (r) -> | |
divider = 0 | |
divider += score for score in r | |
r = (score/divider for score in r) | |
strip_markdown = (str) -> | |
# Extra quotes | |
str = str.replace /"/g, '' | |
# Links and Images | |
str = str.replace /!\[(.*?)?\]\((.*?)?\)/g, '$1' | |
str = str.replace /!?\[(.*?)?\]\((.*?)?\)/g, '$1' | |
# Footnotes | |
str = str.replace /\[\^(.*?)\]:?/g, '' | |
# Bold and italics | |
str = str.replace /\*{1,3}([^\*]*?)\*{1,3}/g, '$1' | |
str = str.replace /_{1,3}([^\*]*?)_{1,3}/g, '$1' | |
# List and blockquotes | |
str = str.replace /^\s*[\*\s]+(.*?)/g, '$1' | |
str = str.replace /^\s*[\>\s]*(.*?)/g, '$1' | |
str = str.replace /^\s*[\*\s]+(.*?)/g, '$1' | |
str = str.replace /^\s*[\>\s]*(.*?)/g, '$1' | |
# Headings | |
str = str.replace /#/g, '' | |
# comments | |
str = str.replace /<!--(.*?)-->/g, '' | |
# Extra whitespace | |
str = str.replace /\s+/g, ' ' | |
# Code | |
str = str.replace /```(.*?)```/g, '' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment