Last active
August 3, 2016 20:38
-
-
Save ahmadalli/d1b886a9644c8e1b5dda4369e2afceef to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.IO; | |
using System.Linq; | |
using System.Text.RegularExpressions; | |
using System.Threading.Tasks; | |
namespace SummaryTool | |
{ | |
public static class SummaryTool | |
{ | |
/// <summary> | |
/// Naive method for splitting a text into sentences | |
/// </summary> | |
/// <param name="content">the text for splitting</param> | |
/// <returns>List of sentences</returns> | |
static List<string> SplitContentToSentences(string content) | |
{ | |
content = content.Replace(Environment.NewLine, "."); | |
return Regex.Split(content, @"\.").Where(x => !string.IsNullOrWhiteSpace(x)).Select(x => x.Trim()).ToList(); | |
} | |
/// <summary> | |
/// Naive method for splitting a text into paragraphs | |
/// </summary> | |
/// <param name="content">the text for splitting</param> | |
/// <returns></returns> | |
static List<string> SplitContentToParagraphs(string content) | |
{ | |
return Regex.Split(content, Environment.NewLine).Where(x => !string.IsNullOrWhiteSpace(x)).ToList(); | |
} | |
/// <summary> | |
/// Caculate the intersection between 2 sentences | |
/// </summary> | |
/// <param name="sent1"></param> | |
/// <param name="sent2"></param> | |
/// <returns></returns> | |
static double SentencesIntersection(string sent1, string sent2) | |
{ | |
// split the sentence into words/tokens | |
var s1 = sent1.Split(' ').ToList(); | |
var s2 = sent2.Split(' ').ToList(); | |
// If there is not intersection, just return 0 | |
if (s1.Count + s2.Count == 0) | |
return 0; | |
// We normalize the result by the average number of words | |
return s1.Intersect(s2).Count() / ((s1.Count + s2.Count) / (double)2); | |
} | |
/// <summary> | |
/// Format a sentence - remove all non-alphbetic chars from the sentence | |
/// We'll use the formatted sentence as a key in our sentences dictionary | |
/// </summary> | |
/// <param name="sentence"></param> | |
/// <returns></returns> | |
static string FormatSentence(string sentence) | |
{ | |
char[] arr = sentence.ToCharArray(); | |
arr = Array.FindAll<char>(arr, (c => (char.IsLetterOrDigit(c) | |
|| char.IsWhiteSpace(c) | |
|| c == '-'))); | |
return new string(arr); | |
} | |
/// <summary> | |
/// Convert the content into a dictionary <K, V> | |
/// k = The formatted sentence | |
/// V = The rank of the sentence | |
/// </summary> | |
/// <param name="content"></param> | |
/// <returns></returns> | |
public static Dictionary<string, double> GetSentecesRanks(string content) | |
{ | |
// Split the content into sentences | |
var sentences = SplitContentToSentences(content); | |
// Calculate the intersection of every two sentences | |
var n = sentences.Count; | |
List<List<double>> values = new List<List<double>>(); | |
for (int i = 0; i < n; i++) | |
{ | |
values.Add(new List<double>()); | |
for (int j = 0; j < n; j++) | |
{ | |
values[i].Add(0); | |
values[i][j] = SentencesIntersection(sentences[i], sentences[j]); | |
} | |
} | |
// Build the sentences dictionary ; | |
// The score of a sentences is the sum of all its intersection ; | |
Dictionary<string, double> sentences_dic = new Dictionary<string, double>(); | |
for (int i = 0; i < n; i++) | |
{ | |
double score = 0; | |
for (int j = 0; j < n; j++) | |
{ | |
if (i == j) | |
continue; | |
score += values[i][j]; | |
}; | |
if (!sentences_dic.ContainsKey(sentences[i])) | |
sentences_dic.Add(FormatSentence(sentences[i]), score); | |
} | |
return sentences_dic; | |
} | |
/// <summary> | |
/// Return the best sentence in a paragraph | |
/// </summary> | |
/// <param name="paragraph"></param> | |
/// <param name="sentences_dic"></param> | |
/// <returns></returns> | |
static string GetBestSentence(string paragraph, Dictionary<string, double> sentences_dic) | |
{ | |
//Split the paragraph into sentences | |
var sentences = SplitContentToSentences(paragraph); | |
//Ignore short paragraphs | |
if (sentences.Count < 2) | |
return ""; | |
//Get the best sentence according to the sentences dictionary | |
return sentences_dic.Where(x => sentences.Exists(y => FormatSentence(y) == x.Key)) | |
.OrderByDescending(x => x.Value).First().Key; | |
} | |
/// <summary> | |
/// Build the summary | |
/// </summary> | |
/// <param name="title"></param> | |
/// <param name="content"></param> | |
/// <param name="sentences_dic"></param> | |
/// <returns></returns> | |
public static string GetSummary(string title, string content, Dictionary<string, double> sentences_dic) | |
{ | |
//Split the content into paragraphs | |
var paragraphs = SplitContentToParagraphs(content); | |
//Add the title | |
string summary = ""; | |
summary += title.Trim(); | |
//Add the best sentence from each paragraph | |
foreach (var p in paragraphs) | |
{ | |
var sentence = GetBestSentence(p, sentences_dic).Trim(); ; | |
if (!string.IsNullOrWhiteSpace(sentence)) | |
summary += Environment.NewLine + sentence; | |
} | |
return summary; | |
} | |
public static string GetSummary(string title, string content) | |
{ | |
return GetSummary(title, content, GetSentecesRanks(content)); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment