Skip to content

Instantly share code, notes, and snippets.

@ahmadalli
Last active August 3, 2016 20:38
Show Gist options
  • Save ahmadalli/d1b886a9644c8e1b5dda4369e2afceef to your computer and use it in GitHub Desktop.
Save ahmadalli/d1b886a9644c8e1b5dda4369e2afceef to your computer and use it in GitHub Desktop.
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
namespace SummaryTool
{
public static class SummaryTool
{
/// <summary>
/// Naive method for splitting a text into sentences
/// </summary>
/// <param name="content">the text for splitting</param>
/// <returns>List of sentences</returns>
static List<string> SplitContentToSentences(string content)
{
content = content.Replace(Environment.NewLine, ".");
return Regex.Split(content, @"\.").Where(x => !string.IsNullOrWhiteSpace(x)).Select(x => x.Trim()).ToList();
}
/// <summary>
/// Naive method for splitting a text into paragraphs
/// </summary>
/// <param name="content">the text for splitting</param>
/// <returns></returns>
static List<string> SplitContentToParagraphs(string content)
{
return Regex.Split(content, Environment.NewLine).Where(x => !string.IsNullOrWhiteSpace(x)).ToList();
}
/// <summary>
/// Caculate the intersection between 2 sentences
/// </summary>
/// <param name="sent1"></param>
/// <param name="sent2"></param>
/// <returns></returns>
static double SentencesIntersection(string sent1, string sent2)
{
// split the sentence into words/tokens
var s1 = sent1.Split(' ').ToList();
var s2 = sent2.Split(' ').ToList();
// If there is not intersection, just return 0
if (s1.Count + s2.Count == 0)
return 0;
// We normalize the result by the average number of words
return s1.Intersect(s2).Count() / ((s1.Count + s2.Count) / (double)2);
}
/// <summary>
/// Format a sentence - remove all non-alphbetic chars from the sentence
/// We'll use the formatted sentence as a key in our sentences dictionary
/// </summary>
/// <param name="sentence"></param>
/// <returns></returns>
static string FormatSentence(string sentence)
{
char[] arr = sentence.ToCharArray();
arr = Array.FindAll<char>(arr, (c => (char.IsLetterOrDigit(c)
|| char.IsWhiteSpace(c)
|| c == '-')));
return new string(arr);
}
/// <summary>
/// Convert the content into a dictionary <K, V>
/// k = The formatted sentence
/// V = The rank of the sentence
/// </summary>
/// <param name="content"></param>
/// <returns></returns>
public static Dictionary<string, double> GetSentecesRanks(string content)
{
// Split the content into sentences
var sentences = SplitContentToSentences(content);
// Calculate the intersection of every two sentences
var n = sentences.Count;
List<List<double>> values = new List<List<double>>();
for (int i = 0; i < n; i++)
{
values.Add(new List<double>());
for (int j = 0; j < n; j++)
{
values[i].Add(0);
values[i][j] = SentencesIntersection(sentences[i], sentences[j]);
}
}
// Build the sentences dictionary ;
// The score of a sentences is the sum of all its intersection ;
Dictionary<string, double> sentences_dic = new Dictionary<string, double>();
for (int i = 0; i < n; i++)
{
double score = 0;
for (int j = 0; j < n; j++)
{
if (i == j)
continue;
score += values[i][j];
};
if (!sentences_dic.ContainsKey(sentences[i]))
sentences_dic.Add(FormatSentence(sentences[i]), score);
}
return sentences_dic;
}
/// <summary>
/// Return the best sentence in a paragraph
/// </summary>
/// <param name="paragraph"></param>
/// <param name="sentences_dic"></param>
/// <returns></returns>
static string GetBestSentence(string paragraph, Dictionary<string, double> sentences_dic)
{
//Split the paragraph into sentences
var sentences = SplitContentToSentences(paragraph);
//Ignore short paragraphs
if (sentences.Count < 2)
return "";
//Get the best sentence according to the sentences dictionary
return sentences_dic.Where(x => sentences.Exists(y => FormatSentence(y) == x.Key))
.OrderByDescending(x => x.Value).First().Key;
}
/// <summary>
/// Build the summary
/// </summary>
/// <param name="title"></param>
/// <param name="content"></param>
/// <param name="sentences_dic"></param>
/// <returns></returns>
public static string GetSummary(string title, string content, Dictionary<string, double> sentences_dic)
{
//Split the content into paragraphs
var paragraphs = SplitContentToParagraphs(content);
//Add the title
string summary = "";
summary += title.Trim();
//Add the best sentence from each paragraph
foreach (var p in paragraphs)
{
var sentence = GetBestSentence(p, sentences_dic).Trim(); ;
if (!string.IsNullOrWhiteSpace(sentence))
summary += Environment.NewLine + sentence;
}
return summary;
}
public static string GetSummary(string title, string content)
{
return GetSummary(title, content, GetSentecesRanks(content));
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment