Skip to content

Instantly share code, notes, and snippets.

@bjoerntx
Created February 23, 2024 13:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bjoerntx/249c7cf677c16a5a9327ab6069423bde to your computer and use it in GitHub Desktop.
Save bjoerntx/249c7cf677c16a5a9327ab6069423bde to your computer and use it in GitHub Desktop.
// find matches in a list of chunks
public static Dictionary<int, double> FindMatches(List<string> chunks, List<string> keywords, int padding = 500)
{
// create a dictionary to store the document frequency of each keyword
Dictionary<string, int> df = new Dictionary<string, int>();
// create a dictionary to store the results
Dictionary<int, double> results = new Dictionary<int, double>();
// create a list to store the trimmed chunks
List<string> trimmedChunks = new List<string>();
// loop through the chunks
for (int i = 0; i < chunks.Count; i++)
{
// remove the padding from the first and last chunk
string chunk = i != 0 ? chunks[i].Substring(padding) : chunks[i];
chunk = i != chunks.Count - 1 ? chunk.Substring(0, chunk.Length - padding) : chunk;
trimmedChunks.Add(chunk.ToLower());
}
// loop through the trimmed chunks
foreach (string chunk in trimmedChunks)
{
// loop through the keywords
foreach (string keyword in keywords)
{
// count the occurrences of the keyword in the chunk
int occurrences = chunk.CountSubstring(keyword);
// add the keyword to the document frequency dictionary
if (!df.ContainsKey(keyword))
{
df[keyword] = 0;
}
// increment the document frequency
df[keyword] += occurrences;
}
}
// loop through the trimmed chunks
for (int chunkId = 0; chunkId < trimmedChunks.Count; chunkId++)
{
// initialize the points
double points = 0;
// loop through the keywords
foreach (string keyword in keywords)
{
// count the occurrences of the keyword in the chunk
int occurrences = trimmedChunks[chunkId].CountSubstring(keyword);
// calculate the points
if (df[keyword] > 0)
{
// add the points
points += occurrences / (double)df[keyword];
}
}
// add the points to the results
results[chunkId] = points;
}
// return the results sorted by points
return results.OrderByDescending(x => x.Value).ToDictionary(x => x.Key, x => x.Value);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment