Skip to content

Instantly share code, notes, and snippets.

@jianminchen
Created July 2, 2021 00:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jianminchen/d365ff4fd8dd785096e1116eec595ea3 to your computer and use it in GitHub Desktop.
Save jianminchen/d365ff4fd8dd785096e1116eec595ea3 to your computer and use it in GitHub Desktop.
Build a trie using csv file with million record import - test the code using Excel -> table -> filter -> startsWith
using System;
using System.Collections.Generic;
using System.Data;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace csvReader
{
class Program
{
static void Main(string[] args)
{
runTestcase();
}
/// <summary>
/// Make sure that Trie class APIs are working.
/// StartsWithFound API - tested using two test cases, compared to Excel sheet -> table -> filter -> startWith results
/// </summary>
public static void runTestcase()
{
// Get all chars for StaticMap variable first - Do not skip the step.
staticMap = ReadCsvFileSpecialChars();
var trie = ReadCsvFileToTrie();
var search = trie.Search("facebook.com");
var search2 = trie.Search("facenama.com");
var hasPrefix = trie.StartsWith("face");
// check excel sheet - table - filter - starts with "fac"- 660 - face.be is shortest one
// starts with "face" - 226
var keys = new string[]{ "fac", "face"};
foreach (var key in keys)
{
if (trie.StartsWith(key))
{
var found = trie.StartsWithFound(key);
if (key.CompareTo("fac") == 0)
{
Debug.Assert(found.Count == 660); // verify using Excel -> table -> filter -> startWith
}
else if (key.CompareTo("face") == 0)
{
Debug.Assert(found.Count == 226); // verify using Excel -> table -> filter -> startWith
}
}
}
}
/// <summary>
/// Open file using Code, and then it shows that delimiter is comma,
/// C# code reference
/// https://stackoverflow.com/questions/5282999/reading-csv-file-and-storing-values-into-an-array
/// </summary>
public static void readCsvFile()
{
using (var reader = new StreamReader(@"C:\Users\jchen\Desktop\blogs\zenefits\top-1m.csv"))
{
var domain = new List<string>();
while (!reader.EndOfStream)
{
var line = reader.ReadLine();
var values = line.Split(',');
domain.Add(values[1]);
}
}
}
/// <summary>
/// It takes less than 20 seconds on my computer
///
/// </summary>
/// <returns></returns>
public static Trie ReadCsvFileToTrie()
{
var trie = new Trie();
using (var reader = new StreamReader(@"C:\Users\jchen\Desktop\blogs\zenefits\top-1m.csv"))
{
while (!reader.EndOfStream)
{
var line = reader.ReadLine();
var values = line.Split(',');
trie.Insert(values[1]);
}
}
return trie;
}
/// <summary>
/// figure out how many special chars
/// </summary>
/// <returns></returns>
public static Dictionary<char, int> ReadCsvFileSpecialChars()
{
var set = new HashSet<char>();
using (var reader = new StreamReader(@"C:\Users\jchen\Desktop\blogs\zenefits\top-1m.csv"))
{
while (!reader.EndOfStream)
{
var line = reader.ReadLine();
var values = line.Split(',');
foreach(var item in values[1])
set.Add(item);
}
}
var sorted = new SortedSet<char>(set);
var map = new Dictionary<char, int>();
int index = 0;
foreach (var item in sorted)
{
map.Add(item, index);
index++;
}
return map;
}
public static Dictionary<char, int> staticMap = new Dictionary<char, int>();
/// <summary>
/// Leetcode 208. Implement Trie (Prefix Tree)
/// https://leetcode.com/problems/implement-trie-prefix-tree/discuss/703925/c-trie-algorithm-practice-in-june-2020
/// Actionable Items:
/// 1. Use a map to help to define n-ary tree
/// 2. Go over csv file once to get all distinct chars first before Trie class is called.
/// Lessons learned:
/// 1. Came cross out-of-memory to handle usagoals.com, moved map to static scope called staticMap
/// </summary>
public class Trie
{
private Trie[] children;
private string word;
/** Initialize your data structure here. */
public Trie()
{
if (staticMap == null || staticMap.Count == 0)
return;
children = new Trie[staticMap.Count];
}
/** Inserts a word into the trie. */
public void Insert(string word)
{
if (word == null || word.Length == 0)
return;
var root = this;
foreach (var item in word)
{
var index = staticMap[item];
if (root.children[index] == null)
{
root.children[index] = new Trie();
}
root = root.children[index];
}
root.word = word;
}
/** Returns if the word is in the trie. */
public bool Search(string word)
{
if (word == null || word.Length == 0)
return false;
var root = this;
foreach (var item in word)
{
int index = staticMap[item];
if (root.children[index] == null)
{
return false;
}
root = root.children[index];
}
return root.word != null;
}
/// <summary>
/// Returns if there is any word in the trie that starts with the given prefix.
/// Only allow prefix with length bigger than 2
/// </summary>
/// <param name="prefix"></param>
/// <returns></returns>
public bool StartsWith(string prefix)
{
if (prefix == null || prefix.Length < 3)
return false;
var root = this;
foreach (var item in prefix)
{
var index = staticMap[item];
if (root.children[index] == null)
{
return false;
}
root = root.children[index];
}
return true;
}
/// <summary>
/// apply BFS - should be better than DFS
/// </summary>
/// <param name="prefix"></param>
/// <returns></returns>
public List<string> StartsWithFound(string prefix)
{
if (prefix == null || prefix.Length == 0)
return new List<string>();
var root = this;
var found = new List<string>();
foreach (var item in prefix)
{
var index = staticMap[item];
root = root.children[index];
}
var queue = new Queue<Trie>();
queue.Enqueue(root);
while (queue.Count > 0)
{
var queueSize = queue.Count;
for(int node = 0; node < queueSize ; node++)
{
var trie = queue.Dequeue();
// This goes first!
if (trie.word != null)
{
found.Add(trie.word);
}
var size = staticMap.Count;
for (int index = 0; index < size; index++)
{
if (trie.children[index] == null)
{
continue;
}
queue.Enqueue(trie.children[index]);
}
}
}
return found;
}
}
/**
* Your Trie object will be instantiated and called as such:
* Trie obj = new Trie();
* obj.Insert(word);
* bool param_2 = obj.Search(word);
* bool param_3 = obj.StartsWith(prefix);
*/
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment