Created
July 2, 2021 00:47
-
-
Save jianminchen/d365ff4fd8dd785096e1116eec595ea3 to your computer and use it in GitHub Desktop.
Build a trie using csv file with million record import - test the code using Excel -> table -> filter -> startsWith
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.Data; | |
using System.Diagnostics; | |
using System.IO; | |
using System.Linq; | |
using System.Text; | |
using System.Threading.Tasks; | |
namespace csvReader | |
{ | |
class Program | |
{ | |
static void Main(string[] args) | |
{ | |
runTestcase(); | |
} | |
/// <summary> | |
/// Make sure that Trie class APIs are working. | |
/// StartsWithFound API - tested using two test cases, compared to Excel sheet -> table -> filter -> startWith results | |
/// </summary> | |
public static void runTestcase() | |
{ | |
// Get all chars for StaticMap variable first - Do not skip the step. | |
staticMap = ReadCsvFileSpecialChars(); | |
var trie = ReadCsvFileToTrie(); | |
var search = trie.Search("facebook.com"); | |
var search2 = trie.Search("facenama.com"); | |
var hasPrefix = trie.StartsWith("face"); | |
// check excel sheet - table - filter - starts with "fac"- 660 - face.be is shortest one | |
// starts with "face" - 226 | |
var keys = new string[]{ "fac", "face"}; | |
foreach (var key in keys) | |
{ | |
if (trie.StartsWith(key)) | |
{ | |
var found = trie.StartsWithFound(key); | |
if (key.CompareTo("fac") == 0) | |
{ | |
Debug.Assert(found.Count == 660); // verify using Excel -> table -> filter -> startWith | |
} | |
else if (key.CompareTo("face") == 0) | |
{ | |
Debug.Assert(found.Count == 226); // verify using Excel -> table -> filter -> startWith | |
} | |
} | |
} | |
} | |
/// <summary> | |
/// Open file using Code, and then it shows that delimiter is comma, | |
/// C# code reference | |
/// https://stackoverflow.com/questions/5282999/reading-csv-file-and-storing-values-into-an-array | |
/// </summary> | |
public static void readCsvFile() | |
{ | |
using (var reader = new StreamReader(@"C:\Users\jchen\Desktop\blogs\zenefits\top-1m.csv")) | |
{ | |
var domain = new List<string>(); | |
while (!reader.EndOfStream) | |
{ | |
var line = reader.ReadLine(); | |
var values = line.Split(','); | |
domain.Add(values[1]); | |
} | |
} | |
} | |
/// <summary> | |
/// It takes less than 20 seconds on my computer | |
/// | |
/// </summary> | |
/// <returns></returns> | |
public static Trie ReadCsvFileToTrie() | |
{ | |
var trie = new Trie(); | |
using (var reader = new StreamReader(@"C:\Users\jchen\Desktop\blogs\zenefits\top-1m.csv")) | |
{ | |
while (!reader.EndOfStream) | |
{ | |
var line = reader.ReadLine(); | |
var values = line.Split(','); | |
trie.Insert(values[1]); | |
} | |
} | |
return trie; | |
} | |
/// <summary> | |
/// figure out how many special chars | |
/// </summary> | |
/// <returns></returns> | |
public static Dictionary<char, int> ReadCsvFileSpecialChars() | |
{ | |
var set = new HashSet<char>(); | |
using (var reader = new StreamReader(@"C:\Users\jchen\Desktop\blogs\zenefits\top-1m.csv")) | |
{ | |
while (!reader.EndOfStream) | |
{ | |
var line = reader.ReadLine(); | |
var values = line.Split(','); | |
foreach(var item in values[1]) | |
set.Add(item); | |
} | |
} | |
var sorted = new SortedSet<char>(set); | |
var map = new Dictionary<char, int>(); | |
int index = 0; | |
foreach (var item in sorted) | |
{ | |
map.Add(item, index); | |
index++; | |
} | |
return map; | |
} | |
public static Dictionary<char, int> staticMap = new Dictionary<char, int>(); | |
/// <summary> | |
/// Leetcode 208. Implement Trie (Prefix Tree) | |
/// https://leetcode.com/problems/implement-trie-prefix-tree/discuss/703925/c-trie-algorithm-practice-in-june-2020 | |
/// Actionable Items: | |
/// 1. Use a map to help to define n-ary tree | |
/// 2. Go over csv file once to get all distinct chars first before Trie class is called. | |
/// Lessons learned: | |
/// 1. Came cross out-of-memory to handle usagoals.com, moved map to static scope called staticMap | |
/// </summary> | |
public class Trie | |
{ | |
private Trie[] children; | |
private string word; | |
/** Initialize your data structure here. */ | |
public Trie() | |
{ | |
if (staticMap == null || staticMap.Count == 0) | |
return; | |
children = new Trie[staticMap.Count]; | |
} | |
/** Inserts a word into the trie. */ | |
public void Insert(string word) | |
{ | |
if (word == null || word.Length == 0) | |
return; | |
var root = this; | |
foreach (var item in word) | |
{ | |
var index = staticMap[item]; | |
if (root.children[index] == null) | |
{ | |
root.children[index] = new Trie(); | |
} | |
root = root.children[index]; | |
} | |
root.word = word; | |
} | |
/** Returns if the word is in the trie. */ | |
public bool Search(string word) | |
{ | |
if (word == null || word.Length == 0) | |
return false; | |
var root = this; | |
foreach (var item in word) | |
{ | |
int index = staticMap[item]; | |
if (root.children[index] == null) | |
{ | |
return false; | |
} | |
root = root.children[index]; | |
} | |
return root.word != null; | |
} | |
/// <summary> | |
/// Returns if there is any word in the trie that starts with the given prefix. | |
/// Only allow prefix with length bigger than 2 | |
/// </summary> | |
/// <param name="prefix"></param> | |
/// <returns></returns> | |
public bool StartsWith(string prefix) | |
{ | |
if (prefix == null || prefix.Length < 3) | |
return false; | |
var root = this; | |
foreach (var item in prefix) | |
{ | |
var index = staticMap[item]; | |
if (root.children[index] == null) | |
{ | |
return false; | |
} | |
root = root.children[index]; | |
} | |
return true; | |
} | |
/// <summary> | |
/// apply BFS - should be better than DFS | |
/// </summary> | |
/// <param name="prefix"></param> | |
/// <returns></returns> | |
public List<string> StartsWithFound(string prefix) | |
{ | |
if (prefix == null || prefix.Length == 0) | |
return new List<string>(); | |
var root = this; | |
var found = new List<string>(); | |
foreach (var item in prefix) | |
{ | |
var index = staticMap[item]; | |
root = root.children[index]; | |
} | |
var queue = new Queue<Trie>(); | |
queue.Enqueue(root); | |
while (queue.Count > 0) | |
{ | |
var queueSize = queue.Count; | |
for(int node = 0; node < queueSize ; node++) | |
{ | |
var trie = queue.Dequeue(); | |
// This goes first! | |
if (trie.word != null) | |
{ | |
found.Add(trie.word); | |
} | |
var size = staticMap.Count; | |
for (int index = 0; index < size; index++) | |
{ | |
if (trie.children[index] == null) | |
{ | |
continue; | |
} | |
queue.Enqueue(trie.children[index]); | |
} | |
} | |
} | |
return found; | |
} | |
} | |
/** | |
* Your Trie object will be instantiated and called as such: | |
* Trie obj = new Trie(); | |
* obj.Insert(word); | |
* bool param_2 = obj.Search(word); | |
* bool param_3 = obj.StartsWith(prefix); | |
*/ | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment