Last active
September 10, 2021 18:35
-
-
Save TNHSAesop/ce9aef04eb712d26b35bba01f9e15cdb to your computer and use it in GitHub Desktop.
Puesdo Code For How A Search Engine Might Crawl The Web (Written in C#)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// start a loop that's always running | |
while (1 == 1) | |
{ | |
// retrieve the next crawl target from a database or other persistent data store | |
string url = GetNextCrawlTarget(); | |
// check to see if this URL has already been indexed, if it has, check to see if it has changed since last crawl and needs to be re-crawled | |
if (URLAlreadyIndexed(url) == false && URLUpdatedSinceLastCrawl(url) == false) | |
{ | |
// crawl the url if there are links present within the document, extract them from the document and put them in a list of new potential crawl targets | |
List<string> potentialCrawlTargetsList = CrawlURL(url); | |
// check to see if the potential crawl targets extracted from the document already exist. | |
//If they do, discard them, if they don't add them to the newCrawlTargetsList for addition to the data store's processing queue | |
List<string> newCrawlTargetsList = DoCrawlTargetsExist(potentialCrawlTargetsList); | |
// if the list of new crawl targets isn't empty, add the new crawl targets to the crawl queue | |
if (newCrawlTargetsList.Count > 0) | |
{ | |
AddCrawlTargets(newCrawlTargetsList); | |
} | |
// now that new potential crawl targets have been extracted, we can send the URL over to the indexing queue | |
IndexURL(url); | |
// crawl process for the URL complete, continue to the next iteration of the loop | |
continue; | |
} | |
else | |
{ | |
// the crawl target was not valid so continue the loop and go to the next iteration | |
continue; | |
} | |
} | |
// visit us on the web at https://tortoiseandharesoftware.com/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment