Adeohluwa/gist:d7969fd9c5c8679b83378015ecbe8a50

## gistfile1.txt
#!/usr/bin/env nimcr

# website is https so compile with  nim c -r --threads:on -d:ssl bizlist_scraper.nim

import httpclient, htmlparser, os, threadpool,
  system, strutils, re, strtabs, sequtils, xmltree


# create a directory for each industry
proc mkCategories() =
  for line in lines "all.txt":
    createDir("./home/categories/$#/companies" % line)
    echo "creating '$#' directory....\n" % line
  echo """
          ++++++++++++++++++++++++++++++
          +++ Done Creating Folders! +++
          ++++++++++++++++++++++++++++++
       """


# ensure no captcha
proc ensureNoCaptcha(response: string) =
  if "CaptchaScode" in response:
    echo "STOP | Too Many Requests From This IP"
    quit(QuitFailure)


# request each url
proc retrieve(url, filename, proxy: string) =
  let browser = newHttpClient()
  let Googlebot = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
  browser.headers = newHttpHeaders({"User-agent":Googlebot})
  var response = browser.request(url)
  ensureNoCaptcha(response.body)
  writeFile(filename, response.body)
  sleep(20000)


# get pages in each industry category asychronously
proc getEachIndustry() =
  # define proxy
  let proxy = "proxy1"
  for eachIndustry in lines "all.txt":
    # getMaxPageNumber(eachCategory)
    for num in 1..10:
      let url = "https://www.businesslist.com.ng/category/$#/$#/state:lagos" % [ eachIndustry.replace(" ", "-"), $num ]
      let filename = "./home/categories/$#/$#.html" % [ eachIndustry, $num ]
      if existsFile(filename):
        echo "skipping this {$#} page $#" % [eachIndustry, $num]
        continue
      echo "downloading... { $# } page $#" % [eachIndustry, $num]
      spawn retrieve(url, filename, proxy)


# get each company page
proc getEachCompany() =
  let proxy = "proxy1"
  for eachCategory in walkDirs("./home/categories/*"):
    for eachPage in walkFiles("$#/*.html" % eachCategory):
      let loadedPage = loadHtml eachPage


      for eachCompany in loadedPage.findAll("h4"):
        let companyUrl = eachCompany[0].attrs["href"]
        if not companyUrl.isNil:
          let url = "https://businesslist.com.ng$#" % [companyUrl]
          var companyUrl = companyUrl.replace(re"/company/\d+","")
          var filename = "$#/companies$#.html" % [eachCategory, companyUrl]
          if existsFile(filename):
            echo "skipping this $#" % [companyUrl]
            continue
          echo "downloading... { $# }" % [companyUrl]
          spawn retrieve(url, filename, proxy)


# extract necessary details
# proc extractData() =
#   for line in lines "all.txt":
#     for eachCompany in walkDirs("home/categories/$#/comapanies/*" % line):
#       echo eachCompany


mkCategories()
getEachIndustry()
getEachCompany()
# extractData()
# writeToCSVFile()
	#!/usr/bin/env nimcr

	# website is https so compile with nim c -r --threads:on -d:ssl bizlist_scraper.nim

	import httpclient, htmlparser, os, threadpool,
	system, strutils, re, strtabs, sequtils, xmltree


	# create a directory for each industry
	proc mkCategories() =
	for line in lines "all.txt":
	createDir("./home/categories/$#/companies" % line)
	echo "creating '$#' directory....\n" % line
	echo """
	++++++++++++++++++++++++++++++
	+++ Done Creating Folders! +++
	++++++++++++++++++++++++++++++
	"""


	# ensure no captcha
	proc ensureNoCaptcha(response: string) =
	if "CaptchaScode" in response:
	echo "STOP \| Too Many Requests From This IP"
	quit(QuitFailure)


	# request each url
	proc retrieve(url, filename, proxy: string) =
	let browser = newHttpClient()
	let Googlebot = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
	browser.headers = newHttpHeaders({"User-agent":Googlebot})
	var response = browser.request(url)
	ensureNoCaptcha(response.body)
	writeFile(filename, response.body)
	sleep(20000)


	# get pages in each industry category asychronously
	proc getEachIndustry() =
	# define proxy
	let proxy = "proxy1"
	for eachIndustry in lines "all.txt":
	# getMaxPageNumber(eachCategory)
	for num in 1..10:
	let url = "https://www.businesslist.com.ng/category/$#/$#/state:lagos" % [ eachIndustry.replace(" ", "-"), $num ]
	let filename = "./home/categories/$#/$#.html" % [ eachIndustry, $num ]
	if existsFile(filename):
	echo "skipping this {$#} page $#" % [eachIndustry, $num]
	continue
	echo "downloading... { $# } page $#" % [eachIndustry, $num]
	spawn retrieve(url, filename, proxy)


	# get each company page
	proc getEachCompany() =
	let proxy = "proxy1"
	for eachCategory in walkDirs("./home/categories/*"):
	for eachPage in walkFiles("$#/*.html" % eachCategory):
	let loadedPage = loadHtml eachPage


	for eachCompany in loadedPage.findAll("h4"):
	let companyUrl = eachCompany[0].attrs["href"]
	if not companyUrl.isNil:
	let url = "https://businesslist.com.ng$#" % [companyUrl]
	var companyUrl = companyUrl.replace(re"/company/\d+","")
	var filename = "$#/companies$#.html" % [eachCategory, companyUrl]
	if existsFile(filename):
	echo "skipping this $#" % [companyUrl]
	continue
	echo "downloading... { $# }" % [companyUrl]
	spawn retrieve(url, filename, proxy)


	# extract necessary details
	# proc extractData() =
	# for line in lines "all.txt":
	# for eachCompany in walkDirs("home/categories/$#/comapanies/*" % line):
	# echo eachCompany




	mkCategories()
	getEachIndustry()
	getEachCompany()
	# extractData()
	# writeToCSVFile()