Skip to content

Instantly share code, notes, and snippets.

@Adeohluwa
Created September 24, 2017 12:29
Show Gist options
  • Save Adeohluwa/d7969fd9c5c8679b83378015ecbe8a50 to your computer and use it in GitHub Desktop.
Save Adeohluwa/d7969fd9c5c8679b83378015ecbe8a50 to your computer and use it in GitHub Desktop.
Scraper In Nim Lang
#!/usr/bin/env nimcr
# website is https so compile with nim c -r --threads:on -d:ssl bizlist_scraper.nim
import httpclient, htmlparser, os, threadpool,
system, strutils, re, strtabs, sequtils, xmltree
# create a directory for each industry
proc mkCategories() =
for line in lines "all.txt":
createDir("./home/categories/$#/companies" % line)
echo "creating '$#' directory....\n" % line
echo """
++++++++++++++++++++++++++++++
+++ Done Creating Folders! +++
++++++++++++++++++++++++++++++
"""
# ensure no captcha
proc ensureNoCaptcha(response: string) =
if "CaptchaScode" in response:
echo "STOP | Too Many Requests From This IP"
quit(QuitFailure)
# request each url
proc retrieve(url, filename, proxy: string) =
let browser = newHttpClient()
let Googlebot = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
browser.headers = newHttpHeaders({"User-agent":Googlebot})
var response = browser.request(url)
ensureNoCaptcha(response.body)
writeFile(filename, response.body)
sleep(20000)
# get pages in each industry category asychronously
proc getEachIndustry() =
# define proxy
let proxy = "proxy1"
for eachIndustry in lines "all.txt":
# getMaxPageNumber(eachCategory)
for num in 1..10:
let url = "https://www.businesslist.com.ng/category/$#/$#/state:lagos" % [ eachIndustry.replace(" ", "-"), $num ]
let filename = "./home/categories/$#/$#.html" % [ eachIndustry, $num ]
if existsFile(filename):
echo "skipping this {$#} page $#" % [eachIndustry, $num]
continue
echo "downloading... { $# } page $#" % [eachIndustry, $num]
spawn retrieve(url, filename, proxy)
# get each company page
proc getEachCompany() =
let proxy = "proxy1"
for eachCategory in walkDirs("./home/categories/*"):
for eachPage in walkFiles("$#/*.html" % eachCategory):
let loadedPage = loadHtml eachPage
for eachCompany in loadedPage.findAll("h4"):
let companyUrl = eachCompany[0].attrs["href"]
if not companyUrl.isNil:
let url = "https://businesslist.com.ng$#" % [companyUrl]
var companyUrl = companyUrl.replace(re"/company/\d+","")
var filename = "$#/companies$#.html" % [eachCategory, companyUrl]
if existsFile(filename):
echo "skipping this $#" % [companyUrl]
continue
echo "downloading... { $# }" % [companyUrl]
spawn retrieve(url, filename, proxy)
# extract necessary details
# proc extractData() =
# for line in lines "all.txt":
# for eachCompany in walkDirs("home/categories/$#/comapanies/*" % line):
# echo eachCompany
mkCategories()
getEachIndustry()
getEachCompany()
# extractData()
# writeToCSVFile()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment