Skip to content

Instantly share code, notes, and snippets.

@vinodjayachandran
Created June 26, 2021 15:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vinodjayachandran/30d83e480e9fcd262f69e8e118280c6a to your computer and use it in GitHub Desktop.
Save vinodjayachandran/30d83e480e9fcd262f69e8e118280c6a to your computer and use it in GitHub Desktop.
Crawl Static HTML and Save it to File
from pathlib import Path
from urllib.request import Request, urlopen
class Crawl:
# Download the HTML Content of the URL and return absolute path of the HTML file
@staticmethod
def fetch(outputDirectory,url):
try:
# Create the output directory if not exists
Path(outputDirectory).mkdir(parents=True, exist_ok=True)
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
response = urlopen(req)
fileName = url[url.rfind("/")+1:len(url)]
htmlFile = open(outputDirectory + "/" +fileName+".html", 'w')
htmlContent = response.read().decode("utf-8")
htmlFile.write(htmlContent)
htmlFile.close()
print(f"{url} crawled successfully and saved at {htmlFile.name}")
return htmlContent
except Exception as e:
print(f'Crawl for {url} has failed' + str(e))
errorFile = open(outputDirectory+"/failed.txt", 'a')
errorFile.write(url+"\n")
errorFile.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment