Created
March 3, 2021 06:59
-
-
Save sithart/d46e55f5e16ee34635109a5d7cdd68f6 to your computer and use it in GitHub Desktop.
website css, javascript, image links extractor
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup as bs | |
from urllib.parse import urljoin | |
import sys | |
# URL of the web page you want to extract | |
url = sys.argv[1] | |
# initialize a session | |
session = requests.Session() | |
# set the User-agent as a regular browser | |
session.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36" | |
# get the HTML content | |
html = session.get(url).content | |
# parse HTML using beautiful soup | |
soup = bs(html, "html.parser") | |
# get the JavaScript files | |
script_files = [] | |
for script in soup.find_all("script"): | |
if script.attrs.get("src"): | |
# if the tag has the attribute 'src' | |
script_url = urljoin(url, script.attrs.get("src")) | |
script_files.append(script_url) | |
# get the CSS files | |
css_files = [] | |
for css in soup.find_all("link"): | |
if css.attrs.get("href"): | |
# if the link tag has the 'href' attribute | |
css_url = urljoin(url, css.attrs.get("href")) | |
css_files.append(css_url) | |
# get the IMG files | |
img_files = [] | |
for img in soup.find_all("img"): | |
if img.attrs.get("src"): | |
# if the link tag has the 'href' attribute | |
img_url = urljoin(url, img.attrs.get("src")) | |
img_files.append(img_url) | |
# print(script_files) | |
# print(css_files) | |
# print(img_files) | |
print("Total script files in the page:", len(script_files)) | |
print("Total CSS files in the page:", len(css_files)) | |
print("Total IMG files in the page:", len(img_files)) | |
# write file links into files | |
with open("javascript_files.txt", "w") as f: | |
for js_file in script_files: | |
print(js_file, file=f) | |
with open("css_files.txt", "w") as f: | |
for css_file in css_files: | |
print(css_file, file=f) | |
with open("img_files.txt", "w") as f: | |
for img_file in img_files: | |
print(img_file, file=f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment