mheidari98/asnlookup_scraper.py

## asnlookup_scraper.py
import re
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver

# old version and not working now
def getCfCIDR():
    try:
        r = requests.get('https://asnlookup.com/organization/cloudflare')
        # https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser
        soup = BeautifulSoup( r.text , "html5lib")  # 'html5lib' , 'html.parser'  'lxml'
        ASs = [ b.text for b in soup.find_all('b') if b.a ]
    except Exception as e:
        logging.error(f"Error to get Cloudflare ASN: {e}")

    ASs = ASs if ASs else ['AS13335']
    logging.debug(f"AS = {', '.join(ASs)}")

    CIDRs = []
    for AS in ASs :
        r = requests.get(f'https://asnlookup.com/asn/{AS}')
        soup = BeautifulSoup( r.text , "html5lib")
        res = [ li.text for li in soup.find_all('li') if re.search("<li><a href=\"/cidr/.*0/", li.decode()) ]
        logging.debug(f"AS {AS} \t=> CIDR = {len(res)}")
        CIDRs += res

    return CIDRs

def createDriver():
    options = webdriver.ChromeOptions()
    options.add_argument("--enable-javascript")
    options.add_experimental_option('excludeSwitches', ['enable-logging'])
    options.headless = True

    driver = webdriver.Chrome(executable_path="./chromedriver.exe", options=options)
    driver.implicitly_wait(3)

    return driver

try:
    driver = createDriver()
    driver.get("https://asnlookup.com/organization/cloudflare")

    # this is just to ensure that the page is loaded
    time.sleep(5)

    html_doc = driver.page_source

    # https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser
    soup = BeautifulSoup(html_doc, 'html5lib') # 'html5lib' , 'html.parser'  'lxml'

    ASs = [ b.text for b in soup.find_all('b') if b.a ]
except Exception as e:
    ASs = ['AS13335']
    print(f"Error to get Cloudflare ASN: {e}")

print(f"AS = {', '.join(ASs)}")

CIDRs = []
for AS in ASs :
    driver.get(f"https://asnlookup.com/asn/{AS}")
    time.sleep(3)
    soup = BeautifulSoup( driver.page_source , "html5lib")
    res = [ li.text for li in soup.find_all('li') if re.search("<li><a href=\"/cidr/.*0/", li.decode()) ]
    print(f"AS {AS} \t=> CIDR = {len(res)}")

    with open(f"Cloudflare_{AS}.txt", 'w') as f :
        f.write('\n'.join(res))

    CIDRs += res

with open(f"Cloudflare_Organization.txt", 'w') as f :
        f.write('\n'.join(CIDRs))

driver.close()
	import re
	import time
	import requests
	from bs4 import BeautifulSoup
	from selenium import webdriver

	# old version and not working now
	def getCfCIDR():
	try:
	r = requests.get('https://asnlookup.com/organization/cloudflare')
	# https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser
	soup = BeautifulSoup( r.text , "html5lib") # 'html5lib' , 'html.parser' 'lxml'
	ASs = [ b.text for b in soup.find_all('b') if b.a ]
	except Exception as e:
	logging.error(f"Error to get Cloudflare ASN: {e}")

	ASs = ASs if ASs else ['AS13335']
	logging.debug(f"AS = {', '.join(ASs)}")

	CIDRs = []
	for AS in ASs :
	r = requests.get(f'https://asnlookup.com/asn/{AS}')
	soup = BeautifulSoup( r.text , "html5lib")
	res = [ li.text for li in soup.find_all('li') if re.search("<li><a href=\"/cidr/.*0/", li.decode()) ]
	logging.debug(f"AS {AS} \t=> CIDR = {len(res)}")
	CIDRs += res

	return CIDRs

	def createDriver():
	options = webdriver.ChromeOptions()
	options.add_argument("--enable-javascript")
	options.add_experimental_option('excludeSwitches', ['enable-logging'])
	options.headless = True

	driver = webdriver.Chrome(executable_path="./chromedriver.exe", options=options)
	driver.implicitly_wait(3)

	return driver

	try:
	driver = createDriver()
	driver.get("https://asnlookup.com/organization/cloudflare")

	# this is just to ensure that the page is loaded
	time.sleep(5)

	html_doc = driver.page_source

	# https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser
	soup = BeautifulSoup(html_doc, 'html5lib') # 'html5lib' , 'html.parser' 'lxml'

	ASs = [ b.text for b in soup.find_all('b') if b.a ]
	except Exception as e:
	ASs = ['AS13335']
	print(f"Error to get Cloudflare ASN: {e}")

	print(f"AS = {', '.join(ASs)}")

	CIDRs = []
	for AS in ASs :
	driver.get(f"https://asnlookup.com/asn/{AS}")
	time.sleep(3)
	soup = BeautifulSoup( driver.page_source , "html5lib")
	res = [ li.text for li in soup.find_all('li') if re.search("<li><a href=\"/cidr/.*0/", li.decode()) ]
	print(f"AS {AS} \t=> CIDR = {len(res)}")

	with open(f"Cloudflare_{AS}.txt", 'w') as f :
	f.write('\n'.join(res))

	CIDRs += res

	with open(f"Cloudflare_Organization.txt", 'w') as f :
	f.write('\n'.join(CIDRs))

	driver.close()