Created
June 27, 2023 11:12
-
-
Save rithvikvibhu/c6a03be8b4b9a2773d48937e467edf19 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Usage | |
# | |
# Install dependencies: | |
# $ pip install selenium beautifulsoup4 | |
# Run: | |
# $ python download-gtlds-applications.py | |
import json | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.chrome.service import Service as ChromeService | |
from selenium.webdriver.chrome.webdriver import WebDriver as ChromeWebDriver | |
from selenium.webdriver.support import expected_conditions as EC | |
from selenium.webdriver.support.ui import WebDriverWait | |
from bs4 import BeautifulSoup | |
service = ChromeService(executable_path=r"./chromedriver") | |
driver = webdriver.Chrome(service=service) | |
driver.get("https://gtldresult.icann.org/application-result/applicationstatus") | |
page_url = driver.find_element(By.LINK_TEXT, "2").get_attribute("href").replace("/2?", "/{}?") | |
print("placeholder page url:", page_url) | |
def go_to_page(page_no: int): | |
print("[*] switching to page", page_url.format(page_no)) | |
driver.get(page_url.format(page_no)) | |
def get_data(driver: ChromeWebDriver): | |
soup = BeautifulSoup(driver.page_source, 'html.parser') | |
# Find all the div elements with class "mainInfo" | |
apps_divs = soup.find_all('div', class_='mainInfo') | |
res = [] | |
for app_div in apps_divs: | |
# Overview row | |
overview_tds = app_div.parent.parent.parent.find_previous_sibling().find_all('td') | |
expand_details = overview_tds[0].a.text.strip() | |
prioritization_number = overview_tds[1].text.strip() | |
application_tld = overview_tds[2].a.text.strip() | |
application_applicant = overview_tds[3].a.text.strip() | |
country_code = overview_tds[4].text.strip() | |
# Detail row | |
label_div = app_div.find('div', class_='aLabel') | |
contact_name_div = app_div.find('div', class_='applicantPrimaryContactName') | |
contact_email_div = app_div.find('div', class_='applicantPrimaryContactEmail') | |
app_id_div = app_div.find('div', class_='tasApplicationId') | |
app_status_div = app_div.find('div', class_='currentStatusDisplayable') | |
eval_result_div = app_div.find('div', class_='evaluationResultDisplayable') | |
label = None | |
contact_name = None | |
contact_email = None | |
app_id = None | |
app_status = None | |
eval_result = None | |
eval_result_link = None | |
if label_div: | |
label = label_div.text.strip().split(':')[1].strip() | |
if contact_name_div: | |
contact_name = contact_name_div.text.strip().split(':')[1].strip() | |
if contact_email_div: | |
contact_email = contact_email_div.text.strip().split(':')[1].strip() | |
if app_id_div: | |
app_id = app_id_div.text.strip().split(':')[1].strip() | |
if app_status_div: | |
app_status = app_status_div.text.strip().split(':')[1].strip() | |
if eval_result_div: | |
eval_result = eval_result_div.text.strip().split(':')[1].strip() | |
eval_result_link = eval_result_div.a['href'] | |
res.append({ | |
'expand_details': expand_details, | |
'prioritization_number': prioritization_number, | |
'application_tld': application_tld, | |
'application_applicant': application_applicant, | |
'country_code': country_code, | |
'label': label, | |
'contact_name': contact_name, | |
'contact_email': contact_email, | |
'app_id': app_id, | |
'app_status': app_status, | |
'eval_result': eval_result, | |
'eval_result_link': eval_result_link, | |
}) | |
return res | |
applications = [] | |
# Page 1 | |
applications.extend(get_data(driver)) | |
# Pages 2-56 | |
for i in range(2, 56+1): | |
print('Total applications:', len(applications)) | |
print('Page', i, '...') | |
go_to_page(i) | |
apps_from_page = get_data(driver) | |
print('\t', len(apps_from_page), 'applications') | |
applications.extend(apps_from_page) | |
print('Total applications:', len(applications)) | |
with open('applications.json', 'w') as f: | |
json.dump(applications, f) | |
print('Wrote applications to file. Done.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment