Skip to content

Instantly share code, notes, and snippets.

@rithvikvibhu
Created June 27, 2023 11:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rithvikvibhu/c6a03be8b4b9a2773d48937e467edf19 to your computer and use it in GitHub Desktop.
Save rithvikvibhu/c6a03be8b4b9a2773d48937e467edf19 to your computer and use it in GitHub Desktop.
# Usage
#
# Install dependencies:
# $ pip install selenium beautifulsoup4
# Run:
# $ python download-gtlds-applications.py
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.webdriver import WebDriver as ChromeWebDriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
service = ChromeService(executable_path=r"./chromedriver")
driver = webdriver.Chrome(service=service)
driver.get("https://gtldresult.icann.org/application-result/applicationstatus")
page_url = driver.find_element(By.LINK_TEXT, "2").get_attribute("href").replace("/2?", "/{}?")
print("placeholder page url:", page_url)
def go_to_page(page_no: int):
print("[*] switching to page", page_url.format(page_no))
driver.get(page_url.format(page_no))
def get_data(driver: ChromeWebDriver):
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Find all the div elements with class "mainInfo"
apps_divs = soup.find_all('div', class_='mainInfo')
res = []
for app_div in apps_divs:
# Overview row
overview_tds = app_div.parent.parent.parent.find_previous_sibling().find_all('td')
expand_details = overview_tds[0].a.text.strip()
prioritization_number = overview_tds[1].text.strip()
application_tld = overview_tds[2].a.text.strip()
application_applicant = overview_tds[3].a.text.strip()
country_code = overview_tds[4].text.strip()
# Detail row
label_div = app_div.find('div', class_='aLabel')
contact_name_div = app_div.find('div', class_='applicantPrimaryContactName')
contact_email_div = app_div.find('div', class_='applicantPrimaryContactEmail')
app_id_div = app_div.find('div', class_='tasApplicationId')
app_status_div = app_div.find('div', class_='currentStatusDisplayable')
eval_result_div = app_div.find('div', class_='evaluationResultDisplayable')
label = None
contact_name = None
contact_email = None
app_id = None
app_status = None
eval_result = None
eval_result_link = None
if label_div:
label = label_div.text.strip().split(':')[1].strip()
if contact_name_div:
contact_name = contact_name_div.text.strip().split(':')[1].strip()
if contact_email_div:
contact_email = contact_email_div.text.strip().split(':')[1].strip()
if app_id_div:
app_id = app_id_div.text.strip().split(':')[1].strip()
if app_status_div:
app_status = app_status_div.text.strip().split(':')[1].strip()
if eval_result_div:
eval_result = eval_result_div.text.strip().split(':')[1].strip()
eval_result_link = eval_result_div.a['href']
res.append({
'expand_details': expand_details,
'prioritization_number': prioritization_number,
'application_tld': application_tld,
'application_applicant': application_applicant,
'country_code': country_code,
'label': label,
'contact_name': contact_name,
'contact_email': contact_email,
'app_id': app_id,
'app_status': app_status,
'eval_result': eval_result,
'eval_result_link': eval_result_link,
})
return res
applications = []
# Page 1
applications.extend(get_data(driver))
# Pages 2-56
for i in range(2, 56+1):
print('Total applications:', len(applications))
print('Page', i, '...')
go_to_page(i)
apps_from_page = get_data(driver)
print('\t', len(apps_from_page), 'applications')
applications.extend(apps_from_page)
print('Total applications:', len(applications))
with open('applications.json', 'w') as f:
json.dump(applications, f)
print('Wrote applications to file. Done.')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment