Skip to content

Instantly share code, notes, and snippets.

@AparnaKarve
Last active February 26, 2020 22:10
Show Gist options
  • Save AparnaKarve/4549370ba72c6d7c447b38ecbca4814a to your computer and use it in GitHub Desktop.
Save AparnaKarve/4549370ba72c6d7c447b38ecbca4814a to your computer and use it in GitHub Desktop.
from urllib.request import urlopen
from bs4 import BeautifulSoup
CLOUD_URL = 'https://docs.ansible.com/ansible/latest/modules/list_of_cloud_modules.html'
CLOUD_CSV = './CSV/cloudscrape.csv'
CLOUD_LABEL = 'Cloud Management'
CLUSTER_URL = 'https://docs.ansible.com/ansible/latest/modules/list_of_clustering_modules.html'
CLUSTER_CSV = './CSV/clusterscrape.csv'
CLUSTER_LABEL = 'Cluster Management'
COMMAND_URL = 'https://docs.ansible.com/ansible/latest/modules/list_of_commands_modules.html'
COMMAND_CSV = './CSV/commandscrape.csv'
COMMAND_LABEL = 'Command/Shell'
CRYPTO_URL = 'https://docs.ansible.com/ansible/latest/modules/list_of_crypto_modules.html'
CRYPTO_CSV = './CSV/cryptoscrape.csv'
CRYPTO_LABEL = 'Cryptography'
DB_URL = 'https://docs.ansible.com/ansible/latest/modules/list_of_database_modules.html'
DB_CSV = './CSV/dbscrape.csv'
DB_LABEL = 'Database Management'
FILE_URL = 'https://docs.ansible.com/ansible/latest/modules/list_of_files_modules.html'
FILE_CSV = './CSV/filescrape.csv'
FILE_LABEL = 'Files Management'
ID_URL = 'https://docs.ansible.com/ansible/latest/modules/list_of_identity_modules.html'
ID_CSV = './CSV/idscrape.csv'
ID_LABEL = 'Identity/Information Security Information Management'
INV_URL = 'https://docs.ansible.com/ansible/latest/modules/list_of_inventory_modules.html'
INV_CSV = './CSV/invscrape.csv'
INV_LABEL = 'Ansible Inventory Management'
MQ_URL = 'https://docs.ansible.com/ansible/latest/modules/list_of_messaging_modules.html'
MQ_CSV = './CSV/mqscrape.csv'
MQ_LABEL = 'Message Queue Management'
MON_URL = 'https://docs.ansible.com/ansible/latest/modules/list_of_monitoring_modules.html'
MON_CSV = './CSV/monscrape.csv'
MON_LABEL = 'Monitoring'
NT_URL = 'https://docs.ansible.com/ansible/latest/modules/list_of_net_tools_modules.html'
NT_CSV = './CSV/ntscrape.csv'
NT_LABEL = 'Network Management'
ANW_URL = 'https://docs.ansible.com/ansible/latest/modules/list_of_network_modules.html'
ANW_CSV = './CSV/anwscrape.csv'
ANW_LABEL = 'Networking'
NOT_URL = 'https://docs.ansible.com/ansible/latest/modules/list_of_notification_modules.html'
NOT_CSV = './CSV/notscrape.csv'
NOT_LABEL = 'Notification Management'
PKG_URL = 'https://docs.ansible.com/ansible/latest/modules/list_of_packaging_modules.html'
PKG_CSV = './CSV/pkgscrape.csv'
PKG_LABEL = 'Packaging/Installation'
RM_URL = 'https://docs.ansible.com/ansible/latest/modules/list_of_remote_management_modules.html'
RM_CSV = './CSV/rmscrape.csv'
RM_LABEL = 'Remote Management'
SRC_URL = 'https://docs.ansible.com/ansible/latest/modules/list_of_source_control_modules.html'
SRC_CSV = './CSV/srcscrape.csv'
SRC_LABEL = 'Source Control Management'
STORE_URL = 'https://docs.ansible.com/ansible/latest/modules/list_of_storage_modules.html'
STORE_CSV = './CSV/storescrape.csv'
STORE_LABEL = 'Storage Management'
SYS_URL = 'https://docs.ansible.com/ansible/latest/modules/list_of_system_modules.html'
SYS_CSV = './CSV/sysscrape.csv'
SYS_LABEL = 'System Management'
UTIL_URL = 'https://docs.ansible.com/ansible/latest/modules/list_of_utilities_modules.html'
UTIL_CSV = './CSV/utilscrape.csv'
UTIL_LABEL = 'Utilities'
WEB_URL = 'https://docs.ansible.com/ansible/latest/modules/list_of_web_infrastructure_modules.html'
WEB_CSV = './CSV/webscrape.csv'
WEB_LABEL = 'Web Infrastructure Management'
WIN_URL = 'https://docs.ansible.com/ansible/latest/modules/list_of_windows_modules.html'
WIN_CSV = './CSV/winscrape.csv'
WIN_LABEL = 'Windows Management'
def download_and_save_module_categories(url, file, label):
html_code = urlopen(url).read() # .decode('utf-8')
f2 = open(file, 'wb')
f2.write("module,description,class_text,subclass_text\n".encode())
soup = BeautifulSoup(html_code, 'html.parser')
links = soup.find_all('h1', href=False, class_='')
for link in links:
# enh_label = f"{link.contents[0]} {label}"
link_children = link.parent()[3].find_all('span', href=False, class_='std std-ref')
if len(link.parent()[3].contents) == 2:
for c in link_children:
details = f"{c.contents[0]}".split(" – ")
f2.write(f"{details[0]},\"{details[1]}\",{label},\n".encode())
###############
links = soup.find_all('h2', href=False, class_='')
for link in links:
enh_label = f"{link.contents[0]}"
link_children = link.parent()[3].find_all('span', href=False, class_='std std-ref')
for c in link_children:
details = f"{c.contents[0]}".split(" – ")
f2.write(f"{details[0]},\"{details[1]}\",{label},{enh_label}\n".encode())
f2.close()
return html_code
download_and_save_module_categories(CLOUD_URL, CLOUD_CSV, CLOUD_LABEL)
download_and_save_module_categories(CLUSTER_URL, CLUSTER_CSV, CLUSTER_LABEL)
download_and_save_module_categories(COMMAND_URL, COMMAND_CSV, COMMAND_LABEL)
download_and_save_module_categories(CRYPTO_URL, CRYPTO_CSV, CRYPTO_LABEL)
download_and_save_module_categories(DB_URL, DB_CSV, DB_LABEL)
download_and_save_module_categories(FILE_URL, FILE_CSV, FILE_LABEL)
download_and_save_module_categories(ID_URL, ID_CSV, ID_LABEL)
download_and_save_module_categories(INV_URL, INV_CSV, INV_LABEL)
download_and_save_module_categories(MQ_URL, MQ_CSV, MQ_LABEL)
download_and_save_module_categories(MON_URL, MON_CSV, MON_LABEL)
download_and_save_module_categories(NT_URL, NT_CSV, NT_LABEL)
download_and_save_module_categories(ANW_URL, ANW_CSV, ANW_LABEL)
download_and_save_module_categories(NOT_URL, NOT_CSV, NOT_LABEL)
download_and_save_module_categories(PKG_URL, PKG_CSV, PKG_LABEL)
download_and_save_module_categories(RM_URL, RM_CSV, RM_LABEL)
download_and_save_module_categories(SRC_URL, SRC_CSV, SRC_LABEL)
download_and_save_module_categories(STORE_URL, STORE_CSV, STORE_LABEL)
download_and_save_module_categories(SYS_URL, SYS_CSV, SYS_LABEL)
download_and_save_module_categories(UTIL_URL, UTIL_CSV, UTIL_LABEL)
download_and_save_module_categories(WEB_URL, WEB_CSV, WEB_LABEL)
download_and_save_module_categories(WIN_URL, WIN_CSV, WIN_LABEL)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment