Skip to content

Instantly share code, notes, and snippets.

@kamipatel
Last active May 26, 2022 16:09
Show Gist options
  • Save kamipatel/92e997d6580bea43a8d7b19a70bc3b74 to your computer and use it in GitHub Desktop.
Save kamipatel/92e997d6580bea43a8d7b19a70bc3b74 to your computer and use it in GitHub Desktop.
meta data scrap
'''
Author: Kam & Stu
Setup: To run this code TBD
'''
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
URL = "https://developer.salesforce.com/docs/metadata-coverage/55"
page = requests.get(URL)
# instance of Options class allows
# us to configure Headless Chrome
options = Options()
# this parameter tells Chrome that
# it should be run without UI (Headless)
options.headless = True
# initializing webdriver for Chrome with our options
#driver = webdriver.Chrome(options=options)
s= Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=s, options=options)
# getting GeekForGeeks webpage
driver.get(URL)
# sleep for 5 seconds just to see that
# the browser was opened indeed
#time.sleep(2)
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "WorkflowSend"))
)
print(driver.title)
t = '//*[@id="WorkflowSend"]'
search = driver.find_element(By.XPATH, t)
#print(search)
tbodypath = '//*[@id="tableBody"]'
tbody = driver.find_element(By.XPATH, tbodypath)
data = {}
df = pd.DataFrame(data)
trpath = '//tr'
trs = tbody.find_elements(By.XPATH, trpath)
for tr in trs:
title = tr.get_attribute("id")
tds = tr.find_elements(By.TAG_NAME, 'span')
for td in tds:
supportedIn = ''
if td.get_attribute("title") != "":
s = td.get_attribute("title")
i = s.find("in ")
if i != -1:
supportedIn = s[i+3 : ]
supportedIn = supportedIn.rstrip().lstrip()
else:
supportedIn = s
if supportedIn == 'Classic Packaging':
supportedIn= '1 GP Managed Packaging'
if supportedIn == 'Applies only to Classic managed packages.':
supportedIn= '1 GP Managed Packaging'
if supportedIn == 'Requires package without a namespace.':
supportedIn= 'Unlocked Packaging'
metadataType = tr.get_attribute("id")
cols = df.columns.tolist()
if metadataType != "":
if len(cols) > 0 and (df['MetadataType'] == metadataType).any():
row = df.loc[df['MetadataType'] == metadataType]
df.at[row.index, supportedIn]='Y'
else:
row = {'MetadataType': metadataType, supportedIn: 'Y'}
df = df.append(row, ignore_index = True)
print(df.columns)
df.to_csv('meta.csv')
# closing browser
driver.close()
print("Done!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment