Skip to content

Instantly share code, notes, and snippets.

@jfmaes
Created April 26, 2023 07:35
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jfmaes/e95a4ee60dbad163654b8e72d863240c to your computer and use it in GitHub Desktop.
Save jfmaes/e95a4ee60dbad163654b8e72d863240c to your computer and use it in GitHub Desktop.
MSDN function definition scraper. requires chromium driver.
import argparse
import selenium as se
from selenium import webdriver
from bs4 import BeautifulSoup
import time
def main():
parser = argparse.ArgumentParser(description = 'extract function definitions from MSDN')
parser.add_argument('--methods', help='list of methods',required=True)
args = parser.parse_args()
#methods_list = []
if args.methods:
methods_list = args.methods.split(',')
#needs chrome or firefox because javascript loads the actual results, requests doesnt deal in that sh*t
options = se.webdriver.ChromeOptions()
options.add_argument('headless')
options.add_experimental_option('excludeSwitches', ['enable-logging'])
browser = se.webdriver.Chrome(options=options)
#Get the search query from the user
for method in methods_list:
search_query = method
#Create the request for the MSDN search page
url = f'https://docs.microsoft.com/en-us/search/?terms={search_query}'
browser.get(url)
time.sleep(100/1000)
#Parse the MSDN search page
soup = BeautifulSoup(browser.page_source, 'html.parser')
#only interested in the top search result. You might need to tweak this if you want more robust searches.
function_page_link = soup.find('a', {"data-bi-name" : "searchItem.0"})
#Check if the function page link is found
if function_page_link:
#print("Checking {0} for function definition. ".format(function_page_link['href']))
#Create the request for the MSDN page of the function
browser.get(function_page_link['href'])
#Parse the MSDN page of the function
function_page_soup = BeautifulSoup(browser.page_source, 'html.parser')
#Find the syntax section of the MSDN page
function_definition = function_page_soup.find('code').text
if function_definition:
#Print the function definition
print(function_definition)
else:
print(f'Function "{search_query}" not found')
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment