Skip to content

Instantly share code, notes, and snippets.

@deepakness
Created July 27, 2023 15:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save deepakness/a9f9f64f086db784bd9ea9a0a44a5230 to your computer and use it in GitHub Desktop.
Save deepakness/a9f9f64f086db784bd9ea9a0a44a5230 to your computer and use it in GitHub Desktop.
Scrape the page titles and URLs of Wikipedia search of "insects".
import requests
from bs4 import BeautifulSoup
import csv
BASE_URL = "https://en.wikipedia.org"
SEARCH_URL = "https://en.wikipedia.org/wiki/Special:Search?search=insects&fulltext=Search&ns0=1"
def get_wikipedia_links(search_url):
r = requests.get(search_url)
soup = BeautifulSoup(r.content, 'html.parser')
links = []
# Find search results on the Wikipedia page
for div in soup.findAll('div', {'class': 'mw-search-result-heading'}):
a = div.find('a')
title = a.get_text()
url = BASE_URL + a['href']
links.append((title, url))
return links
def save_to_csv(data, filename):
with open(filename, 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["Title", "URL"])
for row in data:
writer.writerow(row)
def main():
links = get_wikipedia_links(SEARCH_URL)
save_to_csv(links, 'wikipedia_insects_and_bugs.csv')
print(f"Data saved to 'wikipedia_insects_and_bugs.csv'.")
if __name__ == "__main__":
main()
@deepakness
Copy link
Author

More context about it, here: https://untalkedseo.com/pseo-data-scraping/

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment