Skip to content

Instantly share code, notes, and snippets.

@pyxn
Created June 5, 2023 19:54
Show Gist options
  • Save pyxn/82b80e5ada6b34f5a790b1f508f46fb2 to your computer and use it in GitHub Desktop.
Save pyxn/82b80e5ada6b34f5a790b1f508f46fb2 to your computer and use it in GitHub Desktop.
import csv
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
def get_description(url):
try:
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
description_tag = soup.find("h4", string="Description").find_next_sibling("p")
return description_tag.text
except Exception as e:
print(f"Error while crawling '{url}': {e}")
return None
def main():
input_file = 'urls.csv'
output_file = 'output.csv'
# Read URLs from the input CSV file
urls_df = pd.read_csv(input_file)
# Extract description for each URL with progress tracking
urls_df['description'] = [get_description(url) for url in tqdm(urls_df['url'], desc="Crawling URLs")]
# Save the result to the output CSV file
urls_df.to_csv(output_file, index=False)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment