Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save chrismckelt/28b5f92bb062fcf21636a0d0b1c0fc96 to your computer and use it in GitHub Desktop.
Save chrismckelt/28b5f92bb062fcf21636a0d0b1c0fc96 to your computer and use it in GitHub Desktop.
search CEC solar inverters on google, download datasheets for LLM combined dataset analysis
# https://cer.gov.au/node/4571
# script to import a CSV file,
# from the second row,
# combine the first three columns into a single string.
# For each result,
# search on google using the specific term, only return PDF results.
# Print the results and download the PDFs to a specific folder.
# USAGE: python3 ./search-for-device-capabilities.py
#pip install pandas googlesearch-python
import pandas as pd
import os
import requests
import re
from googlesearch import search
# store query list in a variable
query_list = []
def search_pdf(query):
# Using googlesearch-python to search for PDFs
try:
results = []
for url in search(query + " filetype:pdf", num_results=10):
if url.endswith('.pdf'):
results.append(url)
return results
except Exception as e:
print(f"An error occurred during the search: {e}")
return []
def sanitize_filename(filename):
# Replace invalid characters with an underscore
return re.sub(r'[\/:*?"<>|]', '_', filename)
def download_pdf(url, folder, filename):
# Check if the filename on disk already exists
file_path = os.path.join(folder, filename)
if os.path.exists(file_path):
print(f"Skipping {filename}, already downloaded.")
return
try:
response = requests.get(url)
response.raise_for_status() # Check if the request was successful
with open(file_path, 'wb') as file:
print(f"Downloaded: {filename}")
if os.path.exists(filename):
file.write(response.content)
else:
print(f"File already exists: {filename}")
except requests.exceptions.RequestException as e:
print(f"Failed to download {url}: {e}")
def combine_columns_and_search(csv_file, download_folder):
# Read the CSV file starting from the second row
df = pd.read_csv(csv_file, skiprows=1)
# Ensure the dataframe has at least 3 columns
if df.shape[1] < 3:
print("The CSV file must have at least 3 columns.")
return
# Combine the first three columns into a single string
df['combined'] = df[df.columns[0]].astype(str) + ' ' + df[df.columns[1]].astype(str) + ' ' + df[df.columns[2]].astype(str)
# Ensure the download folder exists
if not os.path.exists(download_folder):
os.makedirs(download_folder)
# Perform Google search for each combined string and return PDF results
for index, row in df.iterrows():
query = row['combined']
print(f"Searching for: {query}")
if query in query_list:
continue
query_list.append(query)
pdf_results = search_pdf(query)
if pdf_results:
print(f"PDF results for {query}:")
for i, result in enumerate(pdf_results):
print(result)
# Generate a sanitized filename for each PDF
filename = sanitize_filename(f"{query.replace(' ', '_')}_{i+1}.pdf")
download_pdf(result, download_folder, filename)
else:
print(f"No PDF results found for {query}")
if __name__ == "__main__":
csv_file_path = '/workspaces/der-services-platform/apps/xtdb-data-imports/device-model/cec-approved-inverters-20240930.csv'
download_folder_path = '/workspaces/der-services-platform/apps/xtdb-data-imports/device-model/datasheets'
combine_columns_and_search(csv_file_path, download_folder_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment