Created
October 1, 2024 23:00
-
-
Save chrismckelt/28b5f92bb062fcf21636a0d0b1c0fc96 to your computer and use it in GitHub Desktop.
search CEC solar inverters on google, download datasheets for LLM combined dataset analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://cer.gov.au/node/4571 | |
# script to import a CSV file, | |
# from the second row, | |
# combine the first three columns into a single string. | |
# For each result, | |
# search on google using the specific term, only return PDF results. | |
# Print the results and download the PDFs to a specific folder. | |
# USAGE: python3 ./search-for-device-capabilities.py | |
#pip install pandas googlesearch-python | |
import pandas as pd | |
import os | |
import requests | |
import re | |
from googlesearch import search | |
# store query list in a variable | |
query_list = [] | |
def search_pdf(query): | |
# Using googlesearch-python to search for PDFs | |
try: | |
results = [] | |
for url in search(query + " filetype:pdf", num_results=10): | |
if url.endswith('.pdf'): | |
results.append(url) | |
return results | |
except Exception as e: | |
print(f"An error occurred during the search: {e}") | |
return [] | |
def sanitize_filename(filename): | |
# Replace invalid characters with an underscore | |
return re.sub(r'[\/:*?"<>|]', '_', filename) | |
def download_pdf(url, folder, filename): | |
# Check if the filename on disk already exists | |
file_path = os.path.join(folder, filename) | |
if os.path.exists(file_path): | |
print(f"Skipping {filename}, already downloaded.") | |
return | |
try: | |
response = requests.get(url) | |
response.raise_for_status() # Check if the request was successful | |
with open(file_path, 'wb') as file: | |
print(f"Downloaded: {filename}") | |
if os.path.exists(filename): | |
file.write(response.content) | |
else: | |
print(f"File already exists: {filename}") | |
except requests.exceptions.RequestException as e: | |
print(f"Failed to download {url}: {e}") | |
def combine_columns_and_search(csv_file, download_folder): | |
# Read the CSV file starting from the second row | |
df = pd.read_csv(csv_file, skiprows=1) | |
# Ensure the dataframe has at least 3 columns | |
if df.shape[1] < 3: | |
print("The CSV file must have at least 3 columns.") | |
return | |
# Combine the first three columns into a single string | |
df['combined'] = df[df.columns[0]].astype(str) + ' ' + df[df.columns[1]].astype(str) + ' ' + df[df.columns[2]].astype(str) | |
# Ensure the download folder exists | |
if not os.path.exists(download_folder): | |
os.makedirs(download_folder) | |
# Perform Google search for each combined string and return PDF results | |
for index, row in df.iterrows(): | |
query = row['combined'] | |
print(f"Searching for: {query}") | |
if query in query_list: | |
continue | |
query_list.append(query) | |
pdf_results = search_pdf(query) | |
if pdf_results: | |
print(f"PDF results for {query}:") | |
for i, result in enumerate(pdf_results): | |
print(result) | |
# Generate a sanitized filename for each PDF | |
filename = sanitize_filename(f"{query.replace(' ', '_')}_{i+1}.pdf") | |
download_pdf(result, download_folder, filename) | |
else: | |
print(f"No PDF results found for {query}") | |
if __name__ == "__main__": | |
csv_file_path = '/workspaces/der-services-platform/apps/xtdb-data-imports/device-model/cec-approved-inverters-20240930.csv' | |
download_folder_path = '/workspaces/der-services-platform/apps/xtdb-data-imports/device-model/datasheets' | |
combine_columns_and_search(csv_file_path, download_folder_path) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment