chrismckelt/solar-datasheet-search-for-llm-analysis.py

## solar-datasheet-search-for-llm-analysis.py

# https://cer.gov.au/node/4571
# script to import a CSV file,
# from the second row,
# combine the first three columns into a single string.
# For each result,
# search on google using the specific term, only return PDF results.
# Print the results and download the PDFs to a specific folder.

# USAGE:  python3 ./search-for-device-capabilities.py

#pip install pandas googlesearch-python
import pandas as pd
import os
import requests
import re
from googlesearch import search

# store query list in a variable
query_list = []

def search_pdf(query):
    # Using googlesearch-python to search for PDFs
    try:
        results = []
        for url in search(query + " filetype:pdf", num_results=10):
            if url.endswith('.pdf'):
                results.append(url)
        return results
    except Exception as e:
        print(f"An error occurred during the search: {e}")
        return []

def sanitize_filename(filename):
    # Replace invalid characters with an underscore
    return re.sub(r'[\/:*?"<>|]', '_', filename)

def download_pdf(url, folder, filename):
    # Check if the filename on disk already exists
    file_path = os.path.join(folder, filename)
    if os.path.exists(file_path):
        print(f"Skipping {filename}, already downloaded.")
        return

    try:
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful

        with open(file_path, 'wb') as file:
            print(f"Downloaded: {filename}")
            if os.path.exists(filename):
                file.write(response.content)
            else:
                print(f"File already exists: {filename}")

    except requests.exceptions.RequestException as e:
        print(f"Failed to download {url}: {e}")

def combine_columns_and_search(csv_file, download_folder):
    # Read the CSV file starting from the second row
    df = pd.read_csv(csv_file, skiprows=1)

    # Ensure the dataframe has at least 3 columns
    if df.shape[1] < 3:
        print("The CSV file must have at least 3 columns.")
        return

    # Combine the first three columns into a single string
    df['combined'] = df[df.columns[0]].astype(str) + ' ' + df[df.columns[1]].astype(str) + ' ' + df[df.columns[2]].astype(str)

    # Ensure the download folder exists
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    # Perform Google search for each combined string and return PDF results
    for index, row in df.iterrows():
        query = row['combined']
        print(f"Searching for: {query}")
        if query in query_list:
            continue

        query_list.append(query)

        pdf_results = search_pdf(query)
        if pdf_results:
            print(f"PDF results for {query}:")
            for i, result in enumerate(pdf_results):
                print(result)
                # Generate a sanitized filename for each PDF
                filename = sanitize_filename(f"{query.replace(' ', '_')}_{i+1}.pdf")
                download_pdf(result, download_folder, filename)
        else:
            print(f"No PDF results found for {query}")

if __name__ == "__main__":
    csv_file_path = '/workspaces/der-services-platform/apps/xtdb-data-imports/device-model/cec-approved-inverters-20240930.csv'
    download_folder_path = '/workspaces/der-services-platform/apps/xtdb-data-imports/device-model/datasheets'
    combine_columns_and_search(csv_file_path, download_folder_path)

	# https://cer.gov.au/node/4571
	# script to import a CSV file,
	# from the second row,
	# combine the first three columns into a single string.
	# For each result,
	# search on google using the specific term, only return PDF results.
	# Print the results and download the PDFs to a specific folder.

	# USAGE: python3 ./search-for-device-capabilities.py

	#pip install pandas googlesearch-python
	import pandas as pd
	import os
	import requests
	import re
	from googlesearch import search

	# store query list in a variable
	query_list = []

	def search_pdf(query):
	# Using googlesearch-python to search for PDFs
	try:
	results = []
	for url in search(query + " filetype:pdf", num_results=10):
	if url.endswith('.pdf'):
	results.append(url)
	return results
	except Exception as e:
	print(f"An error occurred during the search: {e}")
	return []

	def sanitize_filename(filename):
	# Replace invalid characters with an underscore
	return re.sub(r'[\/:*?"<>\|]', '_', filename)

	def download_pdf(url, folder, filename):
	# Check if the filename on disk already exists
	file_path = os.path.join(folder, filename)
	if os.path.exists(file_path):
	print(f"Skipping {filename}, already downloaded.")
	return

	try:
	response = requests.get(url)
	response.raise_for_status() # Check if the request was successful

	with open(file_path, 'wb') as file:
	print(f"Downloaded: {filename}")
	if os.path.exists(filename):
	file.write(response.content)
	else:
	print(f"File already exists: {filename}")

	except requests.exceptions.RequestException as e:
	print(f"Failed to download {url}: {e}")

	def combine_columns_and_search(csv_file, download_folder):
	# Read the CSV file starting from the second row
	df = pd.read_csv(csv_file, skiprows=1)

	# Ensure the dataframe has at least 3 columns
	if df.shape[1] < 3:
	print("The CSV file must have at least 3 columns.")
	return

	# Combine the first three columns into a single string
	df['combined'] = df[df.columns[0]].astype(str) + ' ' + df[df.columns[1]].astype(str) + ' ' + df[df.columns[2]].astype(str)

	# Ensure the download folder exists
	if not os.path.exists(download_folder):
	os.makedirs(download_folder)

	# Perform Google search for each combined string and return PDF results
	for index, row in df.iterrows():
	query = row['combined']
	print(f"Searching for: {query}")
	if query in query_list:
	continue

	query_list.append(query)

	pdf_results = search_pdf(query)
	if pdf_results:
	print(f"PDF results for {query}:")
	for i, result in enumerate(pdf_results):
	print(result)
	# Generate a sanitized filename for each PDF
	filename = sanitize_filename(f"{query.replace(' ', '_')}_{i+1}.pdf")
	download_pdf(result, download_folder, filename)
	else:
	print(f"No PDF results found for {query}")

	if __name__ == "__main__":
	csv_file_path = '/workspaces/der-services-platform/apps/xtdb-data-imports/device-model/cec-approved-inverters-20240930.csv'
	download_folder_path = '/workspaces/der-services-platform/apps/xtdb-data-imports/device-model/datasheets'
	combine_columns_and_search(csv_file_path, download_folder_path)