quinnkeast/convert and filter.py

## convert and filter.py
import pandas as pd
import sys
from datetime import datetime, timedelta
import pytz

# Check if the correct number of arguments was passed
if len(sys.argv) != 2:
    print("Usage: python filter_csv.py <path_to_csv_file>")
    sys.exit(1)

# Get the file path from command line argument
file_path = sys.argv[1]

# Define the device and date range
# Replace with
device_id = 'INSERT DEVICE ID'

# fixed reference date
reference_date = datetime(datetime.now(). year, 4, 30) # april 30 of this year

# create timezone-aware datetime objects
timezone = pytz.timezone("Europe/Berlin")
start_date = reference_date - timedelta(days=365)  # Last 12 months before april 30
start_date = timezone.localize(start_date)
end_date = timezone.localize(reference_date)

print(f"Filtering from {start_date} to {end_date}")

# Function to filter the CSV
def filter_csv(file_path):
    try:
        # open the file and check the first line
        with open(file_path, 'r') as file:
            first_line = file.readline().strip()

        # determine if 'sep=;' line needs to be skipped
        skip_rows = 1 if first_line.startswith('sep=') else 0

        # Load the CSV file
        data = pd.read_csv(file_path, delimiter=';', skiprows=skip_rows)

        # Convert dates to ensure format
        data['startdate'] = pd.to_datetime(data['startdate'], utc=True, errors='coerce')

        # Debug: column names
        print("Columns found in CSV:", data.columns.tolist())
        print("Sample data:", data.head())

        # Filter by device and date range
        # Assuming 'startdate' is sufficient for filtering for simplicity
        filtered_data = data[
            (data['sourcename'].str.contains(device_id)) &
            (data['startdate'] >= start_date) &
            (data['startdate'] <= end_date)
        ]

        # Save the filtered data
        filtered_data.to_csv(file_path.replace('.csv', '_filtered.csv'), index=False)

    except Exception as e:
        print(f"An error occurred: {e}")

# Call the function with the provided file path
filter_csv(file_path)
	import pandas as pd
	import sys
	from datetime import datetime, timedelta
	import pytz

	# Check if the correct number of arguments was passed
	if len(sys.argv) != 2:
	print("Usage: python filter_csv.py <path_to_csv_file>")
	sys.exit(1)

	# Get the file path from command line argument
	file_path = sys.argv[1]

	# Define the device and date range
	# Replace with
	device_id = 'INSERT DEVICE ID'

	# fixed reference date
	reference_date = datetime(datetime.now(). year, 4, 30) # april 30 of this year

	# create timezone-aware datetime objects
	timezone = pytz.timezone("Europe/Berlin")
	start_date = reference_date - timedelta(days=365) # Last 12 months before april 30
	start_date = timezone.localize(start_date)
	end_date = timezone.localize(reference_date)

	print(f"Filtering from {start_date} to {end_date}")

	# Function to filter the CSV
	def filter_csv(file_path):
	try:
	# open the file and check the first line
	with open(file_path, 'r') as file:
	first_line = file.readline().strip()

	# determine if 'sep=;' line needs to be skipped
	skip_rows = 1 if first_line.startswith('sep=') else 0

	# Load the CSV file
	data = pd.read_csv(file_path, delimiter=';', skiprows=skip_rows)

	# Convert dates to ensure format
	data['startdate'] = pd.to_datetime(data['startdate'], utc=True, errors='coerce')

	# Debug: column names
	print("Columns found in CSV:", data.columns.tolist())
	print("Sample data:", data.head())

	# Filter by device and date range
	# Assuming 'startdate' is sufficient for filtering for simplicity
	filtered_data = data[
	(data['sourcename'].str.contains(device_id)) &
	(data['startdate'] >= start_date) &
	(data['startdate'] <= end_date)
	]

	# Save the filtered data
	filtered_data.to_csv(file_path.replace('.csv', '_filtered.csv'), index=False)

	except Exception as e:
	print(f"An error occurred: {e}")

	# Call the function with the provided file path
	filter_csv(file_path)