-
-
Save jmarbach/9395e89d63a967c37813679b35936ccb to your computer and use it in GitHub Desktop.
Anchor Browser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Anchor Browser Website Heartbeat Test | |
| This script fetches the top 100 US websites using Anchor Browser API | |
| and updates a Google Sheet with success/failure status. | |
| Dependencies: | |
| pip install anchorbrowser gspread google-auth playwright | |
| Setup: | |
| 1. Get Anchor Browser API key from https://app.anchorbrowser.io/api-key | |
| 2. Set up Google Sheets API credentials: | |
| - Get service account JSON from Google Cloud Console | |
| - Base64 encode the entire JSON content | |
| - Set GOOGLE_SERVICE_ACCOUNT_JSON environment variable | |
| 3. Share your Google Sheet with the service account email | |
| Usage: | |
| python main.py | |
| """ | |
| import os | |
| import time | |
| import logging | |
| import json | |
| import base64 | |
| from typing import List | |
| import gspread | |
| # Try different import approaches for Google auth | |
| try: | |
| from google.auth.service_account import Credentials | |
| USE_SERVICE_ACCOUNT = True | |
| except ImportError: | |
| try: | |
| from google.oauth2.service_account import Credentials | |
| USE_SERVICE_ACCOUNT = True | |
| except ImportError: | |
| # Fallback to basic auth | |
| from google.auth import default | |
| USE_SERVICE_ACCOUNT = False | |
| logger.warning("Using fallback authentication method") | |
| from anchorbrowser import Anchorbrowser | |
| # Configuration | |
| GOOGLE_SHEET_ID = '' | |
| GOOGLE_WORKSHEET_NAME = 'anchor' | |
| ANCHOR_API_KEY = os.getenv('ANCHOR_API_KEY') | |
| GOOGLE_SERVICE_ACCOUNT_JSON = os.getenv('GOOGLE_SERVICE_ACCOUNT_JSON') | |
| # Logging setup | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(levelname)s - %(message)s' | |
| ) | |
| logger = logging.getLogger(__name__) | |
| class WebsiteHeartbeatTester: | |
| def __init__(self, anchor_api_key: str, google_sheet_id: str, worksheet_name: str = 'anchor', service_account_json: str = None): | |
| """Initialize the heartbeat tester with API keys and credentials.""" | |
| self.anchor_client = Anchorbrowser(api_key=anchor_api_key) | |
| self.google_sheet_id = google_sheet_id | |
| self.worksheet_name = worksheet_name | |
| self.service_account_json = service_account_json | |
| self.gc = None | |
| self.worksheet = None | |
| def setup_google_sheets(self): | |
| """Set up Google Sheets API connection.""" | |
| try: | |
| # Define the scope | |
| scope = [ | |
| 'https://spreadsheets.google.com/feeds', | |
| 'https://www.googleapis.com/auth/drive' | |
| ] | |
| # Load credentials from base64 string | |
| if self.service_account_json and USE_SERVICE_ACCOUNT: | |
| logger.info("Using base64-encoded Google service account JSON") | |
| try: | |
| # Fix base64 padding if needed | |
| base64_str = self.service_account_json.strip() | |
| missing_padding = len(base64_str) % 4 | |
| if missing_padding: | |
| base64_str += '=' * (4 - missing_padding) | |
| # Decode base64 credentials | |
| decoded_creds = base64.b64decode(base64_str).decode('utf-8') | |
| creds_dict = json.loads(decoded_creds) | |
| creds = Credentials.from_service_account_info(creds_dict, scopes=scope) | |
| except Exception as e: | |
| logger.error(f"Error processing base64 credentials: {e}") | |
| # Fall back to gspread method | |
| base64_str = self.service_account_json.strip() | |
| missing_padding = len(base64_str) % 4 | |
| if missing_padding: | |
| base64_str += '=' * (4 - missing_padding) | |
| decoded_creds = base64.b64decode(base64_str).decode('utf-8') | |
| creds_dict = json.loads(decoded_creds) | |
| self.gc = gspread.service_account_from_dict(creds_dict) | |
| sheet = self.gc.open_by_key(self.google_sheet_id) | |
| self.worksheet = sheet.worksheet(self.worksheet_name) | |
| logger.info(f"Successfully connected to Google Sheets using fallback method") | |
| return True | |
| else: | |
| raise Exception("No valid Google credentials found. Set GOOGLE_SERVICE_ACCOUNT_JSON environment variable") | |
| # Authorize and open the sheet (for standard method) | |
| if USE_SERVICE_ACCOUNT and 'creds' in locals(): | |
| self.gc = gspread.authorize(creds) | |
| sheet = self.gc.open_by_key(self.google_sheet_id) | |
| self.worksheet = sheet.worksheet(self.worksheet_name) | |
| logger.info(f"Successfully connected to Google Sheets (worksheet: {self.worksheet_name})") | |
| return True | |
| except Exception as e: | |
| logger.error(f"Failed to setup Google Sheets: {e}") | |
| return False | |
| def get_websites_from_sheet(self) -> List[str]: | |
| """Fetch URLs from column A of the Google Sheet.""" | |
| try: | |
| # Get all values from column A (URLs) | |
| urls = self.worksheet.col_values(1) | |
| # Remove header and filter out empty cells | |
| websites = [url.strip() for url in urls[1:] if url.strip()] | |
| # Ensure URLs have protocol | |
| formatted_urls = [] | |
| for url in websites: | |
| if not url.startswith(('http://', 'https://')): | |
| url = f'https://{url}' | |
| formatted_urls.append(url) | |
| logger.info(f"Retrieved {len(formatted_urls)} websites from Google Sheet") | |
| return formatted_urls | |
| except Exception as e: | |
| logger.error(f"Failed to get websites from sheet: {e}") | |
| return [] | |
| def should_skip_test(self, row: int, column: int) -> bool: | |
| """Check if a test should be skipped because it already has a result.""" | |
| try: | |
| cell_value = self.worksheet.cell(row, column).value | |
| if cell_value and cell_value.strip().lower() in ['yes', 'no']: | |
| logger.info(f"Skipping row {row}, column B - already has result: {cell_value}") | |
| return True | |
| return False | |
| except Exception as e: | |
| logger.debug(f"Error checking cell {row},{column}: {e}") | |
| return False | |
| def test_website(self, url: str) -> bool: | |
| """Test if a website can be successfully fetched using Anchor Browser.""" | |
| session_id = None | |
| browser = None | |
| try: | |
| logger.info(f"Testing website: {url}") | |
| # Create a new browser session | |
| session = self.anchor_client.sessions.create() | |
| session_id = session.data.id | |
| cdp_url = session.data.cdp_url | |
| # Connect to the browser using Playwright | |
| from playwright.sync_api import sync_playwright | |
| with sync_playwright() as p: | |
| # Connect to Anchor Browser session | |
| browser = p.chromium.connect_over_cdp(cdp_url) | |
| try: | |
| # Get the first context and page | |
| contexts = browser.contexts | |
| if not contexts: | |
| # Create a new context if none exists | |
| context = browser.new_context() | |
| page = context.new_page() | |
| else: | |
| context = contexts[0] | |
| pages = context.pages | |
| if not pages: | |
| page = context.new_page() | |
| else: | |
| page = pages[0] | |
| # Navigate to the website | |
| response = page.goto(url, timeout=30000) # 30 second timeout | |
| # Check if the page loaded successfully | |
| if response and response.status < 400: | |
| # Additional check: make sure the page actually loaded content | |
| title = page.title() | |
| if title: # Page has a title, likely loaded successfully | |
| logger.info(f"Successfully tested {url} (Status: {response.status}, Title: {title[:50]}...)") | |
| return True | |
| else: | |
| logger.warning(f"Page loaded but no title found for {url}") | |
| return False | |
| else: | |
| status_code = response.status if response else "No response" | |
| logger.warning(f"Failed to load {url} (Status: {status_code})") | |
| return False | |
| finally: | |
| # Close the browser connection first | |
| if browser: | |
| try: | |
| browser.close() | |
| except: | |
| pass | |
| except Exception as e: | |
| logger.warning(f"Failed to test {url}: {e}") | |
| return False | |
| finally: | |
| # Clean up the session after browser is closed | |
| if session_id: | |
| try: | |
| self.anchor_client.sessions.terminate(session_id) | |
| logger.debug(f"Session {session_id} terminated") | |
| except Exception as e: | |
| logger.debug(f"Failed to terminate session {session_id}: {e}") | |
| pass # Session might have already expired | |
| def update_sheet_result(self, row: int, result: bool): | |
| """Update the Google Sheet with the test result.""" | |
| try: | |
| status = "Yes" if result else "No" | |
| self.worksheet.update_cell(row, 2, status) # Column B | |
| logger.info(f"Updated row {row}, column B with result: {status}") | |
| except Exception as e: | |
| logger.error(f"Failed to update sheet row {row}: {e}") | |
| def run_heartbeat_test(self, limit: int = 100): | |
| """Run the complete heartbeat test.""" | |
| logger.info("Starting Anchor Browser heartbeat test") | |
| # Setup Google Sheets connection | |
| if not self.setup_google_sheets(): | |
| logger.error("Failed to setup Google Sheets. Exiting.") | |
| return | |
| # Get websites from the sheet | |
| websites = self.get_websites_from_sheet() | |
| if not websites: | |
| logger.error("No websites found in the sheet. Exiting.") | |
| return | |
| # Limit to specified number of websites | |
| websites = websites[:limit] | |
| # Add header to column B if it doesn't exist | |
| try: | |
| if not self.worksheet.cell(1, 2).value: | |
| self.worksheet.update_cell(1, 2, "Anchor Browser Test") | |
| except Exception as e: | |
| logger.warning(f"Failed to update header: {e}") | |
| # Test each website | |
| successful_tests = 0 | |
| total_tests = len(websites) | |
| for i, url in enumerate(websites, start=2): # Start from row 2 (skip header) | |
| logger.info(f"Progress: {i-1}/{total_tests} - Testing {url}") | |
| # Check if we should skip this test | |
| if self.should_skip_test(i, 2): | |
| # Count existing results for summary | |
| try: | |
| existing_result = self.worksheet.cell(i, 2).value | |
| if existing_result and existing_result.strip().lower() == 'yes': | |
| successful_tests += 1 | |
| except: | |
| pass | |
| continue | |
| # Test the website | |
| result = self.test_website(url) | |
| # Update the sheet | |
| self.update_sheet_result(i, result) | |
| if result: | |
| successful_tests += 1 | |
| # Add a small delay to avoid rate limiting | |
| time.sleep(1) | |
| # Log summary | |
| success_rate = (successful_tests / total_tests) * 100 | |
| logger.info(f"Heartbeat test completed!") | |
| logger.info(f"Results: {successful_tests}/{total_tests} successful ({success_rate:.1f}%)") | |
| def main(): | |
| """Main function to run the heartbeat test.""" | |
| # Check for required environment variables | |
| if not ANCHOR_API_KEY: | |
| logger.error("ANCHOR_API_KEY environment variable not set") | |
| logger.info("Get your API key from: https://app.anchorbrowser.io/api-key") | |
| return | |
| if not GOOGLE_SERVICE_ACCOUNT_JSON: | |
| logger.error("GOOGLE_SERVICE_ACCOUNT_JSON environment variable not set") | |
| logger.info("Set GOOGLE_SERVICE_ACCOUNT_JSON environment variable (base64-encoded JSON)") | |
| return | |
| # Create and run the tester | |
| tester = WebsiteHeartbeatTester( | |
| anchor_api_key=ANCHOR_API_KEY, | |
| google_sheet_id=GOOGLE_SHEET_ID, | |
| worksheet_name=GOOGLE_WORKSHEET_NAME, | |
| service_account_json=GOOGLE_SERVICE_ACCOUNT_JSON | |
| ) | |
| # Run the test (limit to 100 websites as requested) | |
| tester.run_heartbeat_test(limit=100) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment