Skip to content

Instantly share code, notes, and snippets.

@jmarbach
Created August 27, 2025 22:01
Show Gist options
  • Select an option

  • Save jmarbach/9395e89d63a967c37813679b35936ccb to your computer and use it in GitHub Desktop.

Select an option

Save jmarbach/9395e89d63a967c37813679b35936ccb to your computer and use it in GitHub Desktop.
Anchor Browser
#!/usr/bin/env python3
"""
Anchor Browser Website Heartbeat Test
This script fetches the top 100 US websites using Anchor Browser API
and updates a Google Sheet with success/failure status.
Dependencies:
pip install anchorbrowser gspread google-auth playwright
Setup:
1. Get Anchor Browser API key from https://app.anchorbrowser.io/api-key
2. Set up Google Sheets API credentials:
- Get service account JSON from Google Cloud Console
- Base64 encode the entire JSON content
- Set GOOGLE_SERVICE_ACCOUNT_JSON environment variable
3. Share your Google Sheet with the service account email
Usage:
python main.py
"""
import os
import time
import logging
import json
import base64
from typing import List
import gspread
# Try different import approaches for Google auth
try:
from google.auth.service_account import Credentials
USE_SERVICE_ACCOUNT = True
except ImportError:
try:
from google.oauth2.service_account import Credentials
USE_SERVICE_ACCOUNT = True
except ImportError:
# Fallback to basic auth
from google.auth import default
USE_SERVICE_ACCOUNT = False
logger.warning("Using fallback authentication method")
from anchorbrowser import Anchorbrowser
# Configuration
GOOGLE_SHEET_ID = ''
GOOGLE_WORKSHEET_NAME = 'anchor'
ANCHOR_API_KEY = os.getenv('ANCHOR_API_KEY')
GOOGLE_SERVICE_ACCOUNT_JSON = os.getenv('GOOGLE_SERVICE_ACCOUNT_JSON')
# Logging setup
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class WebsiteHeartbeatTester:
def __init__(self, anchor_api_key: str, google_sheet_id: str, worksheet_name: str = 'anchor', service_account_json: str = None):
"""Initialize the heartbeat tester with API keys and credentials."""
self.anchor_client = Anchorbrowser(api_key=anchor_api_key)
self.google_sheet_id = google_sheet_id
self.worksheet_name = worksheet_name
self.service_account_json = service_account_json
self.gc = None
self.worksheet = None
def setup_google_sheets(self):
"""Set up Google Sheets API connection."""
try:
# Define the scope
scope = [
'https://spreadsheets.google.com/feeds',
'https://www.googleapis.com/auth/drive'
]
# Load credentials from base64 string
if self.service_account_json and USE_SERVICE_ACCOUNT:
logger.info("Using base64-encoded Google service account JSON")
try:
# Fix base64 padding if needed
base64_str = self.service_account_json.strip()
missing_padding = len(base64_str) % 4
if missing_padding:
base64_str += '=' * (4 - missing_padding)
# Decode base64 credentials
decoded_creds = base64.b64decode(base64_str).decode('utf-8')
creds_dict = json.loads(decoded_creds)
creds = Credentials.from_service_account_info(creds_dict, scopes=scope)
except Exception as e:
logger.error(f"Error processing base64 credentials: {e}")
# Fall back to gspread method
base64_str = self.service_account_json.strip()
missing_padding = len(base64_str) % 4
if missing_padding:
base64_str += '=' * (4 - missing_padding)
decoded_creds = base64.b64decode(base64_str).decode('utf-8')
creds_dict = json.loads(decoded_creds)
self.gc = gspread.service_account_from_dict(creds_dict)
sheet = self.gc.open_by_key(self.google_sheet_id)
self.worksheet = sheet.worksheet(self.worksheet_name)
logger.info(f"Successfully connected to Google Sheets using fallback method")
return True
else:
raise Exception("No valid Google credentials found. Set GOOGLE_SERVICE_ACCOUNT_JSON environment variable")
# Authorize and open the sheet (for standard method)
if USE_SERVICE_ACCOUNT and 'creds' in locals():
self.gc = gspread.authorize(creds)
sheet = self.gc.open_by_key(self.google_sheet_id)
self.worksheet = sheet.worksheet(self.worksheet_name)
logger.info(f"Successfully connected to Google Sheets (worksheet: {self.worksheet_name})")
return True
except Exception as e:
logger.error(f"Failed to setup Google Sheets: {e}")
return False
def get_websites_from_sheet(self) -> List[str]:
"""Fetch URLs from column A of the Google Sheet."""
try:
# Get all values from column A (URLs)
urls = self.worksheet.col_values(1)
# Remove header and filter out empty cells
websites = [url.strip() for url in urls[1:] if url.strip()]
# Ensure URLs have protocol
formatted_urls = []
for url in websites:
if not url.startswith(('http://', 'https://')):
url = f'https://{url}'
formatted_urls.append(url)
logger.info(f"Retrieved {len(formatted_urls)} websites from Google Sheet")
return formatted_urls
except Exception as e:
logger.error(f"Failed to get websites from sheet: {e}")
return []
def should_skip_test(self, row: int, column: int) -> bool:
"""Check if a test should be skipped because it already has a result."""
try:
cell_value = self.worksheet.cell(row, column).value
if cell_value and cell_value.strip().lower() in ['yes', 'no']:
logger.info(f"Skipping row {row}, column B - already has result: {cell_value}")
return True
return False
except Exception as e:
logger.debug(f"Error checking cell {row},{column}: {e}")
return False
def test_website(self, url: str) -> bool:
"""Test if a website can be successfully fetched using Anchor Browser."""
session_id = None
browser = None
try:
logger.info(f"Testing website: {url}")
# Create a new browser session
session = self.anchor_client.sessions.create()
session_id = session.data.id
cdp_url = session.data.cdp_url
# Connect to the browser using Playwright
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
# Connect to Anchor Browser session
browser = p.chromium.connect_over_cdp(cdp_url)
try:
# Get the first context and page
contexts = browser.contexts
if not contexts:
# Create a new context if none exists
context = browser.new_context()
page = context.new_page()
else:
context = contexts[0]
pages = context.pages
if not pages:
page = context.new_page()
else:
page = pages[0]
# Navigate to the website
response = page.goto(url, timeout=30000) # 30 second timeout
# Check if the page loaded successfully
if response and response.status < 400:
# Additional check: make sure the page actually loaded content
title = page.title()
if title: # Page has a title, likely loaded successfully
logger.info(f"Successfully tested {url} (Status: {response.status}, Title: {title[:50]}...)")
return True
else:
logger.warning(f"Page loaded but no title found for {url}")
return False
else:
status_code = response.status if response else "No response"
logger.warning(f"Failed to load {url} (Status: {status_code})")
return False
finally:
# Close the browser connection first
if browser:
try:
browser.close()
except:
pass
except Exception as e:
logger.warning(f"Failed to test {url}: {e}")
return False
finally:
# Clean up the session after browser is closed
if session_id:
try:
self.anchor_client.sessions.terminate(session_id)
logger.debug(f"Session {session_id} terminated")
except Exception as e:
logger.debug(f"Failed to terminate session {session_id}: {e}")
pass # Session might have already expired
def update_sheet_result(self, row: int, result: bool):
"""Update the Google Sheet with the test result."""
try:
status = "Yes" if result else "No"
self.worksheet.update_cell(row, 2, status) # Column B
logger.info(f"Updated row {row}, column B with result: {status}")
except Exception as e:
logger.error(f"Failed to update sheet row {row}: {e}")
def run_heartbeat_test(self, limit: int = 100):
"""Run the complete heartbeat test."""
logger.info("Starting Anchor Browser heartbeat test")
# Setup Google Sheets connection
if not self.setup_google_sheets():
logger.error("Failed to setup Google Sheets. Exiting.")
return
# Get websites from the sheet
websites = self.get_websites_from_sheet()
if not websites:
logger.error("No websites found in the sheet. Exiting.")
return
# Limit to specified number of websites
websites = websites[:limit]
# Add header to column B if it doesn't exist
try:
if not self.worksheet.cell(1, 2).value:
self.worksheet.update_cell(1, 2, "Anchor Browser Test")
except Exception as e:
logger.warning(f"Failed to update header: {e}")
# Test each website
successful_tests = 0
total_tests = len(websites)
for i, url in enumerate(websites, start=2): # Start from row 2 (skip header)
logger.info(f"Progress: {i-1}/{total_tests} - Testing {url}")
# Check if we should skip this test
if self.should_skip_test(i, 2):
# Count existing results for summary
try:
existing_result = self.worksheet.cell(i, 2).value
if existing_result and existing_result.strip().lower() == 'yes':
successful_tests += 1
except:
pass
continue
# Test the website
result = self.test_website(url)
# Update the sheet
self.update_sheet_result(i, result)
if result:
successful_tests += 1
# Add a small delay to avoid rate limiting
time.sleep(1)
# Log summary
success_rate = (successful_tests / total_tests) * 100
logger.info(f"Heartbeat test completed!")
logger.info(f"Results: {successful_tests}/{total_tests} successful ({success_rate:.1f}%)")
def main():
"""Main function to run the heartbeat test."""
# Check for required environment variables
if not ANCHOR_API_KEY:
logger.error("ANCHOR_API_KEY environment variable not set")
logger.info("Get your API key from: https://app.anchorbrowser.io/api-key")
return
if not GOOGLE_SERVICE_ACCOUNT_JSON:
logger.error("GOOGLE_SERVICE_ACCOUNT_JSON environment variable not set")
logger.info("Set GOOGLE_SERVICE_ACCOUNT_JSON environment variable (base64-encoded JSON)")
return
# Create and run the tester
tester = WebsiteHeartbeatTester(
anchor_api_key=ANCHOR_API_KEY,
google_sheet_id=GOOGLE_SHEET_ID,
worksheet_name=GOOGLE_WORKSHEET_NAME,
service_account_json=GOOGLE_SERVICE_ACCOUNT_JSON
)
# Run the test (limit to 100 websites as requested)
tester.run_heartbeat_test(limit=100)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment