Skip to content

Instantly share code, notes, and snippets.

@pixobe
Created June 5, 2024 11:50
Show Gist options
  • Save pixobe/f20c345de1a3cd466e002a6d1381bcac to your computer and use it in GitHub Desktop.
Save pixobe/f20c345de1a3cd466e002a6d1381bcac to your computer and use it in GitHub Desktop.
code
from playwright.sync_api import sync_playwright, Playwright,TimeoutError as PlaywrightTimeoutError
import re
import csv
import os
from datetime import datetime
text_to_search=r"TCS|Tata Consultancy Services"
def write_to_csv(publish_date,header, content, filename='tata-2023.csv'):
# Open the file in write mode
with open(filename, mode='a', newline='') as file:
writer = csv.writer(file)
# Write the header
writer.writerow([publish_date,header, f"{content}"])
def convert_last_updated_to_datetime(text):
# Define the pattern to match the "Last Updated" date string
pattern = r"Last Updated: (\w+ \d{2}, \d{4}, \d{2}:\d{2}:\d{2} [AP]M IST)"
# Search for the pattern in the text
match = re.search(pattern, text)
if match:
# Extract the date and time string
date_str = match.group(1)
print(f"Extracted datetime string: {date_str}")
# Define the format of the date string
date_format = "%b %d, %Y, %I:%M:%S %p IST"
# Convert the date string to a datetime object
date_obj = datetime.strptime(date_str, date_format)
return date_obj
else:
print("No 'Last Updated' pattern found in the text.")
return None
def test_has_contents(playwright: Playwright):
years = ['2024']
months = [str(day) for day in range(1, 13)]
days = [str(day) for day in range(1, 32)]
chromium = playwright.chromium
browser = chromium.launch()
context = browser.new_context()
page = context.new_page()
for year in years:
for month in months:
for day in days:
try:
url = f"https://economictimes.indiatimes.com/archive/year-{year},month-{month}.cms";
page.goto(url, wait_until="domcontentloaded")
page.get_by_role("link", name=day, exact=True).click(timeout=5000);
# wait for all the archive list
page.wait_for_url("**/archivelist/**",timeout=5000)
locators = page.get_by_role("link",name=re.compile(text_to_search, re.IGNORECASE), exact=False).all()
for locator in locators:
locator.click(timeout=5000)
publish_date_element = page.query_selector(".jsdtTime")
publish_date = ""
if publish_date_element:
publish_date = convert_last_updated_to_datetime(publish_date_element.inner_text())
heading = page.query_selector(".artTitle")
header = ""
if heading:
header = heading.inner_text()
contentElement = page.query_selector(".artData")
content = ""
if contentElement:
content = contentElement.inner_text()
write_to_csv(publish_date,header,content)
page.go_back(timeout=5000, wait_until='domcontentloaded')
except PlaywrightTimeoutError as e:
print(f" ********** Timed out for {year}-{month}-{day} ********** {e}")
break
except Exception as e:
print(f" ********** An error occurred for {year}-{month}-{day} ********** {e}")
break
page.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment