hmelenok/html_to_csv.py

## html_to_csv.py
from bs4 import BeautifulSoup
import csv

# Load the content from the HTML file
with open('input.html', 'r', encoding='utf-8') as file:
    content = file.read()

soup = BeautifulSoup(content, 'lxml')

# Prepare a list to hold the extracted data
data = []

# Find all '.outer-cell' blocks
outer_cells = soup.select('.outer-cell')
total_cells = len(outer_cells)

print(f"Total outer cells found: {total_cells}\nStarting data extraction...")

# Iterate over each '.outer-cell' block
for index, outer_cell in enumerate(outer_cells, 1):
    # Extract the product name
    product = outer_cell.select_one('.mdl-typography--title').get_text(strip=True)

    # Extract video and channel details
    content_cell = outer_cell.select_one('.content-cell.mdl-cell.mdl-cell--6-col.mdl-typography--body-1')
    links = content_cell.find_all('a')
    video_title = links[0].get_text(strip=True) if links else ''
    video_link = links[0]['href'] if links else ''
    channel_name = links[1].get_text(strip=True) if len(links) > 1 else ''
    channel_link = links[1]['href'] if len(links) > 1 else ''

    # Extract date (assuming it is the last string in the content_cell)
    date = list(content_cell.stripped_strings)[-1] if content_cell else ''

    # Append the extracted details to the data list
    data.append((product, video_title, video_link, channel_name, channel_link, date))

    # Print progress
    print(f"Processed cell {index} of {total_cells}")

print("\nWriting data to output.csv...")

if __name__ == "__main__":
    # Write the extracted data to a CSV file
    with open('output.csv', 'w', encoding='utf-8', newline='') as csv_file:
        writer = csv.writer(csv_file)
        # Write the headers
        writer.writerow(['Products', 'Video title', 'Video Link', 'Channel Name', 'Channel Link', 'Date'])
        # Write the extracted data
        for row in data:
            writer.writerow(row)

    print("Data has been written to output.csv")
	from bs4 import BeautifulSoup
	import csv

	# Load the content from the HTML file
	with open('input.html', 'r', encoding='utf-8') as file:
	content = file.read()

	soup = BeautifulSoup(content, 'lxml')

	# Prepare a list to hold the extracted data
	data = []

	# Find all '.outer-cell' blocks
	outer_cells = soup.select('.outer-cell')
	total_cells = len(outer_cells)

	print(f"Total outer cells found: {total_cells}\nStarting data extraction...")

	# Iterate over each '.outer-cell' block
	for index, outer_cell in enumerate(outer_cells, 1):
	# Extract the product name
	product = outer_cell.select_one('.mdl-typography--title').get_text(strip=True)

	# Extract video and channel details
	content_cell = outer_cell.select_one('.content-cell.mdl-cell.mdl-cell--6-col.mdl-typography--body-1')
	links = content_cell.find_all('a')
	video_title = links[0].get_text(strip=True) if links else ''
	video_link = links[0]['href'] if links else ''
	channel_name = links[1].get_text(strip=True) if len(links) > 1 else ''
	channel_link = links[1]['href'] if len(links) > 1 else ''

	# Extract date (assuming it is the last string in the content_cell)
	date = list(content_cell.stripped_strings)[-1] if content_cell else ''

	# Append the extracted details to the data list
	data.append((product, video_title, video_link, channel_name, channel_link, date))

	# Print progress
	print(f"Processed cell {index} of {total_cells}")

	print("\nWriting data to output.csv...")

	if __name__ == "__main__":
	# Write the extracted data to a CSV file
	with open('output.csv', 'w', encoding='utf-8', newline='') as csv_file:
	writer = csv.writer(csv_file)
	# Write the headers
	writer.writerow(['Products', 'Video title', 'Video Link', 'Channel Name', 'Channel Link', 'Date'])
	# Write the extracted data
	for row in data:
	writer.writerow(row)

	print("Data has been written to output.csv")