Skip to content

Instantly share code, notes, and snippets.

@hmelenok
Created October 30, 2023 19:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hmelenok/60f194b1dc70a73c67acb7d718c57c58 to your computer and use it in GitHub Desktop.
Save hmelenok/60f194b1dc70a73c67acb7d718c57c58 to your computer and use it in GitHub Desktop.
Google Takeout YouTube watched video HTML to CSV script (exports to: Products,Video title,Video Link,Channel Name,Channel Link,Date)
from bs4 import BeautifulSoup
import csv
# Load the content from the HTML file
with open('input.html', 'r', encoding='utf-8') as file:
content = file.read()
soup = BeautifulSoup(content, 'lxml')
# Prepare a list to hold the extracted data
data = []
# Find all '.outer-cell' blocks
outer_cells = soup.select('.outer-cell')
total_cells = len(outer_cells)
print(f"Total outer cells found: {total_cells}\nStarting data extraction...")
# Iterate over each '.outer-cell' block
for index, outer_cell in enumerate(outer_cells, 1):
# Extract the product name
product = outer_cell.select_one('.mdl-typography--title').get_text(strip=True)
# Extract video and channel details
content_cell = outer_cell.select_one('.content-cell.mdl-cell.mdl-cell--6-col.mdl-typography--body-1')
links = content_cell.find_all('a')
video_title = links[0].get_text(strip=True) if links else ''
video_link = links[0]['href'] if links else ''
channel_name = links[1].get_text(strip=True) if len(links) > 1 else ''
channel_link = links[1]['href'] if len(links) > 1 else ''
# Extract date (assuming it is the last string in the content_cell)
date = list(content_cell.stripped_strings)[-1] if content_cell else ''
# Append the extracted details to the data list
data.append((product, video_title, video_link, channel_name, channel_link, date))
# Print progress
print(f"Processed cell {index} of {total_cells}")
print("\nWriting data to output.csv...")
if __name__ == "__main__":
# Write the extracted data to a CSV file
with open('output.csv', 'w', encoding='utf-8', newline='') as csv_file:
writer = csv.writer(csv_file)
# Write the headers
writer.writerow(['Products', 'Video title', 'Video Link', 'Channel Name', 'Channel Link', 'Date'])
# Write the extracted data
for row in data:
writer.writerow(row)
print("Data has been written to output.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment