redgeoff/linked-in-profile-extractor.py

## linked-in-profile-extractor.py
from bs4 import BeautifulSoup
import re
import csv

# Load the uploaded HTML file
html_file_path = '/mnt/data/lease-abstractions.html'

# Reload the HTML content
with open(html_file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()

# Function to sanitize HTML content
def sanitize_html(content):
    # Remove specific pattern \x3C!---->
    content = re.sub(r'\\x3C!---+>', '', content)

    # Remove newlines, tabs, and any leading/trailing whitespace
    content = re.sub(r'[\n\t\r]+', ' ', content).strip()
    return content

# Sanitize the HTML content
sanitized_html = sanitize_html(html_content)

# Define the function to extract profiles using the sanitized HTML
def extract_profiles(html):
    soup = BeautifulSoup(html, 'html.parser')
    profiles = []

    for entity in soup.find_all("div", class_="entity-result__item"):
        profile = {}

        # Extract LinkedIn URL without query parameters and strip whitespace
        link_tag = entity.find("a", class_="app-aware-link")
        if link_tag and 'href' in link_tag.attrs:
            profile["LinkedIn URL"] = link_tag['href'].split('?')[0].strip()

        # Extract Name and strip whitespace
        name_tag = entity.find("img", class_="presence-entity__image")
        if name_tag and 'alt' in name_tag.attrs:
            profile["Name"] = name_tag['alt'].strip()

        # Extract Title and Company and strip whitespace
        subtitle_tag = entity.find("div", class_="entity-result__primary-subtitle")
        if subtitle_tag:
            subtitle_text = subtitle_tag.get_text(" ", strip=True).strip()
            subtitle_text = ' '.join(subtitle_text.split())
            if ' at ' in subtitle_text:
                title, company = subtitle_text.split(' at ', 1)
                profile["Title"] = title.strip()
                profile["Company"] = company.strip()
            else:
                profile["Title"] = subtitle_text.strip()
                profile["Company"] = ''

        profiles.append(profile)

    return profiles

# Extract profiles from the sanitized HTML content
extracted_profiles = extract_profiles(sanitized_html)

# Output file path for the CSV
output_csv_path = '/mnt/data/extracted_profiles.csv'

# Writing the extracted data to a CSV file
with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['Name', 'LinkedIn URL', 'Title', 'Company']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for profile in extracted_profiles:
        # Ensure any remaining newline characters are removed
        for key, value in profile.items():
            profile[key] = value.replace('\\n', ' ').strip()
        writer.writerow(profile)
	from bs4 import BeautifulSoup
	import re
	import csv

	# Load the uploaded HTML file
	html_file_path = '/mnt/data/lease-abstractions.html'

	# Reload the HTML content
	with open(html_file_path, 'r', encoding='utf-8') as file:
	html_content = file.read()

	# Function to sanitize HTML content
	def sanitize_html(content):
	# Remove specific pattern \x3C!---->
	content = re.sub(r'\\x3C!---+>', '', content)

	# Remove newlines, tabs, and any leading/trailing whitespace
	content = re.sub(r'[\n\t\r]+', ' ', content).strip()
	return content

	# Sanitize the HTML content
	sanitized_html = sanitize_html(html_content)

	# Define the function to extract profiles using the sanitized HTML
	def extract_profiles(html):
	soup = BeautifulSoup(html, 'html.parser')
	profiles = []

	for entity in soup.find_all("div", class_="entity-result__item"):
	profile = {}

	# Extract LinkedIn URL without query parameters and strip whitespace
	link_tag = entity.find("a", class_="app-aware-link")
	if link_tag and 'href' in link_tag.attrs:
	profile["LinkedIn URL"] = link_tag['href'].split('?')[0].strip()

	# Extract Name and strip whitespace
	name_tag = entity.find("img", class_="presence-entity__image")
	if name_tag and 'alt' in name_tag.attrs:
	profile["Name"] = name_tag['alt'].strip()

	# Extract Title and Company and strip whitespace
	subtitle_tag = entity.find("div", class_="entity-result__primary-subtitle")
	if subtitle_tag:
	subtitle_text = subtitle_tag.get_text(" ", strip=True).strip()
	subtitle_text = ' '.join(subtitle_text.split())
	if ' at ' in subtitle_text:
	title, company = subtitle_text.split(' at ', 1)
	profile["Title"] = title.strip()
	profile["Company"] = company.strip()
	else:
	profile["Title"] = subtitle_text.strip()
	profile["Company"] = ''

	profiles.append(profile)

	return profiles

	# Extract profiles from the sanitized HTML content
	extracted_profiles = extract_profiles(sanitized_html)

	# Output file path for the CSV
	output_csv_path = '/mnt/data/extracted_profiles.csv'

	# Writing the extracted data to a CSV file
	with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
	fieldnames = ['Name', 'LinkedIn URL', 'Title', 'Company']
	writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

	writer.writeheader()
	for profile in extracted_profiles:
	# Ensure any remaining newline characters are removed
	for key, value in profile.items():
	profile[key] = value.replace('\\n', ' ').strip()
	writer.writerow(profile)