Korayem/cleant_html_and_convert_to_html.js

## cleant_html_and_convert_to_html.js
/*
Generetad by GPT-4-turbo

------------------------------------------------------------------------------------------------------------
Prompt 1:
let’s create a function that cleans up an input html
1. Remove all scripts tags
2. Remove all iframes tags
3. Remove `<head>`
4. Remove CSS style tags
5. Remove `<link>` tag
6. Remove `<noscript>` tags
7. remove any properties on html and body tags
8. remove <!DOCTYPE html> even if it has attributes
9. Remove any html comments
10. Clean any attributes from all tags
11. finally remove any tags with no content inside like `<div></div>`
12. don’t use any loops like `while`


Note the runtime I am in doesnt support DOMParser or node packages. The answer should be vanillaJS.

Take a deep breath and write it in Javascript. the function name is `cleanHtml()`

------------------------------------------------------------------------------------------------------------

Prompt 2:
now write a method htmlToMarkdown() to convert output to markdown format

*/

function cleanHtml(html) {
    // Remove script tags and their content
    html = html.replace(/<script[^>]*>([\s\S]*?)<\/script>/gi, '');

    // Remove iframe tags and their content
    html = html.replace(/<iframe[^>]*>([\s\S]*?)<\/iframe>/gi, '');

    // Remove the entire head tag
    html = html.replace(/<head[^>]*>([\s\S]*?)<\/head>/gi, '');

    // Remove style tags and their content
    html = html.replace(/<style[^>]*>([\s\S]*?)<\/style>/gi, '');

    // Remove link tags
    html = html.replace(/<link[^>]*>/gi, '');

    // Remove noscript tags
    html = html.replace(/<noscript[^>]*>([\s\S]*?)<\/noscript>/gi, '');

    // Remove properties from html and body tags
    html = html.replace(/<(html|body)([^>]*?)>/gi, '<$1>');

    // Remove DOCTYPE
    html = html.replace(/<!DOCTYPE[^>]*>/gi, '');

    // Remove HTML comments
    html = html.replace(/<!--[\s\S]*?-->/g, '');

    // Clean attributes from all tags
    html = html.replace(/<([a-zA-Z0-9]+)([^>]*?)>/g, '<$1>');

    // Remove empty tags
    html = html.replace(/<([a-zA-Z0-9]+)><\/\1>/g, '');

    return html;
}

function htmlToMarkdown(html) {
    // Convert headings
    html = html.replace(/<h([1-6])>(.*?)<\/h\1>/gi, (_, level, content) => `${'#'.repeat(level)} ${content}`);

    // Convert paragraphs
    html = html.replace(/<p>(.*?)<\/p>/gi, (_, content) => `${content}\n\n`);

    // Convert line breaks
    html = html.replace(/<br>/gi, '\n');

    // Convert bold text
    html = html.replace(/<strong>(.*?)<\/strong>/gi, (_, content) => `**${content}**`);
    html = html.replace(/<b>(.*?)<\/b>/gi, (_, content) => `**${content}**`);

    // Convert italic text
    html = html.replace(/<em>(.*?)<\/em>/gi, (_, content) => `*${content}*`);
    html = html.replace(/<i>(.*?)<\/i>/gi, (_, content) => `*${content}*`);

    // Convert unordered lists
    html = html.replace(/<ul>([\s\S]*?)<\/ul>/gi, (_, content) => {
        return content.replace(/<li>(.*?)<\/li>/gi, (_, item) => `* ${item}\n`);
    });

    // Convert ordered lists
    html = html.replace(/<ol>([\s\S]*?)<\/ol>/gi, (_, content) => {
        let counter = 0;
        return content.replace(/<li>(.*?)<\/li>/gi, (_, item) => `${++counter}. ${item}\n`);
    });

    // Convert links
    html = html.replace(/<a href="([^"]+)"[^>]*>(.*?)<\/a>/gi, (_, href, content) => `[${content}](${href})`);

    // Strip all other HTML tags
    html = html.replace(/<[^>]+>/g, '');

    return html.trim();
}
	/*
	Generetad by GPT-4-turbo

	------------------------------------------------------------------------------------------------------------
	Prompt 1:
	let’s create a function that cleans up an input html
	1. Remove all scripts tags
	2. Remove all iframes tags
	3. Remove `<head>`
	4. Remove CSS style tags
	5. Remove `<link>` tag
	6. Remove `<noscript>` tags
	7. remove any properties on html and body tags
	8. remove <!DOCTYPE html> even if it has attributes
	9. Remove any html comments
	10. Clean any attributes from all tags
	11. finally remove any tags with no content inside like `<div></div>`
	12. don’t use any loops like `while`


	Note the runtime I am in doesnt support DOMParser or node packages. The answer should be vanillaJS.

	Take a deep breath and write it in Javascript. the function name is `cleanHtml()`

	------------------------------------------------------------------------------------------------------------

	Prompt 2:
	now write a method htmlToMarkdown() to convert output to markdown format

	*/

	function cleanHtml(html) {
	// Remove script tags and their content
	html = html.replace(/<script[^>]>([\s\S]?)<\/script>/gi, '');

	// Remove iframe tags and their content
	html = html.replace(/<iframe[^>]>([\s\S]?)<\/iframe>/gi, '');

	// Remove the entire head tag
	html = html.replace(/<head[^>]>([\s\S]?)<\/head>/gi, '');

	// Remove style tags and their content
	html = html.replace(/<style[^>]>([\s\S]?)<\/style>/gi, '');

	// Remove link tags
	html = html.replace(/<link[^>]*>/gi, '');

	// Remove noscript tags
	html = html.replace(/<noscript[^>]>([\s\S]?)<\/noscript>/gi, '');

	// Remove properties from html and body tags
	html = html.replace(/<(html\|body)([^>]*?)>/gi, '<$1>');

	// Remove DOCTYPE
	html = html.replace(/<!DOCTYPE[^>]*>/gi, '');

	// Remove HTML comments
	html = html.replace(/<!--[\s\S]*?-->/g, '');

	// Clean attributes from all tags
	html = html.replace(/<([a-zA-Z0-9]+)([^>]*?)>/g, '<$1>');

	// Remove empty tags
	html = html.replace(/<([a-zA-Z0-9]+)><\/\1>/g, '');

	return html;
	}

	function htmlToMarkdown(html) {
	// Convert headings
	html = html.replace(/<h([1-6])>(.*?)<\/h\1>/gi, (_, level, content) => `${'#'.repeat(level)} ${content}`);

	// Convert paragraphs
	html = html.replace(/<p>(.*?)<\/p>/gi, (_, content) => `${content}\n\n`);

	// Convert line breaks
	html = html.replace(/<br>/gi, '\n');

	// Convert bold text
	html = html.replace(/<strong>(.?)<\/strong>/gi, (_, content) => `${content}*`);
	html = html.replace(/<b>(.?)<\/b>/gi, (_, content) => `${content}*`);

	// Convert italic text
	html = html.replace(/<em>(.?)<\/em>/gi, (_, content) => `${content}*`);
	html = html.replace(/<i>(.?)<\/i>/gi, (_, content) => `${content}*`);

	// Convert unordered lists
	html = html.replace(/<ul>([\s\S]*?)<\/ul>/gi, (_, content) => {
	return content.replace(/<li>(.?)<\/li>/gi, (_, item) => ` ${item}\n`);
	});

	// Convert ordered lists
	html = html.replace(/<ol>([\s\S]*?)<\/ol>/gi, (_, content) => {
	let counter = 0;
	return content.replace(/<li>(.*?)<\/li>/gi, (_, item) => `${++counter}. ${item}\n`);
	});

	// Convert links
	html = html.replace(/<a href="([^"]+)"[^>]>(.?)<\/a>/gi, (_, href, content) => `[${content}](${href})`);

	// Strip all other HTML tags
	html = html.replace(/<[^>]+>/g, '');

	return html.trim();
	}