makkoncept/CCimportblogposts.py

## CCimportblogposts.py
import html2text
from bs4 import BeautifulSoup
import os
from shutil import copy

path = os.path.join(os.getcwd(), "_posts")
_posts_files_list = os.listdir(path)
html_list = [file for file in _posts_files_list if ".html" in file]


for html_file in html_list:
    html_file_path = os.path.join("_posts", html_file)

    dir_path = os.path.join("tech-blog-archives", html_file[11 : len(html_file) - 5])
    os.makedirs(dir_path, exist_ok=True)

    with open(html_file_path, "r") as f:
        x = f.read()

    h = html2text.HTML2Text()
    h.body_width = 0
    y = x.split("---")
    html = y[2]
    metadata = y[1]

    if "techblog.creativecommons.org" in html:
        # change the href manually, as there is no definite pattern
        print(html_file_path)
    # soup = BeautifulSoup(html, "html.parser")
    # links_list = soup.find_all("a")
    # for link in links_list:
    #     if "techblog.creativecommons.org" in link.get("href"):
    #         old_href = link.get("href")
    # link["href"] = "https://creativecommons.github.io/tech-blog-archives"
    #         print(link)
    # html = str(soup)
    # print("found the link")
    # print(html_file_path)
    # break

    if "<img" in html:
        # print(html_file_path)
        soup = BeautifulSoup(html, "html.parser")
        img_list = soup.find_all("img")
        for img in img_list:
            src = img.get("src").split("/")[1]
            img["src"] = src
            html = str(soup)
            img_path = os.path.join("img", src)
            if os.path.isfile(img_path):
                copy(img_path, dir_path)
    #             yes_number += 1
    #         else:
    #             no_number += 1

    metadata_list = metadata.split("\n")
    meta_info = {}
    for info in metadata_list:
        if "title:" in info:
            title_list = info.split(" ")
            title = " ".join(title_list[1:])
            if title[0] == "'" and title[-1] == "'":
                title = title[1 : len(title) - 1]
            meta_info["title"] = title
        elif "date:" in info:
            info_list = info.split(" ")
            meta_info["pub_date"] = info_list[1]
        elif "display_name:" in info:
            display_name_list = info.strip().split(" ")
            author = display_name_list[1]
            meta_info["author"] = author
        elif "categories:" in info:
            try:
                idx = metadata_list.index("categories:")
                categories_list = []
                for i in range(idx + 1, len(metadata_list)):
                    if "tags:" in metadata_list[i]:
                        break
                    categories_list.append(metadata_list[i].split(" ")[1])
                categories_string = ", ".join(categories_list)
                meta_info["categories"] = categories_string

            except ValueError:
                meta_info["categories"] = ""

    body = h.handle(html)
    meta_info["body"] = body

    content_string = f"""title: {meta_info.get('title')}
---
categories: {meta_info.get('categories')}
---
author: {meta_info.get('author')}
---
body:

{meta_info.get('body')}
---
pub_date: {meta_info.get('pub_date')}
"""

    path_of_file = os.path.join(dir_path, "contents.lr")
    with open(os.path.join(dir_path, "contents.lr"), "w") as f:
        f.write(content_string)
	import html2text
	from bs4 import BeautifulSoup
	import os
	from shutil import copy

	path = os.path.join(os.getcwd(), "_posts")
	_posts_files_list = os.listdir(path)
	html_list = [file for file in _posts_files_list if ".html" in file]


	for html_file in html_list:
	html_file_path = os.path.join("_posts", html_file)

	dir_path = os.path.join("tech-blog-archives", html_file[11 : len(html_file) - 5])
	os.makedirs(dir_path, exist_ok=True)

	with open(html_file_path, "r") as f:
	x = f.read()

	h = html2text.HTML2Text()
	h.body_width = 0
	y = x.split("---")
	html = y[2]
	metadata = y[1]

	if "techblog.creativecommons.org" in html:
	# change the href manually, as there is no definite pattern
	print(html_file_path)
	# soup = BeautifulSoup(html, "html.parser")
	# links_list = soup.find_all("a")
	# for link in links_list:
	# if "techblog.creativecommons.org" in link.get("href"):
	# old_href = link.get("href")
	# link["href"] = "https://creativecommons.github.io/tech-blog-archives"
	# print(link)
	# html = str(soup)
	# print("found the link")
	# print(html_file_path)
	# break

	if "<img" in html:
	# print(html_file_path)
	soup = BeautifulSoup(html, "html.parser")
	img_list = soup.find_all("img")
	for img in img_list:
	src = img.get("src").split("/")[1]
	img["src"] = src
	html = str(soup)
	img_path = os.path.join("img", src)
	if os.path.isfile(img_path):
	copy(img_path, dir_path)
	# yes_number += 1
	# else:
	# no_number += 1

	metadata_list = metadata.split("\n")
	meta_info = {}
	for info in metadata_list:
	if "title:" in info:
	title_list = info.split(" ")
	title = " ".join(title_list[1:])
	if title[0] == "'" and title[-1] == "'":
	title = title[1 : len(title) - 1]
	meta_info["title"] = title
	elif "date:" in info:
	info_list = info.split(" ")
	meta_info["pub_date"] = info_list[1]
	elif "display_name:" in info:
	display_name_list = info.strip().split(" ")
	author = display_name_list[1]
	meta_info["author"] = author
	elif "categories:" in info:
	try:
	idx = metadata_list.index("categories:")
	categories_list = []
	for i in range(idx + 1, len(metadata_list)):
	if "tags:" in metadata_list[i]:
	break
	categories_list.append(metadata_list[i].split(" ")[1])
	categories_string = ", ".join(categories_list)
	meta_info["categories"] = categories_string

	except ValueError:
	meta_info["categories"] = ""

	body = h.handle(html)
	meta_info["body"] = body

	content_string = f"""title: {meta_info.get('title')}
	---
	categories: {meta_info.get('categories')}
	---
	author: {meta_info.get('author')}
	---
	body:

	{meta_info.get('body')}
	---
	pub_date: {meta_info.get('pub_date')}
	"""

	path_of_file = os.path.join(dir_path, "contents.lr")
	with open(os.path.join(dir_path, "contents.lr"), "w") as f:
	f.write(content_string)