Skip to content

Instantly share code, notes, and snippets.

@makkoncept
Last active March 25, 2019 19:34
Show Gist options
  • Save makkoncept/45303a19fc8e8ee45d7f96555e7e9e76 to your computer and use it in GitHub Desktop.
Save makkoncept/45303a19fc8e8ee45d7f96555e7e9e76 to your computer and use it in GitHub Desktop.
import html2text
from bs4 import BeautifulSoup
import os
from shutil import copy
path = os.path.join(os.getcwd(), "_posts")
_posts_files_list = os.listdir(path)
html_list = [file for file in _posts_files_list if ".html" in file]
for html_file in html_list:
html_file_path = os.path.join("_posts", html_file)
dir_path = os.path.join("tech-blog-archives", html_file[11 : len(html_file) - 5])
os.makedirs(dir_path, exist_ok=True)
with open(html_file_path, "r") as f:
x = f.read()
h = html2text.HTML2Text()
h.body_width = 0
y = x.split("---")
html = y[2]
metadata = y[1]
if "techblog.creativecommons.org" in html:
# change the href manually, as there is no definite pattern
print(html_file_path)
# soup = BeautifulSoup(html, "html.parser")
# links_list = soup.find_all("a")
# for link in links_list:
# if "techblog.creativecommons.org" in link.get("href"):
# old_href = link.get("href")
# link["href"] = "https://creativecommons.github.io/tech-blog-archives"
# print(link)
# html = str(soup)
# print("found the link")
# print(html_file_path)
# break
if "<img" in html:
# print(html_file_path)
soup = BeautifulSoup(html, "html.parser")
img_list = soup.find_all("img")
for img in img_list:
src = img.get("src").split("/")[1]
img["src"] = src
html = str(soup)
img_path = os.path.join("img", src)
if os.path.isfile(img_path):
copy(img_path, dir_path)
# yes_number += 1
# else:
# no_number += 1
metadata_list = metadata.split("\n")
meta_info = {}
for info in metadata_list:
if "title:" in info:
title_list = info.split(" ")
title = " ".join(title_list[1:])
if title[0] == "'" and title[-1] == "'":
title = title[1 : len(title) - 1]
meta_info["title"] = title
elif "date:" in info:
info_list = info.split(" ")
meta_info["pub_date"] = info_list[1]
elif "display_name:" in info:
display_name_list = info.strip().split(" ")
author = display_name_list[1]
meta_info["author"] = author
elif "categories:" in info:
try:
idx = metadata_list.index("categories:")
categories_list = []
for i in range(idx + 1, len(metadata_list)):
if "tags:" in metadata_list[i]:
break
categories_list.append(metadata_list[i].split(" ")[1])
categories_string = ", ".join(categories_list)
meta_info["categories"] = categories_string
except ValueError:
meta_info["categories"] = ""
body = h.handle(html)
meta_info["body"] = body
content_string = f"""title: {meta_info.get('title')}
---
categories: {meta_info.get('categories')}
---
author: {meta_info.get('author')}
---
body:
{meta_info.get('body')}
---
pub_date: {meta_info.get('pub_date')}
"""
path_of_file = os.path.join(dir_path, "contents.lr")
with open(os.path.join(dir_path, "contents.lr"), "w") as f:
f.write(content_string)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment